mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Cleaning the code a bit, fix URL loading bug with multiple fragments in URL
This commit is contained in:
parent
5dd61387bf
commit
3fd48e0e53
@ -3,7 +3,6 @@ package nu.marginalia.wmsa.edge.converting.processor;
|
|||||||
import com.google.common.base.Strings;
|
import com.google.common.base.Strings;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
|
||||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.CommonKeywordExtractor;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.CommonKeywordExtractor;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||||
@ -110,22 +109,6 @@ public class DomainProcessor {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private double getAverageQuality(List<ProcessedDocument> documents) {
|
|
||||||
int n = 0;
|
|
||||||
double q = 0.;
|
|
||||||
for (var doc : documents) {
|
|
||||||
if (doc.quality().isPresent()) {
|
|
||||||
n++;
|
|
||||||
q += doc.quality().getAsDouble();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (n > 0) {
|
|
||||||
return q / n;
|
|
||||||
}
|
|
||||||
return -5.;
|
|
||||||
}
|
|
||||||
|
|
||||||
private EdgeDomainIndexingState getState(String crawlerStatus) {
|
private EdgeDomainIndexingState getState(String crawlerStatus) {
|
||||||
return switch (CrawlerDomainStatus.valueOf(crawlerStatus)) {
|
return switch (CrawlerDomainStatus.valueOf(crawlerStatus)) {
|
||||||
case OK -> EdgeDomainIndexingState.ACTIVE;
|
case OK -> EdgeDomainIndexingState.ACTIVE;
|
||||||
|
@ -41,24 +41,35 @@ public class EdgeUrl implements WideHashable {
|
|||||||
|
|
||||||
private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]");
|
private static Pattern badCharPattern = Pattern.compile("[ \t\n\"<>\\[\\]()',|]");
|
||||||
|
|
||||||
|
/* Java's URI parser is a bit too strict in throwing exceptions when there's an error.
|
||||||
|
|
||||||
|
Here on the Internet, standards are like the picture on the box of the frozen pizza,
|
||||||
|
and what you get is more like what's on the inside, we try to patch things instead,
|
||||||
|
just give it a best-effort attempt att cleaning out broken or unnecessary constructions
|
||||||
|
like bad or missing URLEncoding
|
||||||
|
*/
|
||||||
public static String urlencodeFixer(String url) throws URISyntaxException {
|
public static String urlencodeFixer(String url) throws URISyntaxException {
|
||||||
var s = new StringBuilder();
|
var s = new StringBuilder();
|
||||||
String goodChars = "&.?:/-;+$#";
|
String goodChars = "&.?:/-;+$#";
|
||||||
String hexChars = "0123456789abcdefABCDEF";
|
String hexChars = "0123456789abcdefABCDEF";
|
||||||
|
|
||||||
int pathIdx = findPathIdx(url);
|
int pathIdx = findPathIdx(url);
|
||||||
if (pathIdx < 0) {
|
if (pathIdx < 0) { // url looks like http://marginalia.nu
|
||||||
return url;
|
return url + "/";
|
||||||
}
|
}
|
||||||
s.append(url, 0, pathIdx);
|
s.append(url, 0, pathIdx);
|
||||||
|
|
||||||
for (int i = pathIdx; i < url.length(); i++) {
|
// We don't want the fragment, and multiple fragments breaks the Java URIParser for some reason
|
||||||
|
int end = url.indexOf("#");
|
||||||
|
if (end < 0) end = url.length();
|
||||||
|
|
||||||
|
for (int i = pathIdx; i < end; i++) {
|
||||||
int c = url.charAt(i);
|
int c = url.charAt(i);
|
||||||
|
|
||||||
if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <='Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
|
if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <='Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
|
||||||
s.appendCodePoint(c);
|
s.appendCodePoint(c);
|
||||||
}
|
}
|
||||||
else if (c == '%' && i+2<url.length()) {
|
else if (c == '%' && i+2<end) {
|
||||||
int cn = url.charAt(i+1);
|
int cn = url.charAt(i+1);
|
||||||
int cnn = url.charAt(i+2);
|
int cnn = url.charAt(i+2);
|
||||||
if (hexChars.indexOf(cn) >= 0 && hexChars.indexOf(cnn) >= 0) {
|
if (hexChars.indexOf(cn) >= 0 && hexChars.indexOf(cnn) >= 0) {
|
||||||
|
@ -27,6 +27,7 @@ class EdgeUrlTest {
|
|||||||
}
|
}
|
||||||
@Test
|
@Test
|
||||||
void urlencodeFixer() throws URISyntaxException {
|
void urlencodeFixer() throws URISyntaxException {
|
||||||
|
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/#heredoc"));
|
||||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));
|
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%-sign"));
|
||||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign"));
|
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/%22-sign"));
|
||||||
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\""));
|
System.out.println(EdgeUrl.urlencodeFixer("https://www.example.com/\n \"huh\""));
|
||||||
|
Loading…
Reference in New Issue
Block a user