Processor fixes: Excluding phpinfo()-pages, mastodon feeds.

This commit is contained in:
vlofgren 2022-09-16 18:05:54 +02:00
parent 13c8305dc2
commit 179d54d50a
3 changed files with 33 additions and 5 deletions

View File

@ -167,6 +167,7 @@ public class DocumentKeywordExtractor {
if (lc.length() > 6
&& lc.indexOf('@') > 0
&& mailLikePattern.matcher(lc).matches()) {
reps.add(lc);
String domain = lc.substring(lc.indexOf('@'));

View File

@ -176,6 +176,8 @@ public class DocumentProcessor {
throw new DisqualifiedException(DisqualificationReason.FORBIDDEN);
}
final EdgeUrl url = new EdgeUrl(crawledDocument.url);
Document prunedDoc = doc.clone();
prunedDoc.getElementsByTag("svg").remove();
@ -194,10 +196,12 @@ public class DocumentProcessor {
ret.quality = documentValuator.getQuality(ret.standard, doc, dld);
ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();
final boolean doSimpleProcessing = ret.quality < minDocumentQuality;
EdgePageWordSet words;
if (doSimpleProcessing) {
if (shouldDoSimpleProcessing(url, ret)) {
/* Some documents we'll index, but only superficially. This is a compromise
to allow them to be discoverable, without having them show up without specific
queries. This also saves a lot of processing power.
*/
ret.features = Set.of(HtmlFeature.UNKNOWN);
words = keywordExtractor.extractKeywordsMinimal(dld);
ret.description = "";
@ -208,7 +212,6 @@ public class DocumentProcessor {
ret.description = getDescription(doc);
}
var url = new EdgeUrl(crawledDocument.url);
addMetaWords(ret, url, crawledDomain, words);
getLinks(url, ret, doc, words);
@ -216,6 +219,29 @@ public class DocumentProcessor {
return new DetailsWithWords(ret, words);
}
private boolean shouldDoSimpleProcessing(EdgeUrl url, ProcessedDocumentDetails ret) {
if (ret.quality < minDocumentQuality) {
return true;
}
// These pages shouldn't be publicly accessible
if ("phpinfo()".equals(ret.title)) {
return true;
}
// Urls that look like /@foo are typically Mastodon or other twitter-like feeds,
// we don't want to index them because they change so rapidly; subdirectories are
// fine though
//
// The first startsWith criteria is a performance optimization, even with a compiled
// pattern it is something like 50x faster
if (url.path.startsWith("/@") && url.path.matches("^/@[^/]+/?$")) {
return true;
}
return false;
}
private void addMetaWords(ProcessedDocumentDetails ret, EdgeUrl url, CrawledDomain domain, EdgePageWordSet words) {
List<String> tagWords = new ArrayList<>();

View File

@ -4,7 +4,8 @@ import nu.marginalia.util.language.LanguageFilter;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
class LanguageFilterTest {