From 179d54d50aaca563e230b81822ac72c1c5136827 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Fri, 16 Sep 2022 18:05:54 +0200 Subject: [PATCH] Processor fixes: Excluding phpinfo()-pages, mastodon feeds. --- .../processing/DocumentKeywordExtractor.java | 1 + .../processor/DocumentProcessor.java | 34 ++++++++++++++++--- .../edge/crawling/LanguageFilterTest.java | 3 +- 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java index 58b7c198..e3da3db8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java @@ -167,6 +167,7 @@ public class DocumentKeywordExtractor { if (lc.length() > 6 && lc.indexOf('@') > 0 && mailLikePattern.matcher(lc).matches()) { + reps.add(lc); String domain = lc.substring(lc.indexOf('@')); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index 99c344cd..871185f0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -176,6 +176,8 @@ public class DocumentProcessor { throw new DisqualifiedException(DisqualificationReason.FORBIDDEN); } + final EdgeUrl url = new EdgeUrl(crawledDocument.url); + Document prunedDoc = doc.clone(); prunedDoc.getElementsByTag("svg").remove(); @@ -194,10 +196,12 @@ public class DocumentProcessor { ret.quality = documentValuator.getQuality(ret.standard, doc, dld); ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong(); - final boolean doSimpleProcessing = ret.quality < minDocumentQuality; - EdgePageWordSet words; - if (doSimpleProcessing) { + if (shouldDoSimpleProcessing(url, ret)) { + /* Some documents we'll index, but only superficially. This is a compromise + to allow them to be discoverable, without having them show up without specific + queries. This also saves a lot of processing power. + */ ret.features = Set.of(HtmlFeature.UNKNOWN); words = keywordExtractor.extractKeywordsMinimal(dld); ret.description = ""; @@ -208,7 +212,6 @@ public class DocumentProcessor { ret.description = getDescription(doc); } - var url = new EdgeUrl(crawledDocument.url); addMetaWords(ret, url, crawledDomain, words); getLinks(url, ret, doc, words); @@ -216,6 +219,29 @@ public class DocumentProcessor { return new DetailsWithWords(ret, words); } + private boolean shouldDoSimpleProcessing(EdgeUrl url, ProcessedDocumentDetails ret) { + if (ret.quality < minDocumentQuality) { + return true; + } + + // These pages shouldn't be publicly accessible + if ("phpinfo()".equals(ret.title)) { + return true; + } + + // Urls that look like /@foo are typically Mastodon or other twitter-like feeds, + // we don't want to index them because they change so rapidly; subdirectories are + // fine though + // + // The first startsWith criteria is a performance optimization, even with a compiled + // pattern it is something like 50x faster + if (url.path.startsWith("/@") && url.path.matches("^/@[^/]+/?$")) { + return true; + } + + return false; + } + private void addMetaWords(ProcessedDocumentDetails ret, EdgeUrl url, CrawledDomain domain, EdgePageWordSet words) { List tagWords = new ArrayList<>(); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LanguageFilterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LanguageFilterTest.java index 93171310..7a0abab3 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LanguageFilterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LanguageFilterTest.java @@ -4,7 +4,8 @@ import nu.marginalia.util.language.LanguageFilter; import org.jsoup.Jsoup; import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; class LanguageFilterTest {