Processor fixes: Excluding phpinfo()-pages, mastodon feeds.

2025-02-23 13:09:00 +00:00 · 2022-09-16 18:05:54 +02:00 · 2022-09-16 18:05:54 +02:00 · 179d54d50a
commit 179d54d50a
parent 13c8305dc2
3 changed files with 33 additions and 5 deletions
--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java
@ -167,6 +167,7 @@ public class DocumentKeywordExtractor {
                if (lc.length() > 6
                    && lc.indexOf('@') > 0
                    && mailLikePattern.matcher(lc).matches()) {
+
                    reps.add(lc);

                    String domain = lc.substring(lc.indexOf('@'));
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java
@ -176,6 +176,8 @@ public class DocumentProcessor {
            throw new DisqualifiedException(DisqualificationReason.FORBIDDEN);
        }

+        final EdgeUrl url = new EdgeUrl(crawledDocument.url);
+
        Document prunedDoc = doc.clone();

        prunedDoc.getElementsByTag("svg").remove();
@ -194,10 +196,12 @@ public class DocumentProcessor {
        ret.quality = documentValuator.getQuality(ret.standard, doc, dld);
        ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();

-        final boolean doSimpleProcessing = ret.quality < minDocumentQuality;
-
        EdgePageWordSet words;
-        if (doSimpleProcessing) {
+        if (shouldDoSimpleProcessing(url, ret)) {
+            /* Some documents we'll index, but only superficially. This is a compromise
+               to allow them to be discoverable, without having them show up without specific
+               queries. This also saves a lot of processing power.
+             */
            ret.features = Set.of(HtmlFeature.UNKNOWN);
            words = keywordExtractor.extractKeywordsMinimal(dld);
            ret.description = "";
@ -208,7 +212,6 @@ public class DocumentProcessor {
            ret.description = getDescription(doc);
        }

-        var url = new EdgeUrl(crawledDocument.url);
        addMetaWords(ret, url, crawledDomain, words);

        getLinks(url, ret, doc, words);
@ -216,6 +219,29 @@ public class DocumentProcessor {
        return new DetailsWithWords(ret, words);
    }

+    private boolean shouldDoSimpleProcessing(EdgeUrl url, ProcessedDocumentDetails ret) {
+        if (ret.quality < minDocumentQuality) {
+            return true;
+        }
+
+        // These pages shouldn't be publicly accessible
+        if ("phpinfo()".equals(ret.title)) {
+            return true;
+        }
+
+        // Urls that look like /@foo are typically Mastodon or other twitter-like feeds,
+        // we don't want to index them because they change so rapidly; subdirectories are
+        // fine though
+        //
+        // The first startsWith criteria is a performance optimization, even with a compiled
+        // pattern it is something like 50x faster
+        if (url.path.startsWith("/@") && url.path.matches("^/@[^/]+/?$")) {
+            return true;
+        }
+
+        return false;
+    }
+
    private void addMetaWords(ProcessedDocumentDetails ret, EdgeUrl url, CrawledDomain domain, EdgePageWordSet words) {
        List<String> tagWords = new ArrayList<>();

--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LanguageFilterTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/LanguageFilterTest.java
@ -4,7 +4,8 @@ import nu.marginalia.util.language.LanguageFilter;
 import org.jsoup.Jsoup;
 import org.junit.jupiter.api.Test;

-import static org.junit.jupiter.api.Assertions.*;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;

 class LanguageFilterTest {