(converter) Refactor sideloaders to improve feature handling and keyword logic

Centralized HTML feature handling with `applyFeatures` in StackexchangeSideloader and added dynamic synthetic term generation. Improved HTML structure in RedditSideloader and enhanced metadata processing with feature-based keywords. Updated DomainLinks to correctly compute link counts using individual link occurrences.
2025-02-24 05:18:58 +00:00 · 2024-12-11 16:01:38 +01:00 · 2024-12-11 16:01:38 +01:00 · 5002870d1f
commit 5002870d1f
parent 73861e613f
3 changed files with 49 additions and 26 deletions
--- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/DomainLinks.java
+++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/DomainLinks.java
@ -41,7 +41,13 @@ public class DomainLinks {
    /** Returns the number of links to the given url. */
    public int countForUrl(EdgeUrl url) {
        String key = url.domain.toString() + url.path + (url.param == null ? "" : "?" + url.param);
-        return links.getOrDefault(key, List.of()).size();
+
        int cnt = 0;
        for (var link : links.getOrDefault(key, List.of())) {
            cnt += link.count();
        }
        return cnt;
    }
    @Override
--- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java
@ -2,7 +2,6 @@ package nu.marginalia.converting.sideload.reddit;
 import nu.marginalia.atags.AnchorTextKeywords;
 import nu.marginalia.atags.model.DomainLinks;
 import nu.marginalia.atags.source.AnchorTagsSourceFactory;
 import nu.marginalia.converting.model.GeneratorType;
 import nu.marginalia.converting.model.ProcessedDocument;
 import nu.marginalia.converting.model.ProcessedDomain;
@ -13,7 +12,7 @@ import nu.marginalia.integration.reddit.db.RedditDb;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.model.crawl.DomainIndexingState;
-import nu.marginalia.model.idx.WordFlags;
+import nu.marginalia.model.crawl.HtmlFeature;
 import nu.marginalia.util.ProcessingIterator;
 import org.apache.commons.lang3.StringUtils;
@ -30,16 +29,13 @@ public class RedditSideloader implements SideloadSource {
    private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(RedditSideloader.class);
    private final List<Path> dbFiles;
    private final AnchorTagsSourceFactory anchorTagsSourceFactory;
    private final AnchorTextKeywords anchorTextKeywords;
    private final SideloaderProcessing sideloaderProcessing;
    public RedditSideloader(List<Path> listToDbFiles,
                            AnchorTagsSourceFactory anchorTagsSourceFactory,
                            AnchorTextKeywords anchorTextKeywords,
                            SideloaderProcessing sideloaderProcessing) {
        this.dbFiles = listToDbFiles;
        this.anchorTagsSourceFactory = anchorTagsSourceFactory;
        this.anchorTextKeywords = anchorTextKeywords;
        this.sideloaderProcessing = sideloaderProcessing;
    }
@ -116,14 +112,25 @@ public class RedditSideloader implements SideloadSource {
                .ofInstant(Instant.ofEpochSecond(createdUtc), ZoneOffset.UTC)
                .getYear();
-        String fullHtml = "<!DOCTYPE html>\n<html>\n<head>\n  <title>" + title + "</title>\n  <script src=\"https://www.example.com/dummy.js\" type=\"text/javascript\"></script>\n</head>\n<body>\n  <h1>" + title + "</h1>\n  <article>\n    <p>" + body + "</p>\n  </article>\n</body>\n</html>\n";
+        String fullHtml = """
            <!DOCTYPE html>
                <html>
                <head>
                <title>%s</title>
                <script src="https://www.example.com/dummy.js" type="text/javascript"></script>
                </head>
                <body>
                  <h1>%s</h1>
                  <h2>reddit r/%s %s</h2>
                  <article>
                    <p>%s</p>
                  </article>
                  </body>
                </html>
            """.formatted(title, title, subreddit, subreddit, body);
        List<String> extraKeywords = new ArrayList<>();
        extraKeywords.add("reddit");
        extraKeywords.add(subreddit);
        extraKeywords.add("r/" + subreddit);
        if (!StringUtils.isBlank(author) && !author.equals("[deleted]")) {
            extraKeywords.add(author);
        }
@ -147,12 +154,18 @@ public class RedditSideloader implements SideloadSource {
        if (doc.isProcessedFully()) {
-             for (var keyword : extraKeywords) {
+            // Insert topology information
-                doc.words.addMeta(keyword, WordFlags.Subjects.asBit());
+            if (doc.details != null) {
                doc.details.metadata.withSizeAndTopology(50_000_000, score);
            }
-            // Insert topology information
+            if (doc.words != null) {
-            doc.details.metadata.withSizeAndTopology(50_000_000, score);
+                doc.words.addAllSyntheticTerms(List.of("generator:forum",
                        HtmlFeature.COOKIES.getKeyword(),
                        HtmlFeature.JS.getKeyword(),
                        HtmlFeature.TRACKING_ADTECH.getKeyword()
                ));
            }
        }
--- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java
+++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java
@ -24,10 +24,7 @@ import org.apache.commons.lang3.StringUtils;
 import org.jsoup.Jsoup;
 import java.nio.file.Path;
-import java.util.Arrays;
+import java.util.*;
 import java.util.EnumSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.TimeUnit;
@ -36,6 +33,8 @@ public class StackexchangeSideloader implements SideloadSource {
    private final DocumentKeywordExtractor keywordExtractor;
    private final String domainName;
    private final EnumSet<HtmlFeature> applyFeatures = EnumSet.of(HtmlFeature.JS, HtmlFeature.TRACKING);
    private final Path dbFile;
    public StackexchangeSideloader(Path pathToDbFile,
@ -133,12 +132,17 @@ public class StackexchangeSideloader implements SideloadSource {
            ret.url = url;
            ret.words = keywordExtractor.extractKeywords(dld, new LinkTexts(), url);
-            ret.words.addAllSyntheticTerms(List.of(
+
-                    "site:" + domainName,
+            List<String> syntheticTerms = new ArrayList<>(
                    List.of("site:" + domainName,
                            "site:" + url.domain.topDomain,
                            url.domain.topDomain,
-                    domainName
+                            domainName)
-            ));
+            );
            for (HtmlFeature feature : applyFeatures) {
                syntheticTerms.add(feature.getKeyword());
            }
            ret.words.addAllSyntheticTerms(syntheticTerms);
            if (!post.tags().isBlank()) {
                List<String> subjects = Arrays.asList(post.tags().split(","));
@ -152,7 +156,7 @@ public class StackexchangeSideloader implements SideloadSource {
                    PubDate.toYearByte(ret.details.pubYear),
                    (int) -ret.details.quality,
                    EnumSet.of(DocumentFlags.GeneratorDocs));
-            ret.details.features = EnumSet.of(HtmlFeature.JS, HtmlFeature.TRACKING);
+            ret.details.features = applyFeatures;
            ret.details.metadata.withSizeAndTopology(10000, 0);