diff --git a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/DomainLinks.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/DomainLinks.java index 14e6ad99..0d6d8a8d 100644 --- a/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/DomainLinks.java +++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/DomainLinks.java @@ -41,7 +41,13 @@ public class DomainLinks { /** Returns the number of links to the given url. */ public int countForUrl(EdgeUrl url) { String key = url.domain.toString() + url.path + (url.param == null ? "" : "?" + url.param); - return links.getOrDefault(key, List.of()).size(); + + int cnt = 0; + for (var link : links.getOrDefault(key, List.of())) { + cnt += link.count(); + } + + return cnt; } @Override diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java index 61ccf09f..61fc9e32 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java @@ -2,7 +2,6 @@ package nu.marginalia.converting.sideload.reddit; import nu.marginalia.atags.AnchorTextKeywords; import nu.marginalia.atags.model.DomainLinks; -import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDomain; @@ -13,7 +12,7 @@ import nu.marginalia.integration.reddit.db.RedditDb; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.DomainIndexingState; -import nu.marginalia.model.idx.WordFlags; +import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.util.ProcessingIterator; import org.apache.commons.lang3.StringUtils; @@ -30,16 +29,13 @@ public class RedditSideloader implements SideloadSource { private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(RedditSideloader.class); private final List dbFiles; - private final AnchorTagsSourceFactory anchorTagsSourceFactory; private final AnchorTextKeywords anchorTextKeywords; private final SideloaderProcessing sideloaderProcessing; public RedditSideloader(List listToDbFiles, - AnchorTagsSourceFactory anchorTagsSourceFactory, AnchorTextKeywords anchorTextKeywords, SideloaderProcessing sideloaderProcessing) { this.dbFiles = listToDbFiles; - this.anchorTagsSourceFactory = anchorTagsSourceFactory; this.anchorTextKeywords = anchorTextKeywords; this.sideloaderProcessing = sideloaderProcessing; } @@ -116,14 +112,25 @@ public class RedditSideloader implements SideloadSource { .ofInstant(Instant.ofEpochSecond(createdUtc), ZoneOffset.UTC) .getYear(); - String fullHtml = "\n\n\n " + title + "\n \n\n\n

" + title + "

\n
\n

" + body + "

\n
\n\n\n"; + String fullHtml = """ + + + + %s + + + +

%s

+

reddit r/%s %s

+
+

%s

+
+ + + """.formatted(title, title, subreddit, subreddit, body); List extraKeywords = new ArrayList<>(); - extraKeywords.add("reddit"); - extraKeywords.add(subreddit); - extraKeywords.add("r/" + subreddit); - if (!StringUtils.isBlank(author) && !author.equals("[deleted]")) { extraKeywords.add(author); } @@ -147,12 +154,18 @@ public class RedditSideloader implements SideloadSource { if (doc.isProcessedFully()) { - for (var keyword : extraKeywords) { - doc.words.addMeta(keyword, WordFlags.Subjects.asBit()); + // Insert topology information + if (doc.details != null) { + doc.details.metadata.withSizeAndTopology(50_000_000, score); } - // Insert topology information - doc.details.metadata.withSizeAndTopology(50_000_000, score); + if (doc.words != null) { + doc.words.addAllSyntheticTerms(List.of("generator:forum", + HtmlFeature.COOKIES.getKeyword(), + HtmlFeature.JS.getKeyword(), + HtmlFeature.TRACKING_ADTECH.getKeyword() + )); + } } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java index bf4d21f1..c42443b3 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java @@ -24,10 +24,7 @@ import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import java.nio.file.Path; -import java.util.Arrays; -import java.util.EnumSet; -import java.util.Iterator; -import java.util.List; +import java.util.*; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.TimeUnit; @@ -36,6 +33,8 @@ public class StackexchangeSideloader implements SideloadSource { private final DocumentKeywordExtractor keywordExtractor; private final String domainName; + private final EnumSet applyFeatures = EnumSet.of(HtmlFeature.JS, HtmlFeature.TRACKING); + private final Path dbFile; public StackexchangeSideloader(Path pathToDbFile, @@ -133,12 +132,17 @@ public class StackexchangeSideloader implements SideloadSource { ret.url = url; ret.words = keywordExtractor.extractKeywords(dld, new LinkTexts(), url); - ret.words.addAllSyntheticTerms(List.of( - "site:" + domainName, - "site:" + url.domain.topDomain, - url.domain.topDomain, - domainName - )); + + List syntheticTerms = new ArrayList<>( + List.of("site:" + domainName, + "site:" + url.domain.topDomain, + url.domain.topDomain, + domainName) + ); + for (HtmlFeature feature : applyFeatures) { + syntheticTerms.add(feature.getKeyword()); + } + ret.words.addAllSyntheticTerms(syntheticTerms); if (!post.tags().isBlank()) { List subjects = Arrays.asList(post.tags().split(",")); @@ -152,7 +156,7 @@ public class StackexchangeSideloader implements SideloadSource { PubDate.toYearByte(ret.details.pubYear), (int) -ret.details.quality, EnumSet.of(DocumentFlags.GeneratorDocs)); - ret.details.features = EnumSet.of(HtmlFeature.JS, HtmlFeature.TRACKING); + ret.details.features = applyFeatures; ret.details.metadata.withSizeAndTopology(10000, 0);