(converter) Refactor sideloaders to improve feature handling and keyword logic

Centralized HTML feature handling with `applyFeatures` in StackexchangeSideloader and added dynamic synthetic term generation. Improved HTML structure in RedditSideloader and enhanced metadata processing with feature-based keywords. Updated DomainLinks to correctly compute link counts using individual link occurrences.
This commit is contained in:
Viktor Lofgren 2024-12-11 16:01:38 +01:00
parent 73861e613f
commit 5002870d1f
3 changed files with 49 additions and 26 deletions

View File

@ -41,7 +41,13 @@ public class DomainLinks {
/** Returns the number of links to the given url. */ /** Returns the number of links to the given url. */
public int countForUrl(EdgeUrl url) { public int countForUrl(EdgeUrl url) {
String key = url.domain.toString() + url.path + (url.param == null ? "" : "?" + url.param); String key = url.domain.toString() + url.path + (url.param == null ? "" : "?" + url.param);
return links.getOrDefault(key, List.of()).size();
int cnt = 0;
for (var link : links.getOrDefault(key, List.of())) {
cnt += link.count();
}
return cnt;
} }
@Override @Override

View File

@ -2,7 +2,6 @@ package nu.marginalia.converting.sideload.reddit;
import nu.marginalia.atags.AnchorTextKeywords; import nu.marginalia.atags.AnchorTextKeywords;
import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.model.ProcessedDomain;
@ -13,7 +12,7 @@ import nu.marginalia.integration.reddit.db.RedditDb;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.util.ProcessingIterator; import nu.marginalia.util.ProcessingIterator;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -30,16 +29,13 @@ public class RedditSideloader implements SideloadSource {
private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(RedditSideloader.class); private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(RedditSideloader.class);
private final List<Path> dbFiles; private final List<Path> dbFiles;
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
private final AnchorTextKeywords anchorTextKeywords; private final AnchorTextKeywords anchorTextKeywords;
private final SideloaderProcessing sideloaderProcessing; private final SideloaderProcessing sideloaderProcessing;
public RedditSideloader(List<Path> listToDbFiles, public RedditSideloader(List<Path> listToDbFiles,
AnchorTagsSourceFactory anchorTagsSourceFactory,
AnchorTextKeywords anchorTextKeywords, AnchorTextKeywords anchorTextKeywords,
SideloaderProcessing sideloaderProcessing) { SideloaderProcessing sideloaderProcessing) {
this.dbFiles = listToDbFiles; this.dbFiles = listToDbFiles;
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
this.anchorTextKeywords = anchorTextKeywords; this.anchorTextKeywords = anchorTextKeywords;
this.sideloaderProcessing = sideloaderProcessing; this.sideloaderProcessing = sideloaderProcessing;
} }
@ -116,14 +112,25 @@ public class RedditSideloader implements SideloadSource {
.ofInstant(Instant.ofEpochSecond(createdUtc), ZoneOffset.UTC) .ofInstant(Instant.ofEpochSecond(createdUtc), ZoneOffset.UTC)
.getYear(); .getYear();
String fullHtml = "<!DOCTYPE html>\n<html>\n<head>\n <title>" + title + "</title>\n <script src=\"https://www.example.com/dummy.js\" type=\"text/javascript\"></script>\n</head>\n<body>\n <h1>" + title + "</h1>\n <article>\n <p>" + body + "</p>\n </article>\n</body>\n</html>\n"; String fullHtml = """
<!DOCTYPE html>
<html>
<head>
<title>%s</title>
<script src="https://www.example.com/dummy.js" type="text/javascript"></script>
</head>
<body>
<h1>%s</h1>
<h2>reddit r/%s %s</h2>
<article>
<p>%s</p>
</article>
</body>
</html>
""".formatted(title, title, subreddit, subreddit, body);
List<String> extraKeywords = new ArrayList<>(); List<String> extraKeywords = new ArrayList<>();
extraKeywords.add("reddit");
extraKeywords.add(subreddit);
extraKeywords.add("r/" + subreddit);
if (!StringUtils.isBlank(author) && !author.equals("[deleted]")) { if (!StringUtils.isBlank(author) && !author.equals("[deleted]")) {
extraKeywords.add(author); extraKeywords.add(author);
} }
@ -147,12 +154,18 @@ public class RedditSideloader implements SideloadSource {
if (doc.isProcessedFully()) { if (doc.isProcessedFully()) {
for (var keyword : extraKeywords) { // Insert topology information
doc.words.addMeta(keyword, WordFlags.Subjects.asBit()); if (doc.details != null) {
doc.details.metadata.withSizeAndTopology(50_000_000, score);
} }
// Insert topology information if (doc.words != null) {
doc.details.metadata.withSizeAndTopology(50_000_000, score); doc.words.addAllSyntheticTerms(List.of("generator:forum",
HtmlFeature.COOKIES.getKeyword(),
HtmlFeature.JS.getKeyword(),
HtmlFeature.TRACKING_ADTECH.getKeyword()
));
}
} }

View File

@ -24,10 +24,7 @@ import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.Arrays; import java.util.*;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
@ -36,6 +33,8 @@ public class StackexchangeSideloader implements SideloadSource {
private final DocumentKeywordExtractor keywordExtractor; private final DocumentKeywordExtractor keywordExtractor;
private final String domainName; private final String domainName;
private final EnumSet<HtmlFeature> applyFeatures = EnumSet.of(HtmlFeature.JS, HtmlFeature.TRACKING);
private final Path dbFile; private final Path dbFile;
public StackexchangeSideloader(Path pathToDbFile, public StackexchangeSideloader(Path pathToDbFile,
@ -133,12 +132,17 @@ public class StackexchangeSideloader implements SideloadSource {
ret.url = url; ret.url = url;
ret.words = keywordExtractor.extractKeywords(dld, new LinkTexts(), url); ret.words = keywordExtractor.extractKeywords(dld, new LinkTexts(), url);
ret.words.addAllSyntheticTerms(List.of(
"site:" + domainName, List<String> syntheticTerms = new ArrayList<>(
List.of("site:" + domainName,
"site:" + url.domain.topDomain, "site:" + url.domain.topDomain,
url.domain.topDomain, url.domain.topDomain,
domainName domainName)
)); );
for (HtmlFeature feature : applyFeatures) {
syntheticTerms.add(feature.getKeyword());
}
ret.words.addAllSyntheticTerms(syntheticTerms);
if (!post.tags().isBlank()) { if (!post.tags().isBlank()) {
List<String> subjects = Arrays.asList(post.tags().split(",")); List<String> subjects = Arrays.asList(post.tags().split(","));
@ -152,7 +156,7 @@ public class StackexchangeSideloader implements SideloadSource {
PubDate.toYearByte(ret.details.pubYear), PubDate.toYearByte(ret.details.pubYear),
(int) -ret.details.quality, (int) -ret.details.quality,
EnumSet.of(DocumentFlags.GeneratorDocs)); EnumSet.of(DocumentFlags.GeneratorDocs));
ret.details.features = EnumSet.of(HtmlFeature.JS, HtmlFeature.TRACKING); ret.details.features = applyFeatures;
ret.details.metadata.withSizeAndTopology(10000, 0); ret.details.metadata.withSizeAndTopology(10000, 0);