(converter) Refactor sideloaders to improve feature handling and keyword logic

Centralized HTML feature handling with `applyFeatures` in StackexchangeSideloader and added dynamic synthetic term generation. Improved HTML structure in RedditSideloader and enhanced metadata processing with feature-based keywords. Updated DomainLinks to correctly compute link counts using individual link occurrences.
This commit is contained in:
Viktor Lofgren 2024-12-11 16:01:38 +01:00
parent 73861e613f
commit 5002870d1f
3 changed files with 49 additions and 26 deletions

View File

@ -41,7 +41,13 @@ public class DomainLinks {
/** Returns the number of links to the given url. */
public int countForUrl(EdgeUrl url) {
String key = url.domain.toString() + url.path + (url.param == null ? "" : "?" + url.param);
return links.getOrDefault(key, List.of()).size();
int cnt = 0;
for (var link : links.getOrDefault(key, List.of())) {
cnt += link.count();
}
return cnt;
}
@Override

View File

@ -2,7 +2,6 @@ package nu.marginalia.converting.sideload.reddit;
import nu.marginalia.atags.AnchorTextKeywords;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain;
@ -13,7 +12,7 @@ import nu.marginalia.integration.reddit.db.RedditDb;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.util.ProcessingIterator;
import org.apache.commons.lang3.StringUtils;
@ -30,16 +29,13 @@ public class RedditSideloader implements SideloadSource {
private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(RedditSideloader.class);
private final List<Path> dbFiles;
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
private final AnchorTextKeywords anchorTextKeywords;
private final SideloaderProcessing sideloaderProcessing;
public RedditSideloader(List<Path> listToDbFiles,
AnchorTagsSourceFactory anchorTagsSourceFactory,
AnchorTextKeywords anchorTextKeywords,
SideloaderProcessing sideloaderProcessing) {
this.dbFiles = listToDbFiles;
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
this.anchorTextKeywords = anchorTextKeywords;
this.sideloaderProcessing = sideloaderProcessing;
}
@ -116,14 +112,25 @@ public class RedditSideloader implements SideloadSource {
.ofInstant(Instant.ofEpochSecond(createdUtc), ZoneOffset.UTC)
.getYear();
String fullHtml = "<!DOCTYPE html>\n<html>\n<head>\n <title>" + title + "</title>\n <script src=\"https://www.example.com/dummy.js\" type=\"text/javascript\"></script>\n</head>\n<body>\n <h1>" + title + "</h1>\n <article>\n <p>" + body + "</p>\n </article>\n</body>\n</html>\n";
String fullHtml = """
<!DOCTYPE html>
<html>
<head>
<title>%s</title>
<script src="https://www.example.com/dummy.js" type="text/javascript"></script>
</head>
<body>
<h1>%s</h1>
<h2>reddit r/%s %s</h2>
<article>
<p>%s</p>
</article>
</body>
</html>
""".formatted(title, title, subreddit, subreddit, body);
List<String> extraKeywords = new ArrayList<>();
extraKeywords.add("reddit");
extraKeywords.add(subreddit);
extraKeywords.add("r/" + subreddit);
if (!StringUtils.isBlank(author) && !author.equals("[deleted]")) {
extraKeywords.add(author);
}
@ -147,12 +154,18 @@ public class RedditSideloader implements SideloadSource {
if (doc.isProcessedFully()) {
for (var keyword : extraKeywords) {
doc.words.addMeta(keyword, WordFlags.Subjects.asBit());
// Insert topology information
if (doc.details != null) {
doc.details.metadata.withSizeAndTopology(50_000_000, score);
}
// Insert topology information
doc.details.metadata.withSizeAndTopology(50_000_000, score);
if (doc.words != null) {
doc.words.addAllSyntheticTerms(List.of("generator:forum",
HtmlFeature.COOKIES.getKeyword(),
HtmlFeature.JS.getKeyword(),
HtmlFeature.TRACKING_ADTECH.getKeyword()
));
}
}

View File

@ -24,10 +24,7 @@ import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.List;
import java.util.*;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.TimeUnit;
@ -36,6 +33,8 @@ public class StackexchangeSideloader implements SideloadSource {
private final DocumentKeywordExtractor keywordExtractor;
private final String domainName;
private final EnumSet<HtmlFeature> applyFeatures = EnumSet.of(HtmlFeature.JS, HtmlFeature.TRACKING);
private final Path dbFile;
public StackexchangeSideloader(Path pathToDbFile,
@ -133,12 +132,17 @@ public class StackexchangeSideloader implements SideloadSource {
ret.url = url;
ret.words = keywordExtractor.extractKeywords(dld, new LinkTexts(), url);
ret.words.addAllSyntheticTerms(List.of(
"site:" + domainName,
"site:" + url.domain.topDomain,
url.domain.topDomain,
domainName
));
List<String> syntheticTerms = new ArrayList<>(
List.of("site:" + domainName,
"site:" + url.domain.topDomain,
url.domain.topDomain,
domainName)
);
for (HtmlFeature feature : applyFeatures) {
syntheticTerms.add(feature.getKeyword());
}
ret.words.addAllSyntheticTerms(syntheticTerms);
if (!post.tags().isBlank()) {
List<String> subjects = Arrays.asList(post.tags().split(","));
@ -152,7 +156,7 @@ public class StackexchangeSideloader implements SideloadSource {
PubDate.toYearByte(ret.details.pubYear),
(int) -ret.details.quality,
EnumSet.of(DocumentFlags.GeneratorDocs));
ret.details.features = EnumSet.of(HtmlFeature.JS, HtmlFeature.TRACKING);
ret.details.features = applyFeatures;
ret.details.metadata.withSizeAndTopology(10000, 0);