mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(converter) Refactor sideloaders to improve feature handling and keyword logic
Centralized HTML feature handling with `applyFeatures` in StackexchangeSideloader and added dynamic synthetic term generation. Improved HTML structure in RedditSideloader and enhanced metadata processing with feature-based keywords. Updated DomainLinks to correctly compute link counts using individual link occurrences.
This commit is contained in:
parent
73861e613f
commit
5002870d1f
@ -41,7 +41,13 @@ public class DomainLinks {
|
||||
/** Returns the number of links to the given url. */
|
||||
public int countForUrl(EdgeUrl url) {
|
||||
String key = url.domain.toString() + url.path + (url.param == null ? "" : "?" + url.param);
|
||||
return links.getOrDefault(key, List.of()).size();
|
||||
|
||||
int cnt = 0;
|
||||
for (var link : links.getOrDefault(key, List.of())) {
|
||||
cnt += link.count();
|
||||
}
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.converting.sideload.reddit;
|
||||
|
||||
import nu.marginalia.atags.AnchorTextKeywords;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
@ -13,7 +12,7 @@ import nu.marginalia.integration.reddit.db.RedditDb;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.util.ProcessingIterator;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
@ -30,16 +29,13 @@ public class RedditSideloader implements SideloadSource {
|
||||
private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(RedditSideloader.class);
|
||||
|
||||
private final List<Path> dbFiles;
|
||||
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
||||
private final AnchorTextKeywords anchorTextKeywords;
|
||||
private final SideloaderProcessing sideloaderProcessing;
|
||||
|
||||
public RedditSideloader(List<Path> listToDbFiles,
|
||||
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
||||
AnchorTextKeywords anchorTextKeywords,
|
||||
SideloaderProcessing sideloaderProcessing) {
|
||||
this.dbFiles = listToDbFiles;
|
||||
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
|
||||
this.anchorTextKeywords = anchorTextKeywords;
|
||||
this.sideloaderProcessing = sideloaderProcessing;
|
||||
}
|
||||
@ -116,14 +112,25 @@ public class RedditSideloader implements SideloadSource {
|
||||
.ofInstant(Instant.ofEpochSecond(createdUtc), ZoneOffset.UTC)
|
||||
.getYear();
|
||||
|
||||
String fullHtml = "<!DOCTYPE html>\n<html>\n<head>\n <title>" + title + "</title>\n <script src=\"https://www.example.com/dummy.js\" type=\"text/javascript\"></script>\n</head>\n<body>\n <h1>" + title + "</h1>\n <article>\n <p>" + body + "</p>\n </article>\n</body>\n</html>\n";
|
||||
String fullHtml = """
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>%s</title>
|
||||
<script src="https://www.example.com/dummy.js" type="text/javascript"></script>
|
||||
</head>
|
||||
<body>
|
||||
<h1>%s</h1>
|
||||
<h2>reddit r/%s %s</h2>
|
||||
<article>
|
||||
<p>%s</p>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
||||
""".formatted(title, title, subreddit, subreddit, body);
|
||||
|
||||
List<String> extraKeywords = new ArrayList<>();
|
||||
|
||||
extraKeywords.add("reddit");
|
||||
extraKeywords.add(subreddit);
|
||||
extraKeywords.add("r/" + subreddit);
|
||||
|
||||
if (!StringUtils.isBlank(author) && !author.equals("[deleted]")) {
|
||||
extraKeywords.add(author);
|
||||
}
|
||||
@ -147,12 +154,18 @@ public class RedditSideloader implements SideloadSource {
|
||||
|
||||
|
||||
if (doc.isProcessedFully()) {
|
||||
for (var keyword : extraKeywords) {
|
||||
doc.words.addMeta(keyword, WordFlags.Subjects.asBit());
|
||||
// Insert topology information
|
||||
if (doc.details != null) {
|
||||
doc.details.metadata.withSizeAndTopology(50_000_000, score);
|
||||
}
|
||||
|
||||
// Insert topology information
|
||||
doc.details.metadata.withSizeAndTopology(50_000_000, score);
|
||||
if (doc.words != null) {
|
||||
doc.words.addAllSyntheticTerms(List.of("generator:forum",
|
||||
HtmlFeature.COOKIES.getKeyword(),
|
||||
HtmlFeature.JS.getKeyword(),
|
||||
HtmlFeature.TRACKING_ADTECH.getKeyword()
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -24,10 +24,7 @@ import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.Jsoup;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.EnumSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.ArrayBlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
@ -36,6 +33,8 @@ public class StackexchangeSideloader implements SideloadSource {
|
||||
private final DocumentKeywordExtractor keywordExtractor;
|
||||
private final String domainName;
|
||||
|
||||
private final EnumSet<HtmlFeature> applyFeatures = EnumSet.of(HtmlFeature.JS, HtmlFeature.TRACKING);
|
||||
|
||||
private final Path dbFile;
|
||||
|
||||
public StackexchangeSideloader(Path pathToDbFile,
|
||||
@ -133,12 +132,17 @@ public class StackexchangeSideloader implements SideloadSource {
|
||||
|
||||
ret.url = url;
|
||||
ret.words = keywordExtractor.extractKeywords(dld, new LinkTexts(), url);
|
||||
ret.words.addAllSyntheticTerms(List.of(
|
||||
"site:" + domainName,
|
||||
"site:" + url.domain.topDomain,
|
||||
url.domain.topDomain,
|
||||
domainName
|
||||
));
|
||||
|
||||
List<String> syntheticTerms = new ArrayList<>(
|
||||
List.of("site:" + domainName,
|
||||
"site:" + url.domain.topDomain,
|
||||
url.domain.topDomain,
|
||||
domainName)
|
||||
);
|
||||
for (HtmlFeature feature : applyFeatures) {
|
||||
syntheticTerms.add(feature.getKeyword());
|
||||
}
|
||||
ret.words.addAllSyntheticTerms(syntheticTerms);
|
||||
|
||||
if (!post.tags().isBlank()) {
|
||||
List<String> subjects = Arrays.asList(post.tags().split(","));
|
||||
@ -152,7 +156,7 @@ public class StackexchangeSideloader implements SideloadSource {
|
||||
PubDate.toYearByte(ret.details.pubYear),
|
||||
(int) -ret.details.quality,
|
||||
EnumSet.of(DocumentFlags.GeneratorDocs));
|
||||
ret.details.features = EnumSet.of(HtmlFeature.JS, HtmlFeature.TRACKING);
|
||||
ret.details.features = applyFeatures;
|
||||
|
||||
ret.details.metadata.withSizeAndTopology(10000, 0);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user