mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(converter) Refactor sideloaders to improve feature handling and keyword logic
Centralized HTML feature handling with `applyFeatures` in StackexchangeSideloader and added dynamic synthetic term generation. Improved HTML structure in RedditSideloader and enhanced metadata processing with feature-based keywords. Updated DomainLinks to correctly compute link counts using individual link occurrences.
This commit is contained in:
parent
73861e613f
commit
5002870d1f
@ -41,7 +41,13 @@ public class DomainLinks {
|
|||||||
/** Returns the number of links to the given url. */
|
/** Returns the number of links to the given url. */
|
||||||
public int countForUrl(EdgeUrl url) {
|
public int countForUrl(EdgeUrl url) {
|
||||||
String key = url.domain.toString() + url.path + (url.param == null ? "" : "?" + url.param);
|
String key = url.domain.toString() + url.path + (url.param == null ? "" : "?" + url.param);
|
||||||
return links.getOrDefault(key, List.of()).size();
|
|
||||||
|
int cnt = 0;
|
||||||
|
for (var link : links.getOrDefault(key, List.of())) {
|
||||||
|
cnt += link.count();
|
||||||
|
}
|
||||||
|
|
||||||
|
return cnt;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -2,7 +2,6 @@ package nu.marginalia.converting.sideload.reddit;
|
|||||||
|
|
||||||
import nu.marginalia.atags.AnchorTextKeywords;
|
import nu.marginalia.atags.AnchorTextKeywords;
|
||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
|
||||||
import nu.marginalia.converting.model.GeneratorType;
|
import nu.marginalia.converting.model.GeneratorType;
|
||||||
import nu.marginalia.converting.model.ProcessedDocument;
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
import nu.marginalia.converting.model.ProcessedDomain;
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
@ -13,7 +12,7 @@ import nu.marginalia.integration.reddit.db.RedditDb;
|
|||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.util.ProcessingIterator;
|
import nu.marginalia.util.ProcessingIterator;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
@ -30,16 +29,13 @@ public class RedditSideloader implements SideloadSource {
|
|||||||
private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(RedditSideloader.class);
|
private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(RedditSideloader.class);
|
||||||
|
|
||||||
private final List<Path> dbFiles;
|
private final List<Path> dbFiles;
|
||||||
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
|
||||||
private final AnchorTextKeywords anchorTextKeywords;
|
private final AnchorTextKeywords anchorTextKeywords;
|
||||||
private final SideloaderProcessing sideloaderProcessing;
|
private final SideloaderProcessing sideloaderProcessing;
|
||||||
|
|
||||||
public RedditSideloader(List<Path> listToDbFiles,
|
public RedditSideloader(List<Path> listToDbFiles,
|
||||||
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
|
||||||
AnchorTextKeywords anchorTextKeywords,
|
AnchorTextKeywords anchorTextKeywords,
|
||||||
SideloaderProcessing sideloaderProcessing) {
|
SideloaderProcessing sideloaderProcessing) {
|
||||||
this.dbFiles = listToDbFiles;
|
this.dbFiles = listToDbFiles;
|
||||||
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
|
|
||||||
this.anchorTextKeywords = anchorTextKeywords;
|
this.anchorTextKeywords = anchorTextKeywords;
|
||||||
this.sideloaderProcessing = sideloaderProcessing;
|
this.sideloaderProcessing = sideloaderProcessing;
|
||||||
}
|
}
|
||||||
@ -116,14 +112,25 @@ public class RedditSideloader implements SideloadSource {
|
|||||||
.ofInstant(Instant.ofEpochSecond(createdUtc), ZoneOffset.UTC)
|
.ofInstant(Instant.ofEpochSecond(createdUtc), ZoneOffset.UTC)
|
||||||
.getYear();
|
.getYear();
|
||||||
|
|
||||||
String fullHtml = "<!DOCTYPE html>\n<html>\n<head>\n <title>" + title + "</title>\n <script src=\"https://www.example.com/dummy.js\" type=\"text/javascript\"></script>\n</head>\n<body>\n <h1>" + title + "</h1>\n <article>\n <p>" + body + "</p>\n </article>\n</body>\n</html>\n";
|
String fullHtml = """
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>%s</title>
|
||||||
|
<script src="https://www.example.com/dummy.js" type="text/javascript"></script>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>%s</h1>
|
||||||
|
<h2>reddit r/%s %s</h2>
|
||||||
|
<article>
|
||||||
|
<p>%s</p>
|
||||||
|
</article>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""".formatted(title, title, subreddit, subreddit, body);
|
||||||
|
|
||||||
List<String> extraKeywords = new ArrayList<>();
|
List<String> extraKeywords = new ArrayList<>();
|
||||||
|
|
||||||
extraKeywords.add("reddit");
|
|
||||||
extraKeywords.add(subreddit);
|
|
||||||
extraKeywords.add("r/" + subreddit);
|
|
||||||
|
|
||||||
if (!StringUtils.isBlank(author) && !author.equals("[deleted]")) {
|
if (!StringUtils.isBlank(author) && !author.equals("[deleted]")) {
|
||||||
extraKeywords.add(author);
|
extraKeywords.add(author);
|
||||||
}
|
}
|
||||||
@ -147,12 +154,18 @@ public class RedditSideloader implements SideloadSource {
|
|||||||
|
|
||||||
|
|
||||||
if (doc.isProcessedFully()) {
|
if (doc.isProcessedFully()) {
|
||||||
for (var keyword : extraKeywords) {
|
// Insert topology information
|
||||||
doc.words.addMeta(keyword, WordFlags.Subjects.asBit());
|
if (doc.details != null) {
|
||||||
|
doc.details.metadata.withSizeAndTopology(50_000_000, score);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Insert topology information
|
if (doc.words != null) {
|
||||||
doc.details.metadata.withSizeAndTopology(50_000_000, score);
|
doc.words.addAllSyntheticTerms(List.of("generator:forum",
|
||||||
|
HtmlFeature.COOKIES.getKeyword(),
|
||||||
|
HtmlFeature.JS.getKeyword(),
|
||||||
|
HtmlFeature.TRACKING_ADTECH.getKeyword()
|
||||||
|
));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -24,10 +24,7 @@ import org.apache.commons.lang3.StringUtils;
|
|||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
|
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.Arrays;
|
import java.util.*;
|
||||||
import java.util.EnumSet;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.concurrent.ArrayBlockingQueue;
|
import java.util.concurrent.ArrayBlockingQueue;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
@ -36,6 +33,8 @@ public class StackexchangeSideloader implements SideloadSource {
|
|||||||
private final DocumentKeywordExtractor keywordExtractor;
|
private final DocumentKeywordExtractor keywordExtractor;
|
||||||
private final String domainName;
|
private final String domainName;
|
||||||
|
|
||||||
|
private final EnumSet<HtmlFeature> applyFeatures = EnumSet.of(HtmlFeature.JS, HtmlFeature.TRACKING);
|
||||||
|
|
||||||
private final Path dbFile;
|
private final Path dbFile;
|
||||||
|
|
||||||
public StackexchangeSideloader(Path pathToDbFile,
|
public StackexchangeSideloader(Path pathToDbFile,
|
||||||
@ -133,12 +132,17 @@ public class StackexchangeSideloader implements SideloadSource {
|
|||||||
|
|
||||||
ret.url = url;
|
ret.url = url;
|
||||||
ret.words = keywordExtractor.extractKeywords(dld, new LinkTexts(), url);
|
ret.words = keywordExtractor.extractKeywords(dld, new LinkTexts(), url);
|
||||||
ret.words.addAllSyntheticTerms(List.of(
|
|
||||||
"site:" + domainName,
|
List<String> syntheticTerms = new ArrayList<>(
|
||||||
"site:" + url.domain.topDomain,
|
List.of("site:" + domainName,
|
||||||
url.domain.topDomain,
|
"site:" + url.domain.topDomain,
|
||||||
domainName
|
url.domain.topDomain,
|
||||||
));
|
domainName)
|
||||||
|
);
|
||||||
|
for (HtmlFeature feature : applyFeatures) {
|
||||||
|
syntheticTerms.add(feature.getKeyword());
|
||||||
|
}
|
||||||
|
ret.words.addAllSyntheticTerms(syntheticTerms);
|
||||||
|
|
||||||
if (!post.tags().isBlank()) {
|
if (!post.tags().isBlank()) {
|
||||||
List<String> subjects = Arrays.asList(post.tags().split(","));
|
List<String> subjects = Arrays.asList(post.tags().split(","));
|
||||||
@ -152,7 +156,7 @@ public class StackexchangeSideloader implements SideloadSource {
|
|||||||
PubDate.toYearByte(ret.details.pubYear),
|
PubDate.toYearByte(ret.details.pubYear),
|
||||||
(int) -ret.details.quality,
|
(int) -ret.details.quality,
|
||||||
EnumSet.of(DocumentFlags.GeneratorDocs));
|
EnumSet.of(DocumentFlags.GeneratorDocs));
|
||||||
ret.details.features = EnumSet.of(HtmlFeature.JS, HtmlFeature.TRACKING);
|
ret.details.features = applyFeatures;
|
||||||
|
|
||||||
ret.details.metadata.withSizeAndTopology(10000, 0);
|
ret.details.metadata.withSizeAndTopology(10000, 0);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user