(converter) Experimental support for searching by URL

Add up to synthetic 128 keywords per document, corresponding to links to other websites.
This commit is contained in:
Viktor Lofgren 2024-05-23 17:10:57 +02:00
parent 89aae93e60
commit f83f777fff
2 changed files with 14 additions and 2 deletions

View File

@ -32,6 +32,10 @@ public class LinkProcessor {
ret.feedLinks = new ArrayList<>();
}
public Set<EdgeUrl> getSeenUrls() {
return seenUrls;
}
public Set<EdgeDomain> getForeignDomains() {
return foreignDomains;
}

View File

@ -295,10 +295,10 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
words.addAllSyntheticTerms(FileLinks.createFileLinkKeywords(lp, domain));
words.addAllSyntheticTerms(FileLinks.createFileEndingKeywords(doc));
words.addAllSyntheticTerms(createLinkKeywords(lp));
words.addAllSyntheticTerms(createLinkKeywords(lp, domain));
}
private Set<String> createLinkKeywords(LinkProcessor lp) {
private Set<String> createLinkKeywords(LinkProcessor lp, EdgeDomain domain) {
final Set<String> linkTerms = new HashSet<>();
for (var fd : lp.getForeignDomains()) {
@ -306,6 +306,14 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
linkTerms.add("links:"+fd.getTopDomain().toLowerCase());
}
// Add keyword terms for the first 128 external links, with no prefix
for (EdgeUrl link : lp.getSeenUrls()) {
if (linkTerms.size() > 128) break;
if (domain.hasSameTopDomain(link.domain)) continue;
linkTerms.add(link.toString());
}
return linkTerms;
}