mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(converter) Experimental support for searching by URL
Add up to synthetic 128 keywords per document, corresponding to links to other websites.
This commit is contained in:
parent
89aae93e60
commit
f83f777fff
@ -32,6 +32,10 @@ public class LinkProcessor {
|
||||
ret.feedLinks = new ArrayList<>();
|
||||
}
|
||||
|
||||
public Set<EdgeUrl> getSeenUrls() {
|
||||
return seenUrls;
|
||||
}
|
||||
|
||||
public Set<EdgeDomain> getForeignDomains() {
|
||||
return foreignDomains;
|
||||
}
|
||||
|
@ -295,10 +295,10 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
|
||||
words.addAllSyntheticTerms(FileLinks.createFileLinkKeywords(lp, domain));
|
||||
words.addAllSyntheticTerms(FileLinks.createFileEndingKeywords(doc));
|
||||
words.addAllSyntheticTerms(createLinkKeywords(lp));
|
||||
words.addAllSyntheticTerms(createLinkKeywords(lp, domain));
|
||||
}
|
||||
|
||||
private Set<String> createLinkKeywords(LinkProcessor lp) {
|
||||
private Set<String> createLinkKeywords(LinkProcessor lp, EdgeDomain domain) {
|
||||
final Set<String> linkTerms = new HashSet<>();
|
||||
|
||||
for (var fd : lp.getForeignDomains()) {
|
||||
@ -306,6 +306,14 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
||||
linkTerms.add("links:"+fd.getTopDomain().toLowerCase());
|
||||
}
|
||||
|
||||
// Add keyword terms for the first 128 external links, with no prefix
|
||||
for (EdgeUrl link : lp.getSeenUrls()) {
|
||||
if (linkTerms.size() > 128) break;
|
||||
if (domain.hasSameTopDomain(link.domain)) continue;
|
||||
|
||||
linkTerms.add(link.toString());
|
||||
}
|
||||
|
||||
return linkTerms;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user