mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(converter) Experimental support for searching by URL
Add up to synthetic 128 keywords per document, corresponding to links to other websites.
This commit is contained in:
parent
89aae93e60
commit
f83f777fff
@ -32,6 +32,10 @@ public class LinkProcessor {
|
|||||||
ret.feedLinks = new ArrayList<>();
|
ret.feedLinks = new ArrayList<>();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Set<EdgeUrl> getSeenUrls() {
|
||||||
|
return seenUrls;
|
||||||
|
}
|
||||||
|
|
||||||
public Set<EdgeDomain> getForeignDomains() {
|
public Set<EdgeDomain> getForeignDomains() {
|
||||||
return foreignDomains;
|
return foreignDomains;
|
||||||
}
|
}
|
||||||
|
@ -295,10 +295,10 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
|
|
||||||
words.addAllSyntheticTerms(FileLinks.createFileLinkKeywords(lp, domain));
|
words.addAllSyntheticTerms(FileLinks.createFileLinkKeywords(lp, domain));
|
||||||
words.addAllSyntheticTerms(FileLinks.createFileEndingKeywords(doc));
|
words.addAllSyntheticTerms(FileLinks.createFileEndingKeywords(doc));
|
||||||
words.addAllSyntheticTerms(createLinkKeywords(lp));
|
words.addAllSyntheticTerms(createLinkKeywords(lp, domain));
|
||||||
}
|
}
|
||||||
|
|
||||||
private Set<String> createLinkKeywords(LinkProcessor lp) {
|
private Set<String> createLinkKeywords(LinkProcessor lp, EdgeDomain domain) {
|
||||||
final Set<String> linkTerms = new HashSet<>();
|
final Set<String> linkTerms = new HashSet<>();
|
||||||
|
|
||||||
for (var fd : lp.getForeignDomains()) {
|
for (var fd : lp.getForeignDomains()) {
|
||||||
@ -306,6 +306,14 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin
|
|||||||
linkTerms.add("links:"+fd.getTopDomain().toLowerCase());
|
linkTerms.add("links:"+fd.getTopDomain().toLowerCase());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add keyword terms for the first 128 external links, with no prefix
|
||||||
|
for (EdgeUrl link : lp.getSeenUrls()) {
|
||||||
|
if (linkTerms.size() > 128) break;
|
||||||
|
if (domain.hasSameTopDomain(link.domain)) continue;
|
||||||
|
|
||||||
|
linkTerms.add(link.toString());
|
||||||
|
}
|
||||||
|
|
||||||
return linkTerms;
|
return linkTerms;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user