diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/links/LinkProcessor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/links/LinkProcessor.java index 7623dccf..66213de6 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/links/LinkProcessor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/links/LinkProcessor.java @@ -32,6 +32,10 @@ public class LinkProcessor { ret.feedLinks = new ArrayList<>(); } + public Set getSeenUrls() { + return seenUrls; + } + public Set getForeignDomains() { return foreignDomains; } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 43c2952a..5514fee9 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -295,10 +295,10 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin words.addAllSyntheticTerms(FileLinks.createFileLinkKeywords(lp, domain)); words.addAllSyntheticTerms(FileLinks.createFileEndingKeywords(doc)); - words.addAllSyntheticTerms(createLinkKeywords(lp)); + words.addAllSyntheticTerms(createLinkKeywords(lp, domain)); } - private Set createLinkKeywords(LinkProcessor lp) { + private Set createLinkKeywords(LinkProcessor lp, EdgeDomain domain) { final Set linkTerms = new HashSet<>(); for (var fd : lp.getForeignDomains()) { @@ -306,6 +306,14 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin linkTerms.add("links:"+fd.getTopDomain().toLowerCase()); } + // Add keyword terms for the first 128 external links, with no prefix + for (EdgeUrl link : lp.getSeenUrls()) { + if (linkTerms.size() > 128) break; + if (domain.hasSameTopDomain(link.domain)) continue; + + linkTerms.add(link.toString()); + } + return linkTerms; }