From c5d657ef9800db68ade4819a8e04f0ea3bc1073e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 10 Dec 2024 13:42:10 +0100 Subject: [PATCH] (live-crawler) Flag live crawled documents with a special keyword --- .../converting/processor/DocumentDecorator.java | 4 ++++ .../converting/processor/DomainProcessor.java | 16 +++++++++++++++- .../marginalia/livecrawler/LiveCrawlerMain.java | 3 ++- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentDecorator.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentDecorator.java index 2a4fbcb1..2eb073b9 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentDecorator.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentDecorator.java @@ -2,6 +2,7 @@ package nu.marginalia.converting.processor; import nu.marginalia.converting.model.ProcessedDocument; +import java.util.Collection; import java.util.HashSet; import java.util.Set; @@ -14,6 +15,9 @@ public class DocumentDecorator { public void addTerm(String term) { extraSearchTerms.add(term); } + public void addTerms(Collection terms) { + extraSearchTerms.addAll(terms); + } public void apply(ProcessedDocument doc) { if (doc == null) diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java index c0999c96..d31195f8 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -66,6 +66,16 @@ public class DomainProcessor { return fullProcessing(domain); } + public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection extraKeywords) { + try { + return new SideloadProcessing(dataStream, sizeHint, extraKeywords); + } + catch (Exception ex) { + logger.warn("Failed to process domain sideload", ex); + return null; + } + } + public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) { try { return new SideloadProcessing(dataStream, sizeHint); @@ -74,7 +84,6 @@ public class DomainProcessor { logger.warn("Failed to process domain sideload", ex); return null; } - } public class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource { @@ -89,6 +98,10 @@ public class DomainProcessor { ); SideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) throws IOException { + this(dataStream, sizeHint, List.of()); + } + + SideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection extraKeywords) throws IOException { this.dataStream = dataStream; if (!dataStream.hasNext() || !(dataStream.next() instanceof CrawledDomain crawledDomain)) @@ -100,6 +113,7 @@ public class DomainProcessor { domain.sizeloadSizeAdvice = sizeHint == 0 ? 10_000 : sizeHint; documentDecorator = new DocumentDecorator(); + documentDecorator.addTerms(extraKeywords); processDomain(crawledDomain, domain, documentDecorator); diff --git a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java index d05925bb..f8af9267 100644 --- a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java +++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java @@ -41,6 +41,7 @@ import java.time.temporal.ChronoUnit; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Set; import static nu.marginalia.mqapi.ProcessInboxNames.LIVE_CRAWLER_INBOX; @@ -196,7 +197,7 @@ public class LiveCrawlerMain extends ProcessMainClass { writer.setOrdinalOffset(67_000_000); for (SerializableCrawlDataStream stream : hb.wrap("Processing", dataSet.getDataStreams())) { - writer.write(domainProcessor.sideloadProcessing(stream, 0)); + writer.write(domainProcessor.sideloadProcessing(stream, 0, Set.of("special:live"))); } }