(live-crawler) Flag live crawled documents with a special keyword

This commit is contained in:
Viktor Lofgren 2024-12-10 13:42:10 +01:00
parent 3c2bb566da
commit c5d657ef98
3 changed files with 21 additions and 2 deletions

View File

@ -2,6 +2,7 @@ package nu.marginalia.converting.processor;
import nu.marginalia.converting.model.ProcessedDocument;
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
@ -14,6 +15,9 @@ public class DocumentDecorator {
public void addTerm(String term) {
extraSearchTerms.add(term);
}
public void addTerms(Collection<String> terms) {
extraSearchTerms.addAll(terms);
}
public void apply(ProcessedDocument doc) {
if (doc == null)

View File

@ -66,6 +66,16 @@ public class DomainProcessor {
return fullProcessing(domain);
}
public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection<String> extraKeywords) {
try {
return new SideloadProcessing(dataStream, sizeHint, extraKeywords);
}
catch (Exception ex) {
logger.warn("Failed to process domain sideload", ex);
return null;
}
}
public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) {
try {
return new SideloadProcessing(dataStream, sizeHint);
@ -74,7 +84,6 @@ public class DomainProcessor {
logger.warn("Failed to process domain sideload", ex);
return null;
}
}
public class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource {
@ -89,6 +98,10 @@ public class DomainProcessor {
);
SideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) throws IOException {
this(dataStream, sizeHint, List.of());
}
SideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection<String> extraKeywords) throws IOException {
this.dataStream = dataStream;
if (!dataStream.hasNext() || !(dataStream.next() instanceof CrawledDomain crawledDomain))
@ -100,6 +113,7 @@ public class DomainProcessor {
domain.sizeloadSizeAdvice = sizeHint == 0 ? 10_000 : sizeHint;
documentDecorator = new DocumentDecorator();
documentDecorator.addTerms(extraKeywords);
processDomain(crawledDomain, domain, documentDecorator);

View File

@ -41,6 +41,7 @@ import java.time.temporal.ChronoUnit;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import static nu.marginalia.mqapi.ProcessInboxNames.LIVE_CRAWLER_INBOX;
@ -196,7 +197,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
writer.setOrdinalOffset(67_000_000);
for (SerializableCrawlDataStream stream : hb.wrap("Processing", dataSet.getDataStreams())) {
writer.write(domainProcessor.sideloadProcessing(stream, 0));
writer.write(domainProcessor.sideloadProcessing(stream, 0, Set.of("special:live")));
}
}