mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(live-crawler) Flag live crawled documents with a special keyword
This commit is contained in:
parent
3c2bb566da
commit
c5d657ef98
@ -2,6 +2,7 @@ package nu.marginalia.converting.processor;
|
||||
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
@ -14,6 +15,9 @@ public class DocumentDecorator {
|
||||
public void addTerm(String term) {
|
||||
extraSearchTerms.add(term);
|
||||
}
|
||||
public void addTerms(Collection<String> terms) {
|
||||
extraSearchTerms.addAll(terms);
|
||||
}
|
||||
|
||||
public void apply(ProcessedDocument doc) {
|
||||
if (doc == null)
|
||||
|
@ -66,6 +66,16 @@ public class DomainProcessor {
|
||||
return fullProcessing(domain);
|
||||
}
|
||||
|
||||
public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection<String> extraKeywords) {
|
||||
try {
|
||||
return new SideloadProcessing(dataStream, sizeHint, extraKeywords);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Failed to process domain sideload", ex);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) {
|
||||
try {
|
||||
return new SideloadProcessing(dataStream, sizeHint);
|
||||
@ -74,7 +84,6 @@ public class DomainProcessor {
|
||||
logger.warn("Failed to process domain sideload", ex);
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource {
|
||||
@ -89,6 +98,10 @@ public class DomainProcessor {
|
||||
);
|
||||
|
||||
SideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) throws IOException {
|
||||
this(dataStream, sizeHint, List.of());
|
||||
}
|
||||
|
||||
SideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection<String> extraKeywords) throws IOException {
|
||||
this.dataStream = dataStream;
|
||||
|
||||
if (!dataStream.hasNext() || !(dataStream.next() instanceof CrawledDomain crawledDomain))
|
||||
@ -100,6 +113,7 @@ public class DomainProcessor {
|
||||
domain.sizeloadSizeAdvice = sizeHint == 0 ? 10_000 : sizeHint;
|
||||
|
||||
documentDecorator = new DocumentDecorator();
|
||||
documentDecorator.addTerms(extraKeywords);
|
||||
|
||||
processDomain(crawledDomain, domain, documentDecorator);
|
||||
|
||||
|
@ -41,6 +41,7 @@ import java.time.temporal.ChronoUnit;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import static nu.marginalia.mqapi.ProcessInboxNames.LIVE_CRAWLER_INBOX;
|
||||
|
||||
@ -196,7 +197,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
||||
writer.setOrdinalOffset(67_000_000);
|
||||
|
||||
for (SerializableCrawlDataStream stream : hb.wrap("Processing", dataSet.getDataStreams())) {
|
||||
writer.write(domainProcessor.sideloadProcessing(stream, 0));
|
||||
writer.write(domainProcessor.sideloadProcessing(stream, 0, Set.of("special:live")));
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user