mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(live-crawler) Flag live crawled documents with a special keyword
This commit is contained in:
parent
3c2bb566da
commit
c5d657ef98
@ -2,6 +2,7 @@ package nu.marginalia.converting.processor;
|
|||||||
|
|
||||||
import nu.marginalia.converting.model.ProcessedDocument;
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
@ -14,6 +15,9 @@ public class DocumentDecorator {
|
|||||||
public void addTerm(String term) {
|
public void addTerm(String term) {
|
||||||
extraSearchTerms.add(term);
|
extraSearchTerms.add(term);
|
||||||
}
|
}
|
||||||
|
public void addTerms(Collection<String> terms) {
|
||||||
|
extraSearchTerms.addAll(terms);
|
||||||
|
}
|
||||||
|
|
||||||
public void apply(ProcessedDocument doc) {
|
public void apply(ProcessedDocument doc) {
|
||||||
if (doc == null)
|
if (doc == null)
|
||||||
|
@ -66,6 +66,16 @@ public class DomainProcessor {
|
|||||||
return fullProcessing(domain);
|
return fullProcessing(domain);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection<String> extraKeywords) {
|
||||||
|
try {
|
||||||
|
return new SideloadProcessing(dataStream, sizeHint, extraKeywords);
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.warn("Failed to process domain sideload", ex);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) {
|
public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) {
|
||||||
try {
|
try {
|
||||||
return new SideloadProcessing(dataStream, sizeHint);
|
return new SideloadProcessing(dataStream, sizeHint);
|
||||||
@ -74,7 +84,6 @@ public class DomainProcessor {
|
|||||||
logger.warn("Failed to process domain sideload", ex);
|
logger.warn("Failed to process domain sideload", ex);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource {
|
public class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource {
|
||||||
@ -89,6 +98,10 @@ public class DomainProcessor {
|
|||||||
);
|
);
|
||||||
|
|
||||||
SideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) throws IOException {
|
SideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) throws IOException {
|
||||||
|
this(dataStream, sizeHint, List.of());
|
||||||
|
}
|
||||||
|
|
||||||
|
SideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection<String> extraKeywords) throws IOException {
|
||||||
this.dataStream = dataStream;
|
this.dataStream = dataStream;
|
||||||
|
|
||||||
if (!dataStream.hasNext() || !(dataStream.next() instanceof CrawledDomain crawledDomain))
|
if (!dataStream.hasNext() || !(dataStream.next() instanceof CrawledDomain crawledDomain))
|
||||||
@ -100,6 +113,7 @@ public class DomainProcessor {
|
|||||||
domain.sizeloadSizeAdvice = sizeHint == 0 ? 10_000 : sizeHint;
|
domain.sizeloadSizeAdvice = sizeHint == 0 ? 10_000 : sizeHint;
|
||||||
|
|
||||||
documentDecorator = new DocumentDecorator();
|
documentDecorator = new DocumentDecorator();
|
||||||
|
documentDecorator.addTerms(extraKeywords);
|
||||||
|
|
||||||
processDomain(crawledDomain, domain, documentDecorator);
|
processDomain(crawledDomain, domain, documentDecorator);
|
||||||
|
|
||||||
|
@ -41,6 +41,7 @@ import java.time.temporal.ChronoUnit;
|
|||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import static nu.marginalia.mqapi.ProcessInboxNames.LIVE_CRAWLER_INBOX;
|
import static nu.marginalia.mqapi.ProcessInboxNames.LIVE_CRAWLER_INBOX;
|
||||||
|
|
||||||
@ -196,7 +197,7 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
|||||||
writer.setOrdinalOffset(67_000_000);
|
writer.setOrdinalOffset(67_000_000);
|
||||||
|
|
||||||
for (SerializableCrawlDataStream stream : hb.wrap("Processing", dataSet.getDataStreams())) {
|
for (SerializableCrawlDataStream stream : hb.wrap("Processing", dataSet.getDataStreams())) {
|
||||||
writer.write(domainProcessor.sideloadProcessing(stream, 0));
|
writer.write(domainProcessor.sideloadProcessing(stream, 0, Set.of("special:live")));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user