mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(converter) Optimize sideload-loading
Use ProcessingIterator to fan out processing of documents across more cores, instead of doing all of it in the writer thread blocking everything else with slow single-threaded processing.
This commit is contained in:
parent
b5fc9673d9
commit
e7dd28b926
@ -20,6 +20,7 @@ import nu.marginalia.converting.model.ProcessedDomain;
|
|||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.converting.processor.logic.links.TopKeywords;
|
import nu.marginalia.converting.processor.logic.links.TopKeywords;
|
||||||
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
|
import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator;
|
||||||
|
import nu.marginalia.util.ProcessingIterator;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.jetbrains.annotations.Nullable;
|
import org.jetbrains.annotations.Nullable;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -110,26 +111,21 @@ public class DomainProcessor {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Iterator<ProcessedDocument> getDocumentsStream() {
|
public Iterator<ProcessedDocument> getDocumentsStream() {
|
||||||
return new DocumentsIterator();
|
return new ProcessingIterator<>(24, 16, (taskConsumer) -> {
|
||||||
}
|
while (dataStream.hasNext())
|
||||||
|
{
|
||||||
|
if (!(dataStream.next() instanceof CrawledDocument doc))
|
||||||
|
continue;
|
||||||
|
if (doc.url == null || !processedUrls.add(doc.url))
|
||||||
|
continue;
|
||||||
|
|
||||||
class DocumentsIterator implements Iterator<ProcessedDocument> {
|
|
||||||
ProcessedDocument next = null;
|
|
||||||
@Override
|
|
||||||
public boolean hasNext() {
|
|
||||||
try {
|
|
||||||
while (next == null
|
|
||||||
&& dataStream.hasNext())
|
|
||||||
{
|
|
||||||
if (!(dataStream.next() instanceof CrawledDocument doc))
|
|
||||||
continue;
|
|
||||||
if (doc.url == null || !processedUrls.add(doc.url))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
|
taskConsumer.accept(() -> {
|
||||||
var processedDoc = documentProcessor.process(doc, domain.domain, externalDomainLinks, documentDecorator);
|
var processedDoc = documentProcessor.process(doc, domain.domain, externalDomainLinks, documentDecorator);
|
||||||
|
|
||||||
deduplicator.markIfDuplicate(processedDoc);
|
synchronized (deduplicator) {
|
||||||
next = processedDoc;
|
deduplicator.markIfDuplicate(processedDoc);
|
||||||
|
}
|
||||||
|
|
||||||
if (processedDoc.isProcessedFully()) {
|
if (processedDoc.isProcessedFully()) {
|
||||||
// This is a bit sketchy, but we need to set the size and topology to something
|
// This is a bit sketchy, but we need to set the size and topology to something
|
||||||
@ -137,26 +133,10 @@ public class DomainProcessor {
|
|||||||
10_000, externalDomainLinks.countForUrl(processedDoc.url));
|
10_000, externalDomainLinks.countForUrl(processedDoc.url));
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return processedDoc;
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
catch (IOException ex) {
|
});
|
||||||
logger.warn("Failed to process domain sideload", ex);
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public ProcessedDocument next() {
|
|
||||||
try {
|
|
||||||
if (next == null && !hasNext())
|
|
||||||
throw new NoSuchElementException();
|
|
||||||
return next;
|
|
||||||
} finally {
|
|
||||||
next = null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
Loading…
Reference in New Issue
Block a user