(converter) Improve simple processing performance

There was a regression introduced in the recent slop migration changes in  the performance of the simple conversion track.  This reverts the issue.
This commit is contained in:
Viktor Lofgren 2025-01-21 14:13:33 +01:00
parent 270cab874b
commit 088310e998

View File

@ -193,13 +193,16 @@ public class DomainProcessor {
@Override
public Iterator<ProcessedDocument> getDocumentsStream() {
return dataStream.map((next) -> {
if (!(next instanceof CrawledDocument doc))
return Optional.empty();
return iteratorFactory.create((taskConsumer) -> {
while (dataStream.hasNext())
{
if (!(dataStream.next() instanceof CrawledDocument doc))
continue;
if (doc.url == null || !processedUrls.add(doc.url))
return Optional.empty();
continue;
taskConsumer.accept(() -> {
var processedDoc = documentProcessor.process(doc, domain.domain, externalDomainLinks, documentDecorator);
synchronized (deduplicator) {
@ -212,7 +215,9 @@ public class DomainProcessor {
10_000, externalDomainLinks.countForUrl(processedDoc.url));
}
return Optional.of(processedDoc);
return processedDoc;
});
}
});
}