(converter) Clean up fullProcessing()

This function made some very flimsy-looking assumptions about the order of an iterable.  These are still made, but more explicitly so.
This commit is contained in:
Viktor Lofgren 2023-12-30 13:36:18 +01:00
parent 7ba296ccdf
commit 70c83b60a1

View File

@ -99,6 +99,7 @@ public class DomainProcessor {
domain.sizeloadSizeAdvice = sizeHint == 0 ? 10_000 : sizeHint; domain.sizeloadSizeAdvice = sizeHint == 0 ? 10_000 : sizeHint;
documentDecorator = new DocumentDecorator(anchorTextKeywords); documentDecorator = new DocumentDecorator(anchorTextKeywords);
processDomain(crawledDomain, domain, documentDecorator); processDomain(crawledDomain, domain, documentDecorator);
externalDomainLinks = anchorTagsSource.getAnchorTags(domain.domain); externalDomainLinks = anchorTagsSource.getAnchorTags(domain.domain);
@ -164,43 +165,36 @@ public class DomainProcessor {
return null; return null;
} }
ProcessedDomain ret = new ProcessedDomain();
List<ProcessedDocument> docs = new ArrayList<>(); List<ProcessedDocument> docs = new ArrayList<>();
Set<String> processedUrls = new HashSet<>(); Set<String> processedUrls = new HashSet<>();
DomainLinks externalDomainLinks = null; if (!(dataStream.next() instanceof CrawledDomain crawledDomain)) {
throw new IllegalStateException("First record must be a domain, was " + dataStream.next().getClass().getSimpleName());
DocumentDecorator documentDecorator = null;
try (var deduplicator = new LshDocumentDeduplicator()){
while (dataStream.hasNext()) {
var data = dataStream.next();
// Do a lazy load of the external domain links since we don't know the domain
// until we see the first document
if (externalDomainLinks == null) {
var domain = data.getDomain();
if (domain != null) {
externalDomainLinks = anchorTagsSource.getAnchorTags(domain);
}
} }
if (data instanceof CrawledDomain crawledDomain) { DomainLinks externalDomainLinks = anchorTagsSource.getAnchorTags(crawledDomain.getDomain());
documentDecorator = new DocumentDecorator(anchorTextKeywords); DocumentDecorator documentDecorator = new DocumentDecorator(anchorTextKeywords);
// Process Domain Record
ProcessedDomain ret = new ProcessedDomain();
processDomain(crawledDomain, ret, documentDecorator); processDomain(crawledDomain, ret, documentDecorator);
ret.documents = docs; ret.documents = docs;
} else if (data instanceof CrawledDocument doc) { // Process Documents
try {
if (doc.url == null || !processedUrls.add(doc.url)) try (var deduplicator = new LshDocumentDeduplicator()) {
while (dataStream.hasNext()) {
if (!(dataStream.next() instanceof CrawledDocument doc))
continue;
if (doc.url == null)
continue;
if (!processedUrls.add(doc.url))
continue; continue;
try {
var processedDoc = documentProcessor.process(doc, ret.domain, externalDomainLinks, documentDecorator); var processedDoc = documentProcessor.process(doc, ret.domain, externalDomainLinks, documentDecorator);
deduplicator.markIfDuplicate(processedDoc); deduplicator.markIfDuplicate(processedDoc);
docs.add(processedDoc); docs.add(processedDoc);
} catch (Exception ex) { } catch (Exception ex) {
logger.warn("Failed to process " + doc.url, ex); logger.warn("Failed to process " + doc.url, ex);
@ -208,8 +202,6 @@ public class DomainProcessor {
} }
} }
}
// Add late keywords and features from domain-level information // Add late keywords and features from domain-level information
calculateStatistics(ret, externalDomainLinks); calculateStatistics(ret, externalDomainLinks);