mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(converter) Clean up fullProcessing()
This function made some very flimsy-looking assumptions about the order of an iterable. These are still made, but more explicitly so.
This commit is contained in:
parent
7ba296ccdf
commit
70c83b60a1
@ -99,6 +99,7 @@ public class DomainProcessor {
|
|||||||
domain.sizeloadSizeAdvice = sizeHint == 0 ? 10_000 : sizeHint;
|
domain.sizeloadSizeAdvice = sizeHint == 0 ? 10_000 : sizeHint;
|
||||||
|
|
||||||
documentDecorator = new DocumentDecorator(anchorTextKeywords);
|
documentDecorator = new DocumentDecorator(anchorTextKeywords);
|
||||||
|
|
||||||
processDomain(crawledDomain, domain, documentDecorator);
|
processDomain(crawledDomain, domain, documentDecorator);
|
||||||
|
|
||||||
externalDomainLinks = anchorTagsSource.getAnchorTags(domain.domain);
|
externalDomainLinks = anchorTagsSource.getAnchorTags(domain.domain);
|
||||||
@ -164,50 +165,41 @@ public class DomainProcessor {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
ProcessedDomain ret = new ProcessedDomain();
|
|
||||||
List<ProcessedDocument> docs = new ArrayList<>();
|
List<ProcessedDocument> docs = new ArrayList<>();
|
||||||
Set<String> processedUrls = new HashSet<>();
|
Set<String> processedUrls = new HashSet<>();
|
||||||
|
|
||||||
DomainLinks externalDomainLinks = null;
|
if (!(dataStream.next() instanceof CrawledDomain crawledDomain)) {
|
||||||
|
throw new IllegalStateException("First record must be a domain, was " + dataStream.next().getClass().getSimpleName());
|
||||||
|
}
|
||||||
|
|
||||||
DocumentDecorator documentDecorator = null;
|
DomainLinks externalDomainLinks = anchorTagsSource.getAnchorTags(crawledDomain.getDomain());
|
||||||
|
DocumentDecorator documentDecorator = new DocumentDecorator(anchorTextKeywords);
|
||||||
|
|
||||||
try (var deduplicator = new LshDocumentDeduplicator()){
|
// Process Domain Record
|
||||||
|
|
||||||
|
ProcessedDomain ret = new ProcessedDomain();
|
||||||
|
processDomain(crawledDomain, ret, documentDecorator);
|
||||||
|
ret.documents = docs;
|
||||||
|
|
||||||
|
// Process Documents
|
||||||
|
|
||||||
|
try (var deduplicator = new LshDocumentDeduplicator()) {
|
||||||
while (dataStream.hasNext()) {
|
while (dataStream.hasNext()) {
|
||||||
var data = dataStream.next();
|
if (!(dataStream.next() instanceof CrawledDocument doc))
|
||||||
|
continue;
|
||||||
|
if (doc.url == null)
|
||||||
|
continue;
|
||||||
|
if (!processedUrls.add(doc.url))
|
||||||
|
continue;
|
||||||
|
|
||||||
// Do a lazy load of the external domain links since we don't know the domain
|
try {
|
||||||
// until we see the first document
|
var processedDoc = documentProcessor.process(doc, ret.domain, externalDomainLinks, documentDecorator);
|
||||||
if (externalDomainLinks == null) {
|
deduplicator.markIfDuplicate(processedDoc);
|
||||||
var domain = data.getDomain();
|
docs.add(processedDoc);
|
||||||
|
} catch (Exception ex) {
|
||||||
if (domain != null) {
|
logger.warn("Failed to process " + doc.url, ex);
|
||||||
externalDomainLinks = anchorTagsSource.getAnchorTags(domain);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (data instanceof CrawledDomain crawledDomain) {
|
|
||||||
documentDecorator = new DocumentDecorator(anchorTextKeywords);
|
|
||||||
|
|
||||||
processDomain(crawledDomain, ret, documentDecorator);
|
|
||||||
ret.documents = docs;
|
|
||||||
|
|
||||||
} else if (data instanceof CrawledDocument doc) {
|
|
||||||
try {
|
|
||||||
if (doc.url == null || !processedUrls.add(doc.url))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
var processedDoc = documentProcessor.process(doc, ret.domain, externalDomainLinks, documentDecorator);
|
|
||||||
|
|
||||||
deduplicator.markIfDuplicate(processedDoc);
|
|
||||||
|
|
||||||
docs.add(processedDoc);
|
|
||||||
} catch (Exception ex) {
|
|
||||||
logger.warn("Failed to process " + doc.url, ex);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add late keywords and features from domain-level information
|
// Add late keywords and features from domain-level information
|
||||||
|
Loading…
Reference in New Issue
Block a user