(converter) Fix NPE in converter

This commit is contained in:
Viktor Lofgren 2023-12-28 19:52:26 +01:00
parent bcecc93e39
commit c488599879
3 changed files with 12 additions and 11 deletions

View File

@ -10,18 +10,16 @@ import java.util.Set;
public class DocumentDecorator {
private final Set<String> extraSearchTerms = new HashSet<>();
private final AnchorTextKeywords keywords;
private final DomainLinks externalDomainLinks;
public DocumentDecorator(AnchorTextKeywords keywords, DomainLinks externalDomainLinks) {
public DocumentDecorator(AnchorTextKeywords keywords) {
this.keywords = keywords;
this.externalDomainLinks = externalDomainLinks;
}
public void addTerm(String term) {
extraSearchTerms.add(term);
}
public void apply(ProcessedDocument doc) {
public void apply(ProcessedDocument doc, DomainLinks externalDomainLinks) {
if (doc == null)
return;
if (doc.words == null)

View File

@ -39,7 +39,9 @@ public class DocumentProcessor {
processorPlugins.add(plainTextDocumentProcessorPlugin);
}
public ProcessedDocument process(CrawledDocument crawledDocument, DomainLinks externalDomainLinks, DocumentDecorator documentDecorator) {
public ProcessedDocument process(CrawledDocument crawledDocument,
DomainLinks externalDomainLinks,
DocumentDecorator documentDecorator) {
ProcessedDocument ret = new ProcessedDocument();
try {
@ -52,7 +54,7 @@ public class DocumentProcessor {
default -> DocumentClass.EXTERNALLY_LINKED_MULTI;
};
processDocument(crawledDocument, documentClass, documentDecorator, ret);
processDocument(crawledDocument, documentClass, documentDecorator, externalDomainLinks, ret);
}
catch (DisqualifiedException ex) {
ret.state = UrlIndexingState.DISQUALIFIED;
@ -68,7 +70,7 @@ public class DocumentProcessor {
return ret;
}
private void processDocument(CrawledDocument crawledDocument, DocumentClass documentClass, DocumentDecorator documentDecorator, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException {
private void processDocument(CrawledDocument crawledDocument, DocumentClass documentClass, DocumentDecorator documentDecorator, DomainLinks externalDomainLinks, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException {
var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus);
if (crawlerStatus != CrawlerDocumentStatus.OK) {
@ -92,7 +94,7 @@ public class DocumentProcessor {
ret.details = detailsWithWords.details();
ret.words = detailsWithWords.words();
documentDecorator.apply(ret);
documentDecorator.apply(ret, externalDomainLinks);
if (Boolean.TRUE.equals(crawledDocument.hasCookies)
&& ret.details != null

View File

@ -105,10 +105,11 @@ public class DomainProcessor {
domain = new ProcessedDomain();
domain.sizeloadSizeAdvice = 10_000;
externalDomainLinks = anchorTagsSource.getAnchorTags(domain.domain);
documentDecorator = new DocumentDecorator(anchorTextKeywords, externalDomainLinks);
documentDecorator = new DocumentDecorator(anchorTextKeywords);
processDomain(crawledDomain, domain, documentDecorator);
externalDomainLinks = anchorTagsSource.getAnchorTags(domain.domain);
}
@Override
@ -215,7 +216,7 @@ public class DomainProcessor {
}
if (data instanceof CrawledDomain crawledDomain) {
documentDecorator = new DocumentDecorator(anchorTextKeywords, externalDomainLinks);
documentDecorator = new DocumentDecorator(anchorTextKeywords);
processDomain(crawledDomain, ret, documentDecorator);
ret.documents = docs;