(converter) Fix NPE in converter

This commit is contained in:
Viktor Lofgren 2023-12-28 19:52:26 +01:00
parent bcecc93e39
commit c488599879
3 changed files with 12 additions and 11 deletions

View File

@ -10,18 +10,16 @@ import java.util.Set;
public class DocumentDecorator { public class DocumentDecorator {
private final Set<String> extraSearchTerms = new HashSet<>(); private final Set<String> extraSearchTerms = new HashSet<>();
private final AnchorTextKeywords keywords; private final AnchorTextKeywords keywords;
private final DomainLinks externalDomainLinks;
public DocumentDecorator(AnchorTextKeywords keywords, DomainLinks externalDomainLinks) { public DocumentDecorator(AnchorTextKeywords keywords) {
this.keywords = keywords; this.keywords = keywords;
this.externalDomainLinks = externalDomainLinks;
} }
public void addTerm(String term) { public void addTerm(String term) {
extraSearchTerms.add(term); extraSearchTerms.add(term);
} }
public void apply(ProcessedDocument doc) { public void apply(ProcessedDocument doc, DomainLinks externalDomainLinks) {
if (doc == null) if (doc == null)
return; return;
if (doc.words == null) if (doc.words == null)

View File

@ -39,7 +39,9 @@ public class DocumentProcessor {
processorPlugins.add(plainTextDocumentProcessorPlugin); processorPlugins.add(plainTextDocumentProcessorPlugin);
} }
public ProcessedDocument process(CrawledDocument crawledDocument, DomainLinks externalDomainLinks, DocumentDecorator documentDecorator) { public ProcessedDocument process(CrawledDocument crawledDocument,
DomainLinks externalDomainLinks,
DocumentDecorator documentDecorator) {
ProcessedDocument ret = new ProcessedDocument(); ProcessedDocument ret = new ProcessedDocument();
try { try {
@ -52,7 +54,7 @@ public class DocumentProcessor {
default -> DocumentClass.EXTERNALLY_LINKED_MULTI; default -> DocumentClass.EXTERNALLY_LINKED_MULTI;
}; };
processDocument(crawledDocument, documentClass, documentDecorator, ret); processDocument(crawledDocument, documentClass, documentDecorator, externalDomainLinks, ret);
} }
catch (DisqualifiedException ex) { catch (DisqualifiedException ex) {
ret.state = UrlIndexingState.DISQUALIFIED; ret.state = UrlIndexingState.DISQUALIFIED;
@ -68,7 +70,7 @@ public class DocumentProcessor {
return ret; return ret;
} }
private void processDocument(CrawledDocument crawledDocument, DocumentClass documentClass, DocumentDecorator documentDecorator, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException { private void processDocument(CrawledDocument crawledDocument, DocumentClass documentClass, DocumentDecorator documentDecorator, DomainLinks externalDomainLinks, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException {
var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus); var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus);
if (crawlerStatus != CrawlerDocumentStatus.OK) { if (crawlerStatus != CrawlerDocumentStatus.OK) {
@ -92,7 +94,7 @@ public class DocumentProcessor {
ret.details = detailsWithWords.details(); ret.details = detailsWithWords.details();
ret.words = detailsWithWords.words(); ret.words = detailsWithWords.words();
documentDecorator.apply(ret); documentDecorator.apply(ret, externalDomainLinks);
if (Boolean.TRUE.equals(crawledDocument.hasCookies) if (Boolean.TRUE.equals(crawledDocument.hasCookies)
&& ret.details != null && ret.details != null

View File

@ -105,10 +105,11 @@ public class DomainProcessor {
domain = new ProcessedDomain(); domain = new ProcessedDomain();
domain.sizeloadSizeAdvice = 10_000; domain.sizeloadSizeAdvice = 10_000;
externalDomainLinks = anchorTagsSource.getAnchorTags(domain.domain);
documentDecorator = new DocumentDecorator(anchorTextKeywords, externalDomainLinks);
documentDecorator = new DocumentDecorator(anchorTextKeywords);
processDomain(crawledDomain, domain, documentDecorator); processDomain(crawledDomain, domain, documentDecorator);
externalDomainLinks = anchorTagsSource.getAnchorTags(domain.domain);
} }
@Override @Override
@ -215,7 +216,7 @@ public class DomainProcessor {
} }
if (data instanceof CrawledDomain crawledDomain) { if (data instanceof CrawledDomain crawledDomain) {
documentDecorator = new DocumentDecorator(anchorTextKeywords, externalDomainLinks); documentDecorator = new DocumentDecorator(anchorTextKeywords);
processDomain(crawledDomain, ret, documentDecorator); processDomain(crawledDomain, ret, documentDecorator);
ret.documents = docs; ret.documents = docs;