(keyword-extraction) Correct behavior when loading spans so that they are not double-loaded causing errors

This commit is contained in:
Viktor Lofgren 2024-08-06 11:16:56 +02:00
parent f01267bc6b
commit 680ad19c7d

View File

@ -118,10 +118,6 @@ public class DocumentKeywordExtractor {
} }
for (DocumentSentence sent : dld) { for (DocumentSentence sent : dld) {
if (wordsBuilder.size() > 1500)
break;
for (var word : sent) { for (var word : sent) {
pos++; pos++;
@ -156,8 +152,13 @@ public class DocumentKeywordExtractor {
for (var recorder : spanRecorders) { for (var recorder : spanRecorders) {
wordsBuilder.addSpans(recorder.finish(pos)); wordsBuilder.addSpans(recorder.finish(pos));
// reset the recorder, so we can use it again without adding the same positions twice
recorder.reset();
} }
// Next add synthetic positions to the document for anchor texts
pos += 2; // add some padding to the end of the document before we start adding a-tag words pos += 2; // add some padding to the end of the document before we start adding a-tag words
for (var linkText : linkTexts) { for (var linkText : linkTexts) {
@ -180,7 +181,6 @@ public class DocumentKeywordExtractor {
} }
// add some padding between separate link texts so we don't match across their boundaries // add some padding between separate link texts so we don't match across their boundaries
pos+=2; pos+=2;
} }
@ -247,7 +247,7 @@ public class DocumentKeywordExtractor {
else { else {
if (start > 0) { if (start > 0) {
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos)); spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
start = -1; start = 0;
} }
} }
} }
@ -255,8 +255,14 @@ public class DocumentKeywordExtractor {
public List<DocumentKeywordsBuilder.DocumentWordSpan> finish(int length) { public List<DocumentKeywordsBuilder.DocumentWordSpan> finish(int length) {
if (start > 0) { if (start > 0) {
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length)); spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));
start = 0;
} }
return spans; return spans;
} }
public void reset() {
spans.clear();
start = 0;
}
} }
} }