mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(keyword-extraction) Correct behavior when loading spans so that they are not double-loaded causing errors
This commit is contained in:
parent
f01267bc6b
commit
680ad19c7d
@ -118,10 +118,6 @@ public class DocumentKeywordExtractor {
|
||||
}
|
||||
|
||||
for (DocumentSentence sent : dld) {
|
||||
|
||||
if (wordsBuilder.size() > 1500)
|
||||
break;
|
||||
|
||||
for (var word : sent) {
|
||||
pos++;
|
||||
|
||||
@ -156,8 +152,13 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
for (var recorder : spanRecorders) {
|
||||
wordsBuilder.addSpans(recorder.finish(pos));
|
||||
|
||||
// reset the recorder, so we can use it again without adding the same positions twice
|
||||
recorder.reset();
|
||||
}
|
||||
|
||||
// Next add synthetic positions to the document for anchor texts
|
||||
|
||||
pos += 2; // add some padding to the end of the document before we start adding a-tag words
|
||||
|
||||
for (var linkText : linkTexts) {
|
||||
@ -180,7 +181,6 @@ public class DocumentKeywordExtractor {
|
||||
}
|
||||
|
||||
// add some padding between separate link texts so we don't match across their boundaries
|
||||
|
||||
pos+=2;
|
||||
}
|
||||
|
||||
@ -247,7 +247,7 @@ public class DocumentKeywordExtractor {
|
||||
else {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos));
|
||||
start = -1;
|
||||
start = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -255,8 +255,14 @@ public class DocumentKeywordExtractor {
|
||||
public List<DocumentKeywordsBuilder.DocumentWordSpan> finish(int length) {
|
||||
if (start > 0) {
|
||||
spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length));
|
||||
start = 0;
|
||||
}
|
||||
return spans;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
spans.clear();
|
||||
start = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user