From 680ad19c7d51fdff31113bec2bec2f650a767f3a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 6 Aug 2024 11:16:56 +0200 Subject: [PATCH] (keyword-extraction) Correct behavior when loading spans so that they are not double-loaded causing errors --- .../keyword/DocumentKeywordExtractor.java | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java index c6f87dd0..9559d246 100644 --- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java @@ -118,10 +118,6 @@ public class DocumentKeywordExtractor { } for (DocumentSentence sent : dld) { - - if (wordsBuilder.size() > 1500) - break; - for (var word : sent) { pos++; @@ -156,8 +152,13 @@ public class DocumentKeywordExtractor { for (var recorder : spanRecorders) { wordsBuilder.addSpans(recorder.finish(pos)); + + // reset the recorder, so we can use it again without adding the same positions twice + recorder.reset(); } + // Next add synthetic positions to the document for anchor texts + pos += 2; // add some padding to the end of the document before we start adding a-tag words for (var linkText : linkTexts) { @@ -180,7 +181,6 @@ public class DocumentKeywordExtractor { } // add some padding between separate link texts so we don't match across their boundaries - pos+=2; } @@ -247,7 +247,7 @@ public class DocumentKeywordExtractor { else { if (start > 0) { spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos)); - start = -1; + start = 0; } } } @@ -255,8 +255,14 @@ public class DocumentKeywordExtractor { public List finish(int length) { if (start > 0) { spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length)); + start = 0; } return spans; } + + public void reset() { + spans.clear(); + start = 0; + } } }