mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(keyword-extraction) Update upper limit to number of positions per word
After real-world testing, it was determined that 256 was still a bit too low, but 512 seems like it will only truncate outlier cases like assembly code and certain tabulations.
This commit is contained in:
parent
a6e15cb338
commit
e8ab1e14e0
@ -25,7 +25,7 @@ public class DocumentKeywordsBuilder {
|
||||
// granted, some of these words are word n-grams, but 64 ought to
|
||||
// be plenty. The lexicon writer has another limit that's higher.
|
||||
private final int MAX_WORD_LENGTH = 64;
|
||||
private final int MAX_POSITIONS_PER_WORD = 256;
|
||||
private final int MAX_POSITIONS_PER_WORD = 512;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DocumentKeywordsBuilder.class);
|
||||
|
||||
@ -49,7 +49,6 @@ public class DocumentKeywordsBuilder {
|
||||
var posList = wordToPos.getOrDefault(entry.getKey(), IntList.of());
|
||||
|
||||
if (posList.size() > MAX_POSITIONS_PER_WORD) {
|
||||
logger.info("Truncating positions for word '{}', count was {}", entry.getKey(), posList.size());
|
||||
posList.subList(MAX_POSITIONS_PER_WORD, posList.size()).clear();
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user