From a18edad04cf54225192e727327eaadb16109a638 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 15 Aug 2024 09:36:50 +0200 Subject: [PATCH] (index) Remove stopword list from converter We want to index all words in the document, stopword handling is moved to the index where we change the semantics to elide inclusion checks in query construction for a very short list of words tentatively hard-coded in SearchTerms. --- .../index/index/CombinedIndexReader.java | 19 ++++++-- .../marginalia/index/model/SearchTerms.java | 9 ++++ .../marginalia/language/WordDictionary.java | 46 ------------------- .../nu/marginalia/language/WordPatterns.java | 22 +-------- .../resources/dictionary/en-stopwords | 2 - 5 files changed, 26 insertions(+), 72 deletions(-) delete mode 100644 code/libraries/language-processing/java/nu/marginalia/language/WordDictionary.java delete mode 100644 code/libraries/language-processing/resources/dictionary/en-stopwords diff --git a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java index de52d1c5..216192cf 100644 --- a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java +++ b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java @@ -111,11 +111,22 @@ public class CombinedIndexReader { return 0; }); - var head = findFullWord(elements.getLong(0)); - for (int i = 1; i < elements.size(); i++) { - head.addInclusionFilter(hasWordFull(elements.getLong(i))); + if (!SearchTerms.stopWords.contains(elements.getLong(0))) { + var head = findFullWord(elements.getLong(0)); + + for (int i = 1; i < elements.size(); i++) { + long termId = elements.getLong(i); + + // if a stop word is present in the query, skip the step of requiring it to be in the document, + // we'll assume it's there and save IO + if (SearchTerms.stopWords.contains(termId)) { + continue; + } + + head.addInclusionFilter(hasWordFull(termId)); + } + queryHeads.add(head); } - queryHeads.add(head); // If there are few paths, we can afford to check the priority index as well if (paths.size() < 4) { diff --git a/code/index/java/nu/marginalia/index/model/SearchTerms.java b/code/index/java/nu/marginalia/index/model/SearchTerms.java index 832d22b7..019832b2 100644 --- a/code/index/java/nu/marginalia/index/model/SearchTerms.java +++ b/code/index/java/nu/marginalia/index/model/SearchTerms.java @@ -1,6 +1,7 @@ package nu.marginalia.index.model; import it.unimi.dsi.fastutil.longs.LongArrayList; +import it.unimi.dsi.fastutil.longs.LongArraySet; import it.unimi.dsi.fastutil.longs.LongComparator; import it.unimi.dsi.fastutil.longs.LongList; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; @@ -19,6 +20,14 @@ public final class SearchTerms { private final List coherencesMandatory; private final List coherencesOptional; + public static final LongArraySet stopWords = new LongArraySet( + new long[] { + getWordId("a"), + getWordId("an"), + getWordId("the"), + } + ); + private final CompiledQueryLong compiledQueryIds; public SearchTerms(SearchQuery query, diff --git a/code/libraries/language-processing/java/nu/marginalia/language/WordDictionary.java b/code/libraries/language-processing/java/nu/marginalia/language/WordDictionary.java deleted file mode 100644 index 622c3b8c..00000000 --- a/code/libraries/language-processing/java/nu/marginalia/language/WordDictionary.java +++ /dev/null @@ -1,46 +0,0 @@ -package nu.marginalia.language; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.HashSet; -import java.util.Objects; -import java.util.Set; - -public class WordDictionary { - private final Set words; - private static final Logger logger = LoggerFactory.getLogger(WordDictionary.class); - - private WordDictionary(Set words) { - this.words = words; - } - - public static WordDictionary fromClasspathResource(String resourceName) { - var set = new HashSet(200, 0.5f); - - try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream(resourceName), - "Could not load word frequency table"); - var br = new BufferedReader(new InputStreamReader(resource)) - ) { - while (true) { - String s = br.readLine(); - - if (s == null) break; - if (s.isBlank()) continue; - - set.add(s.trim()); - } - } catch (IOException e) { - logger.warn("Failed to load resource " + resourceName, e); - } - - return new WordDictionary(set); - } - - public boolean contains(String str) { - return words.contains(str); - } -} diff --git a/code/libraries/language-processing/java/nu/marginalia/language/WordPatterns.java b/code/libraries/language-processing/java/nu/marginalia/language/WordPatterns.java index dbc8c9c8..9f137ddc 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/WordPatterns.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/WordPatterns.java @@ -1,7 +1,5 @@ package nu.marginalia.language; -import org.apache.commons.lang3.StringUtils; - /** Logic for deciding which words are eligible to be keywords. *

* This is in dire need of oversight. Here be towering dragons with names, @@ -14,8 +12,6 @@ public class WordPatterns { public static final int MAX_WORD_LENGTH = 64; public static final String WORD_TOKEN_JOINER = "_"; - private static final WordDictionary stopWords = - WordDictionary.fromClasspathResource("dictionary/en-stopwords"); /** Run checks on the word and exclude terms with too many special characters */ @@ -57,27 +53,13 @@ public class WordPatterns { return true; } + // Stopword exclusion has been moved to the index. We just filter out + // junk words here now. public static boolean isStopWord(String s) { - if (s.length() < MIN_WORD_LENGTH) { - return true; - } - if (!isNotJunkWord(s)) { return true; } - String sLc; - if (StringUtils.isAllLowerCase(s)) { - sLc = s; - } - else { - sLc = s.toLowerCase(); - } - - if (stopWords.contains(sLc)) { - return true; - } - return false; } diff --git a/code/libraries/language-processing/resources/dictionary/en-stopwords b/code/libraries/language-processing/resources/dictionary/en-stopwords deleted file mode 100644 index f19a4788..00000000 --- a/code/libraries/language-processing/resources/dictionary/en-stopwords +++ /dev/null @@ -1,2 +0,0 @@ -a -the \ No newline at end of file