(index) Remove stopword list from converter

We want to index all words in the document, stopword handling is moved to the index where we change the semantics to elide inclusion checks in query construction for a very short list of words tentatively hard-coded in SearchTerms.
2025-02-23 13:09:00 +00:00 · 2024-08-15 09:36:50 +02:00 · 2024-08-15 09:36:50 +02:00 · a18edad04c
commit a18edad04c
parent 92522e8d97
5 changed files with 26 additions and 72 deletions
--- a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java
+++ b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java
@ -111,11 +111,22 @@ public class CombinedIndexReader {
                return 0;
            });

-            var head = findFullWord(elements.getLong(0));
-            for (int i = 1; i < elements.size(); i++) {
-                head.addInclusionFilter(hasWordFull(elements.getLong(i)));
+            if (!SearchTerms.stopWords.contains(elements.getLong(0))) {
+                var head = findFullWord(elements.getLong(0));
+
+                for (int i = 1; i < elements.size(); i++) {
+                    long termId = elements.getLong(i);
+
+                    // if a stop word is present in the query, skip the step of requiring it to be in the document,
+                    // we'll assume it's there and save IO
+                    if (SearchTerms.stopWords.contains(termId)) {
+                        continue;
+                    }
+
+                    head.addInclusionFilter(hasWordFull(termId));
+                }
+                queryHeads.add(head);
            }
-            queryHeads.add(head);

            // If there are few paths, we can afford to check the priority index as well
            if (paths.size() < 4) {
--- a/code/index/java/nu/marginalia/index/model/SearchTerms.java
+++ b/code/index/java/nu/marginalia/index/model/SearchTerms.java
@ -1,6 +1,7 @@
 package nu.marginalia.index.model;

 import it.unimi.dsi.fastutil.longs.LongArrayList;
+import it.unimi.dsi.fastutil.longs.LongArraySet;
 import it.unimi.dsi.fastutil.longs.LongComparator;
 import it.unimi.dsi.fastutil.longs.LongList;
 import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
@ -19,6 +20,14 @@ public final class SearchTerms {
    private final List<LongList> coherencesMandatory;
    private final List<LongList> coherencesOptional;

+    public static final LongArraySet stopWords = new LongArraySet(
+            new long[] {
+                    getWordId("a"),
+                    getWordId("an"),
+                    getWordId("the"),
+            }
+    );
+
    private final CompiledQueryLong compiledQueryIds;

    public SearchTerms(SearchQuery query,
--- a/code/libraries/language-processing/java/nu/marginalia/language/WordDictionary.java
+++ b/code/libraries/language-processing/java/nu/marginalia/language/WordDictionary.java
@ -1,46 +0,0 @@
-package nu.marginalia.language;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.util.HashSet;
-import java.util.Objects;
-import java.util.Set;
-
-public class WordDictionary {
-    private final Set<String> words;
-    private static final Logger logger = LoggerFactory.getLogger(WordDictionary.class);
-
-    private WordDictionary(Set<String> words) {
-        this.words = words;
-    }
-
-    public static WordDictionary fromClasspathResource(String resourceName) {
-        var set = new HashSet<String>(200, 0.5f);
-
-        try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream(resourceName),
-                "Could not load word frequency table");
-             var br = new BufferedReader(new InputStreamReader(resource))
-        ) {
-            while (true) {
-                String s = br.readLine();
-
-                if (s == null) break;
-                if (s.isBlank()) continue;
-
-                set.add(s.trim());
-            }
-        } catch (IOException e) {
-            logger.warn("Failed to load resource " + resourceName, e);
-        }
-
-        return new WordDictionary(set);
-    }
-
-    public boolean contains(String str) {
-        return words.contains(str);
-    }
-}
--- a/code/libraries/language-processing/java/nu/marginalia/language/WordPatterns.java
+++ b/code/libraries/language-processing/java/nu/marginalia/language/WordPatterns.java
@ -1,7 +1,5 @@
 package nu.marginalia.language;

-import org.apache.commons.lang3.StringUtils;
-
 /** Logic for deciding which words are eligible to be keywords.
 * <p/>
 * This is in dire need of oversight. Here be towering dragons with names,
@ -14,8 +12,6 @@ public class WordPatterns {
    public static final int MAX_WORD_LENGTH = 64;

    public static final String WORD_TOKEN_JOINER = "_";
-    private static final WordDictionary stopWords =
-            WordDictionary.fromClasspathResource("dictionary/en-stopwords");

    /** Run checks on the word and exclude terms with too many special characters
     */
@ -57,27 +53,13 @@ public class WordPatterns {
        return true;
    }

+    // Stopword exclusion has been moved to the index.  We just filter out
+    // junk words here now.
    public static boolean isStopWord(String s) {
-        if (s.length() < MIN_WORD_LENGTH) {
-            return true;
-        }
-
        if (!isNotJunkWord(s)) {
            return true;
        }

-        String sLc;
-        if (StringUtils.isAllLowerCase(s)) {
-            sLc = s;
-        }
-        else {
-            sLc = s.toLowerCase();
-        }
-
-        if (stopWords.contains(sLc)) {
-            return true;
-        }
-
        return false;
    }

--- a/code/libraries/language-processing/resources/dictionary/en-stopwords
+++ b/code/libraries/language-processing/resources/dictionary/en-stopwords
@ -1,2 +0,0 @@
-a
-the