diff --git a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java index de52d1c5..216192cf 100644 --- a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java +++ b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java @@ -111,11 +111,22 @@ public class CombinedIndexReader { return 0; }); - var head = findFullWord(elements.getLong(0)); - for (int i = 1; i < elements.size(); i++) { - head.addInclusionFilter(hasWordFull(elements.getLong(i))); + if (!SearchTerms.stopWords.contains(elements.getLong(0))) { + var head = findFullWord(elements.getLong(0)); + + for (int i = 1; i < elements.size(); i++) { + long termId = elements.getLong(i); + + // if a stop word is present in the query, skip the step of requiring it to be in the document, + // we'll assume it's there and save IO + if (SearchTerms.stopWords.contains(termId)) { + continue; + } + + head.addInclusionFilter(hasWordFull(termId)); + } + queryHeads.add(head); } - queryHeads.add(head); // If there are few paths, we can afford to check the priority index as well if (paths.size() < 4) { diff --git a/code/index/java/nu/marginalia/index/model/SearchTerms.java b/code/index/java/nu/marginalia/index/model/SearchTerms.java index 832d22b7..019832b2 100644 --- a/code/index/java/nu/marginalia/index/model/SearchTerms.java +++ b/code/index/java/nu/marginalia/index/model/SearchTerms.java @@ -1,6 +1,7 @@ package nu.marginalia.index.model; import it.unimi.dsi.fastutil.longs.LongArrayList; +import it.unimi.dsi.fastutil.longs.LongArraySet; import it.unimi.dsi.fastutil.longs.LongComparator; import it.unimi.dsi.fastutil.longs.LongList; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; @@ -19,6 +20,14 @@ public final class SearchTerms { private final List coherencesMandatory; private final List coherencesOptional; + public static final LongArraySet stopWords = new LongArraySet( + new long[] { + getWordId("a"), + getWordId("an"), + getWordId("the"), + } + ); + private final CompiledQueryLong compiledQueryIds; public SearchTerms(SearchQuery query, diff --git a/code/libraries/language-processing/java/nu/marginalia/language/WordDictionary.java b/code/libraries/language-processing/java/nu/marginalia/language/WordDictionary.java deleted file mode 100644 index 622c3b8c..00000000 --- a/code/libraries/language-processing/java/nu/marginalia/language/WordDictionary.java +++ /dev/null @@ -1,46 +0,0 @@ -package nu.marginalia.language; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.HashSet; -import java.util.Objects; -import java.util.Set; - -public class WordDictionary { - private final Set words; - private static final Logger logger = LoggerFactory.getLogger(WordDictionary.class); - - private WordDictionary(Set words) { - this.words = words; - } - - public static WordDictionary fromClasspathResource(String resourceName) { - var set = new HashSet(200, 0.5f); - - try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream(resourceName), - "Could not load word frequency table"); - var br = new BufferedReader(new InputStreamReader(resource)) - ) { - while (true) { - String s = br.readLine(); - - if (s == null) break; - if (s.isBlank()) continue; - - set.add(s.trim()); - } - } catch (IOException e) { - logger.warn("Failed to load resource " + resourceName, e); - } - - return new WordDictionary(set); - } - - public boolean contains(String str) { - return words.contains(str); - } -} diff --git a/code/libraries/language-processing/java/nu/marginalia/language/WordPatterns.java b/code/libraries/language-processing/java/nu/marginalia/language/WordPatterns.java index dbc8c9c8..9f137ddc 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/WordPatterns.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/WordPatterns.java @@ -1,7 +1,5 @@ package nu.marginalia.language; -import org.apache.commons.lang3.StringUtils; - /** Logic for deciding which words are eligible to be keywords. *

* This is in dire need of oversight. Here be towering dragons with names, @@ -14,8 +12,6 @@ public class WordPatterns { public static final int MAX_WORD_LENGTH = 64; public static final String WORD_TOKEN_JOINER = "_"; - private static final WordDictionary stopWords = - WordDictionary.fromClasspathResource("dictionary/en-stopwords"); /** Run checks on the word and exclude terms with too many special characters */ @@ -57,27 +53,13 @@ public class WordPatterns { return true; } + // Stopword exclusion has been moved to the index. We just filter out + // junk words here now. public static boolean isStopWord(String s) { - if (s.length() < MIN_WORD_LENGTH) { - return true; - } - if (!isNotJunkWord(s)) { return true; } - String sLc; - if (StringUtils.isAllLowerCase(s)) { - sLc = s; - } - else { - sLc = s.toLowerCase(); - } - - if (stopWords.contains(sLc)) { - return true; - } - return false; } diff --git a/code/libraries/language-processing/resources/dictionary/en-stopwords b/code/libraries/language-processing/resources/dictionary/en-stopwords deleted file mode 100644 index f19a4788..00000000 --- a/code/libraries/language-processing/resources/dictionary/en-stopwords +++ /dev/null @@ -1,2 +0,0 @@ -a -the \ No newline at end of file