diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index b8d1f062..c36f410e 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -25,6 +25,7 @@ public class QueryExpansion { this::joinDashes, this::splitWordNum, this::joinTerms, + this::categoryKeywords, this::ngramAll ); @@ -98,6 +99,24 @@ public class QueryExpansion { } } + // Category keyword substitution, e.g. guitar wiki -> guitar generator:wiki + public void categoryKeywords(QWordGraph graph) { + + for (var qw : graph) { + + // Ensure we only perform the substitution on the last word in the query + if (!graph.getNextOriginal(qw).getFirst().isEnd()) { + continue; + } + + switch (qw.word()) { + case "recipe", "recipes" -> graph.addVariant(qw, "category:food"); + case "forum" -> graph.addVariant(qw, "generator:forum"); + case "wiki" -> graph.addVariant(qw, "generator:wiki"); + } + } + } + // Turn 'lawn chair' into 'lawnchair' public void joinTerms(QWordGraph graph) { QWord prev = null; diff --git a/code/functions/search-query/java/nu/marginalia/util/language/EnglishDictionary.java b/code/functions/search-query/java/nu/marginalia/util/language/EnglishDictionary.java deleted file mode 100644 index 56e90701..00000000 --- a/code/functions/search-query/java/nu/marginalia/util/language/EnglishDictionary.java +++ /dev/null @@ -1,165 +0,0 @@ -package nu.marginalia.util.language; - -import com.google.inject.Inject; -import nu.marginalia.term_frequency_dict.TermFrequencyDict; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.BufferedReader; -import java.io.InputStreamReader; -import java.util.*; -import java.util.regex.Pattern; -import java.util.stream.Collectors; - -public class EnglishDictionary { - private final Set englishWords = new HashSet<>(); - private final TermFrequencyDict tfDict; - private final Logger logger = LoggerFactory.getLogger(getClass()); - - @Inject - public EnglishDictionary(TermFrequencyDict tfDict) { - this.tfDict = tfDict; - try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-words"), - "Could not load word frequency table"); - var br = new BufferedReader(new InputStreamReader(resource)) - ) { - for (;;) { - String s = br.readLine(); - if (s == null) { - break; - } - englishWords.add(s.toLowerCase()); - } - } - catch (Exception ex) { - throw new RuntimeException(ex); - } - } - - public boolean isWord(String word) { - return englishWords.contains(word); - } - - private static final Pattern ingPattern = Pattern.compile(".*(\\w)\\1ing$"); - - public Collection getWordVariants(String s) { - var variants = findWordVariants(s); - - var ret = variants.stream() - .filter(var -> tfDict.getTermFreq(var) > 100) - .collect(Collectors.toList()); - - if (s.equals("recipe") || s.equals("recipes")) { - ret.add("category:food"); - } - - return ret; - } - - - public Collection findWordVariants(String s) { - int sl = s.length(); - - if (sl < 2) { - return Collections.emptyList(); - } - if (s.endsWith("s")) { - String a = s.substring(0, sl-1); - String b = s + "es"; - if (isWord(a) && isWord(b)) { - return List.of(a, b); - } - else if (isWord(a)) { - return List.of(a); - } - else if (isWord(b)) { - return List.of(b); - } - } - if (s.endsWith("sm")) { - String a = s.substring(0, sl-1)+"t"; - String b = s.substring(0, sl-1)+"ts"; - if (isWord(a) && isWord(b)) { - return List.of(a, b); - } - else if (isWord(a)) { - return List.of(a); - } - else if (isWord(b)) { - return List.of(b); - } - } - if (s.endsWith("st")) { - String a = s.substring(0, sl-1)+"m"; - String b = s + "s"; - if (isWord(a) && isWord(b)) { - return List.of(a, b); - } - else if (isWord(a)) { - return List.of(a); - } - else if (isWord(b)) { - return List.of(b); - } - } - else if (ingPattern.matcher(s).matches() && sl > 4) { // humming, clapping - var a = s.substring(0, sl-4); - var b = s.substring(0, sl-3) + "ed"; - - if (isWord(a) && isWord(b)) { - return List.of(a, b); - } - else if (isWord(a)) { - return List.of(a); - } - else if (isWord(b)) { - return List.of(b); - } - } - else { - String a = s + "s"; - String b = ingForm(s); - String c = s + "ed"; - - if (isWord(a) && isWord(b) && isWord(c)) { - return List.of(a, b, c); - } - else if (isWord(a) && isWord(b)) { - return List.of(a, b); - } - else if (isWord(b) && isWord(c)) { - return List.of(b, c); - } - else if (isWord(a) && isWord(c)) { - return List.of(a, c); - } - else if (isWord(a)) { - return List.of(a); - } - else if (isWord(b)) { - return List.of(b); - } - else if (isWord(c)) { - return List.of(c); - } - } - - return Collections.emptyList(); - } - - public String ingForm(String s) { - if (s.endsWith("t") && !s.endsWith("tt")) { - return s + "ting"; - } - if (s.endsWith("n") && !s.endsWith("nn")) { - return s + "ning"; - } - if (s.endsWith("m") && !s.endsWith("mm")) { - return s + "ming"; - } - if (s.endsWith("r") && !s.endsWith("rr")) { - return s + "ring"; - } - return s + "ing"; - } -} diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index 0adb2f56..b94bf77d 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -12,6 +12,7 @@ import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.index.query.limit.SpecificationLimitType; import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.term_frequency_dict.TermFrequencyDict; +import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; @@ -207,6 +208,17 @@ public class QueryFactoryTest { System.out.println(subquery); } + @Test + public void testExpansion9() { + var subquery = parseAndGetSpecs("pie recipe"); + + Assertions.assertTrue(subquery.query.compiledQuery.contains(" category:food ")); + + subquery = parseAndGetSpecs("recipe pie"); + + Assertions.assertFalse(subquery.query.compiledQuery.contains(" category:food ")); + } + @Test public void testParsing() { var subquery = parseAndGetSpecs("strlen()");