(search) Reintroduce query rewriting for recipes, add rules for wikis and forums

This commit is contained in:
Viktor Lofgren 2024-12-31 16:05:00 +01:00
parent 5e2a8e9f27
commit baeb4a46cd
3 changed files with 31 additions and 165 deletions

View File

@ -25,6 +25,7 @@ public class QueryExpansion {
this::joinDashes,
this::splitWordNum,
this::joinTerms,
this::categoryKeywords,
this::ngramAll
);
@ -98,6 +99,24 @@ public class QueryExpansion {
}
}
// Category keyword substitution, e.g. guitar wiki -> guitar generator:wiki
public void categoryKeywords(QWordGraph graph) {
for (var qw : graph) {
// Ensure we only perform the substitution on the last word in the query
if (!graph.getNextOriginal(qw).getFirst().isEnd()) {
continue;
}
switch (qw.word()) {
case "recipe", "recipes" -> graph.addVariant(qw, "category:food");
case "forum" -> graph.addVariant(qw, "generator:forum");
case "wiki" -> graph.addVariant(qw, "generator:wiki");
}
}
}
// Turn 'lawn chair' into 'lawnchair'
public void joinTerms(QWordGraph graph) {
QWord prev = null;

View File

@ -1,165 +0,0 @@
package nu.marginalia.util.language;
import com.google.inject.Inject;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class EnglishDictionary {
private final Set<String> englishWords = new HashSet<>();
private final TermFrequencyDict tfDict;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public EnglishDictionary(TermFrequencyDict tfDict) {
this.tfDict = tfDict;
try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-words"),
"Could not load word frequency table");
var br = new BufferedReader(new InputStreamReader(resource))
) {
for (;;) {
String s = br.readLine();
if (s == null) {
break;
}
englishWords.add(s.toLowerCase());
}
}
catch (Exception ex) {
throw new RuntimeException(ex);
}
}
public boolean isWord(String word) {
return englishWords.contains(word);
}
private static final Pattern ingPattern = Pattern.compile(".*(\\w)\\1ing$");
public Collection<String> getWordVariants(String s) {
var variants = findWordVariants(s);
var ret = variants.stream()
.filter(var -> tfDict.getTermFreq(var) > 100)
.collect(Collectors.toList());
if (s.equals("recipe") || s.equals("recipes")) {
ret.add("category:food");
}
return ret;
}
public Collection<String> findWordVariants(String s) {
int sl = s.length();
if (sl < 2) {
return Collections.emptyList();
}
if (s.endsWith("s")) {
String a = s.substring(0, sl-1);
String b = s + "es";
if (isWord(a) && isWord(b)) {
return List.of(a, b);
}
else if (isWord(a)) {
return List.of(a);
}
else if (isWord(b)) {
return List.of(b);
}
}
if (s.endsWith("sm")) {
String a = s.substring(0, sl-1)+"t";
String b = s.substring(0, sl-1)+"ts";
if (isWord(a) && isWord(b)) {
return List.of(a, b);
}
else if (isWord(a)) {
return List.of(a);
}
else if (isWord(b)) {
return List.of(b);
}
}
if (s.endsWith("st")) {
String a = s.substring(0, sl-1)+"m";
String b = s + "s";
if (isWord(a) && isWord(b)) {
return List.of(a, b);
}
else if (isWord(a)) {
return List.of(a);
}
else if (isWord(b)) {
return List.of(b);
}
}
else if (ingPattern.matcher(s).matches() && sl > 4) { // humming, clapping
var a = s.substring(0, sl-4);
var b = s.substring(0, sl-3) + "ed";
if (isWord(a) && isWord(b)) {
return List.of(a, b);
}
else if (isWord(a)) {
return List.of(a);
}
else if (isWord(b)) {
return List.of(b);
}
}
else {
String a = s + "s";
String b = ingForm(s);
String c = s + "ed";
if (isWord(a) && isWord(b) && isWord(c)) {
return List.of(a, b, c);
}
else if (isWord(a) && isWord(b)) {
return List.of(a, b);
}
else if (isWord(b) && isWord(c)) {
return List.of(b, c);
}
else if (isWord(a) && isWord(c)) {
return List.of(a, c);
}
else if (isWord(a)) {
return List.of(a);
}
else if (isWord(b)) {
return List.of(b);
}
else if (isWord(c)) {
return List.of(c);
}
}
return Collections.emptyList();
}
public String ingForm(String s) {
if (s.endsWith("t") && !s.endsWith("tt")) {
return s + "ting";
}
if (s.endsWith("n") && !s.endsWith("nn")) {
return s + "ning";
}
if (s.endsWith("m") && !s.endsWith("mm")) {
return s + "ming";
}
if (s.endsWith("r") && !s.endsWith("rr")) {
return s + "ring";
}
return s + "ing";
}
}

View File

@ -12,6 +12,7 @@ import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
@ -207,6 +208,17 @@ public class QueryFactoryTest {
System.out.println(subquery);
}
@Test
public void testExpansion9() {
var subquery = parseAndGetSpecs("pie recipe");
Assertions.assertTrue(subquery.query.compiledQuery.contains(" category:food "));
subquery = parseAndGetSpecs("recipe pie");
Assertions.assertFalse(subquery.query.compiledQuery.contains(" category:food "));
}
@Test
public void testParsing() {
var subquery = parseAndGetSpecs("strlen()");