(query) Improve handling of stopwords in queries

This commit is contained in:
Viktor Lofgren 2024-05-23 20:50:55 +02:00
parent 0e8300979b
commit 6985ab762a
4 changed files with 15 additions and 27 deletions

View File

@ -23,7 +23,6 @@ public class QueryExpansion {
private final NgramLexicon lexicon; private final NgramLexicon lexicon;
private final List<ExpansionStrategy> expansionStrategies = List.of( private final List<ExpansionStrategy> expansionStrategies = List.of(
this::omitStopWords,
this::joinDashes, this::joinDashes,
this::splitWordNum, this::splitWordNum,
this::joinTerms, this::joinTerms,
@ -56,14 +55,6 @@ public class QueryExpansion {
private static final Pattern dashPattern = Pattern.compile("-"); private static final Pattern dashPattern = Pattern.compile("-");
private static final Pattern numWordBoundary = Pattern.compile("[0-9][a-zA-Z]|[a-zA-Z][0-9]"); private static final Pattern numWordBoundary = Pattern.compile("[0-9][a-zA-Z]|[a-zA-Z][0-9]");
public void omitStopWords(QWordGraph graph) {
for (var qw : graph) {
if (WordPatterns.isStopWord(qw.word())) {
graph.addOmitLink(qw);
}
}
}
// Turn 'lawn-chair' into 'lawnchair' // Turn 'lawn-chair' into 'lawnchair'
public void joinDashes(QWordGraph graph) { public void joinDashes(QWordGraph graph) {
for (var qw : graph) { for (var qw : graph) {

View File

@ -21,7 +21,6 @@ public class QWordGraph implements Iterable<QWord> {
private final List<QWordGraphLink> links = new ArrayList<>(); private final List<QWordGraphLink> links = new ArrayList<>();
private final Map<Integer, List<QWord>> fromTo = new HashMap<>(); private final Map<Integer, List<QWord>> fromTo = new HashMap<>();
private final Map<Integer, List<QWord>> toFrom = new HashMap<>(); private final Map<Integer, List<QWord>> toFrom = new HashMap<>();
private int wordId = 0; private int wordId = 0;
public QWordGraph(String... words) { public QWordGraph(String... words) {
@ -76,21 +75,6 @@ public class QWordGraph implements Iterable<QWord> {
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
/** Add a link from the previous word to the next word for every adjacent word in the graph;
* except for when the provided word is preceeded by the start token and succeeded by the
* end token. */
public void addOmitLink(QWord qw) {
for (var prev : getPrev(qw)) {
for (var next : getNext(qw)) {
if (prev.isBeg() && next.isEnd())
continue;
addLink(prev, next);
}
}
}
public void addLink(QWord from, QWord to) { public void addLink(QWord from, QWord to) {
links.add(new QWordGraphLink(from, to)); links.add(new QWordGraphLink(from, to));
fromTo.computeIfAbsent(from.ord(), k -> new ArrayList<>()).add(to); fromTo.computeIfAbsent(from.ord(), k -> new ArrayList<>()).add(to);

View File

@ -1,5 +1,7 @@
package nu.marginalia.functions.searchquery.query_parser.model; package nu.marginalia.functions.searchquery.query_parser.model;
import nu.marginalia.language.WordPatterns;
import java.util.HashSet; import java.util.HashSet;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.Objects; import java.util.Objects;
@ -32,7 +34,9 @@ public class QWordGraphPathLister {
QWord start, QWord start,
QWord end) QWord end)
{ {
stack.addLast(start); boolean isStopword = WordPatterns.isStopWord(start.word());
if (!isStopword)
stack.addLast(start);
if (Objects.equals(start, end)) { if (Objects.equals(start, end)) {
var nodes = new HashSet<>(stack); var nodes = new HashSet<>(stack);
@ -52,6 +56,7 @@ public class QWordGraphPathLister {
} }
} }
stack.removeLast(); if (!isStopword)
stack.removeLast();
} }
} }

View File

@ -61,6 +61,7 @@ public class QueryFactoryTest {
lines.limit(1000).forEach(line -> { lines.limit(1000).forEach(line -> {
String[] parts = line.split("\t"); String[] parts = line.split("\t");
if (parts.length == 2) { if (parts.length == 2) {
System.out.println(parts[1]);
System.out.println(parseAndGetSpecs(parts[1]).getQuery().compiledQuery); System.out.println(parseAndGetSpecs(parts[1]).getQuery().compiledQuery);
} }
}); });
@ -211,5 +212,12 @@ public class QueryFactoryTest {
var subquery = parseAndGetSpecs("The"); var subquery = parseAndGetSpecs("The");
System.out.println("Time: " + (System.currentTimeMillis() - start)); System.out.println("Time: " + (System.currentTimeMillis() - start));
System.out.println(subquery); System.out.println(subquery);
} @Test
public void testExpansion6() {
long start = System.currentTimeMillis();
var subquery = parseAndGetSpecs("burning the nerves in the neck");
System.out.println("Time: " + (System.currentTimeMillis() - start));
System.out.println(subquery);
} }
} }