mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(query) Improve handling of stopwords in queries
This commit is contained in:
parent
0e8300979b
commit
6985ab762a
@ -23,7 +23,6 @@ public class QueryExpansion {
|
||||
private final NgramLexicon lexicon;
|
||||
|
||||
private final List<ExpansionStrategy> expansionStrategies = List.of(
|
||||
this::omitStopWords,
|
||||
this::joinDashes,
|
||||
this::splitWordNum,
|
||||
this::joinTerms,
|
||||
@ -56,14 +55,6 @@ public class QueryExpansion {
|
||||
private static final Pattern dashPattern = Pattern.compile("-");
|
||||
private static final Pattern numWordBoundary = Pattern.compile("[0-9][a-zA-Z]|[a-zA-Z][0-9]");
|
||||
|
||||
public void omitStopWords(QWordGraph graph) {
|
||||
for (var qw : graph) {
|
||||
if (WordPatterns.isStopWord(qw.word())) {
|
||||
graph.addOmitLink(qw);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Turn 'lawn-chair' into 'lawnchair'
|
||||
public void joinDashes(QWordGraph graph) {
|
||||
for (var qw : graph) {
|
||||
|
@ -21,7 +21,6 @@ public class QWordGraph implements Iterable<QWord> {
|
||||
private final List<QWordGraphLink> links = new ArrayList<>();
|
||||
private final Map<Integer, List<QWord>> fromTo = new HashMap<>();
|
||||
private final Map<Integer, List<QWord>> toFrom = new HashMap<>();
|
||||
|
||||
private int wordId = 0;
|
||||
|
||||
public QWordGraph(String... words) {
|
||||
@ -76,21 +75,6 @@ public class QWordGraph implements Iterable<QWord> {
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
||||
/** Add a link from the previous word to the next word for every adjacent word in the graph;
|
||||
* except for when the provided word is preceeded by the start token and succeeded by the
|
||||
* end token. */
|
||||
public void addOmitLink(QWord qw) {
|
||||
for (var prev : getPrev(qw)) {
|
||||
for (var next : getNext(qw)) {
|
||||
if (prev.isBeg() && next.isEnd())
|
||||
continue;
|
||||
|
||||
addLink(prev, next);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void addLink(QWord from, QWord to) {
|
||||
links.add(new QWordGraphLink(from, to));
|
||||
fromTo.computeIfAbsent(from.ord(), k -> new ArrayList<>()).add(to);
|
||||
|
@ -1,5 +1,7 @@
|
||||
package nu.marginalia.functions.searchquery.query_parser.model;
|
||||
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedList;
|
||||
import java.util.Objects;
|
||||
@ -32,6 +34,8 @@ public class QWordGraphPathLister {
|
||||
QWord start,
|
||||
QWord end)
|
||||
{
|
||||
boolean isStopword = WordPatterns.isStopWord(start.word());
|
||||
if (!isStopword)
|
||||
stack.addLast(start);
|
||||
|
||||
if (Objects.equals(start, end)) {
|
||||
@ -52,6 +56,7 @@ public class QWordGraphPathLister {
|
||||
}
|
||||
}
|
||||
|
||||
if (!isStopword)
|
||||
stack.removeLast();
|
||||
}
|
||||
}
|
||||
|
@ -61,6 +61,7 @@ public class QueryFactoryTest {
|
||||
lines.limit(1000).forEach(line -> {
|
||||
String[] parts = line.split("\t");
|
||||
if (parts.length == 2) {
|
||||
System.out.println(parts[1]);
|
||||
System.out.println(parseAndGetSpecs(parts[1]).getQuery().compiledQuery);
|
||||
}
|
||||
});
|
||||
@ -211,5 +212,12 @@ public class QueryFactoryTest {
|
||||
var subquery = parseAndGetSpecs("The");
|
||||
System.out.println("Time: " + (System.currentTimeMillis() - start));
|
||||
System.out.println(subquery);
|
||||
} @Test
|
||||
|
||||
public void testExpansion6() {
|
||||
long start = System.currentTimeMillis();
|
||||
var subquery = parseAndGetSpecs("burning the nerves in the neck");
|
||||
System.out.println("Time: " + (System.currentTimeMillis() - start));
|
||||
System.out.println(subquery);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user