mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(query) Improve handling of stopwords in queries
This commit is contained in:
parent
0e8300979b
commit
6985ab762a
@ -23,7 +23,6 @@ public class QueryExpansion {
|
|||||||
private final NgramLexicon lexicon;
|
private final NgramLexicon lexicon;
|
||||||
|
|
||||||
private final List<ExpansionStrategy> expansionStrategies = List.of(
|
private final List<ExpansionStrategy> expansionStrategies = List.of(
|
||||||
this::omitStopWords,
|
|
||||||
this::joinDashes,
|
this::joinDashes,
|
||||||
this::splitWordNum,
|
this::splitWordNum,
|
||||||
this::joinTerms,
|
this::joinTerms,
|
||||||
@ -56,14 +55,6 @@ public class QueryExpansion {
|
|||||||
private static final Pattern dashPattern = Pattern.compile("-");
|
private static final Pattern dashPattern = Pattern.compile("-");
|
||||||
private static final Pattern numWordBoundary = Pattern.compile("[0-9][a-zA-Z]|[a-zA-Z][0-9]");
|
private static final Pattern numWordBoundary = Pattern.compile("[0-9][a-zA-Z]|[a-zA-Z][0-9]");
|
||||||
|
|
||||||
public void omitStopWords(QWordGraph graph) {
|
|
||||||
for (var qw : graph) {
|
|
||||||
if (WordPatterns.isStopWord(qw.word())) {
|
|
||||||
graph.addOmitLink(qw);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Turn 'lawn-chair' into 'lawnchair'
|
// Turn 'lawn-chair' into 'lawnchair'
|
||||||
public void joinDashes(QWordGraph graph) {
|
public void joinDashes(QWordGraph graph) {
|
||||||
for (var qw : graph) {
|
for (var qw : graph) {
|
||||||
|
@ -21,7 +21,6 @@ public class QWordGraph implements Iterable<QWord> {
|
|||||||
private final List<QWordGraphLink> links = new ArrayList<>();
|
private final List<QWordGraphLink> links = new ArrayList<>();
|
||||||
private final Map<Integer, List<QWord>> fromTo = new HashMap<>();
|
private final Map<Integer, List<QWord>> fromTo = new HashMap<>();
|
||||||
private final Map<Integer, List<QWord>> toFrom = new HashMap<>();
|
private final Map<Integer, List<QWord>> toFrom = new HashMap<>();
|
||||||
|
|
||||||
private int wordId = 0;
|
private int wordId = 0;
|
||||||
|
|
||||||
public QWordGraph(String... words) {
|
public QWordGraph(String... words) {
|
||||||
@ -76,21 +75,6 @@ public class QWordGraph implements Iterable<QWord> {
|
|||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/** Add a link from the previous word to the next word for every adjacent word in the graph;
|
|
||||||
* except for when the provided word is preceeded by the start token and succeeded by the
|
|
||||||
* end token. */
|
|
||||||
public void addOmitLink(QWord qw) {
|
|
||||||
for (var prev : getPrev(qw)) {
|
|
||||||
for (var next : getNext(qw)) {
|
|
||||||
if (prev.isBeg() && next.isEnd())
|
|
||||||
continue;
|
|
||||||
|
|
||||||
addLink(prev, next);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void addLink(QWord from, QWord to) {
|
public void addLink(QWord from, QWord to) {
|
||||||
links.add(new QWordGraphLink(from, to));
|
links.add(new QWordGraphLink(from, to));
|
||||||
fromTo.computeIfAbsent(from.ord(), k -> new ArrayList<>()).add(to);
|
fromTo.computeIfAbsent(from.ord(), k -> new ArrayList<>()).add(to);
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
package nu.marginalia.functions.searchquery.query_parser.model;
|
package nu.marginalia.functions.searchquery.query_parser.model;
|
||||||
|
|
||||||
|
import nu.marginalia.language.WordPatterns;
|
||||||
|
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
@ -32,7 +34,9 @@ public class QWordGraphPathLister {
|
|||||||
QWord start,
|
QWord start,
|
||||||
QWord end)
|
QWord end)
|
||||||
{
|
{
|
||||||
stack.addLast(start);
|
boolean isStopword = WordPatterns.isStopWord(start.word());
|
||||||
|
if (!isStopword)
|
||||||
|
stack.addLast(start);
|
||||||
|
|
||||||
if (Objects.equals(start, end)) {
|
if (Objects.equals(start, end)) {
|
||||||
var nodes = new HashSet<>(stack);
|
var nodes = new HashSet<>(stack);
|
||||||
@ -52,6 +56,7 @@ public class QWordGraphPathLister {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
stack.removeLast();
|
if (!isStopword)
|
||||||
|
stack.removeLast();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -61,6 +61,7 @@ public class QueryFactoryTest {
|
|||||||
lines.limit(1000).forEach(line -> {
|
lines.limit(1000).forEach(line -> {
|
||||||
String[] parts = line.split("\t");
|
String[] parts = line.split("\t");
|
||||||
if (parts.length == 2) {
|
if (parts.length == 2) {
|
||||||
|
System.out.println(parts[1]);
|
||||||
System.out.println(parseAndGetSpecs(parts[1]).getQuery().compiledQuery);
|
System.out.println(parseAndGetSpecs(parts[1]).getQuery().compiledQuery);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@ -211,5 +212,12 @@ public class QueryFactoryTest {
|
|||||||
var subquery = parseAndGetSpecs("The");
|
var subquery = parseAndGetSpecs("The");
|
||||||
System.out.println("Time: " + (System.currentTimeMillis() - start));
|
System.out.println("Time: " + (System.currentTimeMillis() - start));
|
||||||
System.out.println(subquery);
|
System.out.println(subquery);
|
||||||
|
} @Test
|
||||||
|
|
||||||
|
public void testExpansion6() {
|
||||||
|
long start = System.currentTimeMillis();
|
||||||
|
var subquery = parseAndGetSpecs("burning the nerves in the neck");
|
||||||
|
System.out.println("Time: " + (System.currentTimeMillis() - start));
|
||||||
|
System.out.println(subquery);
|
||||||
}
|
}
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user