From 6985ab762a3e0e0f15a9ff87c3621e4c647572cb Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 23 May 2024 20:50:55 +0200 Subject: [PATCH] (query) Improve handling of stopwords in queries --- .../searchquery/query_parser/QueryExpansion.java | 9 --------- .../query_parser/model/QWordGraph.java | 16 ---------------- .../query_parser/model/QWordGraphPathLister.java | 9 +++++++-- .../marginalia/query/svc/QueryFactoryTest.java | 8 ++++++++ 4 files changed, 15 insertions(+), 27 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index 5dcfc2a4..2af0b586 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -23,7 +23,6 @@ public class QueryExpansion { private final NgramLexicon lexicon; private final List expansionStrategies = List.of( - this::omitStopWords, this::joinDashes, this::splitWordNum, this::joinTerms, @@ -56,14 +55,6 @@ public class QueryExpansion { private static final Pattern dashPattern = Pattern.compile("-"); private static final Pattern numWordBoundary = Pattern.compile("[0-9][a-zA-Z]|[a-zA-Z][0-9]"); - public void omitStopWords(QWordGraph graph) { - for (var qw : graph) { - if (WordPatterns.isStopWord(qw.word())) { - graph.addOmitLink(qw); - } - } - } - // Turn 'lawn-chair' into 'lawnchair' public void joinDashes(QWordGraph graph) { for (var qw : graph) { diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java index 955680ea..724ef6a1 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java @@ -21,7 +21,6 @@ public class QWordGraph implements Iterable { private final List links = new ArrayList<>(); private final Map> fromTo = new HashMap<>(); private final Map> toFrom = new HashMap<>(); - private int wordId = 0; public QWordGraph(String... words) { @@ -76,21 +75,6 @@ public class QWordGraph implements Iterable { .collect(Collectors.toList()); } - - /** Add a link from the previous word to the next word for every adjacent word in the graph; - * except for when the provided word is preceeded by the start token and succeeded by the - * end token. */ - public void addOmitLink(QWord qw) { - for (var prev : getPrev(qw)) { - for (var next : getNext(qw)) { - if (prev.isBeg() && next.isEnd()) - continue; - - addLink(prev, next); - } - } - } - public void addLink(QWord from, QWord to) { links.add(new QWordGraphLink(from, to)); fromTo.computeIfAbsent(from.ord(), k -> new ArrayList<>()).add(to); diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphPathLister.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphPathLister.java index f26c01f7..49e42c2c 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphPathLister.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphPathLister.java @@ -1,5 +1,7 @@ package nu.marginalia.functions.searchquery.query_parser.model; +import nu.marginalia.language.WordPatterns; + import java.util.HashSet; import java.util.LinkedList; import java.util.Objects; @@ -32,7 +34,9 @@ public class QWordGraphPathLister { QWord start, QWord end) { - stack.addLast(start); + boolean isStopword = WordPatterns.isStopWord(start.word()); + if (!isStopword) + stack.addLast(start); if (Objects.equals(start, end)) { var nodes = new HashSet<>(stack); @@ -52,6 +56,7 @@ public class QWordGraphPathLister { } } - stack.removeLast(); + if (!isStopword) + stack.removeLast(); } } diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index 631c94b1..1131db90 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -61,6 +61,7 @@ public class QueryFactoryTest { lines.limit(1000).forEach(line -> { String[] parts = line.split("\t"); if (parts.length == 2) { + System.out.println(parts[1]); System.out.println(parseAndGetSpecs(parts[1]).getQuery().compiledQuery); } }); @@ -211,5 +212,12 @@ public class QueryFactoryTest { var subquery = parseAndGetSpecs("The"); System.out.println("Time: " + (System.currentTimeMillis() - start)); System.out.println(subquery); + } @Test + + public void testExpansion6() { + long start = System.currentTimeMillis(); + var subquery = parseAndGetSpecs("burning the nerves in the neck"); + System.out.println("Time: " + (System.currentTimeMillis() - start)); + System.out.println(subquery); } } \ No newline at end of file