From 00ef4f98031efb6d0a4d725124e46ebaab37609e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 18 Mar 2024 13:16:49 +0100 Subject: [PATCH] (WIP) Partial integration of new query expansion code into the query-serivice --- .../query_parser/QueryVariants.java | 187 +------------- .../variant/ExpansionStrategy.java | 7 + .../query_parser/variant/QueryExpansion.java | 111 ++++++++ .../query_parser/variant/QueryVariant.java | 17 ++ .../query_parser/variant/QueryVariantSet.java | 21 ++ .../query_parser/variant/QueryWord.java | 10 + .../query_parser/variant/VariantStrategy.java | 8 + .../query_parser/variant/model/QWord.java | 47 ++++ .../variant/model/QWordGraph.java | 236 ++++++++++++++++++ .../variant/strategy/CombineDashes.java | 40 +++ .../variant/strategy/JoinTerms.java | 58 +++++ .../variant/strategy/SplitWordNum.java | 65 +++++ .../searchquery/svc/QueryFactory.java | 11 +- .../variant/model/QWordGraphTest.java | 33 +++ .../query/svc/QueryFactoryTest.java | 3 +- 15 files changed, 666 insertions(+), 188 deletions(-) create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWord.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraph.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/CombineDashes.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/JoinTerms.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/SplitWordNum.java create mode 100644 code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraphTest.java diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java index 9732e53f..10648486 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java @@ -1,17 +1,14 @@ package nu.marginalia.functions.searchquery.query_parser; -import ca.rmen.porterstemmer.PorterStemmer; -import lombok.AllArgsConstructor; -import lombok.EqualsAndHashCode; -import lombok.Getter; -import lombok.ToString; import nu.marginalia.functions.searchquery.query_parser.token.Token; import nu.marginalia.functions.searchquery.query_parser.token.TokenType; +import nu.marginalia.functions.searchquery.query_parser.variant.QueryVariant; +import nu.marginalia.functions.searchquery.query_parser.variant.QueryVariantSet; +import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; import nu.marginalia.util.language.EnglishDictionary; import nu.marginalia.LanguageModels; import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; -import nu.marginalia.util.ngrams.NGramBloomFilter; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.language.model.DocumentSentence; import nu.marginalia.language.model.WordSpan; @@ -22,17 +19,13 @@ import java.util.regex.Pattern; public class QueryVariants { private final KeywordExtractor keywordExtractor; private final TermFrequencyDict dict; - private final PorterStemmer ps = new PorterStemmer(); - private final NGramBloomFilter nGramBloomFilter; private final EnglishDictionary englishDictionary; private final ThreadLocal sentenceExtractor; public QueryVariants(LanguageModels lm, TermFrequencyDict dict, - NGramBloomFilter nGramBloomFilter, EnglishDictionary englishDictionary) { - this.nGramBloomFilter = nGramBloomFilter; this.englishDictionary = englishDictionary; this.keywordExtractor = new KeywordExtractor(); this.sentenceExtractor = ThreadLocal.withInitial(() -> new SentenceExtractor(lm)); @@ -40,33 +33,6 @@ public class QueryVariants { } - final Pattern numWordBoundary = Pattern.compile("[0-9][a-zA-Z]|[a-zA-Z][0-9]"); - final Pattern dashBoundary = Pattern.compile("-"); - - @AllArgsConstructor - private static class Word { - public final String stemmed; - public final String word; - public final String wordOriginal; - } - - @AllArgsConstructor @Getter @ToString @EqualsAndHashCode - public static class QueryVariant { - public final List terms; - public final double value; - } - - @Getter @ToString - public static class QueryVariantSet { - final List faithful = new ArrayList<>(); - final List alternative = new ArrayList<>(); - - final List nonLiterals = new ArrayList<>(); - - public boolean isEmpty() { - return faithful.isEmpty() && alternative.isEmpty() && nonLiterals.isEmpty(); - } - } public QueryVariantSet getQueryVariants(List query) { final JoinedQueryAndNonLiteralTokens joinedQuery = joinQuery(query); @@ -108,19 +74,11 @@ public class QueryVariants { byStart.put(0, elongatedFirstWords); } - final List> goodSpans = getWordSpans(byStart, sentence, livingSpans); + final List> goodSpans = getWordSpans(byStart, sentence, livingSpans); List> faithfulQueries = new ArrayList<>(); List> alternativeQueries = new ArrayList<>(); - for (var ls : goodSpans) { - faithfulQueries.addAll(createTokens(ls)); - } - - for (var span : goodSpans) { - alternativeQueries.addAll(joinTerms(span)); - } - for (var ls : goodSpans) { var last = ls.get(ls.size() - 1); @@ -174,105 +132,8 @@ public class QueryVariants { return ret; } - private Collection> createTokens(List ls) { - List asTokens = new ArrayList<>(); - List> ret = new ArrayList<>(); - - - boolean dash = false; - boolean num = false; - - for (var span : ls) { - dash |= dashBoundary.matcher(span.word).find(); - num |= numWordBoundary.matcher(span.word).find(); - if (ls.size() == 1 || !isOmittableWord(span.word)) { - asTokens.add(span.word); - } - } - ret.add(asTokens); - - if (dash) { - ret.addAll(combineDashWords(ls)); - } - - if (num) { - ret.addAll(splitWordNum(ls)); - } - - return ret; - } - - private boolean isOmittableWord(String word) { - return switch (word) { - case "vs", "or", "and", "versus", "is", "the", "why", "when", "if", "who", "are", "am" -> true; - default -> false; - }; - } - - private Collection> splitWordNum(List ls) { - List asTokens2 = new ArrayList<>(); - - boolean num = false; - - for (var span : ls) { - var wordMatcher = numWordBoundary.matcher(span.word); - var stemmedMatcher = numWordBoundary.matcher(span.stemmed); - - int ws = 0; - int ss = 0; - boolean didSplit = false; - while (wordMatcher.find(ws) && stemmedMatcher.find(ss)) { - ws = wordMatcher.start()+1; - ss = stemmedMatcher.start()+1; - if (nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "_")) - || nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "-"))) - { - String combined = splitAtNumBoundary(span.word, wordMatcher.start(), "_"); - asTokens2.add(combined); - didSplit = true; - num = true; - } - } - - if (!didSplit) { - asTokens2.add(span.word); - } - } - - if (num) { - return List.of(asTokens2); - } - return Collections.emptyList(); - } - - private Collection> combineDashWords(List ls) { - List asTokens2 = new ArrayList<>(); - boolean dash = false; - - for (var span : ls) { - var matcher = dashBoundary.matcher(span.word); - if (matcher.find() && nGramBloomFilter.isKnownNGram(ps.stemWord(dashBoundary.matcher(span.word).replaceAll("")))) { - dash = true; - String combined = dashBoundary.matcher(span.word).replaceAll(""); - asTokens2.add(combined); - } - else { - asTokens2.add(span.word); - } - } - - if (dash) { - return List.of(asTokens2); - } - return Collections.emptyList(); - } - - private String splitAtNumBoundary(String in, int splitPoint, String joiner) { - return in.substring(0, splitPoint+1) + joiner + in.substring(splitPoint+1); - } - - private List> getWordSpans(TreeMap> byStart, DocumentSentence sentence, List> livingSpans) { - List> goodSpans = new ArrayList<>(); + private List> getWordSpans(TreeMap> byStart, DocumentSentence sentence, List> livingSpans) { + List> goodSpans = new ArrayList<>(); for (int i = 0; i < 1; i++) { var spans = byStart.get(i); @@ -298,9 +159,9 @@ public class QueryVariants { int end = span.get(span.size()-1).end; if (end == sentence.length()) { - var gs = new ArrayList(span.size()); + var gs = new ArrayList(span.size()); for (var s : span) { - gs.add(new Word(sentence.constructStemmedWordFromSpan(s), sentence.constructWordFromSpan(s), + gs.add(new QueryWord(sentence.constructStemmedWordFromSpan(s), sentence.constructWordFromSpan(s), s.size() == 1 ? sentence.words[s.start] : "")); } goodSpans.add(gs); @@ -325,38 +186,6 @@ public class QueryVariants { return goodSpans; } - private List> joinTerms(List span) { - List> ret = new ArrayList<>(); - - for (int i = 0; i < span.size()-1; i++) { - var a = span.get(i); - var b = span.get(i+1); - - var stemmed = ps.stemWord(a.word + b.word); - - double scoreCombo = dict.getTermFreqStemmed(stemmed); - if (scoreCombo > 10000) { - List asTokens = new ArrayList<>(); - - for (int j = 0; j < i; j++) { - var word = span.get(j).word; - asTokens.add(word); - } - { - var word = a.word + b.word; - asTokens.add(word); - } - for (int j = i+2; j < span.size(); j++) { - var word = span.get(j).word; - asTokens.add(word); - } - - ret.add(asTokens); - } - } - - return ret; - } private JoinedQueryAndNonLiteralTokens joinQuery(List query) { StringJoiner s = new StringJoiner(" "); diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java new file mode 100644 index 00000000..18987aea --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java @@ -0,0 +1,7 @@ +package nu.marginalia.functions.searchquery.query_parser.variant; + +import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph; + +public interface ExpansionStrategy { + void expand(QWordGraph graph); +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java new file mode 100644 index 00000000..faac81d4 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java @@ -0,0 +1,111 @@ +package nu.marginalia.functions.searchquery.query_parser.variant; + +import ca.rmen.porterstemmer.PorterStemmer; +import nu.marginalia.functions.searchquery.query_parser.variant.model.QWord; +import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph; +import nu.marginalia.functions.searchquery.segmentation.NgramLexicon; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; +import org.apache.commons.lang3.StringUtils; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +public class QueryExpansion { + private static final PorterStemmer ps = new PorterStemmer(); + private final TermFrequencyDict dict; + private final NgramLexicon lexicon; + List expansionStrategies = List.of( + this::joinDashes, + this::splitWordNum, + this::joinTerms, + this::createSegments + ); + + public QueryExpansion(TermFrequencyDict dict, + NgramLexicon lexicon + ) { + this.dict = dict; + this.lexicon = lexicon; + } + + public QWordGraph expandQuery(List words) { + + QWordGraph graph = new QWordGraph(words); + + for (var strategy : expansionStrategies) { + strategy.expand(graph); + } + + return null; + } + + private static final Pattern dashPattern = Pattern.compile("-"); + private static final Pattern numWordBoundary = Pattern.compile("[0-9][a-zA-Z]|[a-zA-Z][0-9]"); + + // Turn 'lawn-chair' into 'lawnchair' + public void joinDashes(QWordGraph graph) { + for (var qw : graph) { + if (qw.word().contains("-")) { + var joined = StringUtils.join(dashPattern.split(qw.word())); + graph.addVariant(qw, joined); + } + } + } + + + // Turn 'MP3' into 'MP-3' + public void splitWordNum(QWordGraph graph) { + for (var qw : graph) { + var matcher = numWordBoundary.matcher(qw.word()); + if (matcher.matches()) { + var joined = StringUtils.join(dashPattern.split(qw.word()), '-'); + graph.addVariant(qw, joined); + } + } + } + + // Turn 'lawn chair' into 'lawnchair' + public void joinTerms(QWordGraph graph) { + QWord prev = null; + + for (var qw : graph) { + if (prev != null) { + var joinedWord = prev.word() + qw.word(); + var joinedStemmed = ps.stemWord(joinedWord); + + var scoreA = dict.getTermFreqStemmed(prev.stemmed()); + var scoreB = dict.getTermFreqStemmed(qw.stemmed()); + + var scoreCombo = dict.getTermFreqStemmed(joinedStemmed); + + if (scoreCombo > scoreA + scoreB || scoreCombo > 1000) { + graph.addVariantForSpan(prev, qw, joinedWord); + } + } + + prev = qw; + } + } + + public void createSegments(QWordGraph graph) { + List nodes = new ArrayList<>(); + + for (var qw : graph) { + nodes.add(qw); + } + + String[] words = nodes.stream().map(QWord::word).toArray(String[]::new); + + for (int length = 2; length < Math.min(10, words.length); length++) { + for (var segment : lexicon.findSegments(length, words)) { + int start = segment.start(); + int end = segment.start() + segment.length(); + var word = StringUtils.join(words, "_", start, end); + + graph.addVariantForSpan(nodes.get(start), nodes.get(end), word); + } + } + } + +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java new file mode 100644 index 00000000..8d24387b --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java @@ -0,0 +1,17 @@ +package nu.marginalia.functions.searchquery.query_parser.variant; + +import lombok.AllArgsConstructor; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.ToString; + +import java.util.List; + +@AllArgsConstructor +@Getter +@ToString +@EqualsAndHashCode +public class QueryVariant { + public final List terms; + public final double value; +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java new file mode 100644 index 00000000..b01fbd5e --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java @@ -0,0 +1,21 @@ +package nu.marginalia.functions.searchquery.query_parser.variant; + +import lombok.Getter; +import lombok.ToString; +import nu.marginalia.functions.searchquery.query_parser.token.Token; + +import java.util.ArrayList; +import java.util.List; + +@Getter +@ToString +public class QueryVariantSet { + public final List faithful = new ArrayList<>(); + public final List alternative = new ArrayList<>(); + + public final List nonLiterals = new ArrayList<>(); + + public boolean isEmpty() { + return faithful.isEmpty() && alternative.isEmpty() && nonLiterals.isEmpty(); + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java new file mode 100644 index 00000000..9c158a43 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java @@ -0,0 +1,10 @@ +package nu.marginalia.functions.searchquery.query_parser.variant; + +import lombok.AllArgsConstructor; + +@AllArgsConstructor +public class QueryWord { + public final String stemmed; + public final String word; + public final String wordOriginal; +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java new file mode 100644 index 00000000..2c1a5bfb --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java @@ -0,0 +1,8 @@ +package nu.marginalia.functions.searchquery.query_parser.variant; + +import java.util.Collection; +import java.util.List; + +public interface VariantStrategy { + Collection> constructVariants(List ls); +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWord.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWord.java new file mode 100644 index 00000000..07f65c95 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWord.java @@ -0,0 +1,47 @@ +package nu.marginalia.functions.searchquery.query_parser.variant.model; + +import ca.rmen.porterstemmer.PorterStemmer; + +public record QWord( + int ord, + boolean variant, + String stemmed, + String word, + String original) +{ + + // These are special words that are not in the input, but are added to the graph, + // note the space around the ^ and $, to avoid collisions with real words + private static final String BEG_MARKER = " ^ "; + private static final String END_MARKER = " $ "; + + private static final PorterStemmer ps = new PorterStemmer(); + + public boolean isBeg() { + return word.equals(BEG_MARKER); + } + + public boolean isEnd() { + return word.equals(END_MARKER); + } + + public static QWord beg() { + return new QWord(Integer.MIN_VALUE, false, BEG_MARKER, BEG_MARKER, BEG_MARKER); + } + + public static QWord end() { + return new QWord(Integer.MAX_VALUE, false, END_MARKER, END_MARKER, END_MARKER); + } + + public boolean isOriginal() { + return !variant; + } + + public QWord(int ord, String word) { + this(ord, false, ps.stemWord(word), word, word); + } + + public QWord(int ord, QWord original, String word) { + this(ord, true, ps.stemWord(word), word, original.original); + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraph.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraph.java new file mode 100644 index 00000000..f9902733 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraph.java @@ -0,0 +1,236 @@ +package nu.marginalia.functions.searchquery.query_parser.variant.model; + +import org.jetbrains.annotations.NotNull; + +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** Graph structure for constructing query variants. The graph should be a directed acyclic graph, + * with a single start node and a single end node, denoted by QWord.beg() and QWord.end() respectively. + *

+ * Naively, every path from the start to the end node should represent a valid query variant, although in + * practice it is desirable to be clever about how to evaluate the paths, to avoid combinatorial explosion. + */ +public class QWordGraph implements Iterable { + + + public record QWordGraphLink(QWord from, QWord to) { + } + + private final List links = new ArrayList<>(); + private final Map> fromTo = new HashMap<>(); + private final Map> toFrom = new HashMap<>(); + + private int wordId = 0; + + public QWordGraph(String... words) { + this(List.of(words)); + } + + public QWordGraph(List words) { + QWord beg = QWord.beg(); + QWord end = QWord.end(); + + var prev = beg; + + for (String s : words) { + var word = new QWord(wordId++, s); + addLink(prev, word); + prev = word; + } + + addLink(prev, end); + } + + public void addVariant(QWord original, String word) { + var siblings = getVariants(original); + if (siblings.stream().anyMatch(w -> w.word().equals(word))) + return; + + var newWord = new QWord(wordId++, original, word); + + for (var prev : getPrev(original)) + addLink(prev, newWord); + for (var next : getNext(original)) + addLink(newWord, next); + } + + public void addVariantForSpan(QWord first, QWord last, String word) { + var newWord = new QWord(wordId++, first, word); + + for (var prev : getPrev(first)) + addLink(prev, newWord); + for (var next : getNext(last)) + addLink(newWord, next); + } + + public List getVariants(QWord original) { + var prevNext = getPrev(original).stream() + .flatMap(prev -> getNext(prev).stream()) + .collect(Collectors.toSet()); + + return getNext(original).stream() + .flatMap(next -> getPrev(next).stream()) + .filter(prevNext::contains) + .collect(Collectors.toList()); + } + + + public void addLink(QWord from, QWord to) { + links.add(new QWordGraphLink(from, to)); + fromTo.computeIfAbsent(from, k -> new ArrayList<>()).add(to); + toFrom.computeIfAbsent(to, k -> new ArrayList<>()).add(from); + } + + public List links() { + return Collections.unmodifiableList(links); + } + public List nodes() { + return links.stream() + .flatMap(l -> Stream.of(l.from(), l.to())) + .sorted(Comparator.comparing(QWord::ord)) + .distinct() + .collect(Collectors.toList()); + } + + + public List getNext(QWord word) { + return fromTo.getOrDefault(word, List.of()); + } + public List getNextOriginal(QWord word) { + return fromTo.getOrDefault(word, List.of()) + .stream() + .filter(QWord::isOriginal) + .toList(); + } + + public List getPrev(QWord word) { + return toFrom.getOrDefault(word, List.of()); + } + public List getPrevOriginal(QWord word) { + return toFrom.getOrDefault(word, List.of()) + .stream() + .filter(QWord::isOriginal) + .toList(); + } + + // Returns true if removing the word would disconnect the graph + // so that there is no path from 'begin' to 'end'. This is useful + // in breaking up the graph into smaller component subgraphs, and + // understanding which vertexes can be re-ordered without changing + // the semantics of the encoded query. + public boolean isBypassed(QWord word, QWord begin, QWord end) { + assert word.isOriginal() : "Can only bypass original words"; + + Set edge = new HashSet<>(); + Set visited = new HashSet<>(); + + edge.add(begin); + + while (!edge.isEmpty()) { + Set next = new HashSet<>(); + + for (var w : edge) { + // Skip the word we're trying find a bypassing route for + if (w.ord() == word.ord()) + continue; + + if (Objects.equals(w, end)) + return true; + + next.addAll(getNext(w)); + } + + next.removeAll(visited); + visited.addAll(next); + edge = next; + } + + return false; + } + + /** Returns a set of all nodes that are between 'begin' and 'end' in the graph, + * including the terminal nodes. This is useful for breaking up the graph into + * smaller components that can be evaluated in any order. + *

+ * It is assumed that there is a path from 'begin' to 'end' in the graph, and no + * other paths that bypass 'end'. + *

+ * The nodes are returned in the order they are encountered in a breadth-first search. + */ + public List nodesBetween(QWord begin, QWord end) { + List edge = new ArrayList<>(); + List visited = new ArrayList<>(); + + edge.add(begin); + + while (!edge.isEmpty()) { + List next = new ArrayList<>(); + + for (var w : edge) { + if (Objects.equals(w, end)) + continue; + + assert (!w.isEnd() && end.isEnd()) : "Graph has a path beyond the specified end vertex"; + + next.addAll(getNext(w)); + } + + next.removeAll(visited); + visited.addAll(next); + edge = next; + } + + return visited; + } + + /** Returns a list of subgraphs that are connected on the path from + * 'begin' to 'end'. This is useful for breaking up the graph into + * smaller components that can be evaluated in any order. + *

+ * The subgraphs are specified by their predecessor and successor nodes, + * + */ + public List getSubgraphs(QWord begin, QWord end) { + // Short-circuit for the common and simple case + if (getNext(begin).equals(List.of(end))) + return List.of(new QWordGraphLink(begin, end)); + + List subgraphs = new ArrayList<>(); + + List points = nodesBetween(begin, end) + .stream() + .filter(w -> isBypassed(w, begin, end)) + .toList(); + + for (int i = 0; i < points.size() - 1; i++) { + var a = points.get(i); + var b = points.get(i+1); + + subgraphs.add(new QWordGraphLink(a, b)); + } + + return subgraphs; + } + + + @NotNull + @Override + public Iterator iterator() { + return new Iterator<>() { + QWord pos = QWord.beg(); + + @Override + public boolean hasNext() { + return !pos.isEnd(); + } + + @Override + public QWord next() { + pos = getNextOriginal(pos).get(0); + return pos; + } + }; + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/CombineDashes.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/CombineDashes.java new file mode 100644 index 00000000..c24defbe --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/CombineDashes.java @@ -0,0 +1,40 @@ +package nu.marginalia.functions.searchquery.query_parser.variant.strategy; + +import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; +import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.regex.Pattern; + +/** Variant strategy that combines word that have dashes, as sometimes lawn-chair + * gets spelled lawnchair */ +public class CombineDashes implements VariantStrategy { + final Pattern dashBoundary = Pattern.compile("-"); + + public CombineDashes() { + } + + @Override + public Collection> constructVariants(List words) { + List asTokens2 = new ArrayList<>(); + boolean dash = false; + + for (var span : words) { + var matcher = dashBoundary.matcher(span.word); + if (matcher.find()) { + String combined = dashBoundary.matcher(span.word).replaceAll(""); + asTokens2.add(combined); + } + + asTokens2.add(span.word); + } + + if (dash) { + return List.of(asTokens2); + } + return Collections.emptyList(); + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/JoinTerms.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/JoinTerms.java new file mode 100644 index 00000000..d03a64d1 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/JoinTerms.java @@ -0,0 +1,58 @@ +package nu.marginalia.functions.searchquery.query_parser.variant.strategy; + +import ca.rmen.porterstemmer.PorterStemmer; +import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; +import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +/** Variant strategy that merges tokens that are adjacent, where the combined token + * has a high term frequency. That way we match 'lawnchair' with 'lawn chair' */ +public class JoinTerms implements VariantStrategy { + private final TermFrequencyDict dict; + private final PorterStemmer ps; + + public JoinTerms(TermFrequencyDict dict, PorterStemmer ps) { + this.dict = dict; + this.ps = ps; + } + + @Override + public Collection> constructVariants(List span) { + List> ret = new ArrayList<>(); + + for (int i = 0; i < span.size()-1; i++) { + var a = span.get(i); + var b = span.get(i+1); + + var stemmed = ps.stemWord(a.word + b.word); + + double scoreCombo = dict.getTermFreqStemmed(stemmed); + + if (scoreCombo > 10000) { + List asTokens = new ArrayList<>(); + + for (int j = 0; j < i; j++) { + var word = span.get(j).word; + asTokens.add(word); + } + { + var word = a.word + b.word; + asTokens.add(word); + } + for (int j = i+2; j < span.size(); j++) { + var word = span.get(j).word; + asTokens.add(word); + } + + ret.add(asTokens); + } + + } + + return ret; + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/SplitWordNum.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/SplitWordNum.java new file mode 100644 index 00000000..ac79476b --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/SplitWordNum.java @@ -0,0 +1,65 @@ +package nu.marginalia.functions.searchquery.query_parser.variant.strategy; + +import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; +import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy; +import nu.marginalia.util.ngrams.NGramBloomFilter; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.regex.Pattern; + +/** Variant strategy that splits tokens at the boundary between a number and a word. + */ +public class SplitWordNum implements VariantStrategy { + + + final Pattern numWordBoundary = Pattern.compile("[0-9][a-zA-Z]|[a-zA-Z][0-9]"); + private final NGramBloomFilter nGramBloomFilter; + + public SplitWordNum(NGramBloomFilter nGramBloomFilter) { + this.nGramBloomFilter = nGramBloomFilter; + } + + @Override + public Collection> constructVariants(List ls) { + List asTokens2 = new ArrayList<>(); + + boolean num = false; + + for (var span : ls) { + var wordMatcher = numWordBoundary.matcher(span.word); + var stemmedMatcher = numWordBoundary.matcher(span.stemmed); + + int ws = 0; + int ss = 0; + boolean didSplit = false; + while (wordMatcher.find(ws) && stemmedMatcher.find(ss)) { + ws = wordMatcher.start()+1; + ss = stemmedMatcher.start()+1; + if (nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "_")) + || nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "-"))) + { + String combined = splitAtNumBoundary(span.word, wordMatcher.start(), "_"); + asTokens2.add(combined); + didSplit = true; + num = true; + } + } + + if (!didSplit) { + asTokens2.add(span.word); + } + } + + if (num) { + return List.of(asTokens2); + } + return Collections.emptyList(); + } + + private String splitAtNumBoundary(String in, int splitPoint, String joiner) { + return in.substring(0, splitPoint+1) + joiner + in.substring(splitPoint+1); + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java index ac7ce2b2..9ac7c795 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java @@ -8,7 +8,6 @@ import nu.marginalia.api.searchquery.model.query.SearchSubquery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.util.language.EnglishDictionary; import nu.marginalia.language.WordPatterns; -import nu.marginalia.util.ngrams.NGramBloomFilter; import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.ProcessedQuery; import nu.marginalia.functions.searchquery.query_parser.QueryParser; @@ -37,9 +36,8 @@ public class QueryFactory { @Inject public QueryFactory(LanguageModels lm, TermFrequencyDict dict, - EnglishDictionary englishDictionary, - NGramBloomFilter nGramBloomFilter) { - this.queryVariants = ThreadLocal.withInitial(() -> new QueryVariants(lm ,dict, nGramBloomFilter, englishDictionary)); + EnglishDictionary englishDictionary) { + this.queryVariants = ThreadLocal.withInitial(() -> new QueryVariants(lm ,dict, englishDictionary)); } @@ -79,7 +77,7 @@ public class QueryFactory { String domain = null; - var basicQuery = queryParser.parse(query); + List basicQuery = queryParser.parse(query); if (basicQuery.size() >= 12) { problems.add("Your search query is too long"); @@ -108,10 +106,9 @@ public class QueryFactory { for (var parts : queryPermutations) { QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(parts); - SearchSubquery subquery = termsAccumulator.createSubquery(); - domain = termsAccumulator.domain; + SearchSubquery subquery = termsAccumulator.createSubquery(); subqueries.add(subquery); } diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraphTest.java b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraphTest.java new file mode 100644 index 00000000..a88e4d63 --- /dev/null +++ b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraphTest.java @@ -0,0 +1,33 @@ +package nu.marginalia.functions.searchquery.query_parser.variant.model; + +import org.junit.jupiter.api.Test; + +class QWordGraphTest { + + @Test + public void testAddConstructor() { + QWordGraph graph = new QWordGraph("hello", "world"); + + System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); + System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); + graph.links().forEach(System.out::println); + System.out.println("--"); + graph.nodes().forEach(System.out::println); + System.out.println("--"); + graph.addVariant(graph.nodes().get(1), "sup"); + System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); + System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); + System.out.println("--"); + graph.links().forEach(System.out::println); + System.out.println("--"); + graph.nodes().forEach(System.out::println); + + graph.addVariantForSpan(graph.nodes().get(1), graph.nodes().get(2), "heyall"); + System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); + System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); + System.out.println("--"); + graph.links().forEach(System.out::println); + System.out.println("--"); + graph.nodes().forEach(System.out::println); + } +} \ No newline at end of file diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index fe93a1f6..4020d6e0 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -32,8 +32,7 @@ public class QueryFactoryTest { queryFactory = new QueryFactory(lm, tfd, - new EnglishDictionary(tfd), - new NGramBloomFilter(lm) + new EnglishDictionary(tfd) ); }