From a4b810f51102b9f20d0c7c85b5e5f82bc4fbe5c2 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 21 Mar 2024 12:00:52 +0100 Subject: [PATCH] WIP --- .../query_parser/ExpansionStrategy.java | 7 + .../{variant => }/QueryExpansion.java | 8 +- .../query_parser/QueryPermutation.java | 229 ------------------ .../query_parser/QueryVariants.java | 207 ---------------- .../{variant => }/model/QWord.java | 2 +- .../{variant => }/model/QWordGraph.java | 46 +++- .../variant/ExpansionStrategy.java | 7 - .../query_parser/variant/QueryVariant.java | 17 -- .../query_parser/variant/QueryVariantSet.java | 21 -- .../query_parser/variant/QueryWord.java | 10 - .../query_parser/variant/VariantStrategy.java | 8 - .../searchquery/svc/QueryFactory.java | 50 +--- .../{variant => }/model/QWordGraphTest.java | 6 +- 13 files changed, 68 insertions(+), 550 deletions(-) create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java rename code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/{variant => }/QueryExpansion.java (93%) delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryPermutation.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java rename code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/{variant => }/model/QWord.java (94%) rename code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/{variant => }/model/QWordGraph.java (82%) delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java rename code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/{variant => }/model/QWordGraphTest.java (83%) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java new file mode 100644 index 00000000..20ebffd1 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java @@ -0,0 +1,7 @@ +package nu.marginalia.functions.searchquery.query_parser; + +import nu.marginalia.functions.searchquery.query_parser.model.QWordGraph; + +public interface ExpansionStrategy { + void expand(QWordGraph graph); +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java similarity index 93% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java rename to code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index 820a9022..c216918e 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -1,9 +1,9 @@ -package nu.marginalia.functions.searchquery.query_parser.variant; +package nu.marginalia.functions.searchquery.query_parser; import ca.rmen.porterstemmer.PorterStemmer; import com.google.inject.Inject; -import nu.marginalia.functions.searchquery.query_parser.variant.model.QWord; -import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph; +import nu.marginalia.functions.searchquery.query_parser.model.QWord; +import nu.marginalia.functions.searchquery.query_parser.model.QWordGraph; import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.apache.commons.lang3.StringUtils; @@ -40,7 +40,7 @@ public class QueryExpansion { strategy.expand(graph); } - return null; + return graph; } private static final Pattern dashPattern = Pattern.compile("-"); diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryPermutation.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryPermutation.java deleted file mode 100644 index 417ceda3..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryPermutation.java +++ /dev/null @@ -1,229 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser; - -import nu.marginalia.functions.searchquery.query_parser.token.Token; -import nu.marginalia.functions.searchquery.query_parser.token.TokenType; -import nu.marginalia.language.WordPatterns; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; -import java.util.function.Predicate; -import java.util.regex.Pattern; -import java.util.stream.Collectors; - -import static java.util.stream.Stream.concat; - -public class QueryPermutation { - private final Logger logger = LoggerFactory.getLogger(getClass()); - private final QueryVariants queryVariants; - - public static final Pattern wordPattern = Pattern.compile("[#]?[_@.a-zA-Z0-9'+\\-\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+[#]?"); - public static final Pattern wordAppendixPattern = Pattern.compile("[.]?[0-9a-zA-Z\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]{1,3}[0-9]?"); - - public static final Predicate wordQualitiesPredicate = wordPattern.asMatchPredicate(); - - public static final Predicate wordAppendixPredicate = wordAppendixPattern.asMatchPredicate(); - public static final Predicate wordPredicateEither = wordQualitiesPredicate.or(wordAppendixPredicate); - - public QueryPermutation(QueryVariants queryVariants) { - this.queryVariants = queryVariants; - } - - public List> permuteQueries(List items) { - int start = -1; - int end = items.size(); - - for (int i = 0; i < items.size(); i++) { - var token = items.get(i); - - if (start < 0) { - if (token.type == TokenType.LITERAL_TERM && wordQualitiesPredicate.test(token.str)) { - start = i; - } - } - else { - if (token.type != TokenType.LITERAL_TERM || !wordPredicateEither.test(token.str)) { - end = i; - break; - } - } - } - - if (start >= 0 && end - start > 1) { - List> permuteParts = combineSearchTerms(items.subList(start, end)); - int s = start; - int e = end; - return permuteParts.stream().map(part -> - concat(items.subList(0, s).stream(), concat(part.stream(), items.subList(e, items.size()).stream())) - .collect(Collectors.toList())) - .peek(lst -> lst.removeIf(this::isJunkWord)) - .limit(24) - .collect(Collectors.toList()); - } - else { - return List.of(items); - } - } - - - public List> permuteQueriesNew(List items) { - int start = -1; - int end = items.size(); - - for (int i = 0; i < items.size(); i++) { - var token = items.get(i); - - if (start < 0) { - if (token.type == TokenType.LITERAL_TERM && wordQualitiesPredicate.test(token.str)) { - start = i; - } - } - else { - if (token.type != TokenType.LITERAL_TERM || !wordPredicateEither.test(token.str)) { - end = i; - break; - } - } - } - - if (start >= 0 && end - start >= 1) { - var result = queryVariants.getQueryVariants(items.subList(start, end)); - - logger.debug("{}", result); - - if (result.isEmpty()) { - logger.warn("Empty variants result, falling back on old code"); - return permuteQueries(items); - } - - List> queryVariants = new ArrayList<>(); - for (var query : result.faithful) { - var tokens = query.terms.stream().map(term -> new Token(TokenType.LITERAL_TERM, term)).collect(Collectors.toList()); - tokens.addAll(result.nonLiterals); - - queryVariants.add(tokens); - } - for (var query : result.alternative) { - if (queryVariants.size() >= 6) - break; - - var tokens = query.terms.stream().map(term -> new Token(TokenType.LITERAL_TERM, term)).collect(Collectors.toList()); - tokens.addAll(result.nonLiterals); - - queryVariants.add(tokens); - } - - List> returnValue = new ArrayList<>(queryVariants.size()); - for (var variant: queryVariants) { - List r = new ArrayList<>(start + variant.size() + (items.size() - end)); - r.addAll(items.subList(0, start)); - r.addAll(variant); - r.addAll(items.subList(end, items.size())); - returnValue.add(r); - } - - return returnValue; - - } - else { - return List.of(items); - } - } - - private boolean isJunkWord(Token token) { - if (WordPatterns.isStopWord(token.str) && - !token.str.matches("^(\\d+|([a-z]+:.*))$")) { - return true; - } - return switch (token.str) { - case "vs", "versus", "or", "and" -> true; - default -> false; - }; - } - - private List> combineSearchTerms(List subList) { - int size = subList.size(); - if (size < 1) { - return Collections.emptyList(); - } - else if (size == 1) { - if (WordPatterns.isStopWord(subList.get(0).str)) { - return Collections.emptyList(); - } - return List.of(subList); - } - - List> results = new ArrayList<>(size*(size+1)/2); - - if (subList.size() <= 4 && subList.get(0).str.length() >= 2 && !isPrefixWord(subList.get(subList.size()-1).str)) { - results.add(List.of(joinTokens(subList))); - } - outer: for (int i = size - 1; i >= 1; i--) { - - var left = combineSearchTerms(subList.subList(0, i)); - var right = combineSearchTerms(subList.subList(i, size)); - - for (var l : left) { - if (results.size() > 48) { - break outer; - } - - for (var r : right) { - if (results.size() > 48) { - break outer; - } - - List combined = new ArrayList<>(l.size() + r.size()); - combined.addAll(l); - combined.addAll(r); - if (!results.contains(combined)) { - results.add(combined); - } - } - } - } - if (!results.contains(subList)) { - results.add(subList); - } - Comparator> tc = (o1, o2) -> { - int dJoininess = o2.stream().mapToInt(s->(int)Math.pow(joininess(s.str), 2)).sum() - - o1.stream().mapToInt(s->(int)Math.pow(joininess(s.str), 2)).sum(); - if (dJoininess == 0) { - return (o2.stream().mapToInt(s->(int)Math.pow(rightiness(s.str), 2)).sum() - - o1.stream().mapToInt(s->(int)Math.pow(rightiness(s.str), 2)).sum()); - } - return (int) Math.signum(dJoininess); - }; - results.sort(tc); - return results; - } - - private boolean isPrefixWord(String str) { - return switch (str) { - case "the", "of", "when" -> true; - default -> false; - }; - } - - int joininess(String s) { - return (int) s.chars().filter(c -> c == '_').count(); - } - int rightiness(String s) { - int rightiness = 0; - for (int i = 0; i < s.length(); i++) { - if (s.charAt(i) == '_') { - rightiness+=i; - } - } - return rightiness; - } - - private Token joinTokens(List subList) { - return new Token(TokenType.LITERAL_TERM, - subList.stream().map(t -> t.str).collect(Collectors.joining("_")), - subList.stream().map(t -> t.str).collect(Collectors.joining(" "))); - } -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java deleted file mode 100644 index 10648486..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java +++ /dev/null @@ -1,207 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser; - -import nu.marginalia.functions.searchquery.query_parser.token.Token; -import nu.marginalia.functions.searchquery.query_parser.token.TokenType; -import nu.marginalia.functions.searchquery.query_parser.variant.QueryVariant; -import nu.marginalia.functions.searchquery.query_parser.variant.QueryVariantSet; -import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; -import nu.marginalia.util.language.EnglishDictionary; -import nu.marginalia.LanguageModels; -import nu.marginalia.keyword.KeywordExtractor; -import nu.marginalia.language.sentence.SentenceExtractor; -import nu.marginalia.term_frequency_dict.TermFrequencyDict; -import nu.marginalia.language.model.DocumentSentence; -import nu.marginalia.language.model.WordSpan; - -import java.util.*; -import java.util.regex.Pattern; - -public class QueryVariants { - private final KeywordExtractor keywordExtractor; - private final TermFrequencyDict dict; - - private final EnglishDictionary englishDictionary; - private final ThreadLocal sentenceExtractor; - - public QueryVariants(LanguageModels lm, - TermFrequencyDict dict, - EnglishDictionary englishDictionary) { - this.englishDictionary = englishDictionary; - this.keywordExtractor = new KeywordExtractor(); - this.sentenceExtractor = ThreadLocal.withInitial(() -> new SentenceExtractor(lm)); - this.dict = dict; - } - - - - public QueryVariantSet getQueryVariants(List query) { - final JoinedQueryAndNonLiteralTokens joinedQuery = joinQuery(query); - - final TreeMap> byStart = new TreeMap<>(); - - var se = sentenceExtractor.get(); - var sentence = se.extractSentence(joinedQuery.joinedQuery); - - for (int i = 0; i < sentence.posTags.length; i++) { - if (sentence.posTags[i].startsWith("N") || sentence.posTags[i].startsWith("V")) { - sentence.posTags[i] = "NNP"; - } - else if ("JJ".equals(sentence.posTags[i]) || "CD".equals(sentence.posTags[i]) || sentence.posTags[i].startsWith("P")) { - sentence.posTags[i] = "NNP"; - sentence.setIsStopWord(i, false); - } - } - - for (var kw : keywordExtractor.getKeywordsFromSentence(sentence)) { - byStart.computeIfAbsent(kw.start, k -> new ArrayList<>()).add(kw); - } - - final List> livingSpans = new ArrayList<>(); - - var first = byStart.firstEntry(); - if (first == null) { - var span = new WordSpan(0, sentence.length()); - byStart.put(0, List.of(span)); - } - else if (first.getKey() > 0) { - List elongatedFirstWords = new ArrayList<>(first.getValue().size()); - - first.getValue().forEach(span -> { - elongatedFirstWords.add(new WordSpan(0, span.start)); - elongatedFirstWords.add(new WordSpan(0, span.end)); - }); - - byStart.put(0, elongatedFirstWords); - } - - final List> goodSpans = getWordSpans(byStart, sentence, livingSpans); - - List> faithfulQueries = new ArrayList<>(); - List> alternativeQueries = new ArrayList<>(); - - for (var ls : goodSpans) { - var last = ls.get(ls.size() - 1); - - if (!last.wordOriginal.isBlank() && !Character.isUpperCase(last.wordOriginal.charAt(0))) { - var altLast = englishDictionary.getWordVariants(last.word); - for (String s : altLast) { - List newList = new ArrayList<>(ls.size()); - for (int i = 0; i < ls.size() - 1; i++) { - newList.add(ls.get(i).word); - } - newList.add(s); - alternativeQueries.add(newList); - } - } - - } - - QueryVariantSet returnValue = new QueryVariantSet(); - - returnValue.faithful.addAll(evaluateQueries(faithfulQueries)); - returnValue.alternative.addAll(evaluateQueries(alternativeQueries)); - - returnValue.faithful.sort(Comparator.comparing(QueryVariant::getValue)); - returnValue.alternative.sort(Comparator.comparing(QueryVariant::getValue)); - - returnValue.nonLiterals.addAll(joinedQuery.nonLiterals); - - return returnValue; - } - - final Pattern underscore = Pattern.compile("_"); - - private List evaluateQueries(List> queryStrings) { - Set variantsSet = new HashSet<>(); - List ret = new ArrayList<>(); - for (var lst : queryStrings) { - double q = 0; - for (var word : lst) { - String[] parts = underscore.split(word); - double qp = 0; - for (String part : parts) { - qp += 1./(1+ dict.getTermFreq(part)); - } - q += 1.0 / qp; - } - var qv = new QueryVariant(lst, q); - if (variantsSet.add(qv)) { - ret.add(qv); - } - } - return ret; - } - - private List> getWordSpans(TreeMap> byStart, DocumentSentence sentence, List> livingSpans) { - List> goodSpans = new ArrayList<>(); - for (int i = 0; i < 1; i++) { - var spans = byStart.get(i); - - - if (spans == null ) - continue; - - for (var span : spans) { - ArrayList fragment = new ArrayList<>(); - fragment.add(span); - livingSpans.add(fragment); - } - - if (sentence.posTags[i].startsWith("N") || sentence.posTags[i].startsWith("V")) break; - } - - - while (!livingSpans.isEmpty()) { - - final List> newLivingSpans = new ArrayList<>(livingSpans.size()); - - for (var span : livingSpans) { - int end = span.get(span.size()-1).end; - - if (end == sentence.length()) { - var gs = new ArrayList(span.size()); - for (var s : span) { - gs.add(new QueryWord(sentence.constructStemmedWordFromSpan(s), sentence.constructWordFromSpan(s), - s.size() == 1 ? sentence.words[s.start] : "")); - } - goodSpans.add(gs); - } - var nextWordsKey = byStart.ceilingKey(end); - - if (null == nextWordsKey) - continue; - - for (var next : byStart.get(nextWordsKey)) { - var newSpan = new ArrayList(span.size() + 1); - newSpan.addAll(span); - newSpan.add(next); - newLivingSpans.add(newSpan); - } - } - - livingSpans.clear(); - livingSpans.addAll(newLivingSpans); - } - - return goodSpans; - } - - - private JoinedQueryAndNonLiteralTokens joinQuery(List query) { - StringJoiner s = new StringJoiner(" "); - List leftovers = new ArrayList<>(5); - - for (var t : query) { - if (t.type == TokenType.LITERAL_TERM) { - s.add(t.displayStr); - } - else { - leftovers.add(t); - } - } - - return new JoinedQueryAndNonLiteralTokens(s.toString(), leftovers); - } - - record JoinedQueryAndNonLiteralTokens(String joinedQuery, List nonLiterals) {} -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWord.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWord.java similarity index 94% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWord.java rename to code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWord.java index 07f65c95..b7c4e594 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWord.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWord.java @@ -1,4 +1,4 @@ -package nu.marginalia.functions.searchquery.query_parser.variant.model; +package nu.marginalia.functions.searchquery.query_parser.model; import ca.rmen.porterstemmer.PorterStemmer; diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraph.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java similarity index 82% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraph.java rename to code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java index f9902733..474c4788 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraph.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java @@ -1,4 +1,4 @@ -package nu.marginalia.functions.searchquery.query_parser.variant.model; +package nu.marginalia.functions.searchquery.query_parser.model; import org.jetbrains.annotations.NotNull; @@ -15,8 +15,7 @@ import java.util.stream.Stream; public class QWordGraph implements Iterable { - public record QWordGraphLink(QWord from, QWord to) { - } + public record QWordGraphLink(QWord from, QWord to) {} private final List links = new ArrayList<>(); private final Map> fromTo = new HashMap<>(); @@ -121,8 +120,6 @@ public class QWordGraph implements Iterable { // understanding which vertexes can be re-ordered without changing // the semantics of the encoded query. public boolean isBypassed(QWord word, QWord begin, QWord end) { - assert word.isOriginal() : "Can only bypass original words"; - Set edge = new HashSet<>(); Set visited = new HashSet<>(); @@ -163,6 +160,7 @@ public class QWordGraph implements Iterable { List edge = new ArrayList<>(); List visited = new ArrayList<>(); + visited.add(begin); edge.add(begin); while (!edge.isEmpty()) { @@ -172,7 +170,9 @@ public class QWordGraph implements Iterable { if (Objects.equals(w, end)) continue; - assert (!w.isEnd() && end.isEnd()) : "Graph has a path beyond the specified end vertex"; + if (w.isEnd()) { + assert end.isEnd() : "Graph has a path beyond the specified end vertex " + end; + } next.addAll(getNext(w)); } @@ -182,7 +182,7 @@ public class QWordGraph implements Iterable { edge = next; } - return visited; + return visited.stream().distinct().toList(); } /** Returns a list of subgraphs that are connected on the path from @@ -201,7 +201,7 @@ public class QWordGraph implements Iterable { List points = nodesBetween(begin, end) .stream() - .filter(w -> isBypassed(w, begin, end)) + .filter(w -> !isBypassed(w, begin, end)) .toList(); for (int i = 0; i < points.size() - 1; i++) { @@ -214,6 +214,36 @@ public class QWordGraph implements Iterable { return subgraphs; } + public String compileToQuery() { + return compileToQuery(QWord.beg(), QWord.end()); + } + + public String compileToQuery(QWord begin, QWord end) { + StringJoiner sj = new StringJoiner(" "); + + for (var subgraph : getSubgraphs(begin, end)) { + if (getNext(subgraph.from).equals(List.of(subgraph.to))) { + if (subgraph.from.isBeg()) + continue; + + sj.add(subgraph.from.word()); + } + else { + StringJoiner branchJoiner = new StringJoiner(" | ", "( ", " )"); + if (Objects.equals(subgraph.from, begin)) { + for (QWord path : getNext(subgraph.from)) { + branchJoiner.add(compileToQuery(path, subgraph.to)); + } + } + else { + branchJoiner.add(compileToQuery(subgraph.from, subgraph.to)); + } + sj.add(branchJoiner.toString()); + } + } + + return sj.toString(); + } @NotNull @Override diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java deleted file mode 100644 index 18987aea..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java +++ /dev/null @@ -1,7 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant; - -import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph; - -public interface ExpansionStrategy { - void expand(QWordGraph graph); -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java deleted file mode 100644 index 8d24387b..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java +++ /dev/null @@ -1,17 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant; - -import lombok.AllArgsConstructor; -import lombok.EqualsAndHashCode; -import lombok.Getter; -import lombok.ToString; - -import java.util.List; - -@AllArgsConstructor -@Getter -@ToString -@EqualsAndHashCode -public class QueryVariant { - public final List terms; - public final double value; -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java deleted file mode 100644 index b01fbd5e..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java +++ /dev/null @@ -1,21 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant; - -import lombok.Getter; -import lombok.ToString; -import nu.marginalia.functions.searchquery.query_parser.token.Token; - -import java.util.ArrayList; -import java.util.List; - -@Getter -@ToString -public class QueryVariantSet { - public final List faithful = new ArrayList<>(); - public final List alternative = new ArrayList<>(); - - public final List nonLiterals = new ArrayList<>(); - - public boolean isEmpty() { - return faithful.isEmpty() && alternative.isEmpty() && nonLiterals.isEmpty(); - } -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java deleted file mode 100644 index 9c158a43..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java +++ /dev/null @@ -1,10 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant; - -import lombok.AllArgsConstructor; - -@AllArgsConstructor -public class QueryWord { - public final String stemmed; - public final String word; - public final String wordOriginal; -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java deleted file mode 100644 index 2c1a5bfb..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java +++ /dev/null @@ -1,8 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant; - -import java.util.Collection; -import java.util.List; - -public interface VariantStrategy { - Collection> constructVariants(List ls); -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java index 9ac7c795..3c0e5219 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java @@ -11,8 +11,6 @@ import nu.marginalia.language.WordPatterns; import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.ProcessedQuery; import nu.marginalia.functions.searchquery.query_parser.QueryParser; -import nu.marginalia.functions.searchquery.query_parser.QueryPermutation; -import nu.marginalia.functions.searchquery.query_parser.QueryVariants; import nu.marginalia.functions.searchquery.query_parser.token.Token; import nu.marginalia.functions.searchquery.query_parser.token.TokenType; import nu.marginalia.term_frequency_dict.TermFrequencyDict; @@ -29,43 +27,19 @@ public class QueryFactory { private final Logger logger = LoggerFactory.getLogger(getClass()); private static final int RETAIN_QUERY_VARIANT_COUNT = 5; - private final ThreadLocal queryVariants; private final QueryParser queryParser = new QueryParser(); @Inject public QueryFactory(LanguageModels lm, TermFrequencyDict dict, - EnglishDictionary englishDictionary) { - this.queryVariants = ThreadLocal.withInitial(() -> new QueryVariants(lm ,dict, englishDictionary)); + EnglishDictionary englishDictionary) + { } - public QueryPermutation getQueryPermutation() { - return new QueryPermutation(queryVariants.get()); - } public ProcessedQuery createQuery(QueryParams params) { - final var processedQuery = createQuery(getQueryPermutation(), params); - final List subqueries = processedQuery.specs.subqueries; - - // There used to be a piece of logic here that would try to figure out which one of these subqueries were the "best", - // it's gone for the moment, but it would be neat if it resurrected somehow - - trimArray(subqueries, RETAIN_QUERY_VARIANT_COUNT); - - return processedQuery; - } - - private void trimArray(List arr, int maxSize) { - if (arr.size() > maxSize) { - arr.subList(0, arr.size() - maxSize).clear(); - } - } - - public ProcessedQuery createQuery(QueryPermutation queryPermutation, - QueryParams params) - { final var query = params.humanQuery(); if (query.length() > 1000) { @@ -100,17 +74,19 @@ public class QueryFactory { t.visit(qualityLimits); } - var queryPermutations = queryPermutation.permuteQueriesNew(basicQuery); +// var queryPermutations = queryPermutation.permuteQueriesNew(basicQuery); List subqueries = new ArrayList<>(); + QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery); + domain = termsAccumulator.domain; - for (var parts : queryPermutations) { - QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(parts); - - domain = termsAccumulator.domain; - - SearchSubquery subquery = termsAccumulator.createSubquery(); - subqueries.add(subquery); - } +// for (var parts : queryPermutations) { +// QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery); +// +// domain = termsAccumulator.domain; +// +// SearchSubquery subquery = termsAccumulator.createSubquery(); +// subqueries.add(subquery); +// } List domainIds = params.domainIds(); diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraphTest.java b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java similarity index 83% rename from code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraphTest.java rename to code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java index a88e4d63..bd16b3cb 100644 --- a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraphTest.java +++ b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.functions.searchquery.query_parser.variant.model; +package nu.marginalia.functions.searchquery.query_parser.model; import org.junit.jupiter.api.Test; @@ -10,11 +10,13 @@ class QWordGraphTest { System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); + System.out.println(graph.compileToQuery()); graph.links().forEach(System.out::println); System.out.println("--"); graph.nodes().forEach(System.out::println); System.out.println("--"); graph.addVariant(graph.nodes().get(1), "sup"); + System.out.println(graph.compileToQuery()); System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); System.out.println("--"); @@ -23,6 +25,8 @@ class QWordGraphTest { graph.nodes().forEach(System.out::println); graph.addVariantForSpan(graph.nodes().get(1), graph.nodes().get(2), "heyall"); + graph.addVariant(graph.nodes().get(2), "globe"); + System.out.println(graph.compileToQuery()); System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); System.out.println("--");