From 8bf7d090fd2aac4edbe41b697d37390916b726b6 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 11 Apr 2024 17:20:13 +0200 Subject: [PATCH] (qs) Clean up parsing code using new record matching --- .../searchquery/model/query/SearchQuery.java | 2 +- .../query_parser/ExpansionStrategy.java | 7 - .../query_parser/QueryExpansion.java | 10 + .../searchquery/query_parser/QueryParser.java | 158 +++++++++------ .../query_parser/QueryTokenizer.java | 31 +-- .../query_parser/token/QueryToken.java | 86 ++++++++ .../searchquery/query_parser/token/Token.java | 49 ----- .../query_parser/token/TokenType.java | 34 ---- .../query_parser/token/TokenVisitor.java | 14 -- .../searchquery/svc/QueryFactory.java | 183 +++++++++++++----- .../svc/QueryLimitsAccumulator.java | 93 --------- .../svc/QuerySearchTermsAccumulator.java | 105 ---------- .../util/transform_list/TransformList.java | 9 + 13 files changed, 349 insertions(+), 432 deletions(-) delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/QueryToken.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/Token.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenType.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenVisitor.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryLimitsAccumulator.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java index 9dd10396..ffe02868 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java @@ -72,7 +72,7 @@ public class SearchQuery { @Override public String toString() { StringBuilder sb = new StringBuilder(); - if (!compiledQuery.isEmpty()) sb.append("compiledQuery=").append(compiledQuery); + if (!compiledQuery.isEmpty()) sb.append("compiledQuery=").append(compiledQuery).append(", "); if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] "))); diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java deleted file mode 100644 index 20ebffd1..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java +++ /dev/null @@ -1,7 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser; - -import nu.marginalia.functions.searchquery.query_parser.model.QWordGraph; - -public interface ExpansionStrategy { - void expand(QWordGraph graph); -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index 6415751b..052516d8 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -15,6 +15,9 @@ import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.IntStream; +/** Responsible for expanding a query, that is creating alternative branches of query execution + * to increase the number of results + */ public class QueryExpansion { private static final PorterStemmer ps = new PorterStemmer(); private final TermFrequencyDict dict; @@ -94,6 +97,10 @@ public class QueryExpansion { } } + /** Create an alternative interpretation of the query that replaces a sequence of words + * with a word n-gram. This makes it so that when possible, the order of words in the document + * matches the order of the words in the query. + */ public void createSegments(QWordGraph graph) { List nodes = new ArrayList<>(); @@ -115,4 +122,7 @@ public class QueryExpansion { } } + public interface ExpansionStrategy { + void expand(QWordGraph graph); + } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java index bbaf5c87..3f92a594 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java @@ -1,8 +1,7 @@ package nu.marginalia.functions.searchquery.query_parser; +import nu.marginalia.functions.searchquery.query_parser.token.QueryToken; import nu.marginalia.language.WordPatterns; -import nu.marginalia.functions.searchquery.query_parser.token.Token; -import nu.marginalia.functions.searchquery.query_parser.token.TokenType; import nu.marginalia.util.transform_list.TransformList; import java.util.List; @@ -11,95 +10,126 @@ public class QueryParser { private final QueryTokenizer tokenizer = new QueryTokenizer(); - public List parse(String query) { - List basicTokens = tokenizer.tokenizeQuery(query); + public List parse(String query) { + List basicTokens = tokenizer.tokenizeQuery(query); - TransformList list = new TransformList<>(basicTokens); + TransformList list = new TransformList<>(basicTokens); list.transformEach(QueryParser::handleQuoteTokens); list.transformEach(QueryParser::trimLiterals); list.transformEachPair(QueryParser::createNegatedTerms); list.transformEachPair(QueryParser::createPriorityTerms); list.transformEach(QueryParser::handleSpecialOperations); - list.scanAndTransform(TokenType.LPAREN, TokenType.RPAREN, QueryParser::handleAdvisoryTerms); + list.scanAndTransform(QueryToken.LParen.class::isInstance, QueryToken.RParen.class::isInstance, QueryParser::handleAdvisoryTerms); + list.transformEach(QueryParser::normalizeDomainName); return list.getBackingList(); } - private static void handleQuoteTokens(TransformList.Entity entity) { - var t = entity.value(); - if (t.type == TokenType.QUOT) { - entity.replace(new Token(TokenType.QUOT_TERM, - t.str.replaceAll("\\s+", WordPatterns.WORD_TOKEN_JOINER), - t.displayStr)); - } - } - - private static void trimLiterals(TransformList.Entity entity) { + private static void normalizeDomainName(TransformList.Entity entity) { var t = entity.value(); - if (t.type == TokenType.LITERAL_TERM - && (t.str.endsWith(":") || t.str.endsWith(".")) - && t.str.length() > 1) { - entity.replace(new Token(TokenType.LITERAL_TERM, t.str.substring(0, t.str.length() - 1), t.displayStr)); + if (!(t instanceof QueryToken.LiteralTerm)) + return; + + if (t.str().startsWith("site:")) { + entity.replace(new QueryToken.LiteralTerm(t.str().toLowerCase(), t.displayStr())); } } - private static void createNegatedTerms(TransformList.Entity first, TransformList.Entity second) { - var t = first.value(); - var tn = second.value(); - - if (t.type == TokenType.MINUS && tn.type == TokenType.LITERAL_TERM) { - first.remove(); - second.replace(new Token(TokenType.EXCLUDE_TERM, tn.str, "-" + tn.str)); - } - } - - private static void createPriorityTerms(TransformList.Entity first, TransformList.Entity second) { - var t = first.value(); - var tn = second.value(); - - if (t.type == TokenType.QMARK && tn.type == TokenType.LITERAL_TERM) { - first.remove(); - second.replace(new Token(TokenType.PRIORTY_TERM, tn.str, "?" + tn.str)); - } - } - - private static void handleSpecialOperations(TransformList.Entity entity) { + private static void handleQuoteTokens(TransformList.Entity entity) { var t = entity.value(); - if (t.type != TokenType.LITERAL_TERM) { + + if (!(t instanceof QueryToken.Quot)) { return; } - if (t.str.startsWith("q") && t.str.matches("q[=><]\\d+")) { - entity.replace(new Token(TokenType.QUALITY_TERM, t.str.substring(1), t.displayStr)); - } else if (t.str.startsWith("near:")) { - entity.replace(new Token(TokenType.NEAR_TERM, t.str.substring(5), t.displayStr)); - } else if (t.str.startsWith("year") && t.str.matches("year[=><]\\d{4}")) { - entity.replace(new Token(TokenType.YEAR_TERM, t.str.substring(4), t.displayStr)); - } else if (t.str.startsWith("size") && t.str.matches("size[=><]\\d+")) { - entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr)); - } else if (t.str.startsWith("rank") && t.str.matches("rank[=><]\\d+")) { - entity.replace(new Token(TokenType.RANK_TERM, t.str.substring(4), t.displayStr)); - } else if (t.str.startsWith("qs=")) { - entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr)); - } else if (t.str.contains(":")) { - entity.replace(new Token(TokenType.ADVICE_TERM, t.str, t.displayStr)); - } + entity.replace(new QueryToken.QuotTerm( + t.str().replaceAll("\\s+", WordPatterns.WORD_TOKEN_JOINER), + t.displayStr())); } - private static void handleAdvisoryTerms(TransformList.Entity entity) { + private static void trimLiterals(TransformList.Entity entity) { var t = entity.value(); - if (t.type == TokenType.LPAREN) { - entity.remove(); - } else if (t.type == TokenType.RPAREN) { - entity.remove(); - } else if (t.type == TokenType.LITERAL_TERM) { - entity.replace(new Token(TokenType.ADVICE_TERM, t.str, "(" + t.str + ")")); + + if (!(t instanceof QueryToken.LiteralTerm lt)) + return; + + String str = lt.str(); + if (str.isBlank()) + return; + + if (str.endsWith(":") || str.endsWith(".")) { + entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 1), lt.displayStr())); + } + + } + + private static void createNegatedTerms(TransformList.Entity first, TransformList.Entity second) { + var t = first.value(); + var tn = second.value(); + + if (!(t instanceof QueryToken.Minus)) + return; + if (!(tn instanceof QueryToken.LiteralTerm) && !(tn instanceof QueryToken.AdviceTerm)) + return; + + first.remove(); + + second.replace(new QueryToken.ExcludeTerm(tn.str(), "-" + tn.displayStr())); + } + + private static void createPriorityTerms(TransformList.Entity first, TransformList.Entity second) { + var t = first.value(); + var tn = second.value(); + + if (!(t instanceof QueryToken.QMark)) + return; + if (!(tn instanceof QueryToken.LiteralTerm) && !(tn instanceof QueryToken.AdviceTerm)) + return; + + var replacement = new QueryToken.PriorityTerm(tn.str(), "?" + tn.displayStr()); + + first.remove(); + second.replace(replacement); + } + + private static void handleSpecialOperations(TransformList.Entity entity) { + var t = entity.value(); + if (!(t instanceof QueryToken.LiteralTerm)) { + return; + } + + String str = t.str(); + + if (str.startsWith("q") && str.matches("q[=><]\\d+")) { + entity.replace(new QueryToken.QualityTerm(str.substring(1))); + } else if (str.startsWith("near:")) { + entity.replace(new QueryToken.NearTerm(str.substring(5))); + } else if (str.startsWith("year") && str.matches("year[=><]\\d{4}")) { + entity.replace(new QueryToken.YearTerm(str.substring(4))); + } else if (str.startsWith("size") && str.matches("size[=><]\\d+")) { + entity.replace(new QueryToken.SizeTerm(str.substring(4))); + } else if (str.startsWith("rank") && str.matches("rank[=><]\\d+")) { + entity.replace(new QueryToken.RankTerm(str.substring(4))); + } else if (str.startsWith("qs=")) { + entity.replace(new QueryToken.QsTerm(str.substring(3))); + } else if (str.contains(":")) { + entity.replace(new QueryToken.AdviceTerm(str, t.displayStr())); } } + private static void handleAdvisoryTerms(TransformList.Entity entity) { + var t = entity.value(); + if (t instanceof QueryToken.LParen) { + entity.remove(); + } else if (t instanceof QueryToken.RParen) { + entity.remove(); + } else if (t instanceof QueryToken.LiteralTerm) { + entity.replace(new QueryToken.AdviceTerm(t.str(), "(" + t.displayStr() + ")")); + } + } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java index b7b0a2b7..b12d68a9 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java @@ -1,7 +1,6 @@ package nu.marginalia.functions.searchquery.query_parser; -import nu.marginalia.functions.searchquery.query_parser.token.Token; -import nu.marginalia.functions.searchquery.query_parser.token.TokenType; +import nu.marginalia.functions.searchquery.query_parser.token.QueryToken; import nu.marginalia.language.encoding.AsciiFlattener; import java.util.ArrayList; @@ -11,8 +10,8 @@ import java.util.regex.Pattern; public class QueryTokenizer { private static final Pattern noisePattern = Pattern.compile("[,\\s]"); - public List tokenizeQuery(String rawQuery) { - List tokens = new ArrayList<>(); + public List tokenizeQuery(String rawQuery) { + List tokens = new ArrayList<>(); String query = AsciiFlattener.flattenUnicode(rawQuery); query = noisePattern.matcher(query).replaceAll(" "); @@ -21,26 +20,27 @@ public class QueryTokenizer { int chr = query.charAt(i); if ('(' == chr) { - tokens.add(new Token(TokenType.LPAREN, "(", "(")); + tokens.add(new QueryToken.LParen()); } else if (')' == chr) { - tokens.add(new Token(TokenType.RPAREN, ")", ")")); + tokens.add(new QueryToken.RParen()); } else if ('"' == chr) { int end = query.indexOf('"', i+1); + if (end == -1) { end = query.length(); } - tokens.add(new Token(TokenType.QUOT, - query.substring(i+1, end).toLowerCase(), - query.substring(i, Math.min(query.length(), end+1)))); + + tokens.add(new QueryToken.Quot(query.substring(i + 1, end).toLowerCase())); + i = end; } else if ('-' == chr) { - tokens.add(new Token(TokenType.MINUS, "-")); + tokens.add(new QueryToken.Minus()); } else if ('?' == chr) { - tokens.add(new Token(TokenType.QMARK, "?")); + tokens.add(new QueryToken.QMark()); } else if (Character.isSpaceChar(chr)) { // @@ -52,9 +52,12 @@ public class QueryTokenizer { if (query.charAt(end) == ' ' || query.charAt(end) == ')') break; } - tokens.add(new Token(TokenType.LITERAL_TERM, - query.substring(i, end).toLowerCase(), - query.substring(i, end))); + + String displayStr = query.substring(i, end); + String str = displayStr.toLowerCase(); + + tokens.add(new QueryToken.LiteralTerm(str, displayStr)); + i = end-1; } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/QueryToken.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/QueryToken.java new file mode 100644 index 00000000..b11fe370 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/QueryToken.java @@ -0,0 +1,86 @@ +package nu.marginalia.functions.searchquery.query_parser.token; + + +public sealed interface QueryToken { + String str(); + String displayStr(); + + record LiteralTerm(String str, String displayStr) implements QueryToken {} + record QuotTerm(String str, String displayStr) implements QueryToken {} + record ExcludeTerm(String str, String displayStr) implements QueryToken {} + record AdviceTerm(String str, String displayStr) implements QueryToken {} + record PriorityTerm(String str, String displayStr) implements QueryToken {} + + record QualityTerm(String str) implements QueryToken { + public String displayStr() { + return "q" + str; + } + } + record YearTerm(String str) implements QueryToken { + public String displayStr() { + return "year" + str; + } + } + record SizeTerm(String str) implements QueryToken { + public String displayStr() { + return "size" + str; + } + } + record RankTerm(String str) implements QueryToken { + public String displayStr() { + return "rank" + str; + } + } + record NearTerm(String str) implements QueryToken { + public String displayStr() { + return "near:" + str; + } + } + + record QsTerm(String str) implements QueryToken { + public String displayStr() { + return "qs" + str; + } + } + + record Quot(String str) implements QueryToken { + public String displayStr() { + return "\"" + str + "\""; + } + } + record Minus() implements QueryToken { + public String str() { + return "-"; + } + public String displayStr() { + return "-"; + } + } + record QMark() implements QueryToken { + public String str() { + return "?"; + } + public String displayStr() { + return "?"; + } + } + record LParen() implements QueryToken { + public String str() { + return "("; + } + public String displayStr() { + return "("; + } + } + record RParen() implements QueryToken { + public String str() { + return ")"; + } + public String displayStr() { + return ")"; + } + } + + record Ignore(String str, String displayStr) implements QueryToken {} + +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/Token.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/Token.java deleted file mode 100644 index 06c28972..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/Token.java +++ /dev/null @@ -1,49 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.token; - -import lombok.EqualsAndHashCode; -import lombok.ToString; -import lombok.With; - -@ToString -@EqualsAndHashCode -@With -public class Token { - public TokenType type; - public String str; - public final String displayStr; - - public Token(TokenType type, String str, String displayStr) { - this.type = type; - this.str = str; - this.displayStr = safeString(displayStr); - } - - - public Token(TokenType type, String str) { - this.type = type; - this.str = str; - this.displayStr = safeString(str); - } - - private static String safeString(String s) { - return s.replaceAll("<", "<") - .replaceAll(">", ">"); - } - - public void visit(TokenVisitor visitor) { - switch (type) { - case QUOT_TERM: visitor.onQuotTerm(this); break; - case EXCLUDE_TERM: visitor.onExcludeTerm(this); break; - case PRIORTY_TERM: visitor.onPriorityTerm(this); break; - case ADVICE_TERM: visitor.onAdviceTerm(this); break; - case LITERAL_TERM: visitor.onLiteralTerm(this); break; - - case YEAR_TERM: visitor.onYearTerm(this); break; - case RANK_TERM: visitor.onRankTerm(this); break; - case SIZE_TERM: visitor.onSizeTerm(this); break; - case QS_TERM: visitor.onQsTerm(this); break; - - case QUALITY_TERM: visitor.onQualityTerm(this); break; - } - } -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenType.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenType.java deleted file mode 100644 index 85d55c35..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenType.java +++ /dev/null @@ -1,34 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.token; - -import java.util.function.Predicate; - -public enum TokenType implements Predicate { - TERM, - - - LITERAL_TERM, - QUOT_TERM, - EXCLUDE_TERM, - ADVICE_TERM, - PRIORTY_TERM, - - QUALITY_TERM, - YEAR_TERM, - SIZE_TERM, - RANK_TERM, - NEAR_TERM, - - QS_TERM, - - QUOT, - MINUS, - QMARK, - LPAREN, - RPAREN, - - IGNORE; - - public boolean test(Token t) { - return t.type == this; - } -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenVisitor.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenVisitor.java deleted file mode 100644 index 2e14f837..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenVisitor.java +++ /dev/null @@ -1,14 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.token; - -public interface TokenVisitor { - void onLiteralTerm(Token token); - void onQuotTerm(Token token); - void onExcludeTerm(Token token); - void onPriorityTerm(Token token); - void onAdviceTerm(Token token); - void onYearTerm(Token token); - void onSizeTerm(Token token); - void onRankTerm(Token token); - void onQualityTerm(Token token); - void onQsTerm(Token token); -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java index 55467b4f..26af1bf4 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java @@ -6,18 +6,19 @@ import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.functions.searchquery.query_parser.QueryExpansion; +import nu.marginalia.functions.searchquery.query_parser.token.QueryToken; +import nu.marginalia.index.query.limit.QueryStrategy; +import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.language.WordPatterns; import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.ProcessedQuery; import nu.marginalia.functions.searchquery.query_parser.QueryParser; -import nu.marginalia.functions.searchquery.query_parser.token.Token; -import nu.marginalia.functions.searchquery.query_parser.token.TokenType; +import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.List; @Singleton @@ -46,31 +47,89 @@ public class QueryFactory { List searchTermsHuman = new ArrayList<>(); List problems = new ArrayList<>(); - List basicQuery = queryParser.parse(query); + List basicQuery = queryParser.parse(query); if (basicQuery.size() >= 12) { problems.add("Your search query is too long"); basicQuery.clear(); } + List searchTermsExclude = new ArrayList<>(); + List searchTermsInclude = new ArrayList<>(); + List searchTermsAdvice = new ArrayList<>(); + List searchTermsPriority = new ArrayList<>(); + List> searchTermCoherences = new ArrayList<>(); - QueryLimitsAccumulator qualityLimits = new QueryLimitsAccumulator(params); + SpecificationLimit qualityLimit = SpecificationLimit.none(); + SpecificationLimit year = SpecificationLimit.none(); + SpecificationLimit size = SpecificationLimit.none(); + SpecificationLimit rank = SpecificationLimit.none(); + QueryStrategy queryStrategy = QueryStrategy.AUTO; - for (Token t : basicQuery) { - if (t.type == TokenType.QUOT_TERM || t.type == TokenType.LITERAL_TERM) { - if (t.str.startsWith("site:")) { - t.str = normalizeDomainName(t.str); + String domain = null; + + System.out.println(basicQuery); + + for (QueryToken t : basicQuery) { + switch (t) { + case QueryToken.QuotTerm(String str, String displayStr) -> { + analyzeSearchTerm(problems, str, displayStr); + searchTermsHuman.addAll(Arrays.asList(displayStr.replace("\"", "").split("\\s+"))); + + String[] parts = StringUtils.split(str, '_'); + + // Checking for stop words here is a bit of a stop-gap to fix the issue of stop words being + // required in the query (which is a problem because they are not indexed). How to do this + // in a clean way is a bit of an open problem that may not get resolved until query-parsing is + // improved. + + if (parts.length > 1 && !anyPartIsStopWord(parts)) { + // Prefer that the actual n-gram is present + searchTermsAdvice.add(str); + + // Require that the terms appear in the same sentence + searchTermCoherences.add(Arrays.asList(parts)); + + // Require that each term exists in the document + // (needed for ranking) + searchTermsInclude.addAll(Arrays.asList(parts)); + } + else { + searchTermsInclude.add(str); + } + } + case QueryToken.LiteralTerm(String str, String displayStr) -> { + analyzeSearchTerm(problems, str, displayStr); + searchTermsHuman.addAll(Arrays.asList(displayStr.split("\\s+"))); + + searchTermsInclude.add(str); } - searchTermsHuman.addAll(toHumanSearchTerms(t)); - analyzeSearchTerm(problems, t); - } - t.visit(qualityLimits); + case QueryToken.ExcludeTerm(String str, String displayStr) -> searchTermsExclude.add(str); + case QueryToken.PriorityTerm(String str, String displayStr) -> searchTermsPriority.add(str); + case QueryToken.AdviceTerm(String str, String displayStr) -> { + searchTermsAdvice.add(str); + + if (str.toLowerCase().startsWith("site:")) { + domain = str.substring("site:".length()); + } + } + + case QueryToken.YearTerm(String str) -> year = parseSpecificationLimit(str); + case QueryToken.SizeTerm(String str) -> size = parseSpecificationLimit(str); + case QueryToken.RankTerm(String str) -> rank = parseSpecificationLimit(str); + case QueryToken.QualityTerm(String str) -> qualityLimit = parseSpecificationLimit(str); + case QueryToken.QsTerm(String str) -> queryStrategy = parseQueryStrategy(str); + + default -> {} + } } - QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery); - String domain = termsAccumulator.domain; + if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) { + searchTermsInclude.addAll(searchTermsAdvice); + searchTermsAdvice.clear(); + } List domainIds = params.domainIds(); @@ -80,29 +139,29 @@ public class QueryFactory { limits = limits.forSingleDomain(); } + var searchQuery = new SearchQuery( + queryExpansion.expandQuery( + searchTermsInclude + ), + searchTermsInclude, + searchTermsExclude, + searchTermsAdvice, + searchTermsPriority, + searchTermCoherences + ); + var specsBuilder = SearchSpecification.builder() - .query( - new SearchQuery( - queryExpansion.expandQuery( - termsAccumulator.searchTermsInclude - ), - termsAccumulator.searchTermsInclude, - termsAccumulator.searchTermsExclude, - termsAccumulator.searchTermsAdvice, - termsAccumulator.searchTermsPriority, - termsAccumulator.searchTermCoherences - ) - ) + .query(searchQuery) .humanQuery(query) - .quality(qualityLimits.qualityLimit) - .year(qualityLimits.year) - .size(qualityLimits.size) - .rank(qualityLimits.rank) + .quality(qualityLimit) + .year(year) + .size(size) + .rank(rank) .domains(domainIds) .queryLimits(limits) .searchSetIdentifier(params.identifier()) .rankingParams(ResultRankingParameters.sensibleDefaults()) - .queryStrategy(qualityLimits.queryStrategy); + .queryStrategy(queryStrategy); SearchSpecification specs = specsBuilder.build(); @@ -113,30 +172,52 @@ public class QueryFactory { return new ProcessedQuery(specs, searchTermsHuman, domain); } - private String normalizeDomainName(String str) { - return str.toLowerCase(); - } - - private List toHumanSearchTerms(Token t) { - if (t.type == TokenType.LITERAL_TERM) { - return Arrays.asList(t.displayStr.split("\\s+")); - } - else if (t.type == TokenType.QUOT_TERM) { - return Arrays.asList(t.displayStr.replace("\"", "").split("\\s+")); - - } - return Collections.emptyList(); - } - - private void analyzeSearchTerm(List problems, Token term) { - final String word = term.str; + private void analyzeSearchTerm(List problems, String str, String displayStr) { + final String word = str; if (word.length() < WordPatterns.MIN_WORD_LENGTH) { - problems.add("Search term \"" + term.displayStr + "\" too short"); + problems.add("Search term \"" + displayStr + "\" too short"); } if (!word.contains("_") && word.length() >= WordPatterns.MAX_WORD_LENGTH) { - problems.add("Search term \"" + term.displayStr + "\" too long"); + problems.add("Search term \"" + displayStr + "\" too long"); + } + } + private SpecificationLimit parseSpecificationLimit(String str) { + int startChar = str.charAt(0); + + int val = Integer.parseInt(str.substring(1)); + if (startChar == '=') { + return SpecificationLimit.equals(val); + } else if (startChar == '<') { + return SpecificationLimit.lessThan(val); + } else if (startChar == '>') { + return SpecificationLimit.greaterThan(val); + } else { + return SpecificationLimit.none(); } } + private QueryStrategy parseQueryStrategy(String str) { + return switch (str.toUpperCase()) { + case "RF_TITLE" -> QueryStrategy.REQUIRE_FIELD_TITLE; + case "RF_SUBJECT" -> QueryStrategy.REQUIRE_FIELD_SUBJECT; + case "RF_SITE" -> QueryStrategy.REQUIRE_FIELD_SITE; + case "RF_URL" -> QueryStrategy.REQUIRE_FIELD_URL; + case "RF_DOMAIN" -> QueryStrategy.REQUIRE_FIELD_DOMAIN; + case "RF_LINK" -> QueryStrategy.REQUIRE_FIELD_LINK; + case "SENTENCE" -> QueryStrategy.SENTENCE; + case "TOPIC" -> QueryStrategy.TOPIC; + default -> QueryStrategy.AUTO; + }; + } + + + private boolean anyPartIsStopWord(String[] parts) { + for (String part : parts) { + if (WordPatterns.isStopWord(part)) { + return true; + } + } + return false; + } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryLimitsAccumulator.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryLimitsAccumulator.java deleted file mode 100644 index 1b49bab3..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryLimitsAccumulator.java +++ /dev/null @@ -1,93 +0,0 @@ -package nu.marginalia.functions.searchquery.svc; - -import nu.marginalia.api.searchquery.model.query.QueryParams; -import nu.marginalia.index.query.limit.QueryStrategy; -import nu.marginalia.index.query.limit.SpecificationLimit; -import nu.marginalia.functions.searchquery.query_parser.token.Token; -import nu.marginalia.functions.searchquery.query_parser.token.TokenVisitor; - -public class QueryLimitsAccumulator implements TokenVisitor { - public SpecificationLimit qualityLimit; - public SpecificationLimit year; - public SpecificationLimit size; - public SpecificationLimit rank; - - public QueryStrategy queryStrategy = QueryStrategy.AUTO; - - public QueryLimitsAccumulator(QueryParams params) { - qualityLimit = params.quality(); - year = params.year(); - size = params.size(); - rank = params.rank(); - } - - private SpecificationLimit parseSpecificationLimit(String str) { - int startChar = str.charAt(0); - - int val = Integer.parseInt(str.substring(1)); - if (startChar == '=') { - return SpecificationLimit.equals(val); - } else if (startChar == '<') { - return SpecificationLimit.lessThan(val); - } else if (startChar == '>') { - return SpecificationLimit.greaterThan(val); - } else { - return SpecificationLimit.none(); - } - } - - private QueryStrategy parseQueryStrategy(String str) { - return switch (str.toUpperCase()) { - case "RF_TITLE" -> QueryStrategy.REQUIRE_FIELD_TITLE; - case "RF_SUBJECT" -> QueryStrategy.REQUIRE_FIELD_SUBJECT; - case "RF_SITE" -> QueryStrategy.REQUIRE_FIELD_SITE; - case "RF_URL" -> QueryStrategy.REQUIRE_FIELD_URL; - case "RF_DOMAIN" -> QueryStrategy.REQUIRE_FIELD_DOMAIN; - case "RF_LINK" -> QueryStrategy.REQUIRE_FIELD_LINK; - case "SENTENCE" -> QueryStrategy.SENTENCE; - case "TOPIC" -> QueryStrategy.TOPIC; - default -> QueryStrategy.AUTO; - }; - } - - @Override - public void onYearTerm(Token token) { - year = parseSpecificationLimit(token.str); - } - - @Override - public void onSizeTerm(Token token) { - size = parseSpecificationLimit(token.str); - } - - @Override - public void onRankTerm(Token token) { - rank = parseSpecificationLimit(token.str); - } - - @Override - public void onQualityTerm(Token token) { - qualityLimit = parseSpecificationLimit(token.str); - } - - @Override - public void onQsTerm(Token token) { - queryStrategy = parseQueryStrategy(token.str); - } - - - @Override - public void onLiteralTerm(Token token) {} - - @Override - public void onQuotTerm(Token token) {} - - @Override - public void onExcludeTerm(Token token) {} - - @Override - public void onPriorityTerm(Token token) {} - - @Override - public void onAdviceTerm(Token token) {} -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java deleted file mode 100644 index cc3a7e56..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java +++ /dev/null @@ -1,105 +0,0 @@ -package nu.marginalia.functions.searchquery.svc; - -import nu.marginalia.api.searchquery.model.query.SearchQuery; -import nu.marginalia.language.WordPatterns; -import nu.marginalia.functions.searchquery.query_parser.token.Token; -import nu.marginalia.functions.searchquery.query_parser.token.TokenVisitor; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -/** @see SearchQuery */ -public class QuerySearchTermsAccumulator implements TokenVisitor { - public List searchTermsExclude = new ArrayList<>(); - public List searchTermsInclude = new ArrayList<>(); - public List searchTermsAdvice = new ArrayList<>(); - public List searchTermsPriority = new ArrayList<>(); - public List> searchTermCoherences = new ArrayList<>(); - - public String domain; - - public QuerySearchTermsAccumulator(List parts) { - for (Token t : parts) { - t.visit(this); - } - - if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) { - searchTermsInclude.addAll(searchTermsAdvice); - searchTermsAdvice.clear(); - } - - } - - @Override - public void onLiteralTerm(Token token) { - searchTermsInclude.add(token.str); - } - - @Override - public void onQuotTerm(Token token) { - String[] parts = token.str.split("_"); - - // HACK (2023-05-02 vlofgren) - // - // Checking for stop words here is a bit of a stop-gap to fix the issue of stop words being - // required in the query (which is a problem because they are not indexed). How to do this - // in a clean way is a bit of an open problem that may not get resolved until query-parsing is - // improved. - - if (parts.length > 1 && !anyPartIsStopWord(parts)) { - // Prefer that the actual n-gram is present - searchTermsAdvice.add(token.str); - - // Require that the terms appear in the same sentence - searchTermCoherences.add(Arrays.asList(parts)); - - // Require that each term exists in the document - // (needed for ranking) - searchTermsInclude.addAll(Arrays.asList(parts)); - } - else { - searchTermsInclude.add(token.str); - - } - } - - private boolean anyPartIsStopWord(String[] parts) { - for (String part : parts) { - if (WordPatterns.isStopWord(part)) { - return true; - } - } - return false; - } - - @Override - public void onExcludeTerm(Token token) { - searchTermsExclude.add(token.str); - } - - @Override - public void onPriorityTerm(Token token) { - searchTermsPriority.add(token.str); - } - - @Override - public void onAdviceTerm(Token token) { - searchTermsAdvice.add(token.str); - - if (token.str.toLowerCase().startsWith("site:")) { - domain = token.str.substring("site:".length()); - } - } - - @Override - public void onYearTerm(Token token) {} - @Override - public void onSizeTerm(Token token) {} - @Override - public void onRankTerm(Token token) {} - @Override - public void onQualityTerm(Token token) {} - @Override - public void onQsTerm(Token token) {} -} diff --git a/code/functions/search-query/java/nu/marginalia/util/transform_list/TransformList.java b/code/functions/search-query/java/nu/marginalia/util/transform_list/TransformList.java index 08bc428e..62dd2e0a 100644 --- a/code/functions/search-query/java/nu/marginalia/util/transform_list/TransformList.java +++ b/code/functions/search-query/java/nu/marginalia/util/transform_list/TransformList.java @@ -80,6 +80,15 @@ public class TransformList { iter.remove(); } } + else if (firstEntity.action == Action.NO_OP) { + if (secondEntry.action == Action.REPLACE) { + backingList.set(iter.nextIndex(), secondEntry.value); + } + else if (secondEntry.action == Action.REMOVE) { + iter.next(); + iter.remove(); + } + } } }