mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(qs) Clean up parsing code using new record matching
This commit is contained in:
parent
c538c25008
commit
ed73d79ec1
@ -72,7 +72,7 @@ public class SearchQuery {
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
if (!compiledQuery.isEmpty()) sb.append("compiledQuery=").append(compiledQuery);
|
||||
if (!compiledQuery.isEmpty()) sb.append("compiledQuery=").append(compiledQuery).append(", ");
|
||||
if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
|
@ -1,7 +0,0 @@
|
||||
package nu.marginalia.functions.searchquery.query_parser;
|
||||
|
||||
import nu.marginalia.functions.searchquery.query_parser.model.QWordGraph;
|
||||
|
||||
public interface ExpansionStrategy {
|
||||
void expand(QWordGraph graph);
|
||||
}
|
@ -15,6 +15,9 @@ import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
/** Responsible for expanding a query, that is creating alternative branches of query execution
|
||||
* to increase the number of results
|
||||
*/
|
||||
public class QueryExpansion {
|
||||
private static final PorterStemmer ps = new PorterStemmer();
|
||||
private final TermFrequencyDict dict;
|
||||
@ -94,6 +97,10 @@ public class QueryExpansion {
|
||||
}
|
||||
}
|
||||
|
||||
/** Create an alternative interpretation of the query that replaces a sequence of words
|
||||
* with a word n-gram. This makes it so that when possible, the order of words in the document
|
||||
* matches the order of the words in the query.
|
||||
*/
|
||||
public void createSegments(QWordGraph graph) {
|
||||
List<QWord> nodes = new ArrayList<>();
|
||||
|
||||
@ -115,4 +122,7 @@ public class QueryExpansion {
|
||||
}
|
||||
}
|
||||
|
||||
public interface ExpansionStrategy {
|
||||
void expand(QWordGraph graph);
|
||||
}
|
||||
}
|
||||
|
@ -1,8 +1,7 @@
|
||||
package nu.marginalia.functions.searchquery.query_parser;
|
||||
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.Token;
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.TokenType;
|
||||
import nu.marginalia.util.transform_list.TransformList;
|
||||
|
||||
import java.util.List;
|
||||
@ -11,95 +10,126 @@ public class QueryParser {
|
||||
|
||||
private final QueryTokenizer tokenizer = new QueryTokenizer();
|
||||
|
||||
public List<Token> parse(String query) {
|
||||
List<Token> basicTokens = tokenizer.tokenizeQuery(query);
|
||||
public List<QueryToken> parse(String query) {
|
||||
List<QueryToken> basicTokens = tokenizer.tokenizeQuery(query);
|
||||
|
||||
TransformList<Token> list = new TransformList<>(basicTokens);
|
||||
TransformList<QueryToken> list = new TransformList<>(basicTokens);
|
||||
|
||||
list.transformEach(QueryParser::handleQuoteTokens);
|
||||
list.transformEach(QueryParser::trimLiterals);
|
||||
list.transformEachPair(QueryParser::createNegatedTerms);
|
||||
list.transformEachPair(QueryParser::createPriorityTerms);
|
||||
list.transformEach(QueryParser::handleSpecialOperations);
|
||||
list.scanAndTransform(TokenType.LPAREN, TokenType.RPAREN, QueryParser::handleAdvisoryTerms);
|
||||
list.scanAndTransform(QueryToken.LParen.class::isInstance, QueryToken.RParen.class::isInstance, QueryParser::handleAdvisoryTerms);
|
||||
list.transformEach(QueryParser::normalizeDomainName);
|
||||
|
||||
return list.getBackingList();
|
||||
}
|
||||
|
||||
private static void handleQuoteTokens(TransformList<Token>.Entity entity) {
|
||||
var t = entity.value();
|
||||
if (t.type == TokenType.QUOT) {
|
||||
entity.replace(new Token(TokenType.QUOT_TERM,
|
||||
t.str.replaceAll("\\s+", WordPatterns.WORD_TOKEN_JOINER),
|
||||
t.displayStr));
|
||||
}
|
||||
}
|
||||
|
||||
private static void trimLiterals(TransformList<Token>.Entity entity) {
|
||||
private static void normalizeDomainName(TransformList<QueryToken>.Entity entity) {
|
||||
var t = entity.value();
|
||||
|
||||
if (t.type == TokenType.LITERAL_TERM
|
||||
&& (t.str.endsWith(":") || t.str.endsWith("."))
|
||||
&& t.str.length() > 1) {
|
||||
entity.replace(new Token(TokenType.LITERAL_TERM, t.str.substring(0, t.str.length() - 1), t.displayStr));
|
||||
if (!(t instanceof QueryToken.LiteralTerm))
|
||||
return;
|
||||
|
||||
if (t.str().startsWith("site:")) {
|
||||
entity.replace(new QueryToken.LiteralTerm(t.str().toLowerCase(), t.displayStr()));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static void createNegatedTerms(TransformList<Token>.Entity first, TransformList<Token>.Entity second) {
|
||||
var t = first.value();
|
||||
var tn = second.value();
|
||||
|
||||
if (t.type == TokenType.MINUS && tn.type == TokenType.LITERAL_TERM) {
|
||||
first.remove();
|
||||
second.replace(new Token(TokenType.EXCLUDE_TERM, tn.str, "-" + tn.str));
|
||||
}
|
||||
}
|
||||
|
||||
private static void createPriorityTerms(TransformList<Token>.Entity first, TransformList<Token>.Entity second) {
|
||||
var t = first.value();
|
||||
var tn = second.value();
|
||||
|
||||
if (t.type == TokenType.QMARK && tn.type == TokenType.LITERAL_TERM) {
|
||||
first.remove();
|
||||
second.replace(new Token(TokenType.PRIORTY_TERM, tn.str, "?" + tn.str));
|
||||
}
|
||||
}
|
||||
|
||||
private static void handleSpecialOperations(TransformList<Token>.Entity entity) {
|
||||
private static void handleQuoteTokens(TransformList<QueryToken>.Entity entity) {
|
||||
var t = entity.value();
|
||||
if (t.type != TokenType.LITERAL_TERM) {
|
||||
|
||||
if (!(t instanceof QueryToken.Quot)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (t.str.startsWith("q") && t.str.matches("q[=><]\\d+")) {
|
||||
entity.replace(new Token(TokenType.QUALITY_TERM, t.str.substring(1), t.displayStr));
|
||||
} else if (t.str.startsWith("near:")) {
|
||||
entity.replace(new Token(TokenType.NEAR_TERM, t.str.substring(5), t.displayStr));
|
||||
} else if (t.str.startsWith("year") && t.str.matches("year[=><]\\d{4}")) {
|
||||
entity.replace(new Token(TokenType.YEAR_TERM, t.str.substring(4), t.displayStr));
|
||||
} else if (t.str.startsWith("size") && t.str.matches("size[=><]\\d+")) {
|
||||
entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr));
|
||||
} else if (t.str.startsWith("rank") && t.str.matches("rank[=><]\\d+")) {
|
||||
entity.replace(new Token(TokenType.RANK_TERM, t.str.substring(4), t.displayStr));
|
||||
} else if (t.str.startsWith("qs=")) {
|
||||
entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr));
|
||||
} else if (t.str.contains(":")) {
|
||||
entity.replace(new Token(TokenType.ADVICE_TERM, t.str, t.displayStr));
|
||||
}
|
||||
entity.replace(new QueryToken.QuotTerm(
|
||||
t.str().replaceAll("\\s+", WordPatterns.WORD_TOKEN_JOINER),
|
||||
t.displayStr()));
|
||||
}
|
||||
|
||||
private static void handleAdvisoryTerms(TransformList<Token>.Entity entity) {
|
||||
private static void trimLiterals(TransformList<QueryToken>.Entity entity) {
|
||||
var t = entity.value();
|
||||
if (t.type == TokenType.LPAREN) {
|
||||
entity.remove();
|
||||
} else if (t.type == TokenType.RPAREN) {
|
||||
entity.remove();
|
||||
} else if (t.type == TokenType.LITERAL_TERM) {
|
||||
entity.replace(new Token(TokenType.ADVICE_TERM, t.str, "(" + t.str + ")"));
|
||||
|
||||
if (!(t instanceof QueryToken.LiteralTerm lt))
|
||||
return;
|
||||
|
||||
String str = lt.str();
|
||||
if (str.isBlank())
|
||||
return;
|
||||
|
||||
if (str.endsWith(":") || str.endsWith(".")) {
|
||||
entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 1), lt.displayStr()));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static void createNegatedTerms(TransformList<QueryToken>.Entity first, TransformList<QueryToken>.Entity second) {
|
||||
var t = first.value();
|
||||
var tn = second.value();
|
||||
|
||||
if (!(t instanceof QueryToken.Minus))
|
||||
return;
|
||||
if (!(tn instanceof QueryToken.LiteralTerm) && !(tn instanceof QueryToken.AdviceTerm))
|
||||
return;
|
||||
|
||||
first.remove();
|
||||
|
||||
second.replace(new QueryToken.ExcludeTerm(tn.str(), "-" + tn.displayStr()));
|
||||
}
|
||||
|
||||
private static void createPriorityTerms(TransformList<QueryToken>.Entity first, TransformList<QueryToken>.Entity second) {
|
||||
var t = first.value();
|
||||
var tn = second.value();
|
||||
|
||||
if (!(t instanceof QueryToken.QMark))
|
||||
return;
|
||||
if (!(tn instanceof QueryToken.LiteralTerm) && !(tn instanceof QueryToken.AdviceTerm))
|
||||
return;
|
||||
|
||||
var replacement = new QueryToken.PriorityTerm(tn.str(), "?" + tn.displayStr());
|
||||
|
||||
first.remove();
|
||||
second.replace(replacement);
|
||||
}
|
||||
|
||||
private static void handleSpecialOperations(TransformList<QueryToken>.Entity entity) {
|
||||
var t = entity.value();
|
||||
if (!(t instanceof QueryToken.LiteralTerm)) {
|
||||
return;
|
||||
}
|
||||
|
||||
String str = t.str();
|
||||
|
||||
if (str.startsWith("q") && str.matches("q[=><]\\d+")) {
|
||||
entity.replace(new QueryToken.QualityTerm(str.substring(1)));
|
||||
} else if (str.startsWith("near:")) {
|
||||
entity.replace(new QueryToken.NearTerm(str.substring(5)));
|
||||
} else if (str.startsWith("year") && str.matches("year[=><]\\d{4}")) {
|
||||
entity.replace(new QueryToken.YearTerm(str.substring(4)));
|
||||
} else if (str.startsWith("size") && str.matches("size[=><]\\d+")) {
|
||||
entity.replace(new QueryToken.SizeTerm(str.substring(4)));
|
||||
} else if (str.startsWith("rank") && str.matches("rank[=><]\\d+")) {
|
||||
entity.replace(new QueryToken.RankTerm(str.substring(4)));
|
||||
} else if (str.startsWith("qs=")) {
|
||||
entity.replace(new QueryToken.QsTerm(str.substring(3)));
|
||||
} else if (str.contains(":")) {
|
||||
entity.replace(new QueryToken.AdviceTerm(str, t.displayStr()));
|
||||
}
|
||||
}
|
||||
|
||||
private static void handleAdvisoryTerms(TransformList<QueryToken>.Entity entity) {
|
||||
var t = entity.value();
|
||||
if (t instanceof QueryToken.LParen) {
|
||||
entity.remove();
|
||||
} else if (t instanceof QueryToken.RParen) {
|
||||
entity.remove();
|
||||
} else if (t instanceof QueryToken.LiteralTerm) {
|
||||
entity.replace(new QueryToken.AdviceTerm(t.str(), "(" + t.displayStr() + ")"));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -1,7 +1,6 @@
|
||||
package nu.marginalia.functions.searchquery.query_parser;
|
||||
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.Token;
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.TokenType;
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
|
||||
import nu.marginalia.language.encoding.AsciiFlattener;
|
||||
|
||||
import java.util.ArrayList;
|
||||
@ -11,8 +10,8 @@ import java.util.regex.Pattern;
|
||||
public class QueryTokenizer {
|
||||
private static final Pattern noisePattern = Pattern.compile("[,\\s]");
|
||||
|
||||
public List<Token> tokenizeQuery(String rawQuery) {
|
||||
List<Token> tokens = new ArrayList<>();
|
||||
public List<QueryToken> tokenizeQuery(String rawQuery) {
|
||||
List<QueryToken> tokens = new ArrayList<>();
|
||||
|
||||
String query = AsciiFlattener.flattenUnicode(rawQuery);
|
||||
query = noisePattern.matcher(query).replaceAll(" ");
|
||||
@ -21,26 +20,27 @@ public class QueryTokenizer {
|
||||
int chr = query.charAt(i);
|
||||
|
||||
if ('(' == chr) {
|
||||
tokens.add(new Token(TokenType.LPAREN, "(", "("));
|
||||
tokens.add(new QueryToken.LParen());
|
||||
}
|
||||
else if (')' == chr) {
|
||||
tokens.add(new Token(TokenType.RPAREN, ")", ")"));
|
||||
tokens.add(new QueryToken.RParen());
|
||||
}
|
||||
else if ('"' == chr) {
|
||||
int end = query.indexOf('"', i+1);
|
||||
|
||||
if (end == -1) {
|
||||
end = query.length();
|
||||
}
|
||||
tokens.add(new Token(TokenType.QUOT,
|
||||
query.substring(i+1, end).toLowerCase(),
|
||||
query.substring(i, Math.min(query.length(), end+1))));
|
||||
|
||||
tokens.add(new QueryToken.Quot(query.substring(i + 1, end).toLowerCase()));
|
||||
|
||||
i = end;
|
||||
}
|
||||
else if ('-' == chr) {
|
||||
tokens.add(new Token(TokenType.MINUS, "-"));
|
||||
tokens.add(new QueryToken.Minus());
|
||||
}
|
||||
else if ('?' == chr) {
|
||||
tokens.add(new Token(TokenType.QMARK, "?"));
|
||||
tokens.add(new QueryToken.QMark());
|
||||
}
|
||||
else if (Character.isSpaceChar(chr)) {
|
||||
//
|
||||
@ -52,9 +52,12 @@ public class QueryTokenizer {
|
||||
if (query.charAt(end) == ' ' || query.charAt(end) == ')')
|
||||
break;
|
||||
}
|
||||
tokens.add(new Token(TokenType.LITERAL_TERM,
|
||||
query.substring(i, end).toLowerCase(),
|
||||
query.substring(i, end)));
|
||||
|
||||
String displayStr = query.substring(i, end);
|
||||
String str = displayStr.toLowerCase();
|
||||
|
||||
tokens.add(new QueryToken.LiteralTerm(str, displayStr));
|
||||
|
||||
i = end-1;
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,86 @@
|
||||
package nu.marginalia.functions.searchquery.query_parser.token;
|
||||
|
||||
|
||||
public sealed interface QueryToken {
|
||||
String str();
|
||||
String displayStr();
|
||||
|
||||
record LiteralTerm(String str, String displayStr) implements QueryToken {}
|
||||
record QuotTerm(String str, String displayStr) implements QueryToken {}
|
||||
record ExcludeTerm(String str, String displayStr) implements QueryToken {}
|
||||
record AdviceTerm(String str, String displayStr) implements QueryToken {}
|
||||
record PriorityTerm(String str, String displayStr) implements QueryToken {}
|
||||
|
||||
record QualityTerm(String str) implements QueryToken {
|
||||
public String displayStr() {
|
||||
return "q" + str;
|
||||
}
|
||||
}
|
||||
record YearTerm(String str) implements QueryToken {
|
||||
public String displayStr() {
|
||||
return "year" + str;
|
||||
}
|
||||
}
|
||||
record SizeTerm(String str) implements QueryToken {
|
||||
public String displayStr() {
|
||||
return "size" + str;
|
||||
}
|
||||
}
|
||||
record RankTerm(String str) implements QueryToken {
|
||||
public String displayStr() {
|
||||
return "rank" + str;
|
||||
}
|
||||
}
|
||||
record NearTerm(String str) implements QueryToken {
|
||||
public String displayStr() {
|
||||
return "near:" + str;
|
||||
}
|
||||
}
|
||||
|
||||
record QsTerm(String str) implements QueryToken {
|
||||
public String displayStr() {
|
||||
return "qs" + str;
|
||||
}
|
||||
}
|
||||
|
||||
record Quot(String str) implements QueryToken {
|
||||
public String displayStr() {
|
||||
return "\"" + str + "\"";
|
||||
}
|
||||
}
|
||||
record Minus() implements QueryToken {
|
||||
public String str() {
|
||||
return "-";
|
||||
}
|
||||
public String displayStr() {
|
||||
return "-";
|
||||
}
|
||||
}
|
||||
record QMark() implements QueryToken {
|
||||
public String str() {
|
||||
return "?";
|
||||
}
|
||||
public String displayStr() {
|
||||
return "?";
|
||||
}
|
||||
}
|
||||
record LParen() implements QueryToken {
|
||||
public String str() {
|
||||
return "(";
|
||||
}
|
||||
public String displayStr() {
|
||||
return "(";
|
||||
}
|
||||
}
|
||||
record RParen() implements QueryToken {
|
||||
public String str() {
|
||||
return ")";
|
||||
}
|
||||
public String displayStr() {
|
||||
return ")";
|
||||
}
|
||||
}
|
||||
|
||||
record Ignore(String str, String displayStr) implements QueryToken {}
|
||||
|
||||
}
|
@ -1,49 +0,0 @@
|
||||
package nu.marginalia.functions.searchquery.query_parser.token;
|
||||
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.ToString;
|
||||
import lombok.With;
|
||||
|
||||
@ToString
|
||||
@EqualsAndHashCode
|
||||
@With
|
||||
public class Token {
|
||||
public TokenType type;
|
||||
public String str;
|
||||
public final String displayStr;
|
||||
|
||||
public Token(TokenType type, String str, String displayStr) {
|
||||
this.type = type;
|
||||
this.str = str;
|
||||
this.displayStr = safeString(displayStr);
|
||||
}
|
||||
|
||||
|
||||
public Token(TokenType type, String str) {
|
||||
this.type = type;
|
||||
this.str = str;
|
||||
this.displayStr = safeString(str);
|
||||
}
|
||||
|
||||
private static String safeString(String s) {
|
||||
return s.replaceAll("<", "<")
|
||||
.replaceAll(">", ">");
|
||||
}
|
||||
|
||||
public void visit(TokenVisitor visitor) {
|
||||
switch (type) {
|
||||
case QUOT_TERM: visitor.onQuotTerm(this); break;
|
||||
case EXCLUDE_TERM: visitor.onExcludeTerm(this); break;
|
||||
case PRIORTY_TERM: visitor.onPriorityTerm(this); break;
|
||||
case ADVICE_TERM: visitor.onAdviceTerm(this); break;
|
||||
case LITERAL_TERM: visitor.onLiteralTerm(this); break;
|
||||
|
||||
case YEAR_TERM: visitor.onYearTerm(this); break;
|
||||
case RANK_TERM: visitor.onRankTerm(this); break;
|
||||
case SIZE_TERM: visitor.onSizeTerm(this); break;
|
||||
case QS_TERM: visitor.onQsTerm(this); break;
|
||||
|
||||
case QUALITY_TERM: visitor.onQualityTerm(this); break;
|
||||
}
|
||||
}
|
||||
}
|
@ -1,34 +0,0 @@
|
||||
package nu.marginalia.functions.searchquery.query_parser.token;
|
||||
|
||||
import java.util.function.Predicate;
|
||||
|
||||
public enum TokenType implements Predicate<Token> {
|
||||
TERM,
|
||||
|
||||
|
||||
LITERAL_TERM,
|
||||
QUOT_TERM,
|
||||
EXCLUDE_TERM,
|
||||
ADVICE_TERM,
|
||||
PRIORTY_TERM,
|
||||
|
||||
QUALITY_TERM,
|
||||
YEAR_TERM,
|
||||
SIZE_TERM,
|
||||
RANK_TERM,
|
||||
NEAR_TERM,
|
||||
|
||||
QS_TERM,
|
||||
|
||||
QUOT,
|
||||
MINUS,
|
||||
QMARK,
|
||||
LPAREN,
|
||||
RPAREN,
|
||||
|
||||
IGNORE;
|
||||
|
||||
public boolean test(Token t) {
|
||||
return t.type == this;
|
||||
}
|
||||
}
|
@ -1,14 +0,0 @@
|
||||
package nu.marginalia.functions.searchquery.query_parser.token;
|
||||
|
||||
public interface TokenVisitor {
|
||||
void onLiteralTerm(Token token);
|
||||
void onQuotTerm(Token token);
|
||||
void onExcludeTerm(Token token);
|
||||
void onPriorityTerm(Token token);
|
||||
void onAdviceTerm(Token token);
|
||||
void onYearTerm(Token token);
|
||||
void onSizeTerm(Token token);
|
||||
void onRankTerm(Token token);
|
||||
void onQualityTerm(Token token);
|
||||
void onQsTerm(Token token);
|
||||
}
|
@ -6,18 +6,19 @@ import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
|
||||
import nu.marginalia.functions.searchquery.query_parser.QueryParser;
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.Token;
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.TokenType;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
@Singleton
|
||||
@ -46,31 +47,89 @@ public class QueryFactory {
|
||||
List<String> searchTermsHuman = new ArrayList<>();
|
||||
List<String> problems = new ArrayList<>();
|
||||
|
||||
List<Token> basicQuery = queryParser.parse(query);
|
||||
List<QueryToken> basicQuery = queryParser.parse(query);
|
||||
|
||||
if (basicQuery.size() >= 12) {
|
||||
problems.add("Your search query is too long");
|
||||
basicQuery.clear();
|
||||
}
|
||||
|
||||
List<String> searchTermsExclude = new ArrayList<>();
|
||||
List<String> searchTermsInclude = new ArrayList<>();
|
||||
List<String> searchTermsAdvice = new ArrayList<>();
|
||||
List<String> searchTermsPriority = new ArrayList<>();
|
||||
List<List<String>> searchTermCoherences = new ArrayList<>();
|
||||
|
||||
QueryLimitsAccumulator qualityLimits = new QueryLimitsAccumulator(params);
|
||||
SpecificationLimit qualityLimit = SpecificationLimit.none();
|
||||
SpecificationLimit year = SpecificationLimit.none();
|
||||
SpecificationLimit size = SpecificationLimit.none();
|
||||
SpecificationLimit rank = SpecificationLimit.none();
|
||||
QueryStrategy queryStrategy = QueryStrategy.AUTO;
|
||||
|
||||
for (Token t : basicQuery) {
|
||||
if (t.type == TokenType.QUOT_TERM || t.type == TokenType.LITERAL_TERM) {
|
||||
if (t.str.startsWith("site:")) {
|
||||
t.str = normalizeDomainName(t.str);
|
||||
String domain = null;
|
||||
|
||||
System.out.println(basicQuery);
|
||||
|
||||
for (QueryToken t : basicQuery) {
|
||||
switch (t) {
|
||||
case QueryToken.QuotTerm(String str, String displayStr) -> {
|
||||
analyzeSearchTerm(problems, str, displayStr);
|
||||
searchTermsHuman.addAll(Arrays.asList(displayStr.replace("\"", "").split("\\s+")));
|
||||
|
||||
String[] parts = StringUtils.split(str, '_');
|
||||
|
||||
// Checking for stop words here is a bit of a stop-gap to fix the issue of stop words being
|
||||
// required in the query (which is a problem because they are not indexed). How to do this
|
||||
// in a clean way is a bit of an open problem that may not get resolved until query-parsing is
|
||||
// improved.
|
||||
|
||||
if (parts.length > 1 && !anyPartIsStopWord(parts)) {
|
||||
// Prefer that the actual n-gram is present
|
||||
searchTermsAdvice.add(str);
|
||||
|
||||
// Require that the terms appear in the same sentence
|
||||
searchTermCoherences.add(Arrays.asList(parts));
|
||||
|
||||
// Require that each term exists in the document
|
||||
// (needed for ranking)
|
||||
searchTermsInclude.addAll(Arrays.asList(parts));
|
||||
}
|
||||
else {
|
||||
searchTermsInclude.add(str);
|
||||
}
|
||||
}
|
||||
case QueryToken.LiteralTerm(String str, String displayStr) -> {
|
||||
analyzeSearchTerm(problems, str, displayStr);
|
||||
searchTermsHuman.addAll(Arrays.asList(displayStr.split("\\s+")));
|
||||
|
||||
searchTermsInclude.add(str);
|
||||
}
|
||||
|
||||
searchTermsHuman.addAll(toHumanSearchTerms(t));
|
||||
analyzeSearchTerm(problems, t);
|
||||
}
|
||||
|
||||
t.visit(qualityLimits);
|
||||
case QueryToken.ExcludeTerm(String str, String displayStr) -> searchTermsExclude.add(str);
|
||||
case QueryToken.PriorityTerm(String str, String displayStr) -> searchTermsPriority.add(str);
|
||||
case QueryToken.AdviceTerm(String str, String displayStr) -> {
|
||||
searchTermsAdvice.add(str);
|
||||
|
||||
if (str.toLowerCase().startsWith("site:")) {
|
||||
domain = str.substring("site:".length());
|
||||
}
|
||||
}
|
||||
|
||||
case QueryToken.YearTerm(String str) -> year = parseSpecificationLimit(str);
|
||||
case QueryToken.SizeTerm(String str) -> size = parseSpecificationLimit(str);
|
||||
case QueryToken.RankTerm(String str) -> rank = parseSpecificationLimit(str);
|
||||
case QueryToken.QualityTerm(String str) -> qualityLimit = parseSpecificationLimit(str);
|
||||
case QueryToken.QsTerm(String str) -> queryStrategy = parseQueryStrategy(str);
|
||||
|
||||
default -> {}
|
||||
}
|
||||
}
|
||||
|
||||
QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery);
|
||||
String domain = termsAccumulator.domain;
|
||||
if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) {
|
||||
searchTermsInclude.addAll(searchTermsAdvice);
|
||||
searchTermsAdvice.clear();
|
||||
}
|
||||
|
||||
List<Integer> domainIds = params.domainIds();
|
||||
|
||||
@ -80,29 +139,29 @@ public class QueryFactory {
|
||||
limits = limits.forSingleDomain();
|
||||
}
|
||||
|
||||
var searchQuery = new SearchQuery(
|
||||
queryExpansion.expandQuery(
|
||||
searchTermsInclude
|
||||
),
|
||||
searchTermsInclude,
|
||||
searchTermsExclude,
|
||||
searchTermsAdvice,
|
||||
searchTermsPriority,
|
||||
searchTermCoherences
|
||||
);
|
||||
|
||||
var specsBuilder = SearchSpecification.builder()
|
||||
.query(
|
||||
new SearchQuery(
|
||||
queryExpansion.expandQuery(
|
||||
termsAccumulator.searchTermsInclude
|
||||
),
|
||||
termsAccumulator.searchTermsInclude,
|
||||
termsAccumulator.searchTermsExclude,
|
||||
termsAccumulator.searchTermsAdvice,
|
||||
termsAccumulator.searchTermsPriority,
|
||||
termsAccumulator.searchTermCoherences
|
||||
)
|
||||
)
|
||||
.query(searchQuery)
|
||||
.humanQuery(query)
|
||||
.quality(qualityLimits.qualityLimit)
|
||||
.year(qualityLimits.year)
|
||||
.size(qualityLimits.size)
|
||||
.rank(qualityLimits.rank)
|
||||
.quality(qualityLimit)
|
||||
.year(year)
|
||||
.size(size)
|
||||
.rank(rank)
|
||||
.domains(domainIds)
|
||||
.queryLimits(limits)
|
||||
.searchSetIdentifier(params.identifier())
|
||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
||||
.queryStrategy(qualityLimits.queryStrategy);
|
||||
.queryStrategy(queryStrategy);
|
||||
|
||||
SearchSpecification specs = specsBuilder.build();
|
||||
|
||||
@ -113,30 +172,52 @@ public class QueryFactory {
|
||||
return new ProcessedQuery(specs, searchTermsHuman, domain);
|
||||
}
|
||||
|
||||
private String normalizeDomainName(String str) {
|
||||
return str.toLowerCase();
|
||||
}
|
||||
|
||||
private List<String> toHumanSearchTerms(Token t) {
|
||||
if (t.type == TokenType.LITERAL_TERM) {
|
||||
return Arrays.asList(t.displayStr.split("\\s+"));
|
||||
}
|
||||
else if (t.type == TokenType.QUOT_TERM) {
|
||||
return Arrays.asList(t.displayStr.replace("\"", "").split("\\s+"));
|
||||
|
||||
}
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
private void analyzeSearchTerm(List<String> problems, Token term) {
|
||||
final String word = term.str;
|
||||
private void analyzeSearchTerm(List<String> problems, String str, String displayStr) {
|
||||
final String word = str;
|
||||
|
||||
if (word.length() < WordPatterns.MIN_WORD_LENGTH) {
|
||||
problems.add("Search term \"" + term.displayStr + "\" too short");
|
||||
problems.add("Search term \"" + displayStr + "\" too short");
|
||||
}
|
||||
if (!word.contains("_") && word.length() >= WordPatterns.MAX_WORD_LENGTH) {
|
||||
problems.add("Search term \"" + term.displayStr + "\" too long");
|
||||
problems.add("Search term \"" + displayStr + "\" too long");
|
||||
}
|
||||
}
|
||||
private SpecificationLimit parseSpecificationLimit(String str) {
|
||||
int startChar = str.charAt(0);
|
||||
|
||||
int val = Integer.parseInt(str.substring(1));
|
||||
if (startChar == '=') {
|
||||
return SpecificationLimit.equals(val);
|
||||
} else if (startChar == '<') {
|
||||
return SpecificationLimit.lessThan(val);
|
||||
} else if (startChar == '>') {
|
||||
return SpecificationLimit.greaterThan(val);
|
||||
} else {
|
||||
return SpecificationLimit.none();
|
||||
}
|
||||
}
|
||||
|
||||
private QueryStrategy parseQueryStrategy(String str) {
|
||||
return switch (str.toUpperCase()) {
|
||||
case "RF_TITLE" -> QueryStrategy.REQUIRE_FIELD_TITLE;
|
||||
case "RF_SUBJECT" -> QueryStrategy.REQUIRE_FIELD_SUBJECT;
|
||||
case "RF_SITE" -> QueryStrategy.REQUIRE_FIELD_SITE;
|
||||
case "RF_URL" -> QueryStrategy.REQUIRE_FIELD_URL;
|
||||
case "RF_DOMAIN" -> QueryStrategy.REQUIRE_FIELD_DOMAIN;
|
||||
case "RF_LINK" -> QueryStrategy.REQUIRE_FIELD_LINK;
|
||||
case "SENTENCE" -> QueryStrategy.SENTENCE;
|
||||
case "TOPIC" -> QueryStrategy.TOPIC;
|
||||
default -> QueryStrategy.AUTO;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
private boolean anyPartIsStopWord(String[] parts) {
|
||||
for (String part : parts) {
|
||||
if (WordPatterns.isStopWord(part)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -1,93 +0,0 @@
|
||||
package nu.marginalia.functions.searchquery.svc;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.Token;
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.TokenVisitor;
|
||||
|
||||
public class QueryLimitsAccumulator implements TokenVisitor {
|
||||
public SpecificationLimit qualityLimit;
|
||||
public SpecificationLimit year;
|
||||
public SpecificationLimit size;
|
||||
public SpecificationLimit rank;
|
||||
|
||||
public QueryStrategy queryStrategy = QueryStrategy.AUTO;
|
||||
|
||||
public QueryLimitsAccumulator(QueryParams params) {
|
||||
qualityLimit = params.quality();
|
||||
year = params.year();
|
||||
size = params.size();
|
||||
rank = params.rank();
|
||||
}
|
||||
|
||||
private SpecificationLimit parseSpecificationLimit(String str) {
|
||||
int startChar = str.charAt(0);
|
||||
|
||||
int val = Integer.parseInt(str.substring(1));
|
||||
if (startChar == '=') {
|
||||
return SpecificationLimit.equals(val);
|
||||
} else if (startChar == '<') {
|
||||
return SpecificationLimit.lessThan(val);
|
||||
} else if (startChar == '>') {
|
||||
return SpecificationLimit.greaterThan(val);
|
||||
} else {
|
||||
return SpecificationLimit.none();
|
||||
}
|
||||
}
|
||||
|
||||
private QueryStrategy parseQueryStrategy(String str) {
|
||||
return switch (str.toUpperCase()) {
|
||||
case "RF_TITLE" -> QueryStrategy.REQUIRE_FIELD_TITLE;
|
||||
case "RF_SUBJECT" -> QueryStrategy.REQUIRE_FIELD_SUBJECT;
|
||||
case "RF_SITE" -> QueryStrategy.REQUIRE_FIELD_SITE;
|
||||
case "RF_URL" -> QueryStrategy.REQUIRE_FIELD_URL;
|
||||
case "RF_DOMAIN" -> QueryStrategy.REQUIRE_FIELD_DOMAIN;
|
||||
case "RF_LINK" -> QueryStrategy.REQUIRE_FIELD_LINK;
|
||||
case "SENTENCE" -> QueryStrategy.SENTENCE;
|
||||
case "TOPIC" -> QueryStrategy.TOPIC;
|
||||
default -> QueryStrategy.AUTO;
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onYearTerm(Token token) {
|
||||
year = parseSpecificationLimit(token.str);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onSizeTerm(Token token) {
|
||||
size = parseSpecificationLimit(token.str);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onRankTerm(Token token) {
|
||||
rank = parseSpecificationLimit(token.str);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onQualityTerm(Token token) {
|
||||
qualityLimit = parseSpecificationLimit(token.str);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onQsTerm(Token token) {
|
||||
queryStrategy = parseQueryStrategy(token.str);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void onLiteralTerm(Token token) {}
|
||||
|
||||
@Override
|
||||
public void onQuotTerm(Token token) {}
|
||||
|
||||
@Override
|
||||
public void onExcludeTerm(Token token) {}
|
||||
|
||||
@Override
|
||||
public void onPriorityTerm(Token token) {}
|
||||
|
||||
@Override
|
||||
public void onAdviceTerm(Token token) {}
|
||||
}
|
@ -1,105 +0,0 @@
|
||||
package nu.marginalia.functions.searchquery.svc;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.Token;
|
||||
import nu.marginalia.functions.searchquery.query_parser.token.TokenVisitor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/** @see SearchQuery */
|
||||
public class QuerySearchTermsAccumulator implements TokenVisitor {
|
||||
public List<String> searchTermsExclude = new ArrayList<>();
|
||||
public List<String> searchTermsInclude = new ArrayList<>();
|
||||
public List<String> searchTermsAdvice = new ArrayList<>();
|
||||
public List<String> searchTermsPriority = new ArrayList<>();
|
||||
public List<List<String>> searchTermCoherences = new ArrayList<>();
|
||||
|
||||
public String domain;
|
||||
|
||||
public QuerySearchTermsAccumulator(List<Token> parts) {
|
||||
for (Token t : parts) {
|
||||
t.visit(this);
|
||||
}
|
||||
|
||||
if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) {
|
||||
searchTermsInclude.addAll(searchTermsAdvice);
|
||||
searchTermsAdvice.clear();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onLiteralTerm(Token token) {
|
||||
searchTermsInclude.add(token.str);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onQuotTerm(Token token) {
|
||||
String[] parts = token.str.split("_");
|
||||
|
||||
// HACK (2023-05-02 vlofgren)
|
||||
//
|
||||
// Checking for stop words here is a bit of a stop-gap to fix the issue of stop words being
|
||||
// required in the query (which is a problem because they are not indexed). How to do this
|
||||
// in a clean way is a bit of an open problem that may not get resolved until query-parsing is
|
||||
// improved.
|
||||
|
||||
if (parts.length > 1 && !anyPartIsStopWord(parts)) {
|
||||
// Prefer that the actual n-gram is present
|
||||
searchTermsAdvice.add(token.str);
|
||||
|
||||
// Require that the terms appear in the same sentence
|
||||
searchTermCoherences.add(Arrays.asList(parts));
|
||||
|
||||
// Require that each term exists in the document
|
||||
// (needed for ranking)
|
||||
searchTermsInclude.addAll(Arrays.asList(parts));
|
||||
}
|
||||
else {
|
||||
searchTermsInclude.add(token.str);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private boolean anyPartIsStopWord(String[] parts) {
|
||||
for (String part : parts) {
|
||||
if (WordPatterns.isStopWord(part)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onExcludeTerm(Token token) {
|
||||
searchTermsExclude.add(token.str);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onPriorityTerm(Token token) {
|
||||
searchTermsPriority.add(token.str);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onAdviceTerm(Token token) {
|
||||
searchTermsAdvice.add(token.str);
|
||||
|
||||
if (token.str.toLowerCase().startsWith("site:")) {
|
||||
domain = token.str.substring("site:".length());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onYearTerm(Token token) {}
|
||||
@Override
|
||||
public void onSizeTerm(Token token) {}
|
||||
@Override
|
||||
public void onRankTerm(Token token) {}
|
||||
@Override
|
||||
public void onQualityTerm(Token token) {}
|
||||
@Override
|
||||
public void onQsTerm(Token token) {}
|
||||
}
|
@ -80,6 +80,15 @@ public class TransformList<T> {
|
||||
iter.remove();
|
||||
}
|
||||
}
|
||||
else if (firstEntity.action == Action.NO_OP) {
|
||||
if (secondEntry.action == Action.REPLACE) {
|
||||
backingList.set(iter.nextIndex(), secondEntry.value);
|
||||
}
|
||||
else if (secondEntry.action == Action.REMOVE) {
|
||||
iter.next();
|
||||
iter.remove();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user