(qs) Clean up parsing code using new record matching

This commit is contained in:
Viktor Lofgren 2024-04-11 17:20:13 +02:00
parent 6bfe04b609
commit 8bf7d090fd
13 changed files with 349 additions and 432 deletions

View File

@ -72,7 +72,7 @@ public class SearchQuery {
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
if (!compiledQuery.isEmpty()) sb.append("compiledQuery=").append(compiledQuery);
if (!compiledQuery.isEmpty()) sb.append("compiledQuery=").append(compiledQuery).append(", ");
if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] ")));

View File

@ -1,7 +0,0 @@
package nu.marginalia.functions.searchquery.query_parser;
import nu.marginalia.functions.searchquery.query_parser.model.QWordGraph;
public interface ExpansionStrategy {
void expand(QWordGraph graph);
}

View File

@ -15,6 +15,9 @@ import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
/** Responsible for expanding a query, that is creating alternative branches of query execution
* to increase the number of results
*/
public class QueryExpansion {
private static final PorterStemmer ps = new PorterStemmer();
private final TermFrequencyDict dict;
@ -94,6 +97,10 @@ public class QueryExpansion {
}
}
/** Create an alternative interpretation of the query that replaces a sequence of words
* with a word n-gram. This makes it so that when possible, the order of words in the document
* matches the order of the words in the query.
*/
public void createSegments(QWordGraph graph) {
List<QWord> nodes = new ArrayList<>();
@ -115,4 +122,7 @@ public class QueryExpansion {
}
}
public interface ExpansionStrategy {
void expand(QWordGraph graph);
}
}

View File

@ -1,8 +1,7 @@
package nu.marginalia.functions.searchquery.query_parser;
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.functions.searchquery.query_parser.token.Token;
import nu.marginalia.functions.searchquery.query_parser.token.TokenType;
import nu.marginalia.util.transform_list.TransformList;
import java.util.List;
@ -11,95 +10,126 @@ public class QueryParser {
private final QueryTokenizer tokenizer = new QueryTokenizer();
public List<Token> parse(String query) {
List<Token> basicTokens = tokenizer.tokenizeQuery(query);
public List<QueryToken> parse(String query) {
List<QueryToken> basicTokens = tokenizer.tokenizeQuery(query);
TransformList<Token> list = new TransformList<>(basicTokens);
TransformList<QueryToken> list = new TransformList<>(basicTokens);
list.transformEach(QueryParser::handleQuoteTokens);
list.transformEach(QueryParser::trimLiterals);
list.transformEachPair(QueryParser::createNegatedTerms);
list.transformEachPair(QueryParser::createPriorityTerms);
list.transformEach(QueryParser::handleSpecialOperations);
list.scanAndTransform(TokenType.LPAREN, TokenType.RPAREN, QueryParser::handleAdvisoryTerms);
list.scanAndTransform(QueryToken.LParen.class::isInstance, QueryToken.RParen.class::isInstance, QueryParser::handleAdvisoryTerms);
list.transformEach(QueryParser::normalizeDomainName);
return list.getBackingList();
}
private static void handleQuoteTokens(TransformList<Token>.Entity entity) {
var t = entity.value();
if (t.type == TokenType.QUOT) {
entity.replace(new Token(TokenType.QUOT_TERM,
t.str.replaceAll("\\s+", WordPatterns.WORD_TOKEN_JOINER),
t.displayStr));
}
}
private static void trimLiterals(TransformList<Token>.Entity entity) {
private static void normalizeDomainName(TransformList<QueryToken>.Entity entity) {
var t = entity.value();
if (t.type == TokenType.LITERAL_TERM
&& (t.str.endsWith(":") || t.str.endsWith("."))
&& t.str.length() > 1) {
entity.replace(new Token(TokenType.LITERAL_TERM, t.str.substring(0, t.str.length() - 1), t.displayStr));
if (!(t instanceof QueryToken.LiteralTerm))
return;
if (t.str().startsWith("site:")) {
entity.replace(new QueryToken.LiteralTerm(t.str().toLowerCase(), t.displayStr()));
}
}
private static void createNegatedTerms(TransformList<Token>.Entity first, TransformList<Token>.Entity second) {
var t = first.value();
var tn = second.value();
if (t.type == TokenType.MINUS && tn.type == TokenType.LITERAL_TERM) {
first.remove();
second.replace(new Token(TokenType.EXCLUDE_TERM, tn.str, "-" + tn.str));
}
}
private static void createPriorityTerms(TransformList<Token>.Entity first, TransformList<Token>.Entity second) {
var t = first.value();
var tn = second.value();
if (t.type == TokenType.QMARK && tn.type == TokenType.LITERAL_TERM) {
first.remove();
second.replace(new Token(TokenType.PRIORTY_TERM, tn.str, "?" + tn.str));
}
}
private static void handleSpecialOperations(TransformList<Token>.Entity entity) {
private static void handleQuoteTokens(TransformList<QueryToken>.Entity entity) {
var t = entity.value();
if (t.type != TokenType.LITERAL_TERM) {
if (!(t instanceof QueryToken.Quot)) {
return;
}
if (t.str.startsWith("q") && t.str.matches("q[=><]\\d+")) {
entity.replace(new Token(TokenType.QUALITY_TERM, t.str.substring(1), t.displayStr));
} else if (t.str.startsWith("near:")) {
entity.replace(new Token(TokenType.NEAR_TERM, t.str.substring(5), t.displayStr));
} else if (t.str.startsWith("year") && t.str.matches("year[=><]\\d{4}")) {
entity.replace(new Token(TokenType.YEAR_TERM, t.str.substring(4), t.displayStr));
} else if (t.str.startsWith("size") && t.str.matches("size[=><]\\d+")) {
entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr));
} else if (t.str.startsWith("rank") && t.str.matches("rank[=><]\\d+")) {
entity.replace(new Token(TokenType.RANK_TERM, t.str.substring(4), t.displayStr));
} else if (t.str.startsWith("qs=")) {
entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr));
} else if (t.str.contains(":")) {
entity.replace(new Token(TokenType.ADVICE_TERM, t.str, t.displayStr));
}
entity.replace(new QueryToken.QuotTerm(
t.str().replaceAll("\\s+", WordPatterns.WORD_TOKEN_JOINER),
t.displayStr()));
}
private static void handleAdvisoryTerms(TransformList<Token>.Entity entity) {
private static void trimLiterals(TransformList<QueryToken>.Entity entity) {
var t = entity.value();
if (t.type == TokenType.LPAREN) {
entity.remove();
} else if (t.type == TokenType.RPAREN) {
entity.remove();
} else if (t.type == TokenType.LITERAL_TERM) {
entity.replace(new Token(TokenType.ADVICE_TERM, t.str, "(" + t.str + ")"));
if (!(t instanceof QueryToken.LiteralTerm lt))
return;
String str = lt.str();
if (str.isBlank())
return;
if (str.endsWith(":") || str.endsWith(".")) {
entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 1), lt.displayStr()));
}
}
private static void createNegatedTerms(TransformList<QueryToken>.Entity first, TransformList<QueryToken>.Entity second) {
var t = first.value();
var tn = second.value();
if (!(t instanceof QueryToken.Minus))
return;
if (!(tn instanceof QueryToken.LiteralTerm) && !(tn instanceof QueryToken.AdviceTerm))
return;
first.remove();
second.replace(new QueryToken.ExcludeTerm(tn.str(), "-" + tn.displayStr()));
}
private static void createPriorityTerms(TransformList<QueryToken>.Entity first, TransformList<QueryToken>.Entity second) {
var t = first.value();
var tn = second.value();
if (!(t instanceof QueryToken.QMark))
return;
if (!(tn instanceof QueryToken.LiteralTerm) && !(tn instanceof QueryToken.AdviceTerm))
return;
var replacement = new QueryToken.PriorityTerm(tn.str(), "?" + tn.displayStr());
first.remove();
second.replace(replacement);
}
private static void handleSpecialOperations(TransformList<QueryToken>.Entity entity) {
var t = entity.value();
if (!(t instanceof QueryToken.LiteralTerm)) {
return;
}
String str = t.str();
if (str.startsWith("q") && str.matches("q[=><]\\d+")) {
entity.replace(new QueryToken.QualityTerm(str.substring(1)));
} else if (str.startsWith("near:")) {
entity.replace(new QueryToken.NearTerm(str.substring(5)));
} else if (str.startsWith("year") && str.matches("year[=><]\\d{4}")) {
entity.replace(new QueryToken.YearTerm(str.substring(4)));
} else if (str.startsWith("size") && str.matches("size[=><]\\d+")) {
entity.replace(new QueryToken.SizeTerm(str.substring(4)));
} else if (str.startsWith("rank") && str.matches("rank[=><]\\d+")) {
entity.replace(new QueryToken.RankTerm(str.substring(4)));
} else if (str.startsWith("qs=")) {
entity.replace(new QueryToken.QsTerm(str.substring(3)));
} else if (str.contains(":")) {
entity.replace(new QueryToken.AdviceTerm(str, t.displayStr()));
}
}
private static void handleAdvisoryTerms(TransformList<QueryToken>.Entity entity) {
var t = entity.value();
if (t instanceof QueryToken.LParen) {
entity.remove();
} else if (t instanceof QueryToken.RParen) {
entity.remove();
} else if (t instanceof QueryToken.LiteralTerm) {
entity.replace(new QueryToken.AdviceTerm(t.str(), "(" + t.displayStr() + ")"));
}
}
}

View File

@ -1,7 +1,6 @@
package nu.marginalia.functions.searchquery.query_parser;
import nu.marginalia.functions.searchquery.query_parser.token.Token;
import nu.marginalia.functions.searchquery.query_parser.token.TokenType;
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
import nu.marginalia.language.encoding.AsciiFlattener;
import java.util.ArrayList;
@ -11,8 +10,8 @@ import java.util.regex.Pattern;
public class QueryTokenizer {
private static final Pattern noisePattern = Pattern.compile("[,\\s]");
public List<Token> tokenizeQuery(String rawQuery) {
List<Token> tokens = new ArrayList<>();
public List<QueryToken> tokenizeQuery(String rawQuery) {
List<QueryToken> tokens = new ArrayList<>();
String query = AsciiFlattener.flattenUnicode(rawQuery);
query = noisePattern.matcher(query).replaceAll(" ");
@ -21,26 +20,27 @@ public class QueryTokenizer {
int chr = query.charAt(i);
if ('(' == chr) {
tokens.add(new Token(TokenType.LPAREN, "(", "("));
tokens.add(new QueryToken.LParen());
}
else if (')' == chr) {
tokens.add(new Token(TokenType.RPAREN, ")", ")"));
tokens.add(new QueryToken.RParen());
}
else if ('"' == chr) {
int end = query.indexOf('"', i+1);
if (end == -1) {
end = query.length();
}
tokens.add(new Token(TokenType.QUOT,
query.substring(i+1, end).toLowerCase(),
query.substring(i, Math.min(query.length(), end+1))));
tokens.add(new QueryToken.Quot(query.substring(i + 1, end).toLowerCase()));
i = end;
}
else if ('-' == chr) {
tokens.add(new Token(TokenType.MINUS, "-"));
tokens.add(new QueryToken.Minus());
}
else if ('?' == chr) {
tokens.add(new Token(TokenType.QMARK, "?"));
tokens.add(new QueryToken.QMark());
}
else if (Character.isSpaceChar(chr)) {
//
@ -52,9 +52,12 @@ public class QueryTokenizer {
if (query.charAt(end) == ' ' || query.charAt(end) == ')')
break;
}
tokens.add(new Token(TokenType.LITERAL_TERM,
query.substring(i, end).toLowerCase(),
query.substring(i, end)));
String displayStr = query.substring(i, end);
String str = displayStr.toLowerCase();
tokens.add(new QueryToken.LiteralTerm(str, displayStr));
i = end-1;
}
}

View File

@ -0,0 +1,86 @@
package nu.marginalia.functions.searchquery.query_parser.token;
public sealed interface QueryToken {
String str();
String displayStr();
record LiteralTerm(String str, String displayStr) implements QueryToken {}
record QuotTerm(String str, String displayStr) implements QueryToken {}
record ExcludeTerm(String str, String displayStr) implements QueryToken {}
record AdviceTerm(String str, String displayStr) implements QueryToken {}
record PriorityTerm(String str, String displayStr) implements QueryToken {}
record QualityTerm(String str) implements QueryToken {
public String displayStr() {
return "q" + str;
}
}
record YearTerm(String str) implements QueryToken {
public String displayStr() {
return "year" + str;
}
}
record SizeTerm(String str) implements QueryToken {
public String displayStr() {
return "size" + str;
}
}
record RankTerm(String str) implements QueryToken {
public String displayStr() {
return "rank" + str;
}
}
record NearTerm(String str) implements QueryToken {
public String displayStr() {
return "near:" + str;
}
}
record QsTerm(String str) implements QueryToken {
public String displayStr() {
return "qs" + str;
}
}
record Quot(String str) implements QueryToken {
public String displayStr() {
return "\"" + str + "\"";
}
}
record Minus() implements QueryToken {
public String str() {
return "-";
}
public String displayStr() {
return "-";
}
}
record QMark() implements QueryToken {
public String str() {
return "?";
}
public String displayStr() {
return "?";
}
}
record LParen() implements QueryToken {
public String str() {
return "(";
}
public String displayStr() {
return "(";
}
}
record RParen() implements QueryToken {
public String str() {
return ")";
}
public String displayStr() {
return ")";
}
}
record Ignore(String str, String displayStr) implements QueryToken {}
}

View File

@ -1,49 +0,0 @@
package nu.marginalia.functions.searchquery.query_parser.token;
import lombok.EqualsAndHashCode;
import lombok.ToString;
import lombok.With;
@ToString
@EqualsAndHashCode
@With
public class Token {
public TokenType type;
public String str;
public final String displayStr;
public Token(TokenType type, String str, String displayStr) {
this.type = type;
this.str = str;
this.displayStr = safeString(displayStr);
}
public Token(TokenType type, String str) {
this.type = type;
this.str = str;
this.displayStr = safeString(str);
}
private static String safeString(String s) {
return s.replaceAll("<", "&lt;")
.replaceAll(">", "&gt;");
}
public void visit(TokenVisitor visitor) {
switch (type) {
case QUOT_TERM: visitor.onQuotTerm(this); break;
case EXCLUDE_TERM: visitor.onExcludeTerm(this); break;
case PRIORTY_TERM: visitor.onPriorityTerm(this); break;
case ADVICE_TERM: visitor.onAdviceTerm(this); break;
case LITERAL_TERM: visitor.onLiteralTerm(this); break;
case YEAR_TERM: visitor.onYearTerm(this); break;
case RANK_TERM: visitor.onRankTerm(this); break;
case SIZE_TERM: visitor.onSizeTerm(this); break;
case QS_TERM: visitor.onQsTerm(this); break;
case QUALITY_TERM: visitor.onQualityTerm(this); break;
}
}
}

View File

@ -1,34 +0,0 @@
package nu.marginalia.functions.searchquery.query_parser.token;
import java.util.function.Predicate;
public enum TokenType implements Predicate<Token> {
TERM,
LITERAL_TERM,
QUOT_TERM,
EXCLUDE_TERM,
ADVICE_TERM,
PRIORTY_TERM,
QUALITY_TERM,
YEAR_TERM,
SIZE_TERM,
RANK_TERM,
NEAR_TERM,
QS_TERM,
QUOT,
MINUS,
QMARK,
LPAREN,
RPAREN,
IGNORE;
public boolean test(Token t) {
return t.type == this;
}
}

View File

@ -1,14 +0,0 @@
package nu.marginalia.functions.searchquery.query_parser.token;
public interface TokenVisitor {
void onLiteralTerm(Token token);
void onQuotTerm(Token token);
void onExcludeTerm(Token token);
void onPriorityTerm(Token token);
void onAdviceTerm(Token token);
void onYearTerm(Token token);
void onSizeTerm(Token token);
void onRankTerm(Token token);
void onQualityTerm(Token token);
void onQsTerm(Token token);
}

View File

@ -6,18 +6,19 @@ import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
import nu.marginalia.functions.searchquery.query_parser.QueryParser;
import nu.marginalia.functions.searchquery.query_parser.token.Token;
import nu.marginalia.functions.searchquery.query_parser.token.TokenType;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
@Singleton
@ -46,31 +47,89 @@ public class QueryFactory {
List<String> searchTermsHuman = new ArrayList<>();
List<String> problems = new ArrayList<>();
List<Token> basicQuery = queryParser.parse(query);
List<QueryToken> basicQuery = queryParser.parse(query);
if (basicQuery.size() >= 12) {
problems.add("Your search query is too long");
basicQuery.clear();
}
List<String> searchTermsExclude = new ArrayList<>();
List<String> searchTermsInclude = new ArrayList<>();
List<String> searchTermsAdvice = new ArrayList<>();
List<String> searchTermsPriority = new ArrayList<>();
List<List<String>> searchTermCoherences = new ArrayList<>();
QueryLimitsAccumulator qualityLimits = new QueryLimitsAccumulator(params);
SpecificationLimit qualityLimit = SpecificationLimit.none();
SpecificationLimit year = SpecificationLimit.none();
SpecificationLimit size = SpecificationLimit.none();
SpecificationLimit rank = SpecificationLimit.none();
QueryStrategy queryStrategy = QueryStrategy.AUTO;
for (Token t : basicQuery) {
if (t.type == TokenType.QUOT_TERM || t.type == TokenType.LITERAL_TERM) {
if (t.str.startsWith("site:")) {
t.str = normalizeDomainName(t.str);
String domain = null;
System.out.println(basicQuery);
for (QueryToken t : basicQuery) {
switch (t) {
case QueryToken.QuotTerm(String str, String displayStr) -> {
analyzeSearchTerm(problems, str, displayStr);
searchTermsHuman.addAll(Arrays.asList(displayStr.replace("\"", "").split("\\s+")));
String[] parts = StringUtils.split(str, '_');
// Checking for stop words here is a bit of a stop-gap to fix the issue of stop words being
// required in the query (which is a problem because they are not indexed). How to do this
// in a clean way is a bit of an open problem that may not get resolved until query-parsing is
// improved.
if (parts.length > 1 && !anyPartIsStopWord(parts)) {
// Prefer that the actual n-gram is present
searchTermsAdvice.add(str);
// Require that the terms appear in the same sentence
searchTermCoherences.add(Arrays.asList(parts));
// Require that each term exists in the document
// (needed for ranking)
searchTermsInclude.addAll(Arrays.asList(parts));
}
else {
searchTermsInclude.add(str);
}
}
case QueryToken.LiteralTerm(String str, String displayStr) -> {
analyzeSearchTerm(problems, str, displayStr);
searchTermsHuman.addAll(Arrays.asList(displayStr.split("\\s+")));
searchTermsInclude.add(str);
}
searchTermsHuman.addAll(toHumanSearchTerms(t));
analyzeSearchTerm(problems, t);
case QueryToken.ExcludeTerm(String str, String displayStr) -> searchTermsExclude.add(str);
case QueryToken.PriorityTerm(String str, String displayStr) -> searchTermsPriority.add(str);
case QueryToken.AdviceTerm(String str, String displayStr) -> {
searchTermsAdvice.add(str);
if (str.toLowerCase().startsWith("site:")) {
domain = str.substring("site:".length());
}
}
t.visit(qualityLimits);
case QueryToken.YearTerm(String str) -> year = parseSpecificationLimit(str);
case QueryToken.SizeTerm(String str) -> size = parseSpecificationLimit(str);
case QueryToken.RankTerm(String str) -> rank = parseSpecificationLimit(str);
case QueryToken.QualityTerm(String str) -> qualityLimit = parseSpecificationLimit(str);
case QueryToken.QsTerm(String str) -> queryStrategy = parseQueryStrategy(str);
default -> {}
}
}
QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery);
String domain = termsAccumulator.domain;
if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) {
searchTermsInclude.addAll(searchTermsAdvice);
searchTermsAdvice.clear();
}
List<Integer> domainIds = params.domainIds();
@ -80,29 +139,29 @@ public class QueryFactory {
limits = limits.forSingleDomain();
}
var specsBuilder = SearchSpecification.builder()
.query(
new SearchQuery(
var searchQuery = new SearchQuery(
queryExpansion.expandQuery(
termsAccumulator.searchTermsInclude
searchTermsInclude
),
termsAccumulator.searchTermsInclude,
termsAccumulator.searchTermsExclude,
termsAccumulator.searchTermsAdvice,
termsAccumulator.searchTermsPriority,
termsAccumulator.searchTermCoherences
)
)
searchTermsInclude,
searchTermsExclude,
searchTermsAdvice,
searchTermsPriority,
searchTermCoherences
);
var specsBuilder = SearchSpecification.builder()
.query(searchQuery)
.humanQuery(query)
.quality(qualityLimits.qualityLimit)
.year(qualityLimits.year)
.size(qualityLimits.size)
.rank(qualityLimits.rank)
.quality(qualityLimit)
.year(year)
.size(size)
.rank(rank)
.domains(domainIds)
.queryLimits(limits)
.searchSetIdentifier(params.identifier())
.rankingParams(ResultRankingParameters.sensibleDefaults())
.queryStrategy(qualityLimits.queryStrategy);
.queryStrategy(queryStrategy);
SearchSpecification specs = specsBuilder.build();
@ -113,30 +172,52 @@ public class QueryFactory {
return new ProcessedQuery(specs, searchTermsHuman, domain);
}
private String normalizeDomainName(String str) {
return str.toLowerCase();
}
private List<String> toHumanSearchTerms(Token t) {
if (t.type == TokenType.LITERAL_TERM) {
return Arrays.asList(t.displayStr.split("\\s+"));
}
else if (t.type == TokenType.QUOT_TERM) {
return Arrays.asList(t.displayStr.replace("\"", "").split("\\s+"));
}
return Collections.emptyList();
}
private void analyzeSearchTerm(List<String> problems, Token term) {
final String word = term.str;
private void analyzeSearchTerm(List<String> problems, String str, String displayStr) {
final String word = str;
if (word.length() < WordPatterns.MIN_WORD_LENGTH) {
problems.add("Search term \"" + term.displayStr + "\" too short");
problems.add("Search term \"" + displayStr + "\" too short");
}
if (!word.contains("_") && word.length() >= WordPatterns.MAX_WORD_LENGTH) {
problems.add("Search term \"" + term.displayStr + "\" too long");
problems.add("Search term \"" + displayStr + "\" too long");
}
}
private SpecificationLimit parseSpecificationLimit(String str) {
int startChar = str.charAt(0);
int val = Integer.parseInt(str.substring(1));
if (startChar == '=') {
return SpecificationLimit.equals(val);
} else if (startChar == '<') {
return SpecificationLimit.lessThan(val);
} else if (startChar == '>') {
return SpecificationLimit.greaterThan(val);
} else {
return SpecificationLimit.none();
}
}
private QueryStrategy parseQueryStrategy(String str) {
return switch (str.toUpperCase()) {
case "RF_TITLE" -> QueryStrategy.REQUIRE_FIELD_TITLE;
case "RF_SUBJECT" -> QueryStrategy.REQUIRE_FIELD_SUBJECT;
case "RF_SITE" -> QueryStrategy.REQUIRE_FIELD_SITE;
case "RF_URL" -> QueryStrategy.REQUIRE_FIELD_URL;
case "RF_DOMAIN" -> QueryStrategy.REQUIRE_FIELD_DOMAIN;
case "RF_LINK" -> QueryStrategy.REQUIRE_FIELD_LINK;
case "SENTENCE" -> QueryStrategy.SENTENCE;
case "TOPIC" -> QueryStrategy.TOPIC;
default -> QueryStrategy.AUTO;
};
}
private boolean anyPartIsStopWord(String[] parts) {
for (String part : parts) {
if (WordPatterns.isStopWord(part)) {
return true;
}
}
return false;
}
}

View File

@ -1,93 +0,0 @@
package nu.marginalia.functions.searchquery.svc;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.functions.searchquery.query_parser.token.Token;
import nu.marginalia.functions.searchquery.query_parser.token.TokenVisitor;
public class QueryLimitsAccumulator implements TokenVisitor {
public SpecificationLimit qualityLimit;
public SpecificationLimit year;
public SpecificationLimit size;
public SpecificationLimit rank;
public QueryStrategy queryStrategy = QueryStrategy.AUTO;
public QueryLimitsAccumulator(QueryParams params) {
qualityLimit = params.quality();
year = params.year();
size = params.size();
rank = params.rank();
}
private SpecificationLimit parseSpecificationLimit(String str) {
int startChar = str.charAt(0);
int val = Integer.parseInt(str.substring(1));
if (startChar == '=') {
return SpecificationLimit.equals(val);
} else if (startChar == '<') {
return SpecificationLimit.lessThan(val);
} else if (startChar == '>') {
return SpecificationLimit.greaterThan(val);
} else {
return SpecificationLimit.none();
}
}
private QueryStrategy parseQueryStrategy(String str) {
return switch (str.toUpperCase()) {
case "RF_TITLE" -> QueryStrategy.REQUIRE_FIELD_TITLE;
case "RF_SUBJECT" -> QueryStrategy.REQUIRE_FIELD_SUBJECT;
case "RF_SITE" -> QueryStrategy.REQUIRE_FIELD_SITE;
case "RF_URL" -> QueryStrategy.REQUIRE_FIELD_URL;
case "RF_DOMAIN" -> QueryStrategy.REQUIRE_FIELD_DOMAIN;
case "RF_LINK" -> QueryStrategy.REQUIRE_FIELD_LINK;
case "SENTENCE" -> QueryStrategy.SENTENCE;
case "TOPIC" -> QueryStrategy.TOPIC;
default -> QueryStrategy.AUTO;
};
}
@Override
public void onYearTerm(Token token) {
year = parseSpecificationLimit(token.str);
}
@Override
public void onSizeTerm(Token token) {
size = parseSpecificationLimit(token.str);
}
@Override
public void onRankTerm(Token token) {
rank = parseSpecificationLimit(token.str);
}
@Override
public void onQualityTerm(Token token) {
qualityLimit = parseSpecificationLimit(token.str);
}
@Override
public void onQsTerm(Token token) {
queryStrategy = parseQueryStrategy(token.str);
}
@Override
public void onLiteralTerm(Token token) {}
@Override
public void onQuotTerm(Token token) {}
@Override
public void onExcludeTerm(Token token) {}
@Override
public void onPriorityTerm(Token token) {}
@Override
public void onAdviceTerm(Token token) {}
}

View File

@ -1,105 +0,0 @@
package nu.marginalia.functions.searchquery.svc;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.functions.searchquery.query_parser.token.Token;
import nu.marginalia.functions.searchquery.query_parser.token.TokenVisitor;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/** @see SearchQuery */
public class QuerySearchTermsAccumulator implements TokenVisitor {
public List<String> searchTermsExclude = new ArrayList<>();
public List<String> searchTermsInclude = new ArrayList<>();
public List<String> searchTermsAdvice = new ArrayList<>();
public List<String> searchTermsPriority = new ArrayList<>();
public List<List<String>> searchTermCoherences = new ArrayList<>();
public String domain;
public QuerySearchTermsAccumulator(List<Token> parts) {
for (Token t : parts) {
t.visit(this);
}
if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) {
searchTermsInclude.addAll(searchTermsAdvice);
searchTermsAdvice.clear();
}
}
@Override
public void onLiteralTerm(Token token) {
searchTermsInclude.add(token.str);
}
@Override
public void onQuotTerm(Token token) {
String[] parts = token.str.split("_");
// HACK (2023-05-02 vlofgren)
//
// Checking for stop words here is a bit of a stop-gap to fix the issue of stop words being
// required in the query (which is a problem because they are not indexed). How to do this
// in a clean way is a bit of an open problem that may not get resolved until query-parsing is
// improved.
if (parts.length > 1 && !anyPartIsStopWord(parts)) {
// Prefer that the actual n-gram is present
searchTermsAdvice.add(token.str);
// Require that the terms appear in the same sentence
searchTermCoherences.add(Arrays.asList(parts));
// Require that each term exists in the document
// (needed for ranking)
searchTermsInclude.addAll(Arrays.asList(parts));
}
else {
searchTermsInclude.add(token.str);
}
}
private boolean anyPartIsStopWord(String[] parts) {
for (String part : parts) {
if (WordPatterns.isStopWord(part)) {
return true;
}
}
return false;
}
@Override
public void onExcludeTerm(Token token) {
searchTermsExclude.add(token.str);
}
@Override
public void onPriorityTerm(Token token) {
searchTermsPriority.add(token.str);
}
@Override
public void onAdviceTerm(Token token) {
searchTermsAdvice.add(token.str);
if (token.str.toLowerCase().startsWith("site:")) {
domain = token.str.substring("site:".length());
}
}
@Override
public void onYearTerm(Token token) {}
@Override
public void onSizeTerm(Token token) {}
@Override
public void onRankTerm(Token token) {}
@Override
public void onQualityTerm(Token token) {}
@Override
public void onQsTerm(Token token) {}
}

View File

@ -80,6 +80,15 @@ public class TransformList<T> {
iter.remove();
}
}
else if (firstEntity.action == Action.NO_OP) {
if (secondEntry.action == Action.REPLACE) {
backingList.set(iter.nextIndex(), secondEntry.value);
}
else if (secondEntry.action == Action.REMOVE) {
iter.next();
iter.remove();
}
}
}
}