(query) Tidy up code

This commit is contained in:
Viktor Lofgren 2024-06-26 13:40:06 +02:00
parent 02df421c94
commit 6973712480
6 changed files with 89 additions and 92 deletions

View File

@ -36,8 +36,8 @@ public class SearchQuery {
@Deprecated // why does this exist? @Deprecated // why does this exist?
private double value = 0; private double value = 0;
public static SearchQueryBuilder builder(String compiledQuery) { public static SearchQueryBuilder builder() {
return new SearchQueryBuilder(compiledQuery); return new SearchQueryBuilder();
} }
public SearchQuery() { public SearchQuery() {
@ -86,15 +86,19 @@ public class SearchQuery {
} }
public static class SearchQueryBuilder { public static class SearchQueryBuilder {
private final String compiledQuery; private String compiledQuery;
private List<String> searchTermsInclude = new ArrayList<>(); public final List<String> searchTermsInclude = new ArrayList<>();
private List<String> searchTermsExclude = new ArrayList<>(); public final List<String> searchTermsExclude = new ArrayList<>();
private List<String> searchTermsAdvice = new ArrayList<>(); public final List<String> searchTermsAdvice = new ArrayList<>();
private List<String> searchTermsPriority = new ArrayList<>(); public final List<String> searchTermsPriority = new ArrayList<>();
private List<SearchCoherenceConstraint> searchTermCoherences = new ArrayList<>(); public final List<SearchCoherenceConstraint> searchTermCoherences = new ArrayList<>();
private SearchQueryBuilder(String compiledQuery) { private SearchQueryBuilder() {
this.compiledQuery = compiledQuery; }
public SearchQueryBuilder compiledQuery(String query) {
this.compiledQuery = query;
return this;
} }
public SearchQueryBuilder include(String... terms) { public SearchQueryBuilder include(String... terms) {
@ -117,7 +121,7 @@ public class SearchQuery {
return this; return this;
} }
public SearchQueryBuilder coherences(SearchCoherenceConstraint constraint) { public SearchQueryBuilder coherenceConstraint(SearchCoherenceConstraint constraint) {
searchTermCoherences.add(constraint); searchTermCoherences.add(constraint);
return this; return this;
} }
@ -125,5 +129,13 @@ public class SearchQuery {
public SearchQuery build() { public SearchQuery build() {
return new SearchQuery(compiledQuery, searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchTermCoherences); return new SearchQuery(compiledQuery, searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchTermCoherences);
} }
/** If there are no ranking terms, promote the advice terms to ranking terms */
public void promoteNonRankingTerms() {
if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) {
searchTermsInclude.addAll(searchTermsAdvice);
searchTermsAdvice.clear();
}
}
} }
} }

View File

@ -1,6 +1,7 @@
package nu.marginalia.functions.searchquery.query_parser; package nu.marginalia.functions.searchquery.query_parser;
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken; import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.language.WordPatterns; import nu.marginalia.language.WordPatterns;
import nu.marginalia.util.transform_list.TransformList; import nu.marginalia.util.transform_list.TransformList;
@ -104,15 +105,19 @@ public class QueryParser {
String str = t.str(); String str = t.str();
if (str.startsWith("q") && str.matches("q[=><]\\d+")) { if (str.startsWith("q") && str.matches("q[=><]\\d+")) {
entity.replace(new QueryToken.QualityTerm(str.substring(1))); var limit = parseSpecificationLimit(str.substring(1));
entity.replace(new QueryToken.QualityTerm(limit, str));
} else if (str.startsWith("near:")) { } else if (str.startsWith("near:")) {
entity.replace(new QueryToken.NearTerm(str.substring(5))); entity.replace(new QueryToken.NearTerm(str.substring(5)));
} else if (str.startsWith("year") && str.matches("year[=><]\\d{4}")) { } else if (str.startsWith("year") && str.matches("year[=><]\\d{4}")) {
entity.replace(new QueryToken.YearTerm(str.substring(4))); var limit = parseSpecificationLimit(str.substring(4));
entity.replace(new QueryToken.YearTerm(limit, str));
} else if (str.startsWith("size") && str.matches("size[=><]\\d+")) { } else if (str.startsWith("size") && str.matches("size[=><]\\d+")) {
entity.replace(new QueryToken.SizeTerm(str.substring(4))); var limit = parseSpecificationLimit(str.substring(4));
entity.replace(new QueryToken.SizeTerm(limit, str));
} else if (str.startsWith("rank") && str.matches("rank[=><]\\d+")) { } else if (str.startsWith("rank") && str.matches("rank[=><]\\d+")) {
entity.replace(new QueryToken.RankTerm(str.substring(4))); var limit = parseSpecificationLimit(str.substring(4));
entity.replace(new QueryToken.RankTerm(limit, str));
} else if (str.startsWith("qs=")) { } else if (str.startsWith("qs=")) {
entity.replace(new QueryToken.QsTerm(str.substring(3))); entity.replace(new QueryToken.QsTerm(str.substring(3)));
} else if (str.contains(":")) { } else if (str.contains(":")) {
@ -120,6 +125,21 @@ public class QueryParser {
} }
} }
private static SpecificationLimit parseSpecificationLimit(String str) {
int startChar = str.charAt(0);
int val = Integer.parseInt(str.substring(1));
if (startChar == '=') {
return SpecificationLimit.equals(val);
} else if (startChar == '<') {
return SpecificationLimit.lessThan(val);
} else if (startChar == '>') {
return SpecificationLimit.greaterThan(val);
} else {
return SpecificationLimit.none();
}
}
private static void handleAdvisoryTerms(TransformList<QueryToken>.Entity entity) { private static void handleAdvisoryTerms(TransformList<QueryToken>.Entity entity) {
var t = entity.value(); var t = entity.value();
if (t instanceof QueryToken.LParen) { if (t instanceof QueryToken.LParen) {

View File

@ -1,6 +1,8 @@
package nu.marginalia.functions.searchquery.query_parser.token; package nu.marginalia.functions.searchquery.query_parser.token;
import nu.marginalia.index.query.limit.SpecificationLimit;
public sealed interface QueryToken { public sealed interface QueryToken {
String str(); String str();
String displayStr(); String displayStr();
@ -11,25 +13,18 @@ public sealed interface QueryToken {
record AdviceTerm(String str, String displayStr) implements QueryToken {} record AdviceTerm(String str, String displayStr) implements QueryToken {}
record PriorityTerm(String str, String displayStr) implements QueryToken {} record PriorityTerm(String str, String displayStr) implements QueryToken {}
record QualityTerm(String str) implements QueryToken { record QualityTerm(SpecificationLimit limit, String displayStr) implements QueryToken {
public String displayStr() { public String str() { return displayStr; }
return "q" + str;
} }
record YearTerm(SpecificationLimit limit, String displayStr) implements QueryToken {
public String str() { return displayStr; }
} }
record YearTerm(String str) implements QueryToken { record SizeTerm(SpecificationLimit limit, String displayStr) implements QueryToken {
public String displayStr() { public String str() { return displayStr; }
return "year" + str;
}
}
record SizeTerm(String str) implements QueryToken {
public String displayStr() {
return "size" + str;
}
}
record RankTerm(String str) implements QueryToken {
public String displayStr() {
return "rank" + str;
} }
record RankTerm(SpecificationLimit limit, String displayStr) implements QueryToken {
public String str() { return displayStr; }
} }
record NearTerm(String str) implements QueryToken { record NearTerm(String str) implements QueryToken {
public String displayStr() { public String displayStr() {

View File

@ -53,11 +53,7 @@ public class QueryFactory {
basicQuery.clear(); basicQuery.clear();
} }
List<String> searchTermsExclude = new ArrayList<>(); SearchQuery.SearchQueryBuilder queryBuilder = SearchQuery.builder();
List<String> searchTermsInclude = new ArrayList<>();
List<String> searchTermsAdvice = new ArrayList<>();
List<String> searchTermsPriority = new ArrayList<>();
List<SearchCoherenceConstraint> searchTermCoherences = new ArrayList<>();
SpecificationLimit qualityLimit = SpecificationLimit.none(); SpecificationLimit qualityLimit = SpecificationLimit.none();
SpecificationLimit year = SpecificationLimit.none(); SpecificationLimit year = SpecificationLimit.none();
@ -77,51 +73,48 @@ public class QueryFactory {
if (parts.length > 1) { if (parts.length > 1) {
// Require that the terms appear in sequence // Require that the terms appear in sequence
searchTermCoherences.add(SearchCoherenceConstraint.mandatory(parts)); queryBuilder.coherenceConstraint(SearchCoherenceConstraint.mandatory(parts));
// Construct a regular query from the parts in the quoted string // Construct a regular query from the parts in the quoted string
searchTermsInclude.addAll(Arrays.asList(parts)); queryBuilder.include(parts);
// Prefer that the actual n-gram is present // Prefer that the actual n-gram is present
searchTermsPriority.add(str); queryBuilder.priority(str);
} }
else { else {
// If the quoted word is a single word, we don't need to do more than include it in the search // If the quoted word is a single word, we don't need to do more than include it in the search
searchTermsInclude.add(str); queryBuilder.include(str);
} }
} }
case QueryToken.LiteralTerm(String str, String displayStr) -> { case QueryToken.LiteralTerm(String str, String displayStr) -> {
analyzeSearchTerm(problems, str, displayStr); analyzeSearchTerm(problems, str, displayStr);
searchTermsHuman.addAll(Arrays.asList(displayStr.split("\\s+"))); searchTermsHuman.addAll(Arrays.asList(displayStr.split("\\s+")));
searchTermsInclude.add(str); queryBuilder.include(str);
} }
case QueryToken.ExcludeTerm(String str, String displayStr) -> queryBuilder.exclude(str);
case QueryToken.ExcludeTerm(String str, String displayStr) -> searchTermsExclude.add(str); case QueryToken.PriorityTerm(String str, String displayStr) -> queryBuilder.priority(str);
case QueryToken.PriorityTerm(String str, String displayStr) -> searchTermsPriority.add(str);
case QueryToken.AdviceTerm(String str, String displayStr) -> { case QueryToken.AdviceTerm(String str, String displayStr) -> {
searchTermsAdvice.add(str); queryBuilder.advice(str);
if (str.toLowerCase().startsWith("site:")) { if (str.toLowerCase().startsWith("site:")) {
domain = str.substring("site:".length()); domain = str.substring("site:".length());
} }
} }
case QueryToken.YearTerm(String str) -> year = parseSpecificationLimit(str); case QueryToken.YearTerm(SpecificationLimit limit, String displayStr) -> year = limit;
case QueryToken.SizeTerm(String str) -> size = parseSpecificationLimit(str); case QueryToken.SizeTerm(SpecificationLimit limit, String displayStr) -> size = limit;
case QueryToken.RankTerm(String str) -> rank = parseSpecificationLimit(str); case QueryToken.RankTerm(SpecificationLimit limit, String displayStr) -> rank = limit;
case QueryToken.QualityTerm(String str) -> qualityLimit = parseSpecificationLimit(str); case QueryToken.QualityTerm(SpecificationLimit limit, String displayStr) -> qualityLimit = limit;
case QueryToken.QsTerm(String str) -> queryStrategy = parseQueryStrategy(str); case QueryToken.QsTerm(String str) -> queryStrategy = parseQueryStrategy(str);
default -> {} default -> {}
} }
} }
if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) { queryBuilder.promoteNonRankingTerms();
searchTermsInclude.addAll(searchTermsAdvice);
searchTermsAdvice.clear();
}
List<Integer> domainIds = params.domainIds(); List<Integer> domainIds = params.domainIds();
@ -131,25 +124,18 @@ public class QueryFactory {
limits = limits.forSingleDomain(); limits = limits.forSingleDomain();
} }
var expansion = queryExpansion.expandQuery(searchTermsInclude); var expansion = queryExpansion.expandQuery(queryBuilder.searchTermsInclude);
// Query expansion may produce suggestions for coherence constraints, // Query expansion may produce suggestions for coherence constraints,
// add these to the query // add these to the query
for (var coh : expansion.extraCoherences()) { for (var coh : expansion.extraCoherences()) {
searchTermCoherences.add(SearchCoherenceConstraint.optional(coh)); queryBuilder.coherenceConstraint(SearchCoherenceConstraint.optional(coh));
} }
var searchQuery = new SearchQuery( queryBuilder.compiledQuery(expansion.compiledQuery());
expansion.compiledQuery(),
searchTermsInclude,
searchTermsExclude,
searchTermsAdvice,
searchTermsPriority,
searchTermCoherences
);
var specsBuilder = SearchSpecification.builder() var specsBuilder = SearchSpecification.builder()
.query(searchQuery) .query(queryBuilder.build())
.humanQuery(query) .humanQuery(query)
.quality(qualityLimit) .quality(qualityLimit)
.year(year) .year(year)
@ -180,20 +166,7 @@ public class QueryFactory {
problems.add("Search term \"" + displayStr + "\" too long"); problems.add("Search term \"" + displayStr + "\" too long");
} }
} }
private SpecificationLimit parseSpecificationLimit(String str) {
int startChar = str.charAt(0);
int val = Integer.parseInt(str.substring(1));
if (startChar == '=') {
return SpecificationLimit.equals(val);
} else if (startChar == '<') {
return SpecificationLimit.lessThan(val);
} else if (startChar == '>') {
return SpecificationLimit.greaterThan(val);
} else {
return SpecificationLimit.none();
}
}
private QueryStrategy parseQueryStrategy(String str) { private QueryStrategy parseQueryStrategy(String str) {
return switch (str.toUpperCase()) { return switch (str.toUpperCase()) {
@ -208,14 +181,4 @@ public class QueryFactory {
default -> QueryStrategy.AUTO; default -> QueryStrategy.AUTO;
}; };
} }
private boolean anyPartIsStopWord(String[] parts) {
for (String part : parts) {
if (WordPatterns.isStopWord(part)) {
return true;
}
}
return false;
}
} }

View File

@ -1,6 +1,7 @@
package nu.marginalia.query.svc; package nu.marginalia.query.svc;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion; import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
@ -57,7 +58,12 @@ public class QueryFactoryTest {
@Test @Test
void qsec10() { void qsec10() {
try (var lines = Files.lines(Path.of("/home/vlofgren/Exports/qsec10/webis-qsec-10-training-set/webis-qsec-10-training-set-queries.txt"))) { Path webis = Path.of("/home/vlofgren/Exports/qsec10/webis-qsec-10-training-set/webis-qsec-10-training-set-queries.txt");
if (!Files.exists(webis))
return;
try (var lines = Files.lines(webis)) {
lines.limit(1000).forEach(line -> { lines.limit(1000).forEach(line -> {
String[] parts = line.split("\t"); String[] parts = line.split("\t");
if (parts.length == 2) { if (parts.length == 2) {
@ -129,15 +135,15 @@ public class QueryFactoryTest {
{ {
// the is a stopword, so it should generate an ngram search term // the is a stopword, so it should generate an ngram search term
var specs = parseAndGetSpecs("\"the shining\""); var specs = parseAndGetSpecs("\"the shining\"");
assertEquals("the_shining", specs.query.compiledQuery); assertEquals("( shining | the_shining )", specs.query.compiledQuery);
} }
{ {
// tde isn't a stopword, so we should get the normal behavior // tde isn't a stopword, so we should get the normal behavior
var specs = parseAndGetSpecs("\"tde shining\""); var specs = parseAndGetSpecs("\"tde shining\"");
assertEquals("( shining tde | tde_shining )", specs.query.compiledQuery); assertEquals("( shining tde | tde_shining )", specs.query.compiledQuery);
assertEquals(List.of("tde_shining"), specs.query.searchTermsAdvice); assertEquals(List.of("tde_shining"), specs.query.searchTermsPriority);
assertEquals(List.of(List.of("tde", "shining")), specs.query.searchTermCoherences); assertEquals(List.of(new SearchCoherenceConstraint(true, List.of("tde", "shining"))), specs.query.searchTermCoherences);
} }
} }

View File

@ -169,7 +169,8 @@ public class IndexQueryServiceIntegrationSmokeTest {
.domains(new ArrayList<>()) .domains(new ArrayList<>())
.searchSetIdentifier("NONE") .searchSetIdentifier("NONE")
.query( .query(
SearchQuery.builder("2") SearchQuery.builder()
.compiledQuery("2")
.include("2") .include("2")
.build() .build()
).build() ).build()