(query) Tidy up code

This commit is contained in:
Viktor Lofgren 2024-06-26 13:40:06 +02:00
parent 02df421c94
commit 6973712480
6 changed files with 89 additions and 92 deletions

View File

@ -36,8 +36,8 @@ public class SearchQuery {
@Deprecated // why does this exist?
private double value = 0;
public static SearchQueryBuilder builder(String compiledQuery) {
return new SearchQueryBuilder(compiledQuery);
public static SearchQueryBuilder builder() {
return new SearchQueryBuilder();
}
public SearchQuery() {
@ -86,15 +86,19 @@ public class SearchQuery {
}
public static class SearchQueryBuilder {
private final String compiledQuery;
private List<String> searchTermsInclude = new ArrayList<>();
private List<String> searchTermsExclude = new ArrayList<>();
private List<String> searchTermsAdvice = new ArrayList<>();
private List<String> searchTermsPriority = new ArrayList<>();
private List<SearchCoherenceConstraint> searchTermCoherences = new ArrayList<>();
private String compiledQuery;
public final List<String> searchTermsInclude = new ArrayList<>();
public final List<String> searchTermsExclude = new ArrayList<>();
public final List<String> searchTermsAdvice = new ArrayList<>();
public final List<String> searchTermsPriority = new ArrayList<>();
public final List<SearchCoherenceConstraint> searchTermCoherences = new ArrayList<>();
private SearchQueryBuilder(String compiledQuery) {
this.compiledQuery = compiledQuery;
private SearchQueryBuilder() {
}
public SearchQueryBuilder compiledQuery(String query) {
this.compiledQuery = query;
return this;
}
public SearchQueryBuilder include(String... terms) {
@ -117,7 +121,7 @@ public class SearchQuery {
return this;
}
public SearchQueryBuilder coherences(SearchCoherenceConstraint constraint) {
public SearchQueryBuilder coherenceConstraint(SearchCoherenceConstraint constraint) {
searchTermCoherences.add(constraint);
return this;
}
@ -125,5 +129,13 @@ public class SearchQuery {
public SearchQuery build() {
return new SearchQuery(compiledQuery, searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchTermCoherences);
}
/** If there are no ranking terms, promote the advice terms to ranking terms */
public void promoteNonRankingTerms() {
if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) {
searchTermsInclude.addAll(searchTermsAdvice);
searchTermsAdvice.clear();
}
}
}
}

View File

@ -1,6 +1,7 @@
package nu.marginalia.functions.searchquery.query_parser;
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.util.transform_list.TransformList;
@ -104,15 +105,19 @@ public class QueryParser {
String str = t.str();
if (str.startsWith("q") && str.matches("q[=><]\\d+")) {
entity.replace(new QueryToken.QualityTerm(str.substring(1)));
var limit = parseSpecificationLimit(str.substring(1));
entity.replace(new QueryToken.QualityTerm(limit, str));
} else if (str.startsWith("near:")) {
entity.replace(new QueryToken.NearTerm(str.substring(5)));
} else if (str.startsWith("year") && str.matches("year[=><]\\d{4}")) {
entity.replace(new QueryToken.YearTerm(str.substring(4)));
var limit = parseSpecificationLimit(str.substring(4));
entity.replace(new QueryToken.YearTerm(limit, str));
} else if (str.startsWith("size") && str.matches("size[=><]\\d+")) {
entity.replace(new QueryToken.SizeTerm(str.substring(4)));
var limit = parseSpecificationLimit(str.substring(4));
entity.replace(new QueryToken.SizeTerm(limit, str));
} else if (str.startsWith("rank") && str.matches("rank[=><]\\d+")) {
entity.replace(new QueryToken.RankTerm(str.substring(4)));
var limit = parseSpecificationLimit(str.substring(4));
entity.replace(new QueryToken.RankTerm(limit, str));
} else if (str.startsWith("qs=")) {
entity.replace(new QueryToken.QsTerm(str.substring(3)));
} else if (str.contains(":")) {
@ -120,6 +125,21 @@ public class QueryParser {
}
}
private static SpecificationLimit parseSpecificationLimit(String str) {
int startChar = str.charAt(0);
int val = Integer.parseInt(str.substring(1));
if (startChar == '=') {
return SpecificationLimit.equals(val);
} else if (startChar == '<') {
return SpecificationLimit.lessThan(val);
} else if (startChar == '>') {
return SpecificationLimit.greaterThan(val);
} else {
return SpecificationLimit.none();
}
}
private static void handleAdvisoryTerms(TransformList<QueryToken>.Entity entity) {
var t = entity.value();
if (t instanceof QueryToken.LParen) {

View File

@ -1,6 +1,8 @@
package nu.marginalia.functions.searchquery.query_parser.token;
import nu.marginalia.index.query.limit.SpecificationLimit;
public sealed interface QueryToken {
String str();
String displayStr();
@ -11,25 +13,18 @@ public sealed interface QueryToken {
record AdviceTerm(String str, String displayStr) implements QueryToken {}
record PriorityTerm(String str, String displayStr) implements QueryToken {}
record QualityTerm(String str) implements QueryToken {
public String displayStr() {
return "q" + str;
}
record QualityTerm(SpecificationLimit limit, String displayStr) implements QueryToken {
public String str() { return displayStr; }
}
record YearTerm(String str) implements QueryToken {
public String displayStr() {
return "year" + str;
}
record YearTerm(SpecificationLimit limit, String displayStr) implements QueryToken {
public String str() { return displayStr; }
}
record SizeTerm(String str) implements QueryToken {
public String displayStr() {
return "size" + str;
}
record SizeTerm(SpecificationLimit limit, String displayStr) implements QueryToken {
public String str() { return displayStr; }
}
record RankTerm(String str) implements QueryToken {
public String displayStr() {
return "rank" + str;
}
record RankTerm(SpecificationLimit limit, String displayStr) implements QueryToken {
public String str() { return displayStr; }
}
record NearTerm(String str) implements QueryToken {
public String displayStr() {

View File

@ -53,11 +53,7 @@ public class QueryFactory {
basicQuery.clear();
}
List<String> searchTermsExclude = new ArrayList<>();
List<String> searchTermsInclude = new ArrayList<>();
List<String> searchTermsAdvice = new ArrayList<>();
List<String> searchTermsPriority = new ArrayList<>();
List<SearchCoherenceConstraint> searchTermCoherences = new ArrayList<>();
SearchQuery.SearchQueryBuilder queryBuilder = SearchQuery.builder();
SpecificationLimit qualityLimit = SpecificationLimit.none();
SpecificationLimit year = SpecificationLimit.none();
@ -77,51 +73,48 @@ public class QueryFactory {
if (parts.length > 1) {
// Require that the terms appear in sequence
searchTermCoherences.add(SearchCoherenceConstraint.mandatory(parts));
queryBuilder.coherenceConstraint(SearchCoherenceConstraint.mandatory(parts));
// Construct a regular query from the parts in the quoted string
searchTermsInclude.addAll(Arrays.asList(parts));
queryBuilder.include(parts);
// Prefer that the actual n-gram is present
searchTermsPriority.add(str);
queryBuilder.priority(str);
}
else {
// If the quoted word is a single word, we don't need to do more than include it in the search
searchTermsInclude.add(str);
queryBuilder.include(str);
}
}
case QueryToken.LiteralTerm(String str, String displayStr) -> {
analyzeSearchTerm(problems, str, displayStr);
searchTermsHuman.addAll(Arrays.asList(displayStr.split("\\s+")));
searchTermsInclude.add(str);
queryBuilder.include(str);
}
case QueryToken.ExcludeTerm(String str, String displayStr) -> searchTermsExclude.add(str);
case QueryToken.PriorityTerm(String str, String displayStr) -> searchTermsPriority.add(str);
case QueryToken.ExcludeTerm(String str, String displayStr) -> queryBuilder.exclude(str);
case QueryToken.PriorityTerm(String str, String displayStr) -> queryBuilder.priority(str);
case QueryToken.AdviceTerm(String str, String displayStr) -> {
searchTermsAdvice.add(str);
queryBuilder.advice(str);
if (str.toLowerCase().startsWith("site:")) {
domain = str.substring("site:".length());
}
}
case QueryToken.YearTerm(String str) -> year = parseSpecificationLimit(str);
case QueryToken.SizeTerm(String str) -> size = parseSpecificationLimit(str);
case QueryToken.RankTerm(String str) -> rank = parseSpecificationLimit(str);
case QueryToken.QualityTerm(String str) -> qualityLimit = parseSpecificationLimit(str);
case QueryToken.YearTerm(SpecificationLimit limit, String displayStr) -> year = limit;
case QueryToken.SizeTerm(SpecificationLimit limit, String displayStr) -> size = limit;
case QueryToken.RankTerm(SpecificationLimit limit, String displayStr) -> rank = limit;
case QueryToken.QualityTerm(SpecificationLimit limit, String displayStr) -> qualityLimit = limit;
case QueryToken.QsTerm(String str) -> queryStrategy = parseQueryStrategy(str);
default -> {}
}
}
if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) {
searchTermsInclude.addAll(searchTermsAdvice);
searchTermsAdvice.clear();
}
queryBuilder.promoteNonRankingTerms();
List<Integer> domainIds = params.domainIds();
@ -131,25 +124,18 @@ public class QueryFactory {
limits = limits.forSingleDomain();
}
var expansion = queryExpansion.expandQuery(searchTermsInclude);
var expansion = queryExpansion.expandQuery(queryBuilder.searchTermsInclude);
// Query expansion may produce suggestions for coherence constraints,
// add these to the query
for (var coh : expansion.extraCoherences()) {
searchTermCoherences.add(SearchCoherenceConstraint.optional(coh));
queryBuilder.coherenceConstraint(SearchCoherenceConstraint.optional(coh));
}
var searchQuery = new SearchQuery(
expansion.compiledQuery(),
searchTermsInclude,
searchTermsExclude,
searchTermsAdvice,
searchTermsPriority,
searchTermCoherences
);
queryBuilder.compiledQuery(expansion.compiledQuery());
var specsBuilder = SearchSpecification.builder()
.query(searchQuery)
.query(queryBuilder.build())
.humanQuery(query)
.quality(qualityLimit)
.year(year)
@ -180,20 +166,7 @@ public class QueryFactory {
problems.add("Search term \"" + displayStr + "\" too long");
}
}
private SpecificationLimit parseSpecificationLimit(String str) {
int startChar = str.charAt(0);
int val = Integer.parseInt(str.substring(1));
if (startChar == '=') {
return SpecificationLimit.equals(val);
} else if (startChar == '<') {
return SpecificationLimit.lessThan(val);
} else if (startChar == '>') {
return SpecificationLimit.greaterThan(val);
} else {
return SpecificationLimit.none();
}
}
private QueryStrategy parseQueryStrategy(String str) {
return switch (str.toUpperCase()) {
@ -208,14 +181,4 @@ public class QueryFactory {
default -> QueryStrategy.AUTO;
};
}
private boolean anyPartIsStopWord(String[] parts) {
for (String part : parts) {
if (WordPatterns.isStopWord(part)) {
return true;
}
}
return false;
}
}

View File

@ -1,6 +1,7 @@
package nu.marginalia.query.svc;
import nu.marginalia.WmsaHome;
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
@ -57,7 +58,12 @@ public class QueryFactoryTest {
@Test
void qsec10() {
try (var lines = Files.lines(Path.of("/home/vlofgren/Exports/qsec10/webis-qsec-10-training-set/webis-qsec-10-training-set-queries.txt"))) {
Path webis = Path.of("/home/vlofgren/Exports/qsec10/webis-qsec-10-training-set/webis-qsec-10-training-set-queries.txt");
if (!Files.exists(webis))
return;
try (var lines = Files.lines(webis)) {
lines.limit(1000).forEach(line -> {
String[] parts = line.split("\t");
if (parts.length == 2) {
@ -129,15 +135,15 @@ public class QueryFactoryTest {
{
// the is a stopword, so it should generate an ngram search term
var specs = parseAndGetSpecs("\"the shining\"");
assertEquals("the_shining", specs.query.compiledQuery);
assertEquals("( shining | the_shining )", specs.query.compiledQuery);
}
{
// tde isn't a stopword, so we should get the normal behavior
var specs = parseAndGetSpecs("\"tde shining\"");
assertEquals("( shining tde | tde_shining )", specs.query.compiledQuery);
assertEquals(List.of("tde_shining"), specs.query.searchTermsAdvice);
assertEquals(List.of(List.of("tde", "shining")), specs.query.searchTermCoherences);
assertEquals(List.of("tde_shining"), specs.query.searchTermsPriority);
assertEquals(List.of(new SearchCoherenceConstraint(true, List.of("tde", "shining"))), specs.query.searchTermCoherences);
}
}

View File

@ -169,7 +169,8 @@ public class IndexQueryServiceIntegrationSmokeTest {
.domains(new ArrayList<>())
.searchSetIdentifier("NONE")
.query(
SearchQuery.builder("2")
SearchQuery.builder()
.compiledQuery("2")
.include("2")
.build()
).build()