Debug query strategy options

This commit is contained in:
Viktor Lofgren 2023-02-02 10:35:55 +01:00
parent b18cd0bc36
commit 4a07eda61c
10 changed files with 86 additions and 49 deletions

View File

@ -0,0 +1,4 @@
package nu.marginalia.wmsa.edge.index.model;
public record QueryLimits(int resultsByDomain, int resultsTotal, int timeoutMs, int fetchSize) {
}

View File

@ -3,5 +3,10 @@ package nu.marginalia.wmsa.edge.index.model;
public enum QueryStrategy {
SENTENCE,
TOPIC,
REQUIRE_FIELD_SITE,
REQUIRE_FIELD_TITLE,
REQUIRE_FIELD_SUBJECT,
AUTO
}

View File

@ -7,6 +7,8 @@ import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
import nu.marginalia.wmsa.edge.index.model.QueryStrategy;
import nu.marginalia.wmsa.edge.index.query.IndexQueryParams;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery;
@ -17,6 +19,7 @@ import java.util.Objects;
public class IndexResultValuator {
private final IndexMetadataService metadataService;
private final List<List<String>> searchTermVariants;
private final IndexQueryParams queryParams;
private final int[] termIdsAll;
private final TLongHashSet resultsWithPriorityTerms;
@ -24,9 +27,10 @@ public class IndexResultValuator {
private final TObjectIntHashMap<String> termToId = new TObjectIntHashMap<>(10, 0.75f, -1);
private final TermMetadata termMetadata;
public IndexResultValuator(SearchIndexControl indexes, TLongList results, List<EdgeSearchSubquery> subqueries) {
public IndexResultValuator(SearchIndexControl indexes, TLongList results, List<EdgeSearchSubquery> subqueries, IndexQueryParams queryParams) {
this.metadataService = new IndexMetadataService(indexes);
this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
this.queryParams = queryParams;
var lexiconReader = Objects.requireNonNull(indexes.getLexiconReader());
IntArrayList termIdsList = new IntArrayList();
@ -114,10 +118,15 @@ public class IndexResultValuator {
docMetadata,
resultsWithPriorityTerms.contains(searchResult.combinedId)
);
searchResult.scores.add(score);
setScore += score.termValue();
if (!filterRequired(metadata, queryParams.queryStrategy())) {
setScore += 1000;
}
if (termIdx == 0) {
setScore += score.documentValue();
}
@ -130,6 +139,19 @@ public class IndexResultValuator {
return setScore/setSize;
}
private boolean filterRequired(long metadata, QueryStrategy queryStrategy) {
if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) {
return EdgePageWordFlags.Site.isPresent(metadata);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) {
return EdgePageWordFlags.Subjects.isPresent(metadata);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) {
return EdgePageWordFlags.Title.isPresent(metadata);
}
return true;
}
private double calculateTermCoherencePenalty(int urlId, TObjectIntHashMap<String> termToId, List<String> termList) {
long maskDirectGenerous = ~0;
long maskDirectRaw = ~0;

View File

@ -92,7 +92,8 @@ public class SearchIndex {
SearchIndexReader.IndexQueryBuilder query =
switch(params.queryStrategy()) {
case SENTENCE -> indexReader.findWordAsSentence(orderedIncludes);
case TOPIC -> indexReader.findWordAsTopic(orderedIncludes);
case TOPIC, REQUIRE_FIELD_SITE, REQUIRE_FIELD_TITLE, REQUIRE_FIELD_SUBJECT
-> indexReader.findWordAsTopic(orderedIncludes);
case AUTO -> indexReader.findWordTopicDynamicMode(orderedIncludes);
};

View File

@ -115,11 +115,13 @@ public class EdgeIndexQueryService {
TLongHashSet consideredUrlIds;
public SearchQuery(EdgeSearchSpecification specsSet) {
this.fetchSize = specsSet.fetchSize;
this.budget = new IndexSearchBudget(specsSet.timeoutMs);
var limits = specsSet.queryLimits;
this.fetchSize = limits.fetchSize();
this.budget = new IndexSearchBudget(limits.timeoutMs());
this.subqueries = specsSet.subqueries;
this.limitByDomain = specsSet.limitByDomain;
this.limitTotal = specsSet.limitTotal;
this.limitByDomain = limits.resultsByDomain();
this.limitTotal = limits.resultsTotal();
this.consideredUrlIds = new TLongHashSet(fetchSize * 4);
@ -151,7 +153,7 @@ public class EdgeIndexQueryService {
}
}
final var evaluator = new IndexResultValuator(indexes, results, subqueries);
final var evaluator = new IndexResultValuator(indexes, results, subqueries, queryParams);
ArrayList<EdgeSearchResultItem> items = new ArrayList<>(results.size());
ArrayList<EdgeSearchResultItem> refusedItems = new ArrayList<>(results.size());

View File

@ -1,6 +1,7 @@
package nu.marginalia.wmsa.edge.model.search;
import lombok.*;
import nu.marginalia.wmsa.edge.index.model.QueryLimits;
import nu.marginalia.wmsa.edge.index.model.QueryStrategy;
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier;
import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit;
@ -9,23 +10,17 @@ import java.util.List;
@ToString @Getter @Builder @With @AllArgsConstructor
public class EdgeSearchSpecification {
public List<EdgeSearchSubquery> subqueries;
public List<Integer> domains;
public SearchSetIdentifier searchSetIdentifier;
public final int limitByDomain;
public final int limitTotal;
public final String humanQuery;
public final int timeoutMs;
public final int fetchSize;
public final SpecificationLimit quality;
public final SpecificationLimit year;
public final SpecificationLimit size;
public final QueryLimits queryLimits;
public final QueryStrategy queryStrategy;
}

View File

@ -6,6 +6,7 @@ import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import nu.marginalia.wmsa.edge.index.model.QueryLimits;
import nu.marginalia.wmsa.edge.index.model.QueryStrategy;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery;
@ -84,6 +85,8 @@ public class QueryFactory {
List<String> problems = new ArrayList<>();
String domain = null;
QueryStrategy queryStrategy = QueryStrategy.AUTO;
var basicQuery = queryParser.parse(query);
if (basicQuery.size() >= 8) {
@ -113,6 +116,9 @@ public class QueryFactory {
if (t.type == TokenType.SIZE_TERM) {
size = parseSpecificationLimit(t.str);
}
if (t.type == TokenType.QS_TERM) {
queryStrategy = parseQueryStrategy(t.str);
}
}
var queryPermutations = queryParser.permuteQueriesNew(basicQuery);
@ -179,25 +185,24 @@ public class QueryFactory {
}
}
int domainLimit;
if (domain != null) {
domainLimit = 100;
} else {
domainLimit = 2;
}
EdgeSearchSpecification.EdgeSearchSpecificationBuilder specsBuilder = EdgeSearchSpecification.builder()
.subqueries(subqueries)
.limitTotal(100)
.queryLimits(new QueryLimits(domainLimit, 100, 250, 4096))
.humanQuery(query)
.timeoutMs(250)
.fetchSize(4096)
.quality(qualityLimit)
.year(year)
.size(size)
.domains(domains)
.queryStrategy(QueryStrategy.AUTO)
.queryStrategy(queryStrategy)
.searchSetIdentifier(profile.searchSetIdentifier);
if (domain != null) {
specsBuilder = specsBuilder.limitByDomain(100);
} else {
specsBuilder = specsBuilder.limitByDomain(2);
}
EdgeSearchSpecification specs = specsBuilder.build();
return new EdgeSearchQuery(specs, searchTermsHuman, domain);
@ -210,10 +215,10 @@ public class QueryFactory {
if (startChar == '=') {
return SpecificationLimit.equals(val);
}
else if (startChar == '<'){
else if (startChar == '<') {
return SpecificationLimit.lessThan(val);
}
else if (startChar == '>'){
else if (startChar == '>') {
return SpecificationLimit.greaterThan(val);
}
else {
@ -221,6 +226,17 @@ public class QueryFactory {
}
}
private QueryStrategy parseQueryStrategy(String str) {
return switch (str.toUpperCase()) {
case "RF_TITLE" -> QueryStrategy.REQUIRE_FIELD_TITLE;
case "RF_SUBJECT" -> QueryStrategy.REQUIRE_FIELD_SUBJECT;
case "RF_SITE" -> QueryStrategy.REQUIRE_FIELD_SITE;
case "SENTENCE" -> QueryStrategy.SENTENCE;
case "TOPIC" -> QueryStrategy.TOPIC;
default -> QueryStrategy.AUTO;
};
}
private String normalizeDomainName(String str) {
return str.toLowerCase();
}

View File

@ -93,6 +93,8 @@ public class QueryParser {
entity.replace(new Token(TokenType.YEAR_TERM, t.str.substring(4), t.displayStr));
} else if (t.str.startsWith("size") && t.str.matches("size[=><]\\d+")) {
entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr));
} else if (t.str.startsWith("qs=")) {
entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr));
} else if (t.str.contains(":")) {
entity.replace(new Token(TokenType.ADVICE_TERM, t.str, t.displayStr));
}
@ -508,6 +510,8 @@ enum TokenType implements Predicate<Token> {
SIZE_TERM,
NEAR_TERM,
QS_TERM,
QUOT,
MINUS,
QMARK,

View File

@ -4,6 +4,7 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
import nu.marginalia.wmsa.edge.index.model.QueryLimits;
import nu.marginalia.wmsa.edge.index.model.QueryStrategy;
import nu.marginalia.wmsa.edge.model.search.*;
import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit;
@ -47,11 +48,8 @@ public class EdgeSearchQueryIndexService {
.subqueries(sqs)
.domains(Collections.emptyList())
.searchSetIdentifier(profile.searchSetIdentifier)
.limitByDomain(limitPerDomain)
.limitTotal(limitTotal)
.queryLimits(new QueryLimits(limitPerDomain, limitTotal, 150, 2048))
.humanQuery("")
.timeoutMs(150)
.fetchSize(2048)
.year(SpecificationLimit.none())
.size(SpecificationLimit.none())
.quality(SpecificationLimit.none())
@ -76,11 +74,13 @@ public class EdgeSearchQueryIndexService {
resultList.sort(resultListComparator);
UrlDeduplicator deduplicator = new UrlDeduplicator(processedQuery.specs.limitByDomain);
List<EdgeUrlDetails> retList = new ArrayList<>(processedQuery.specs.limitTotal);
var limits = processedQuery.specs.queryLimits;
UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain());
List<EdgeUrlDetails> retList = new ArrayList<>(limits.resultsTotal());
for (var item : resultList) {
if (retList.size() >= processedQuery.specs.limitTotal)
if (retList.size() >= limits.resultsTotal())
break;
if (!deduplicator.shouldRemove(item)) {

View File

@ -3,10 +3,7 @@ package nu.marginalia.wmsa.edge.index.service;
import com.google.inject.Guice;
import com.google.inject.Inject;
import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata;
import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
import nu.marginalia.wmsa.edge.index.model.QueryStrategy;
import nu.marginalia.wmsa.edge.index.model.*;
import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl;
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry;
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader;
@ -80,10 +77,7 @@ public class EdgeIndexIntegrationTest {
var rsp = queryService.query(
EdgeSearchSpecification.builder()
.timeoutMs(Integer.MAX_VALUE)
.fetchSize(4000)
.limitTotal(10)
.limitByDomain(10)
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
.queryStrategy(QueryStrategy.SENTENCE)
.year(SpecificationLimit.none())
.quality(SpecificationLimit.none())
@ -115,10 +109,7 @@ public class EdgeIndexIntegrationTest {
var rsp = queryService.query(
EdgeSearchSpecification.builder()
.timeoutMs(Integer.MAX_VALUE)
.fetchSize(4000)
.limitTotal(10)
.limitByDomain(10)
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
.year(SpecificationLimit.none())
.quality(SpecificationLimit.none())
.size(SpecificationLimit.none())
@ -144,10 +135,7 @@ public class EdgeIndexIntegrationTest {
var rsp = queryService.query(
EdgeSearchSpecification.builder()
.timeoutMs(Integer.MAX_VALUE)
.fetchSize(4000)
.limitTotal(10)
.limitByDomain(10)
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
.quality(SpecificationLimit.none())
.year(SpecificationLimit.equals(1998))
.size(SpecificationLimit.none())