mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Debug query strategy options
This commit is contained in:
parent
b18cd0bc36
commit
4a07eda61c
@ -0,0 +1,4 @@
|
||||
package nu.marginalia.wmsa.edge.index.model;
|
||||
|
||||
public record QueryLimits(int resultsByDomain, int resultsTotal, int timeoutMs, int fetchSize) {
|
||||
}
|
@ -3,5 +3,10 @@ package nu.marginalia.wmsa.edge.index.model;
|
||||
public enum QueryStrategy {
|
||||
SENTENCE,
|
||||
TOPIC,
|
||||
|
||||
REQUIRE_FIELD_SITE,
|
||||
REQUIRE_FIELD_TITLE,
|
||||
REQUIRE_FIELD_SUBJECT,
|
||||
|
||||
AUTO
|
||||
}
|
||||
|
@ -7,6 +7,8 @@ import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
|
||||
import nu.marginalia.wmsa.edge.index.model.QueryStrategy;
|
||||
import nu.marginalia.wmsa.edge.index.query.IndexQueryParams;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery;
|
||||
@ -17,6 +19,7 @@ import java.util.Objects;
|
||||
public class IndexResultValuator {
|
||||
private final IndexMetadataService metadataService;
|
||||
private final List<List<String>> searchTermVariants;
|
||||
private final IndexQueryParams queryParams;
|
||||
private final int[] termIdsAll;
|
||||
|
||||
private final TLongHashSet resultsWithPriorityTerms;
|
||||
@ -24,9 +27,10 @@ public class IndexResultValuator {
|
||||
private final TObjectIntHashMap<String> termToId = new TObjectIntHashMap<>(10, 0.75f, -1);
|
||||
private final TermMetadata termMetadata;
|
||||
|
||||
public IndexResultValuator(SearchIndexControl indexes, TLongList results, List<EdgeSearchSubquery> subqueries) {
|
||||
public IndexResultValuator(SearchIndexControl indexes, TLongList results, List<EdgeSearchSubquery> subqueries, IndexQueryParams queryParams) {
|
||||
this.metadataService = new IndexMetadataService(indexes);
|
||||
this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
|
||||
this.queryParams = queryParams;
|
||||
|
||||
var lexiconReader = Objects.requireNonNull(indexes.getLexiconReader());
|
||||
IntArrayList termIdsList = new IntArrayList();
|
||||
@ -114,10 +118,15 @@ public class IndexResultValuator {
|
||||
docMetadata,
|
||||
resultsWithPriorityTerms.contains(searchResult.combinedId)
|
||||
);
|
||||
|
||||
searchResult.scores.add(score);
|
||||
|
||||
setScore += score.termValue();
|
||||
|
||||
if (!filterRequired(metadata, queryParams.queryStrategy())) {
|
||||
setScore += 1000;
|
||||
}
|
||||
|
||||
if (termIdx == 0) {
|
||||
setScore += score.documentValue();
|
||||
}
|
||||
@ -130,6 +139,19 @@ public class IndexResultValuator {
|
||||
return setScore/setSize;
|
||||
}
|
||||
|
||||
private boolean filterRequired(long metadata, QueryStrategy queryStrategy) {
|
||||
if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) {
|
||||
return EdgePageWordFlags.Site.isPresent(metadata);
|
||||
}
|
||||
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) {
|
||||
return EdgePageWordFlags.Subjects.isPresent(metadata);
|
||||
}
|
||||
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) {
|
||||
return EdgePageWordFlags.Title.isPresent(metadata);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private double calculateTermCoherencePenalty(int urlId, TObjectIntHashMap<String> termToId, List<String> termList) {
|
||||
long maskDirectGenerous = ~0;
|
||||
long maskDirectRaw = ~0;
|
||||
|
@ -92,7 +92,8 @@ public class SearchIndex {
|
||||
SearchIndexReader.IndexQueryBuilder query =
|
||||
switch(params.queryStrategy()) {
|
||||
case SENTENCE -> indexReader.findWordAsSentence(orderedIncludes);
|
||||
case TOPIC -> indexReader.findWordAsTopic(orderedIncludes);
|
||||
case TOPIC, REQUIRE_FIELD_SITE, REQUIRE_FIELD_TITLE, REQUIRE_FIELD_SUBJECT
|
||||
-> indexReader.findWordAsTopic(orderedIncludes);
|
||||
case AUTO -> indexReader.findWordTopicDynamicMode(orderedIncludes);
|
||||
};
|
||||
|
||||
|
@ -115,11 +115,13 @@ public class EdgeIndexQueryService {
|
||||
TLongHashSet consideredUrlIds;
|
||||
|
||||
public SearchQuery(EdgeSearchSpecification specsSet) {
|
||||
this.fetchSize = specsSet.fetchSize;
|
||||
this.budget = new IndexSearchBudget(specsSet.timeoutMs);
|
||||
var limits = specsSet.queryLimits;
|
||||
|
||||
this.fetchSize = limits.fetchSize();
|
||||
this.budget = new IndexSearchBudget(limits.timeoutMs());
|
||||
this.subqueries = specsSet.subqueries;
|
||||
this.limitByDomain = specsSet.limitByDomain;
|
||||
this.limitTotal = specsSet.limitTotal;
|
||||
this.limitByDomain = limits.resultsByDomain();
|
||||
this.limitTotal = limits.resultsTotal();
|
||||
|
||||
this.consideredUrlIds = new TLongHashSet(fetchSize * 4);
|
||||
|
||||
@ -151,7 +153,7 @@ public class EdgeIndexQueryService {
|
||||
}
|
||||
}
|
||||
|
||||
final var evaluator = new IndexResultValuator(indexes, results, subqueries);
|
||||
final var evaluator = new IndexResultValuator(indexes, results, subqueries, queryParams);
|
||||
|
||||
ArrayList<EdgeSearchResultItem> items = new ArrayList<>(results.size());
|
||||
ArrayList<EdgeSearchResultItem> refusedItems = new ArrayList<>(results.size());
|
||||
|
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.wmsa.edge.model.search;
|
||||
|
||||
import lombok.*;
|
||||
import nu.marginalia.wmsa.edge.index.model.QueryLimits;
|
||||
import nu.marginalia.wmsa.edge.index.model.QueryStrategy;
|
||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit;
|
||||
@ -9,23 +10,17 @@ import java.util.List;
|
||||
|
||||
@ToString @Getter @Builder @With @AllArgsConstructor
|
||||
public class EdgeSearchSpecification {
|
||||
|
||||
public List<EdgeSearchSubquery> subqueries;
|
||||
public List<Integer> domains;
|
||||
public SearchSetIdentifier searchSetIdentifier;
|
||||
|
||||
public final int limitByDomain;
|
||||
public final int limitTotal;
|
||||
|
||||
public final String humanQuery;
|
||||
|
||||
public final int timeoutMs;
|
||||
public final int fetchSize;
|
||||
|
||||
public final SpecificationLimit quality;
|
||||
public final SpecificationLimit year;
|
||||
public final SpecificationLimit size;
|
||||
|
||||
public final QueryLimits queryLimits;
|
||||
public final QueryStrategy queryStrategy;
|
||||
|
||||
}
|
||||
|
@ -6,6 +6,7 @@ import nu.marginalia.util.language.WordPatterns;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||
import nu.marginalia.wmsa.edge.index.model.QueryLimits;
|
||||
import nu.marginalia.wmsa.edge.index.model.QueryStrategy;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery;
|
||||
@ -84,6 +85,8 @@ public class QueryFactory {
|
||||
List<String> problems = new ArrayList<>();
|
||||
String domain = null;
|
||||
|
||||
QueryStrategy queryStrategy = QueryStrategy.AUTO;
|
||||
|
||||
var basicQuery = queryParser.parse(query);
|
||||
|
||||
if (basicQuery.size() >= 8) {
|
||||
@ -113,6 +116,9 @@ public class QueryFactory {
|
||||
if (t.type == TokenType.SIZE_TERM) {
|
||||
size = parseSpecificationLimit(t.str);
|
||||
}
|
||||
if (t.type == TokenType.QS_TERM) {
|
||||
queryStrategy = parseQueryStrategy(t.str);
|
||||
}
|
||||
}
|
||||
|
||||
var queryPermutations = queryParser.permuteQueriesNew(basicQuery);
|
||||
@ -179,25 +185,24 @@ public class QueryFactory {
|
||||
}
|
||||
}
|
||||
|
||||
int domainLimit;
|
||||
if (domain != null) {
|
||||
domainLimit = 100;
|
||||
} else {
|
||||
domainLimit = 2;
|
||||
}
|
||||
|
||||
EdgeSearchSpecification.EdgeSearchSpecificationBuilder specsBuilder = EdgeSearchSpecification.builder()
|
||||
.subqueries(subqueries)
|
||||
.limitTotal(100)
|
||||
.queryLimits(new QueryLimits(domainLimit, 100, 250, 4096))
|
||||
.humanQuery(query)
|
||||
.timeoutMs(250)
|
||||
.fetchSize(4096)
|
||||
.quality(qualityLimit)
|
||||
.year(year)
|
||||
.size(size)
|
||||
.domains(domains)
|
||||
.queryStrategy(QueryStrategy.AUTO)
|
||||
.queryStrategy(queryStrategy)
|
||||
.searchSetIdentifier(profile.searchSetIdentifier);
|
||||
|
||||
if (domain != null) {
|
||||
specsBuilder = specsBuilder.limitByDomain(100);
|
||||
} else {
|
||||
specsBuilder = specsBuilder.limitByDomain(2);
|
||||
}
|
||||
|
||||
EdgeSearchSpecification specs = specsBuilder.build();
|
||||
|
||||
return new EdgeSearchQuery(specs, searchTermsHuman, domain);
|
||||
@ -210,10 +215,10 @@ public class QueryFactory {
|
||||
if (startChar == '=') {
|
||||
return SpecificationLimit.equals(val);
|
||||
}
|
||||
else if (startChar == '<'){
|
||||
else if (startChar == '<') {
|
||||
return SpecificationLimit.lessThan(val);
|
||||
}
|
||||
else if (startChar == '>'){
|
||||
else if (startChar == '>') {
|
||||
return SpecificationLimit.greaterThan(val);
|
||||
}
|
||||
else {
|
||||
@ -221,6 +226,17 @@ public class QueryFactory {
|
||||
}
|
||||
}
|
||||
|
||||
private QueryStrategy parseQueryStrategy(String str) {
|
||||
return switch (str.toUpperCase()) {
|
||||
case "RF_TITLE" -> QueryStrategy.REQUIRE_FIELD_TITLE;
|
||||
case "RF_SUBJECT" -> QueryStrategy.REQUIRE_FIELD_SUBJECT;
|
||||
case "RF_SITE" -> QueryStrategy.REQUIRE_FIELD_SITE;
|
||||
case "SENTENCE" -> QueryStrategy.SENTENCE;
|
||||
case "TOPIC" -> QueryStrategy.TOPIC;
|
||||
default -> QueryStrategy.AUTO;
|
||||
};
|
||||
}
|
||||
|
||||
private String normalizeDomainName(String str) {
|
||||
return str.toLowerCase();
|
||||
}
|
||||
|
@ -93,6 +93,8 @@ public class QueryParser {
|
||||
entity.replace(new Token(TokenType.YEAR_TERM, t.str.substring(4), t.displayStr));
|
||||
} else if (t.str.startsWith("size") && t.str.matches("size[=><]\\d+")) {
|
||||
entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr));
|
||||
} else if (t.str.startsWith("qs=")) {
|
||||
entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr));
|
||||
} else if (t.str.contains(":")) {
|
||||
entity.replace(new Token(TokenType.ADVICE_TERM, t.str, t.displayStr));
|
||||
}
|
||||
@ -508,6 +510,8 @@ enum TokenType implements Predicate<Token> {
|
||||
SIZE_TERM,
|
||||
NEAR_TERM,
|
||||
|
||||
QS_TERM,
|
||||
|
||||
QUOT,
|
||||
MINUS,
|
||||
QMARK,
|
||||
|
@ -4,6 +4,7 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
|
||||
import nu.marginalia.wmsa.edge.index.model.QueryLimits;
|
||||
import nu.marginalia.wmsa.edge.index.model.QueryStrategy;
|
||||
import nu.marginalia.wmsa.edge.model.search.*;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit;
|
||||
@ -47,11 +48,8 @@ public class EdgeSearchQueryIndexService {
|
||||
.subqueries(sqs)
|
||||
.domains(Collections.emptyList())
|
||||
.searchSetIdentifier(profile.searchSetIdentifier)
|
||||
.limitByDomain(limitPerDomain)
|
||||
.limitTotal(limitTotal)
|
||||
.queryLimits(new QueryLimits(limitPerDomain, limitTotal, 150, 2048))
|
||||
.humanQuery("")
|
||||
.timeoutMs(150)
|
||||
.fetchSize(2048)
|
||||
.year(SpecificationLimit.none())
|
||||
.size(SpecificationLimit.none())
|
||||
.quality(SpecificationLimit.none())
|
||||
@ -76,11 +74,13 @@ public class EdgeSearchQueryIndexService {
|
||||
|
||||
resultList.sort(resultListComparator);
|
||||
|
||||
UrlDeduplicator deduplicator = new UrlDeduplicator(processedQuery.specs.limitByDomain);
|
||||
List<EdgeUrlDetails> retList = new ArrayList<>(processedQuery.specs.limitTotal);
|
||||
var limits = processedQuery.specs.queryLimits;
|
||||
|
||||
UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain());
|
||||
List<EdgeUrlDetails> retList = new ArrayList<>(limits.resultsTotal());
|
||||
|
||||
for (var item : resultList) {
|
||||
if (retList.size() >= processedQuery.specs.limitTotal)
|
||||
if (retList.size() >= limits.resultsTotal())
|
||||
break;
|
||||
|
||||
if (!deduplicator.shouldRemove(item)) {
|
||||
|
@ -3,10 +3,7 @@ package nu.marginalia.wmsa.edge.index.service;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
|
||||
import nu.marginalia.wmsa.edge.index.model.QueryStrategy;
|
||||
import nu.marginalia.wmsa.edge.index.model.*;
|
||||
import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl;
|
||||
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry;
|
||||
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader;
|
||||
@ -80,10 +77,7 @@ public class EdgeIndexIntegrationTest {
|
||||
|
||||
var rsp = queryService.query(
|
||||
EdgeSearchSpecification.builder()
|
||||
.timeoutMs(Integer.MAX_VALUE)
|
||||
.fetchSize(4000)
|
||||
.limitTotal(10)
|
||||
.limitByDomain(10)
|
||||
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
||||
.queryStrategy(QueryStrategy.SENTENCE)
|
||||
.year(SpecificationLimit.none())
|
||||
.quality(SpecificationLimit.none())
|
||||
@ -115,10 +109,7 @@ public class EdgeIndexIntegrationTest {
|
||||
|
||||
var rsp = queryService.query(
|
||||
EdgeSearchSpecification.builder()
|
||||
.timeoutMs(Integer.MAX_VALUE)
|
||||
.fetchSize(4000)
|
||||
.limitTotal(10)
|
||||
.limitByDomain(10)
|
||||
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
||||
.year(SpecificationLimit.none())
|
||||
.quality(SpecificationLimit.none())
|
||||
.size(SpecificationLimit.none())
|
||||
@ -144,10 +135,7 @@ public class EdgeIndexIntegrationTest {
|
||||
|
||||
var rsp = queryService.query(
|
||||
EdgeSearchSpecification.builder()
|
||||
.timeoutMs(Integer.MAX_VALUE)
|
||||
.fetchSize(4000)
|
||||
.limitTotal(10)
|
||||
.limitByDomain(10)
|
||||
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
||||
.quality(SpecificationLimit.none())
|
||||
.year(SpecificationLimit.equals(1998))
|
||||
.size(SpecificationLimit.none())
|
||||
|
Loading…
Reference in New Issue
Block a user