diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryLimits.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryLimits.java new file mode 100644 index 00000000..dce78343 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryLimits.java @@ -0,0 +1,4 @@ +package nu.marginalia.wmsa.edge.index.model; + +public record QueryLimits(int resultsByDomain, int resultsTotal, int timeoutMs, int fetchSize) { +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryStrategy.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryStrategy.java index 7f3632c1..d8682a61 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryStrategy.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/QueryStrategy.java @@ -3,5 +3,10 @@ package nu.marginalia.wmsa.edge.index.model; public enum QueryStrategy { SENTENCE, TOPIC, + + REQUIRE_FIELD_SITE, + REQUIRE_FIELD_TITLE, + REQUIRE_FIELD_SUBJECT, + AUTO } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexResultValuator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexResultValuator.java index 1c928fc5..da2e92f8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexResultValuator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/IndexResultValuator.java @@ -7,6 +7,8 @@ import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap; import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; +import nu.marginalia.wmsa.edge.index.model.QueryStrategy; +import nu.marginalia.wmsa.edge.index.query.IndexQueryParams; import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem; import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; @@ -17,6 +19,7 @@ import java.util.Objects; public class IndexResultValuator { private final IndexMetadataService metadataService; private final List> searchTermVariants; + private final IndexQueryParams queryParams; private final int[] termIdsAll; private final TLongHashSet resultsWithPriorityTerms; @@ -24,9 +27,10 @@ public class IndexResultValuator { private final TObjectIntHashMap termToId = new TObjectIntHashMap<>(10, 0.75f, -1); private final TermMetadata termMetadata; - public IndexResultValuator(SearchIndexControl indexes, TLongList results, List subqueries) { + public IndexResultValuator(SearchIndexControl indexes, TLongList results, List subqueries, IndexQueryParams queryParams) { this.metadataService = new IndexMetadataService(indexes); this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList(); + this.queryParams = queryParams; var lexiconReader = Objects.requireNonNull(indexes.getLexiconReader()); IntArrayList termIdsList = new IntArrayList(); @@ -114,10 +118,15 @@ public class IndexResultValuator { docMetadata, resultsWithPriorityTerms.contains(searchResult.combinedId) ); + searchResult.scores.add(score); setScore += score.termValue(); + if (!filterRequired(metadata, queryParams.queryStrategy())) { + setScore += 1000; + } + if (termIdx == 0) { setScore += score.documentValue(); } @@ -130,6 +139,19 @@ public class IndexResultValuator { return setScore/setSize; } + private boolean filterRequired(long metadata, QueryStrategy queryStrategy) { + if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) { + return EdgePageWordFlags.Site.isPresent(metadata); + } + else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) { + return EdgePageWordFlags.Subjects.isPresent(metadata); + } + else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) { + return EdgePageWordFlags.Title.isPresent(metadata); + } + return true; + } + private double calculateTermCoherencePenalty(int urlId, TObjectIntHashMap termToId, List termList) { long maskDirectGenerous = ~0; long maskDirectRaw = ~0; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndex.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndex.java index 6d70fab6..72c0e13f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndex.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/postings/SearchIndex.java @@ -92,7 +92,8 @@ public class SearchIndex { SearchIndexReader.IndexQueryBuilder query = switch(params.queryStrategy()) { case SENTENCE -> indexReader.findWordAsSentence(orderedIncludes); - case TOPIC -> indexReader.findWordAsTopic(orderedIncludes); + case TOPIC, REQUIRE_FIELD_SITE, REQUIRE_FIELD_TITLE, REQUIRE_FIELD_SUBJECT + -> indexReader.findWordAsTopic(orderedIncludes); case AUTO -> indexReader.findWordTopicDynamicMode(orderedIncludes); }; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java index 16f100de..d04b37b6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java @@ -115,11 +115,13 @@ public class EdgeIndexQueryService { TLongHashSet consideredUrlIds; public SearchQuery(EdgeSearchSpecification specsSet) { - this.fetchSize = specsSet.fetchSize; - this.budget = new IndexSearchBudget(specsSet.timeoutMs); + var limits = specsSet.queryLimits; + + this.fetchSize = limits.fetchSize(); + this.budget = new IndexSearchBudget(limits.timeoutMs()); this.subqueries = specsSet.subqueries; - this.limitByDomain = specsSet.limitByDomain; - this.limitTotal = specsSet.limitTotal; + this.limitByDomain = limits.resultsByDomain(); + this.limitTotal = limits.resultsTotal(); this.consideredUrlIds = new TLongHashSet(fetchSize * 4); @@ -151,7 +153,7 @@ public class EdgeIndexQueryService { } } - final var evaluator = new IndexResultValuator(indexes, results, subqueries); + final var evaluator = new IndexResultValuator(indexes, results, subqueries, queryParams); ArrayList items = new ArrayList<>(results.size()); ArrayList refusedItems = new ArrayList<>(results.size()); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java index a1289a42..f60a4b8f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchSpecification.java @@ -1,6 +1,7 @@ package nu.marginalia.wmsa.edge.model.search; import lombok.*; +import nu.marginalia.wmsa.edge.index.model.QueryLimits; import nu.marginalia.wmsa.edge.index.model.QueryStrategy; import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier; import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit; @@ -9,23 +10,17 @@ import java.util.List; @ToString @Getter @Builder @With @AllArgsConstructor public class EdgeSearchSpecification { - public List subqueries; public List domains; public SearchSetIdentifier searchSetIdentifier; - public final int limitByDomain; - public final int limitTotal; - public final String humanQuery; - public final int timeoutMs; - public final int fetchSize; - public final SpecificationLimit quality; public final SpecificationLimit year; public final SpecificationLimit size; + public final QueryLimits queryLimits; public final QueryStrategy queryStrategy; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java index 6cf8a050..31ec39bc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryFactory.java @@ -6,6 +6,7 @@ import nu.marginalia.util.language.WordPatterns; import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; +import nu.marginalia.wmsa.edge.index.model.QueryLimits; import nu.marginalia.wmsa.edge.index.model.QueryStrategy; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery; @@ -84,6 +85,8 @@ public class QueryFactory { List problems = new ArrayList<>(); String domain = null; + QueryStrategy queryStrategy = QueryStrategy.AUTO; + var basicQuery = queryParser.parse(query); if (basicQuery.size() >= 8) { @@ -113,6 +116,9 @@ public class QueryFactory { if (t.type == TokenType.SIZE_TERM) { size = parseSpecificationLimit(t.str); } + if (t.type == TokenType.QS_TERM) { + queryStrategy = parseQueryStrategy(t.str); + } } var queryPermutations = queryParser.permuteQueriesNew(basicQuery); @@ -179,25 +185,24 @@ public class QueryFactory { } } + int domainLimit; + if (domain != null) { + domainLimit = 100; + } else { + domainLimit = 2; + } + EdgeSearchSpecification.EdgeSearchSpecificationBuilder specsBuilder = EdgeSearchSpecification.builder() .subqueries(subqueries) - .limitTotal(100) + .queryLimits(new QueryLimits(domainLimit, 100, 250, 4096)) .humanQuery(query) - .timeoutMs(250) - .fetchSize(4096) .quality(qualityLimit) .year(year) .size(size) .domains(domains) - .queryStrategy(QueryStrategy.AUTO) + .queryStrategy(queryStrategy) .searchSetIdentifier(profile.searchSetIdentifier); - if (domain != null) { - specsBuilder = specsBuilder.limitByDomain(100); - } else { - specsBuilder = specsBuilder.limitByDomain(2); - } - EdgeSearchSpecification specs = specsBuilder.build(); return new EdgeSearchQuery(specs, searchTermsHuman, domain); @@ -210,10 +215,10 @@ public class QueryFactory { if (startChar == '=') { return SpecificationLimit.equals(val); } - else if (startChar == '<'){ + else if (startChar == '<') { return SpecificationLimit.lessThan(val); } - else if (startChar == '>'){ + else if (startChar == '>') { return SpecificationLimit.greaterThan(val); } else { @@ -221,6 +226,17 @@ public class QueryFactory { } } + private QueryStrategy parseQueryStrategy(String str) { + return switch (str.toUpperCase()) { + case "RF_TITLE" -> QueryStrategy.REQUIRE_FIELD_TITLE; + case "RF_SUBJECT" -> QueryStrategy.REQUIRE_FIELD_SUBJECT; + case "RF_SITE" -> QueryStrategy.REQUIRE_FIELD_SITE; + case "SENTENCE" -> QueryStrategy.SENTENCE; + case "TOPIC" -> QueryStrategy.TOPIC; + default -> QueryStrategy.AUTO; + }; + } + private String normalizeDomainName(String str) { return str.toLowerCase(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java index 354ba0ce..04b91c88 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/query/QueryParser.java @@ -93,6 +93,8 @@ public class QueryParser { entity.replace(new Token(TokenType.YEAR_TERM, t.str.substring(4), t.displayStr)); } else if (t.str.startsWith("size") && t.str.matches("size[=><]\\d+")) { entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr)); + } else if (t.str.startsWith("qs=")) { + entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr)); } else if (t.str.contains(":")) { entity.replace(new Token(TokenType.ADVICE_TERM, t.str, t.displayStr)); } @@ -508,6 +510,8 @@ enum TokenType implements Predicate { SIZE_TERM, NEAR_TERM, + QS_TERM, + QUOT, MINUS, QMARK, diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java index 0c4cffc2..c9a63bc5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchQueryIndexService.java @@ -4,6 +4,7 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; +import nu.marginalia.wmsa.edge.index.model.QueryLimits; import nu.marginalia.wmsa.edge.index.model.QueryStrategy; import nu.marginalia.wmsa.edge.model.search.*; import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit; @@ -47,11 +48,8 @@ public class EdgeSearchQueryIndexService { .subqueries(sqs) .domains(Collections.emptyList()) .searchSetIdentifier(profile.searchSetIdentifier) - .limitByDomain(limitPerDomain) - .limitTotal(limitTotal) + .queryLimits(new QueryLimits(limitPerDomain, limitTotal, 150, 2048)) .humanQuery("") - .timeoutMs(150) - .fetchSize(2048) .year(SpecificationLimit.none()) .size(SpecificationLimit.none()) .quality(SpecificationLimit.none()) @@ -76,11 +74,13 @@ public class EdgeSearchQueryIndexService { resultList.sort(resultListComparator); - UrlDeduplicator deduplicator = new UrlDeduplicator(processedQuery.specs.limitByDomain); - List retList = new ArrayList<>(processedQuery.specs.limitTotal); + var limits = processedQuery.specs.queryLimits; + + UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain()); + List retList = new ArrayList<>(limits.resultsTotal()); for (var item : resultList) { - if (retList.size() >= processedQuery.specs.limitTotal) + if (retList.size() >= limits.resultsTotal()) break; if (!deduplicator.shouldRemove(item)) { diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTest.java index a59e4fd0..27d820ea 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexIntegrationTest.java @@ -3,10 +3,7 @@ package nu.marginalia.wmsa.edge.index.service; import com.google.inject.Guice; import com.google.inject.Inject; import nu.marginalia.wmsa.configuration.server.Initialization; -import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags; -import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata; -import nu.marginalia.wmsa.edge.index.model.QueryStrategy; +import nu.marginalia.wmsa.edge.index.model.*; import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl; import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry; import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntryHeader; @@ -80,10 +77,7 @@ public class EdgeIndexIntegrationTest { var rsp = queryService.query( EdgeSearchSpecification.builder() - .timeoutMs(Integer.MAX_VALUE) - .fetchSize(4000) - .limitTotal(10) - .limitByDomain(10) + .queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000)) .queryStrategy(QueryStrategy.SENTENCE) .year(SpecificationLimit.none()) .quality(SpecificationLimit.none()) @@ -115,10 +109,7 @@ public class EdgeIndexIntegrationTest { var rsp = queryService.query( EdgeSearchSpecification.builder() - .timeoutMs(Integer.MAX_VALUE) - .fetchSize(4000) - .limitTotal(10) - .limitByDomain(10) + .queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000)) .year(SpecificationLimit.none()) .quality(SpecificationLimit.none()) .size(SpecificationLimit.none()) @@ -144,10 +135,7 @@ public class EdgeIndexIntegrationTest { var rsp = queryService.query( EdgeSearchSpecification.builder() - .timeoutMs(Integer.MAX_VALUE) - .fetchSize(4000) - .limitTotal(10) - .limitByDomain(10) + .queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000)) .quality(SpecificationLimit.none()) .year(SpecificationLimit.equals(1998)) .size(SpecificationLimit.none())