diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSubquery.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSubquery.java index 83422fab..3c2c4bbc 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSubquery.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSubquery.java @@ -10,7 +10,7 @@ import java.util.stream.Collectors; @AllArgsConstructor public class SearchSubquery { - /** These terms must be present in the document */ + /** These terms must be present in the document and are used in ranking*/ public final List searchTermsInclude; /** These terms must be absent from the document */ @@ -21,18 +21,22 @@ public class SearchSubquery { /** If these optional terms are present in the document, rank it highly */ public final List searchTermsPriority; - + + /** Terms that we require to be in the same sentence */ + public final List> searchTermCoherences; + private double value = 0; public SearchSubquery(List searchTermsInclude, List searchTermsExclude, List searchTermsAdvice, - List searchTermsPriority - ) { + List searchTermsPriority, + List> searchTermCoherences) { this.searchTermsInclude = searchTermsInclude; this.searchTermsExclude = searchTermsExclude; this.searchTermsAdvice = searchTermsAdvice; this.searchTermsPriority = searchTermsPriority; + this.searchTermCoherences = searchTermCoherences; } public SearchSubquery setValue(double value) { @@ -51,6 +55,7 @@ public class SearchSubquery { if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] "))); + if (!searchTermCoherences.isEmpty()) sb.append("coherences=").append(searchTermCoherences.stream().map(coh->coh.stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", "))); return sb.toString(); } diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultPreliminaryScore.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultPreliminaryScore.java index e2a38e52..b8696a4c 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultPreliminaryScore.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/results/SearchResultPreliminaryScore.java @@ -6,9 +6,7 @@ import static java.lang.Boolean.compare; import static java.lang.Double.compare; public record SearchResultPreliminaryScore( - boolean anyAllSynthetic, - int minNumberOfFlagsSet, - int minPositionsSet, + boolean disqualified, boolean hasPriorityTerm, double searchRankingScore) implements Comparable @@ -27,16 +25,7 @@ public record SearchResultPreliminaryScore( return PREFER_LOW * compare(searchRankingScore, other.searchRankingScore); } - public boolean isEmpty() { - if (minNumberOfFlagsSet > 0) - return false; - - if (anyAllSynthetic) - return false; - - if (minPositionsSet > 0) - return false; - - return true; + public boolean isDisqualified() { + return disqualified; } } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java index 7f4a7fc0..2656263a 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndex.java @@ -110,11 +110,12 @@ public class SearchIndex { // that contain pairs of two search terms if (orderedIncludes.length > 1) { for (int i = 0; i + 1 < orderedIncludes.length; i++) { - var remainingWords = Arrays.copyOfRange(orderedIncludes, i+1, orderedIncludes.length); - var entrySource = indexReader - .findPriorityWord(orderedIncludes[i]) - .alsoPrioAnyOf(remainingWords); - queryHeads.add(entrySource); + for (int j = i + 1; j < orderedIncludes.length; j++) { + var entrySource = indexReader + .findPriorityWord(orderedIncludes[i]) + .alsoPrio(orderedIncludes[j]); + queryHeads.add(entrySource); + } } } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexSearchTerms.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexSearchTerms.java index fdd5f2ac..aedd7da3 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexSearchTerms.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/index/SearchIndexSearchTerms.java @@ -5,9 +5,18 @@ import it.unimi.dsi.fastutil.ints.IntComparator; import it.unimi.dsi.fastutil.ints.IntList; import it.unimi.dsi.fastutil.ints.IntOpenHashSet; -public record SearchIndexSearchTerms(IntList includes, IntList excludes, IntList priority) { +import java.util.Collections; +import java.util.List; + +public record SearchIndexSearchTerms( + IntList includes, + IntList excludes, + IntList priority, + List coherences + ) +{ public SearchIndexSearchTerms() { - this(IntList.of(), IntList.of(), IntList.of()); + this(IntList.of(), IntList.of(), IntList.of(), Collections.emptyList()); } public boolean isEmpty() { diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java index 1bbca926..105c6f04 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java @@ -8,8 +8,10 @@ import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap; import nu.marginalia.index.client.model.query.SearchSubquery; import nu.marginalia.index.index.SearchIndex; import nu.marginalia.index.svc.SearchTermsService; +import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.ranking.ResultValuator; +import java.util.ArrayList; import java.util.List; import java.util.OptionalInt; @@ -74,7 +76,27 @@ public class IndexMetadataService { } } - return new QuerySearchTerms(termToId, termIdsList.toIntArray()); + + return new QuerySearchTerms(termToId, + termIdsList.toIntArray(), + getTermCoherences(searchTermVariants)); + } + + + private TermCoherences getTermCoherences(List searchTermVariants) { + List coherences = new ArrayList<>(); + + for (var subquery : searchTermVariants) { + for (var coh : subquery.searchTermCoherences) { + int[] ids = coh.stream().map(searchTermsService::lookUpWord).filter(OptionalInt::isPresent).mapToInt(OptionalInt::getAsInt).toArray(); + coherences.add(ids); + } + + // It's assumed each subquery has identical coherences + break; + } + + return new TermCoherences(coherences); } public TLongHashSet getResultsWithPriorityTerms(List subqueries, long[] resultsArray) { @@ -116,15 +138,32 @@ public class IndexMetadataService { return termdocToMeta.getOrDefault(termdocKey(termId, docId), 0); } + public boolean testCoherence(long docId, TermCoherences coherences) { + + for (var coherenceSet : coherences.words()) { + long overlap = 0xFF_FFFF_FFFF_FFFFL; + for (var word : coherenceSet) { + overlap &= WordMetadata.decodePositions(getTermMetadata(word, docId)); + } + if (overlap == 0L) { + return false; + } + } + + return true; + } } public static class QuerySearchTerms { private final TObjectIntHashMap termToId; public final int[] termIdsAll; - public QuerySearchTerms(TObjectIntHashMap termToId, int[] termIdsAll) { + public final TermCoherences coherences; + + public QuerySearchTerms(TObjectIntHashMap termToId, int[] termIdsAll, TermCoherences coherences) { this.termToId = termToId; this.termIdsAll = termIdsAll; + this.coherences = coherences; } public int get(String searchTerm) { @@ -132,6 +171,8 @@ public class IndexMetadataService { } } + public record TermCoherences(List words) {} + private static long termdocKey(int termId, long docId) { return (docId << 32) | termId; } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java index 4cdd6949..dd13e942 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java @@ -117,10 +117,15 @@ public class IndexResultValuator { double score = searchResultValuator.calculateSearchResultValue(searchResult.keywordScores, 5000, rankingContext); + boolean disqualified = false; + + if (!termMetadata.testCoherence(urlIdInt, searchTerms.coherences)) + disqualified = true; + else if (maxFlagsCount == 0 && !anyAllSynthetic && maxPositionsSet == 0) + disqualified = true; + searchResult.setScore(new SearchResultPreliminaryScore( - anyAllSynthetic, - maxFlagsCount, - maxPositionsSet, + disqualified, hasPriorityTerm, score )); @@ -140,6 +145,7 @@ public class IndexResultValuator { return false; } } + return true; } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java index d1269749..13acaaf2 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java @@ -155,6 +155,7 @@ public class IndexQueryService { outer: // These queries are various term combinations for (var subquery : params.subqueries) { + final SearchIndexSearchTerms searchTerms = searchTermsSvc.getSearchTerms(subquery); if (searchTerms.isEmpty()) { @@ -195,16 +196,20 @@ public class IndexQueryService { } var includes = subquery.searchTermsInclude; + var advice = subquery.searchTermsAdvice; var excludes = subquery.searchTermsExclude; var priority = subquery.searchTermsPriority; - for (int i = 0; i < subquery.searchTermsInclude.size(); i++) { + for (int i = 0; i < includes.size(); i++) { logger.info(queryMarker, "{} -> {} I", includes.get(i), searchTerms.includes().getInt(i)); } - for (int i = 0; i < subquery.searchTermsExclude.size(); i++) { + for (int i = 0; i < advice.size(); i++) { + logger.info(queryMarker, "{} -> {} A", advice.get(i), searchTerms.includes().getInt(includes.size() + i)); + } + for (int i = 0; i < excludes.size(); i++) { logger.info(queryMarker, "{} -> {} E", excludes.get(i), searchTerms.excludes().getInt(i)); } - for (int i = 0; i < subquery.searchTermsPriority.size(); i++) { + for (int i = 0; i < priority.size(); i++) { logger.info(queryMarker, "{} -> {} P", priority.get(i), searchTerms.priority().getInt(i)); } } @@ -247,7 +252,7 @@ public class IndexQueryService { return Arrays.stream(resultIds.toArray()) .parallel() .mapToObj(evaluator::calculatePreliminaryScore) - .filter(score -> !score.getScore().isEmpty()) + .filter(score -> !score.getScore().isDisqualified()) .collect(Collectors.toList()); } diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java index 81d561b1..7e5cad50 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/SearchTermsService.java @@ -11,10 +11,7 @@ import nu.marginalia.lexicon.KeywordLexiconReadOnlyView; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.OptionalInt; +import java.util.*; @Singleton public class SearchTermsService { @@ -30,34 +27,49 @@ public class SearchTermsService { final IntList excludes = new IntArrayList(); final IntList includes = new IntArrayList(); final IntList priority = new IntArrayList(); + final List coherences = new ArrayList<>(); - for (var include : request.searchTermsInclude) { - var word = lookUpWord(include); - if (word.isEmpty()) { - logger.debug("Unknown search term: " + include); + if (!addEachTerm(includes, request.searchTermsInclude)) { + return new SearchIndexSearchTerms(); + } + + // This looks like a bug, but it's not + // v--- ----v + if (!addEachTerm(includes, request.searchTermsAdvice)) { + return new SearchIndexSearchTerms(); + } + + for (var coherence : request.searchTermCoherences) { + IntList parts = new IntArrayList(coherence.size()); + + if (!addEachTerm(parts, coherence)) { return new SearchIndexSearchTerms(); } - includes.add(word.getAsInt()); + + coherences.add(parts); } + // we don't care if we can't find these: + addEachTerm(excludes, request.searchTermsExclude); + addEachTerm(priority, request.searchTermsPriority); - for (var advice : request.searchTermsAdvice) { - var word = lookUpWord(advice); - if (word.isEmpty()) { - logger.debug("Unknown search term: " + advice); - return new SearchIndexSearchTerms(); + return new SearchIndexSearchTerms(includes, excludes, priority, coherences); + } + + private boolean addEachTerm(IntList ret, List words) { + boolean success = true; + + for (var exclude : words) { + var word = lookUpWord(exclude); + + if (word.isPresent()) { + lookUpWord(exclude).ifPresent(ret::add); + } + else { + success = false; } - includes.add(word.getAsInt()); } - - for (var exclude : request.searchTermsExclude) { - lookUpWord(exclude).ifPresent(excludes::add); - } - for (var exclude : request.searchTermsPriority) { - lookUpWord(exclude).ifPresent(priority::add); - } - - return new SearchIndexSearchTerms(includes, excludes, priority); + return success; } diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java index d4a75f80..fd92354b 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java @@ -28,11 +28,9 @@ import spark.Spark; import java.io.IOException; import java.util.*; -import java.util.stream.Collectors; import java.util.stream.IntStream; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; @Execution(SAME_THREAD) @@ -91,8 +89,8 @@ public class IndexQueryServiceIntegrationTest { .domains(new ArrayList<>()) .searchSetIdentifier(SearchSetIdentifier.NONE) .subqueries(List.of(new SearchSubquery( - List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList() - ))).build()); + List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(), + Collections.emptyList()))).build()); Assertions.assertArrayEquals( new int[] { 30, 90, 150, 210, 270, 330, 390, 450, 510 }, @@ -123,8 +121,8 @@ public class IndexQueryServiceIntegrationTest { .queryStrategy(QueryStrategy.SENTENCE) .domains(List.of(2)) .subqueries(List.of(new SearchSubquery( - List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList() - ))).build()); + List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(), + Collections.emptyList()))).build()); Assertions.assertArrayEquals( new int[] { 210, 270 }, rsp.results.stream().mapToInt(SearchResultItem::getUrlIdInt).toArray()); @@ -149,8 +147,8 @@ public class IndexQueryServiceIntegrationTest { .searchSetIdentifier(SearchSetIdentifier.NONE) .rankingParams(ResultRankingParameters.sensibleDefaults()) .subqueries(List.of(new SearchSubquery( - List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList() - )) + List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), + Collections.emptyList())) ).build()); @@ -167,8 +165,6 @@ public class IndexQueryServiceIntegrationTest { } - - public void loadData(int id) { int[] factors = IntStream .rangeClosed(1, id) diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java index d61e5681..cb77ca32 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QueryFactory.java @@ -83,8 +83,8 @@ public class QueryFactory { Arrays.asList(termsInclude), Collections.emptyList(), Collections.emptyList(), - Collections.emptyList() - )); + Collections.emptyList(), + Collections.emptyList())); var specs = SearchSpecification.builder() .subqueries(sqs) diff --git a/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QuerySearchTermsAccumulator.java b/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QuerySearchTermsAccumulator.java index 730b8f99..6fcb62c2 100644 --- a/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QuerySearchTermsAccumulator.java +++ b/code/services-core/search-service/src/main/java/nu/marginalia/search/query/QuerySearchTermsAccumulator.java @@ -9,17 +9,19 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +/** @see SearchSubquery */ public class QuerySearchTermsAccumulator implements TokenVisitor { public List searchTermsExclude = new ArrayList<>(); public List searchTermsInclude = new ArrayList<>(); public List searchTermsAdvice = new ArrayList<>(); public List searchTermsPriority = new ArrayList<>(); + public List> searchTermCoherences = new ArrayList<>(); public String near; public String domain; public SearchSubquery createSubquery() { - return new SearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority); + return new SearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchTermCoherences); } public QuerySearchTermsAccumulator(SearchProfile profile, List parts) { @@ -45,11 +47,19 @@ public class QuerySearchTermsAccumulator implements TokenVisitor { public void onQuotTerm(Token token) { String[] parts = token.str.split("_"); if (parts.length > 1) { + // Prefer that the actual n-gram is present searchTermsAdvice.add(token.str); + + // Require that the terms appear in the same sentence + searchTermCoherences.add(Arrays.asList(parts)); + + // Require that each term exists in the document + // (needed for ranking) searchTermsInclude.addAll(Arrays.asList(parts)); } else { searchTermsInclude.add(token.str); + } } diff --git a/code/tools/load-test/src/main/java/nu/marginalia/load_test/LoadTestMain.java b/code/tools/load-test/src/main/java/nu/marginalia/load_test/LoadTestMain.java index 8dcde846..cfbaeec6 100644 --- a/code/tools/load-test/src/main/java/nu/marginalia/load_test/LoadTestMain.java +++ b/code/tools/load-test/src/main/java/nu/marginalia/load_test/LoadTestMain.java @@ -29,7 +29,7 @@ public class LoadTestMain { for (int i = 0; i < 10000; i++) { String uri = "http://127.0.0.1:8080/search?query=%s&profile=corpo".formatted( - Strings.join(pickNCommonWords(2), '+') + Strings.join(pickNCommonWords(4), '+') ); HttpRequest req = HttpRequest.newBuilder(new URI(uri))