diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java index 2b5cbaa0..898264e8 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java @@ -4,9 +4,6 @@ import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint; import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.Bm25Parameters; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; -import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails; -import nu.marginalia.api.searchquery.model.results.debug.ResultRankingInputs; -import nu.marginalia.api.searchquery.model.results.debug.ResultRankingOutputs; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.index.query.limit.SpecificationLimitType; @@ -147,43 +144,4 @@ public class IndexProtobufCodec { return builder.build(); } - - public static RpcResultRankingDetails convertRankingDetails(ResultRankingDetails rankingDetails) { - if (rankingDetails == null) { - return null; - } - - return RpcResultRankingDetails.newBuilder() - .setInputs(convertRankingInputs(rankingDetails.inputs())) - .setOutput(convertRankingOutput(rankingDetails.outputs())) - .build(); - } - - private static RpcResultRankingOutputs convertRankingOutput(ResultRankingOutputs outputs) { - return RpcResultRankingOutputs.newBuilder() - .setAverageSentenceLengthPenalty(outputs.averageSentenceLengthPenalty()) - .setQualityPenalty(outputs.qualityPenalty()) - .setRankingBonus(outputs.rankingBonus()) - .setTopologyBonus(outputs.topologyBonus()) - .setDocumentLengthPenalty(outputs.documentLengthPenalty()) - .setTemporalBias(outputs.temporalBias()) - .setFlagsPenalty(outputs.flagsPenalty()) - .setOverallPart(outputs.overallPart()) - .setTcfAvgDist(outputs.tcfAvgDist()) - .setTcfFirstPosition(outputs.tcfFirstPosition()) - .setBm25Part(outputs.bm25()) - .build(); - } - - private static RpcResultRankingInputs convertRankingInputs(ResultRankingInputs inputs) { - return RpcResultRankingInputs.newBuilder() - .setRank(inputs.rank()) - .setAsl(inputs.asl()) - .setQuality(inputs.quality()) - .setSize(inputs.size()) - .setTopology(inputs.topology()) - .setYear(inputs.year()) - .addAllFlags(inputs.flags()) - .build(); - } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java index 691d374a..e6e68431 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java @@ -9,13 +9,17 @@ import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.SearchResultItem; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; +import nu.marginalia.api.searchquery.model.results.debug.DebugFactor; +import nu.marginalia.api.searchquery.model.results.debug.DebugFactorGroup; +import nu.marginalia.api.searchquery.model.results.debug.DebugTermFactorGroup; import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails; -import nu.marginalia.api.searchquery.model.results.debug.ResultRankingInputs; -import nu.marginalia.api.searchquery.model.results.debug.ResultRankingOutputs; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.model.EdgeUrl; import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; public class QueryProtobufCodec { @@ -138,45 +142,109 @@ public class QueryProtobufCodec { private static ResultRankingDetails convertRankingDetails(RpcResultRankingDetails rankingDetails) { if (rankingDetails == null) return null; - var inputs = rankingDetails.getInputs(); - var outputs = rankingDetails.getOutput(); + + var docData = rankingDetails.getDocumentOutputs(); + var termData = rankingDetails.getTermOutputs(); return new ResultRankingDetails( - convertRankingInputs(inputs), - convertRankingOutputs(outputs) + convertDocumentOutputs(docData), + convertTermData(termData) ); } - private static ResultRankingOutputs convertRankingOutputs(RpcResultRankingOutputs outputs) { - return new ResultRankingOutputs( - outputs.getAverageSentenceLengthPenalty(), - outputs.getQualityPenalty(), - outputs.getRankingBonus(), - outputs.getTopologyBonus(), - outputs.getDocumentLengthPenalty(), - outputs.getTemporalBias(), - outputs.getFlagsPenalty(), - outputs.getOverallPart(), - outputs.getBm25Part(), - outputs.getTcfAvgDist(), - outputs.getTcfFirstPosition() + private static List convertTermData(RpcResultTermRankingOutputs termData) { + Map termIdByName = new HashMap<>(); + Map> factorsByTerm = new HashMap<>(); - ); + for (int i = 0; i < termData.getTermCount(); i++) { + termIdByName.put(termData.getTerm(i), termData.getTermId(i)); + factorsByTerm.computeIfAbsent(termData.getTerm(i), k -> new ArrayList<>()) + .add(new DebugFactor(termData.getFactor(i), termData.getValue(i))); + } + + Map> factorGroupsByTerm = new HashMap<>(); + for (var entry : factorsByTerm.entrySet()) { + String term = entry.getKey(); + var factorsList = entry.getValue(); + + Map> factorsByGroup = new HashMap<>(); + + for (var factor : factorsList) { + String[] parts = factor.factor().split("\\."); + + String group, name; + + if (parts.length != 2) { + group = "unknown"; + name = parts[0]; + } else { + group = parts[0]; + name = parts[1]; + } + + + factorsByGroup.computeIfAbsent(group, k -> new ArrayList<>()) + .add(new DebugFactor(name, factor.value())); + } + + factorsByGroup.forEach((groupName, groupData) -> { + factorGroupsByTerm.computeIfAbsent(term, k -> new ArrayList<>()) + .add(new DebugFactorGroup(groupName, groupData)); + }); + + } + + List groups = new ArrayList<>(); + + for (var entry : factorGroupsByTerm.entrySet()) { + groups.add(new DebugTermFactorGroup(entry.getKey(), termIdByName.get(entry.getKey()), entry.getValue())); + } + + return groups; } - private static ResultRankingInputs convertRankingInputs(RpcResultRankingInputs inputs) { - return new ResultRankingInputs( - inputs.getRank(), - inputs.getAsl(), - inputs.getQuality(), - inputs.getSize(), - inputs.getTopology(), - inputs.getYear(), - inputs.getFlagsList() - ); + private static List convertDocumentOutputs(RpcResultDocumentRankingOutputs docData) { + + List unclusteredFactors = new ArrayList<>(); + for (int i = 0; i < docData.getFactorCount(); i++) { + String factor = docData.getFactor(i); + String value = docData.getValue(i); + unclusteredFactors.add(new DebugFactor(factor, value)); + } + + Map> factorsByGroup = new HashMap<>(); + + for (var factor : unclusteredFactors) { + String factorName = factor.factor(); + String value = factor.value(); + + String[] parts = factorName.split("\\."); + + String group, name; + + if (parts.length != 2) { + group = "unknown"; + name = factorName; + } + else { + group = parts[0]; + name = parts[1]; + } + + factorsByGroup.computeIfAbsent(group, k -> new ArrayList<>()) + .add(new DebugFactor(name, value)); + } + + List groups = new ArrayList<>(); + for (var entry : factorsByGroup.entrySet()) { + groups.add(new DebugFactorGroup(entry.getKey(), entry.getValue())); + } + + return groups; } + private static SearchResultItem convertRawResult(RpcRawResultItem rawItem) { var keywordScores = new ArrayList(rawItem.getKeywordScoresCount()); @@ -189,6 +257,7 @@ public class QueryProtobufCodec { rawItem.getHtmlFeatures(), keywordScores, rawItem.getHasPriorityTerms(), + null, // Not set Double.NaN // Not set ); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java index dbd94638..c9599b2e 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java @@ -2,6 +2,7 @@ package nu.marginalia.api.searchquery.model.results; import lombok.AllArgsConstructor; import lombok.Getter; +import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors; import nu.marginalia.model.id.UrlIdCodec; import org.jetbrains.annotations.NotNull; @@ -27,6 +28,8 @@ public class SearchResultItem implements Comparable { public boolean hasPrioTerm; + public DebugRankingFactors debugRankingFactors; + public SearchResultItem(long combinedId, long encodedDocMetadata, int htmlFeatures) { diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugFactor.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugFactor.java new file mode 100644 index 00000000..9eb2f6c6 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugFactor.java @@ -0,0 +1,4 @@ +package nu.marginalia.api.searchquery.model.results.debug; + +public record DebugFactor(String factor, String value) { +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugFactorGroup.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugFactorGroup.java new file mode 100644 index 00000000..245cdb8c --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugFactorGroup.java @@ -0,0 +1,5 @@ +package nu.marginalia.api.searchquery.model.results.debug; + +import java.util.List; + +public record DebugFactorGroup(String name, List factors) {} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugRankingFactors.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugRankingFactors.java new file mode 100644 index 00000000..25d012d3 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugRankingFactors.java @@ -0,0 +1,38 @@ +package nu.marginalia.api.searchquery.model.results.debug; + +import it.unimi.dsi.fastutil.ints.IntIterator; + +import java.util.ArrayList; +import java.util.List; +import java.util.StringJoiner; + +public class DebugRankingFactors { + private final List documentFactors = new ArrayList<>(); + private final List termFactors = new ArrayList<>(); + + public DebugRankingFactors() {} + + public void addDocumentFactor(String factor, String value) { + documentFactors.add(new DebugFactor(factor, value)); + } + + public void addTermFactor(long termId, String factor, String value) { + termFactors.add(new DebugTermFactor(termId, null, factor, value)); + } + public void addTermFactor(long termId, String factor, IntIterator sequenceIter) { + if (!sequenceIter.hasNext()) return; + + StringJoiner joiner = new StringJoiner(","); + while (sequenceIter.hasNext()) { + joiner.add(String.valueOf(sequenceIter.nextInt())); + } + termFactors.add(new DebugTermFactor(termId, null, factor, joiner.toString())); + } + public List getDocumentFactors() { + return documentFactors; + } + + public List getTermFactors() { + return termFactors; + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugTermFactor.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugTermFactor.java new file mode 100644 index 00000000..84b944f3 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugTermFactor.java @@ -0,0 +1,4 @@ +package nu.marginalia.api.searchquery.model.results.debug; + +public record DebugTermFactor(long termId, String term, String factor, String value) { +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugTermFactorGroup.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugTermFactorGroup.java new file mode 100644 index 00000000..303b7eec --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugTermFactorGroup.java @@ -0,0 +1,6 @@ +package nu.marginalia.api.searchquery.model.results.debug; + +import java.util.List; + +public record DebugTermFactorGroup(String term, long termId, List factorList) { +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingDetails.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingDetails.java index c94200e2..e4bca962 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingDetails.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingDetails.java @@ -1,6 +1,9 @@ package nu.marginalia.api.searchquery.model.results.debug; -public record ResultRankingDetails(ResultRankingInputs inputs, ResultRankingOutputs outputs) +import java.util.List; + +public record ResultRankingDetails(List docFactorGroups, + List termFactorGroups) { } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingInputs.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingInputs.java deleted file mode 100644 index 86169416..00000000 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingInputs.java +++ /dev/null @@ -1,5 +0,0 @@ -package nu.marginalia.api.searchquery.model.results.debug; - -import java.util.List; - -public record ResultRankingInputs(int rank, int asl, int quality, int size, int topology, int year, List flags) {} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingOutputs.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingOutputs.java deleted file mode 100644 index e9c490e8..00000000 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingOutputs.java +++ /dev/null @@ -1,16 +0,0 @@ -package nu.marginalia.api.searchquery.model.results.debug; - - -public record ResultRankingOutputs(double averageSentenceLengthPenalty, - double qualityPenalty, - double rankingBonus, - double topologyBonus, - double documentLengthPenalty, - double temporalBias, - double flagsPenalty, - double overallPart, - double bm25, - double tcfAvgDist, - double tcfFirstPosition) -{ -} diff --git a/code/functions/search-query/api/src/main/protobuf/query-api.proto b/code/functions/search-query/api/src/main/protobuf/query-api.proto index ee6e669b..640e5fdb 100644 --- a/code/functions/search-query/api/src/main/protobuf/query-api.proto +++ b/code/functions/search-query/api/src/main/protobuf/query-api.proto @@ -143,8 +143,8 @@ message RpcResultRankingParameters { } message RpcResultRankingDetails { - RpcResultRankingInputs inputs = 1; - RpcResultRankingOutputs output = 2; + RpcResultDocumentRankingOutputs documentOutputs = 1; + RpcResultTermRankingOutputs termOutputs = 2; } message RpcResultRankingInputs { @@ -158,19 +158,16 @@ message RpcResultRankingInputs { } /** Summary of the output of the ranking function */ -message RpcResultRankingOutputs { - double averageSentenceLengthPenalty = 1; - double qualityPenalty = 2; - double rankingBonus = 3; - double topologyBonus = 4; - double documentLengthPenalty = 5; - double temporalBias = 6; - double flagsPenalty = 7; - double overallPart = 8; - double bm25Part = 9; - // 10-14 unused - double tcfAvgDist = 15; - double tcfFirstPosition = 16; +message RpcResultDocumentRankingOutputs { + repeated string factor = 1; + repeated string value = 2; +} + +message RpcResultTermRankingOutputs { + repeated int64 termId = 1; + repeated string term = 2; + repeated string factor = 3; + repeated string value = 4; } /* Defines a single subquery */ diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java index 68e077a4..e1614166 100644 --- a/code/index/java/nu/marginalia/index/IndexGrpcService.java +++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java @@ -118,7 +118,13 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { .labels(nodeName, "GRPC") .time(() -> { // Perform the search - return executeSearch(params); + try { + return executeSearch(params); + } + catch (Exception ex) { + logger.error("Error in handling request", ex); + return List.of(); + } }); // Prometheus bookkeeping @@ -286,7 +292,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { awaitCompletion(); // Return the best results - return resultValuator.selectBestResults(parameters, resultHeap); + return resultValuator.selectBestResults(parameters, resultRankingContext, resultHeap); } /** Wait for all tasks to complete */ @@ -399,6 +405,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { } } } + private boolean execute() throws InterruptedException { long start = System.currentTimeMillis(); @@ -417,7 +424,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { stallTime.addAndGet(System.currentTimeMillis() - start); resultHeap.addAll( - resultValuator.rankResults(parameters, rankingContext, resultIds) + resultValuator.rankResults(parameters, false, rankingContext, resultIds) ); } diff --git a/code/index/java/nu/marginalia/index/results/IndexResultRankingService.java b/code/index/java/nu/marginalia/index/results/IndexResultRankingService.java index 8c94cefd..810a1880 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultRankingService.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultRankingService.java @@ -6,13 +6,13 @@ import gnu.trove.list.TLongList; import gnu.trove.list.array.TLongArrayList; import gnu.trove.map.hash.TObjectLongHashMap; import it.unimi.dsi.fastutil.longs.LongArrayList; -import nu.marginalia.api.searchquery.RpcDecoratedResultItem; -import nu.marginalia.api.searchquery.RpcRawResultItem; -import nu.marginalia.api.searchquery.RpcResultKeywordScore; +import nu.marginalia.api.searchquery.*; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CqDataLong; import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.SearchResultItem; +import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors; import nu.marginalia.index.index.CombinedIndexReader; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.model.SearchParameters; @@ -48,6 +48,7 @@ public class IndexResultRankingService { } public List rankResults(SearchParameters params, + boolean exportDebugData, ResultRankingContext rankingContext, CombinedDocIdList resultIds) { @@ -99,10 +100,19 @@ public class IndexResultRankingService { continue; } - // Calculate the preliminary score - var score = resultRanker.calculateScore(arena, resultIds.at(i), searchTerms, flags, positions); - if (score != null) { - results.add(score); + if (!exportDebugData) { + var score = resultRanker.calculateScore(arena, null, resultIds.at(i), searchTerms, flags, positions); + if (score != null) { + results.add(score); + } + } + else { + var rankingFactors = new DebugRankingFactors(); + var score = resultRanker.calculateScore(arena, rankingFactors, resultIds.at(i), searchTerms, flags, positions); + if (score != null) { + score.debugRankingFactors = rankingFactors; + results.add(score); + } } } @@ -112,6 +122,7 @@ public class IndexResultRankingService { public List selectBestResults(SearchParameters params, + ResultRankingContext resultRankingContext, Collection results) throws SQLException { var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain); @@ -136,6 +147,25 @@ public class IndexResultRankingService { } } + // If we're exporting debug data from the ranking, we need to re-run the ranking calculation + // for the selected results, as this would be comically expensive to do for all the results we + // discard along the way + + if (params.rankingParams.exportDebugData) { + var combinedIdsList = new LongArrayList(resultsList.size()); + for (var item : resultsList) { + combinedIdsList.add(item.combinedId); + } + + resultsList.clear(); + resultsList.addAll(this.rankResults( + params, + true, + resultRankingContext, + new CombinedDocIdList(combinedIdsList)) + ); + } + // Fetch the document details for the selected results in one go, from the local document database // for this index partition Map detailsById = new HashMap<>(idsList.size()); @@ -189,11 +219,45 @@ public class IndexResultRankingService { decoratedBuilder.setPubYear(docData.pubYear()); } - /* FIXME - var rankingDetails = IndexProtobufCodec.convertRankingDetails(result.rankingDetails); - if (rankingDetails != null) { - decoratedBuilder.setRankingDetails(rankingDetails); - }*/ + if (result.debugRankingFactors != null) { + var debugFactors = result.debugRankingFactors; + var detailsBuilder = RpcResultRankingDetails.newBuilder(); + var documentOutputs = RpcResultDocumentRankingOutputs.newBuilder(); + + for (var factor : debugFactors.getDocumentFactors()) { + documentOutputs.addFactor(factor.factor()); + documentOutputs.addValue(factor.value()); + } + + detailsBuilder.setDocumentOutputs(documentOutputs); + + var termOutputs = RpcResultTermRankingOutputs.newBuilder(); + + CqDataLong termIds = params.compiledQueryIds.data;; + + for (var entry : debugFactors.getTermFactors()) { + String term = "[ERROR IN LOOKUP]"; + + // CURSED: This is a linear search, but the number of terms is small, and it's in a debug path + for (int i = 0; i < termIds.size(); i++) { + if (termIds.get(i) == entry.termId()) { + term = params.compiledQuery.at(i); + break; + } + } + + termOutputs + .addTermId(entry.termId()) + .addTerm(term) + .addFactor(entry.factor()) + .addValue(entry.value()); + } + + detailsBuilder.setTermOutputs(termOutputs); + decoratedBuilder.setRankingDetails(detailsBuilder); + } + + resultItems.add(decoratedBuilder.build()); } return resultItems; diff --git a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java index 1d52e2c4..9b5d0e33 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java @@ -7,6 +7,7 @@ import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.SearchResultItem; +import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors; import nu.marginalia.index.forward.spans.DocumentSpans; import nu.marginalia.index.index.CombinedIndexReader; import nu.marginalia.index.index.StatefulIndex; @@ -57,6 +58,7 @@ public class IndexResultScoreCalculator { @Nullable public SearchResultItem calculateScore(Arena arena, + @Nullable DebugRankingFactors rankingFactors, long combinedId, QuerySearchTerms searchTerms, long[] wordFlags, @@ -88,6 +90,8 @@ public class IndexResultScoreCalculator { DocumentSpans spans = index.getDocumentSpans(arena, docId); double score = calculateSearchResultValue( + rankingFactors, + searchTerms, wordFlagsQuery, positionsQuery, docMetadata, @@ -157,7 +161,9 @@ public class IndexResultScoreCalculator { return true; } - public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery, + public double calculateSearchResultValue(DebugRankingFactors rankingFactors, + QuerySearchTerms searchTerms, + CompiledQueryLong wordFlagsQuery, CompiledQuery positionsQuery, long documentMetadata, int features, @@ -344,12 +350,82 @@ public class IndexResultScoreCalculator { + verbatimMatchScore + keywordMinDistFac; + + double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / calculateAvgMinDistance(positionsQuery, ctx)); double tcfFirstPosition = rankingParams.tcfFirstPosition * (1.0 / Math.sqrt(firstPosition)); double bM25 = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new Bm25GraphVisitor(rankingParams.bm25Params, weightedCounts, length, ctx)); double bFlags = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new TermFlagsGraphVisitor(rankingParams.bm25Params, wordFlagsQuery.data, weightedCounts, ctx)); + if (rankingFactors != null) { + rankingFactors.addDocumentFactor("overall.averageSentenceLengthPenalty", Double.toString(averageSentenceLengthPenalty)); + rankingFactors.addDocumentFactor("overall.documentLengthPenalty", Double.toString(documentLengthPenalty)); + rankingFactors.addDocumentFactor("overall.qualityPenalty", Double.toString(qualityPenalty)); + rankingFactors.addDocumentFactor("overall.rankingBonus", Double.toString(rankingBonus)); + rankingFactors.addDocumentFactor("overall.topologyBonus", Double.toString(topologyBonus)); + rankingFactors.addDocumentFactor("overall.temporalBias", Double.toString(temporalBias)); + rankingFactors.addDocumentFactor("overall.flagsPenalty", Double.toString(flagsPenalty)); + rankingFactors.addDocumentFactor("overall.verbatimMatchScore", Double.toString(verbatimMatchScore)); + rankingFactors.addDocumentFactor("overall.keywordMinDistFac", Double.toString(keywordMinDistFac)); + + rankingFactors.addDocumentFactor("tcf.avgDist", Double.toString(tcfAvgDist)); + rankingFactors.addDocumentFactor("tcf.firstPosition", Double.toString(tcfFirstPosition)); + + rankingFactors.addDocumentFactor("bm25.main", Double.toString(bM25)); + rankingFactors.addDocumentFactor("bm25.flags", Double.toString(bFlags)); + + for (int i = 0; i < searchTerms.termIdsAll.size(); i++) { + long termId = searchTerms.termIdsAll.at(i); + + rankingFactors.addTermFactor(termId, "factor.weightedCount", Double.toString(weightedCounts[i])); + byte flags = (byte) wordFlagsQuery.at(i); + + for (var flag : WordFlags.values()) { + if (flag.isPresent(flags)) { + rankingFactors.addTermFactor(termId, "flags." + flag.name(), "true"); + } + } + + if (verbatimMatchInAnchor) { + rankingFactors.addTermFactor(termId, "verbatim.anchor", "true"); + } + if (verbatimMatchInBody) { + rankingFactors.addTermFactor(termId, "verbatim.body", "true"); + } + if (verbatimMatchInCode) { + rankingFactors.addTermFactor(termId, "verbatim.code", "true"); + } + if (verbatimMatchInExtLink) { + rankingFactors.addTermFactor(termId, "verbatim.extLink", "true"); + } + if (verbatimMatchInHeading) { + rankingFactors.addTermFactor(termId, "verbatim.heading", "true"); + } + if (verbatimMatchInNav) { + rankingFactors.addTermFactor(termId, "verbatim.nav", "true"); + } + if (verbatimMatchInTitle) { + rankingFactors.addTermFactor(termId, "verbatim.title", "true"); + } + + rankingFactors.addTermFactor(termId, "unordered.title", Integer.toString(unorderedMatchInTitleCount)); + rankingFactors.addTermFactor(termId, "unordered.heading", Integer.toString(unorderedMatchInHeadingCount)); + + if (positions[i] != null) { + rankingFactors.addTermFactor(termId, "positions.all", positions[i].iterator()); + rankingFactors.addTermFactor(termId, "positions.title", SequenceOperations.findIntersections(spans.title.iterator(), positions[i].iterator()).iterator()); + rankingFactors.addTermFactor(termId, "positions.heading", SequenceOperations.findIntersections(spans.heading.iterator(), positions[i].iterator()).iterator()); + rankingFactors.addTermFactor(termId, "positions.anchor", SequenceOperations.findIntersections(spans.anchor.iterator(), positions[i].iterator()).iterator()); + rankingFactors.addTermFactor(termId, "positions.code", SequenceOperations.findIntersections(spans.code.iterator(), positions[i].iterator()).iterator()); + rankingFactors.addTermFactor(termId, "positions.nav", SequenceOperations.findIntersections(spans.nav.iterator(), positions[i].iterator()).iterator()); + rankingFactors.addTermFactor(termId, "positions.externalLinkText", SequenceOperations.findIntersections(spans.externalLinkText.iterator(), positions[i].iterator()).iterator()); + } + + } + + } + // Renormalize to 0...15, where 0 is the best possible score; // this is a historical artifact of the original ranking function double ret = normalize( diff --git a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java index 4966e5f0..de538945 100644 --- a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java +++ b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java @@ -25,7 +25,7 @@ class IndexResultDomainDeduplicatorTest { } SearchResultItem forId(int domain, int ordinal) { - return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(),false, Double.NaN); + return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(),false, null, Double.NaN); } } \ No newline at end of file diff --git a/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java b/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java index 4ed3b838..73a989bf 100644 --- a/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java +++ b/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java @@ -3,12 +3,12 @@ package nu.marginalia.query; import com.google.common.base.Strings; import com.google.gson.Gson; import com.google.inject.Inject; +import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.results.Bm25Parameters; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.functions.searchquery.QueryGRPCService; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.model.gson.GsonFactory; -import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.renderer.MustacheRenderer; import nu.marginalia.renderer.RendererFactory; import spark.Request; @@ -82,7 +82,7 @@ public class QueryBasicInterface { domainCount, count, 250, 8192 ), set); - var rankingParams = rankingParamsFromRequest(request); + var rankingParams = debugRankingParamsFromRequest(request); var detailedDirectResult = queryGRPCService.executeDirect( queryString, queryParams, rankingParams @@ -98,7 +98,7 @@ public class QueryBasicInterface { ); } - private ResultRankingParameters rankingParamsFromRequest(Request request) { + private ResultRankingParameters debugRankingParamsFromRequest(Request request) { var sensibleDefaults = ResultRankingParameters.sensibleDefaults(); return ResultRankingParameters.builder() diff --git a/code/services-core/query-service/resources/templates/qdebug.hdb b/code/services-core/query-service/resources/templates/qdebug.hdb index 4d2e7e41..ddcbfcdc 100644 --- a/code/services-core/query-service/resources/templates/qdebug.hdb +++ b/code/services-core/query-service/resources/templates/qdebug.hdb @@ -102,27 +102,26 @@

{{description}}

dataHash: {{dataHash}} wordsTotal: {{wordsTotal}} bestPositions: {{bestPositions}} rankingScore: {{rankingScore}} urlQuality: {{urlQuality}}
- {{#with rankingDetails.inputs}} -
Rank: {{rank}}
-
ASL: {{asl}}
-
Quality: {{quality}}
-
Size: {{size}}
-
Topology: {{topology}}
-
Year: {{year}}
-
Flags: {{#each flags}} {{.}} {{/each}}
+ {{#with rankingDetails.docFactorGroups}} + {{#each .}} +
{{name}}
+ {{#each factors}} +
{{factor}}: {{value}}
+ {{/each}} + {{/each}} {{/with}} - {{#with rankingDetails.outputs}} -
Average Sentence Length Penalty: {{averageSentenceLengthPenalty}}
-
Quality Penalty: {{qualityPenalty}}
-
Ranking Bonus: {{rankingBonus}}
-
Topology Bonus: {{topologyBonus}}
-
Document Length Penalty: {{documentLengthPenalty}}
-
Temporal Bias: {{temporalBias}}
-
Flags Penalty: {{flagsPenalty}}
-
Overall Part: {{overallPart}}
-
TCF Avg Distance: {{tcfAvgDist}}
-
TCF First Position: {{tcfFirstPosition}}
-
BM25: {{bM25}}
+ + {{#with rankingDetails.termFactorGroups}} + {{#each .}} +
{{termId}}:{{term}}
+ {{#each factorList}} +
{{name}}
+ {{#each factors}} +
{{factor}}: {{value}}
+ {{/each}} + + {{/each}} + {{/each}} {{/with}}