mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(wip) Repair qdebug utility and show new ranking details
This commit is contained in:
parent
7babdb87d5
commit
2e89b55593
@ -4,9 +4,6 @@ import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingInputs;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingOutputs;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||
@ -147,43 +144,4 @@ public class IndexProtobufCodec {
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
|
||||
public static RpcResultRankingDetails convertRankingDetails(ResultRankingDetails rankingDetails) {
|
||||
if (rankingDetails == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return RpcResultRankingDetails.newBuilder()
|
||||
.setInputs(convertRankingInputs(rankingDetails.inputs()))
|
||||
.setOutput(convertRankingOutput(rankingDetails.outputs()))
|
||||
.build();
|
||||
}
|
||||
|
||||
private static RpcResultRankingOutputs convertRankingOutput(ResultRankingOutputs outputs) {
|
||||
return RpcResultRankingOutputs.newBuilder()
|
||||
.setAverageSentenceLengthPenalty(outputs.averageSentenceLengthPenalty())
|
||||
.setQualityPenalty(outputs.qualityPenalty())
|
||||
.setRankingBonus(outputs.rankingBonus())
|
||||
.setTopologyBonus(outputs.topologyBonus())
|
||||
.setDocumentLengthPenalty(outputs.documentLengthPenalty())
|
||||
.setTemporalBias(outputs.temporalBias())
|
||||
.setFlagsPenalty(outputs.flagsPenalty())
|
||||
.setOverallPart(outputs.overallPart())
|
||||
.setTcfAvgDist(outputs.tcfAvgDist())
|
||||
.setTcfFirstPosition(outputs.tcfFirstPosition())
|
||||
.setBm25Part(outputs.bm25())
|
||||
.build();
|
||||
}
|
||||
|
||||
private static RpcResultRankingInputs convertRankingInputs(ResultRankingInputs inputs) {
|
||||
return RpcResultRankingInputs.newBuilder()
|
||||
.setRank(inputs.rank())
|
||||
.setAsl(inputs.asl())
|
||||
.setQuality(inputs.quality())
|
||||
.setSize(inputs.size())
|
||||
.setTopology(inputs.topology())
|
||||
.setYear(inputs.year())
|
||||
.addAllFlags(inputs.flags())
|
||||
.build();
|
||||
}
|
||||
}
|
||||
|
@ -9,13 +9,17 @@ import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.DebugFactor;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.DebugFactorGroup;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.DebugTermFactorGroup;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingInputs;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingOutputs;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
public class QueryProtobufCodec {
|
||||
|
||||
@ -138,45 +142,109 @@ public class QueryProtobufCodec {
|
||||
private static ResultRankingDetails convertRankingDetails(RpcResultRankingDetails rankingDetails) {
|
||||
if (rankingDetails == null)
|
||||
return null;
|
||||
var inputs = rankingDetails.getInputs();
|
||||
var outputs = rankingDetails.getOutput();
|
||||
|
||||
var docData = rankingDetails.getDocumentOutputs();
|
||||
var termData = rankingDetails.getTermOutputs();
|
||||
|
||||
return new ResultRankingDetails(
|
||||
convertRankingInputs(inputs),
|
||||
convertRankingOutputs(outputs)
|
||||
convertDocumentOutputs(docData),
|
||||
convertTermData(termData)
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
private static ResultRankingOutputs convertRankingOutputs(RpcResultRankingOutputs outputs) {
|
||||
return new ResultRankingOutputs(
|
||||
outputs.getAverageSentenceLengthPenalty(),
|
||||
outputs.getQualityPenalty(),
|
||||
outputs.getRankingBonus(),
|
||||
outputs.getTopologyBonus(),
|
||||
outputs.getDocumentLengthPenalty(),
|
||||
outputs.getTemporalBias(),
|
||||
outputs.getFlagsPenalty(),
|
||||
outputs.getOverallPart(),
|
||||
outputs.getBm25Part(),
|
||||
outputs.getTcfAvgDist(),
|
||||
outputs.getTcfFirstPosition()
|
||||
private static List<DebugTermFactorGroup> convertTermData(RpcResultTermRankingOutputs termData) {
|
||||
Map<String, Long> termIdByName = new HashMap<>();
|
||||
Map<String, List<DebugFactor>> factorsByTerm = new HashMap<>();
|
||||
|
||||
);
|
||||
for (int i = 0; i < termData.getTermCount(); i++) {
|
||||
termIdByName.put(termData.getTerm(i), termData.getTermId(i));
|
||||
factorsByTerm.computeIfAbsent(termData.getTerm(i), k -> new ArrayList<>())
|
||||
.add(new DebugFactor(termData.getFactor(i), termData.getValue(i)));
|
||||
}
|
||||
|
||||
Map<String, List<DebugFactorGroup>> factorGroupsByTerm = new HashMap<>();
|
||||
for (var entry : factorsByTerm.entrySet()) {
|
||||
String term = entry.getKey();
|
||||
var factorsList = entry.getValue();
|
||||
|
||||
Map<String, List<DebugFactor>> factorsByGroup = new HashMap<>();
|
||||
|
||||
for (var factor : factorsList) {
|
||||
String[] parts = factor.factor().split("\\.");
|
||||
|
||||
String group, name;
|
||||
|
||||
if (parts.length != 2) {
|
||||
group = "unknown";
|
||||
name = parts[0];
|
||||
} else {
|
||||
group = parts[0];
|
||||
name = parts[1];
|
||||
}
|
||||
|
||||
|
||||
factorsByGroup.computeIfAbsent(group, k -> new ArrayList<>())
|
||||
.add(new DebugFactor(name, factor.value()));
|
||||
}
|
||||
|
||||
factorsByGroup.forEach((groupName, groupData) -> {
|
||||
factorGroupsByTerm.computeIfAbsent(term, k -> new ArrayList<>())
|
||||
.add(new DebugFactorGroup(groupName, groupData));
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
List<DebugTermFactorGroup> groups = new ArrayList<>();
|
||||
|
||||
for (var entry : factorGroupsByTerm.entrySet()) {
|
||||
groups.add(new DebugTermFactorGroup(entry.getKey(), termIdByName.get(entry.getKey()), entry.getValue()));
|
||||
}
|
||||
|
||||
return groups;
|
||||
}
|
||||
|
||||
private static ResultRankingInputs convertRankingInputs(RpcResultRankingInputs inputs) {
|
||||
return new ResultRankingInputs(
|
||||
inputs.getRank(),
|
||||
inputs.getAsl(),
|
||||
inputs.getQuality(),
|
||||
inputs.getSize(),
|
||||
inputs.getTopology(),
|
||||
inputs.getYear(),
|
||||
inputs.getFlagsList()
|
||||
);
|
||||
private static List<DebugFactorGroup> convertDocumentOutputs(RpcResultDocumentRankingOutputs docData) {
|
||||
|
||||
List<DebugFactor> unclusteredFactors = new ArrayList<>();
|
||||
for (int i = 0; i < docData.getFactorCount(); i++) {
|
||||
String factor = docData.getFactor(i);
|
||||
String value = docData.getValue(i);
|
||||
unclusteredFactors.add(new DebugFactor(factor, value));
|
||||
}
|
||||
|
||||
Map<String, List<DebugFactor>> factorsByGroup = new HashMap<>();
|
||||
|
||||
for (var factor : unclusteredFactors) {
|
||||
String factorName = factor.factor();
|
||||
String value = factor.value();
|
||||
|
||||
String[] parts = factorName.split("\\.");
|
||||
|
||||
String group, name;
|
||||
|
||||
if (parts.length != 2) {
|
||||
group = "unknown";
|
||||
name = factorName;
|
||||
}
|
||||
else {
|
||||
group = parts[0];
|
||||
name = parts[1];
|
||||
}
|
||||
|
||||
factorsByGroup.computeIfAbsent(group, k -> new ArrayList<>())
|
||||
.add(new DebugFactor(name, value));
|
||||
}
|
||||
|
||||
List<DebugFactorGroup> groups = new ArrayList<>();
|
||||
for (var entry : factorsByGroup.entrySet()) {
|
||||
groups.add(new DebugFactorGroup(entry.getKey(), entry.getValue()));
|
||||
}
|
||||
|
||||
return groups;
|
||||
}
|
||||
|
||||
|
||||
private static SearchResultItem convertRawResult(RpcRawResultItem rawItem) {
|
||||
var keywordScores = new ArrayList<SearchResultKeywordScore>(rawItem.getKeywordScoresCount());
|
||||
|
||||
@ -189,6 +257,7 @@ public class QueryProtobufCodec {
|
||||
rawItem.getHtmlFeatures(),
|
||||
keywordScores,
|
||||
rawItem.getHasPriorityTerms(),
|
||||
null, // Not set
|
||||
Double.NaN // Not set
|
||||
);
|
||||
}
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.api.searchquery.model.results;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
@ -27,6 +28,8 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
||||
|
||||
public boolean hasPrioTerm;
|
||||
|
||||
public DebugRankingFactors debugRankingFactors;
|
||||
|
||||
public SearchResultItem(long combinedId,
|
||||
long encodedDocMetadata,
|
||||
int htmlFeatures) {
|
||||
|
@ -0,0 +1,4 @@
|
||||
package nu.marginalia.api.searchquery.model.results.debug;
|
||||
|
||||
public record DebugFactor(String factor, String value) {
|
||||
}
|
@ -0,0 +1,5 @@
|
||||
package nu.marginalia.api.searchquery.model.results.debug;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public record DebugFactorGroup(String name, List<DebugFactor> factors) {}
|
@ -0,0 +1,38 @@
|
||||
package nu.marginalia.api.searchquery.model.results.debug;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
public class DebugRankingFactors {
|
||||
private final List<DebugFactor> documentFactors = new ArrayList<>();
|
||||
private final List<DebugTermFactor> termFactors = new ArrayList<>();
|
||||
|
||||
public DebugRankingFactors() {}
|
||||
|
||||
public void addDocumentFactor(String factor, String value) {
|
||||
documentFactors.add(new DebugFactor(factor, value));
|
||||
}
|
||||
|
||||
public void addTermFactor(long termId, String factor, String value) {
|
||||
termFactors.add(new DebugTermFactor(termId, null, factor, value));
|
||||
}
|
||||
public void addTermFactor(long termId, String factor, IntIterator sequenceIter) {
|
||||
if (!sequenceIter.hasNext()) return;
|
||||
|
||||
StringJoiner joiner = new StringJoiner(",");
|
||||
while (sequenceIter.hasNext()) {
|
||||
joiner.add(String.valueOf(sequenceIter.nextInt()));
|
||||
}
|
||||
termFactors.add(new DebugTermFactor(termId, null, factor, joiner.toString()));
|
||||
}
|
||||
public List<DebugFactor> getDocumentFactors() {
|
||||
return documentFactors;
|
||||
}
|
||||
|
||||
public List<DebugTermFactor> getTermFactors() {
|
||||
return termFactors;
|
||||
}
|
||||
}
|
@ -0,0 +1,4 @@
|
||||
package nu.marginalia.api.searchquery.model.results.debug;
|
||||
|
||||
public record DebugTermFactor(long termId, String term, String factor, String value) {
|
||||
}
|
@ -0,0 +1,6 @@
|
||||
package nu.marginalia.api.searchquery.model.results.debug;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public record DebugTermFactorGroup(String term, long termId, List<DebugFactorGroup> factorList) {
|
||||
}
|
@ -1,6 +1,9 @@
|
||||
package nu.marginalia.api.searchquery.model.results.debug;
|
||||
|
||||
public record ResultRankingDetails(ResultRankingInputs inputs, ResultRankingOutputs outputs)
|
||||
import java.util.List;
|
||||
|
||||
public record ResultRankingDetails(List<DebugFactorGroup> docFactorGroups,
|
||||
List<DebugTermFactorGroup> termFactorGroups)
|
||||
{
|
||||
|
||||
}
|
||||
|
@ -1,5 +0,0 @@
|
||||
package nu.marginalia.api.searchquery.model.results.debug;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public record ResultRankingInputs(int rank, int asl, int quality, int size, int topology, int year, List<String> flags) {}
|
@ -1,16 +0,0 @@
|
||||
package nu.marginalia.api.searchquery.model.results.debug;
|
||||
|
||||
|
||||
public record ResultRankingOutputs(double averageSentenceLengthPenalty,
|
||||
double qualityPenalty,
|
||||
double rankingBonus,
|
||||
double topologyBonus,
|
||||
double documentLengthPenalty,
|
||||
double temporalBias,
|
||||
double flagsPenalty,
|
||||
double overallPart,
|
||||
double bm25,
|
||||
double tcfAvgDist,
|
||||
double tcfFirstPosition)
|
||||
{
|
||||
}
|
@ -143,8 +143,8 @@ message RpcResultRankingParameters {
|
||||
}
|
||||
|
||||
message RpcResultRankingDetails {
|
||||
RpcResultRankingInputs inputs = 1;
|
||||
RpcResultRankingOutputs output = 2;
|
||||
RpcResultDocumentRankingOutputs documentOutputs = 1;
|
||||
RpcResultTermRankingOutputs termOutputs = 2;
|
||||
}
|
||||
|
||||
message RpcResultRankingInputs {
|
||||
@ -158,19 +158,16 @@ message RpcResultRankingInputs {
|
||||
}
|
||||
|
||||
/** Summary of the output of the ranking function */
|
||||
message RpcResultRankingOutputs {
|
||||
double averageSentenceLengthPenalty = 1;
|
||||
double qualityPenalty = 2;
|
||||
double rankingBonus = 3;
|
||||
double topologyBonus = 4;
|
||||
double documentLengthPenalty = 5;
|
||||
double temporalBias = 6;
|
||||
double flagsPenalty = 7;
|
||||
double overallPart = 8;
|
||||
double bm25Part = 9;
|
||||
// 10-14 unused
|
||||
double tcfAvgDist = 15;
|
||||
double tcfFirstPosition = 16;
|
||||
message RpcResultDocumentRankingOutputs {
|
||||
repeated string factor = 1;
|
||||
repeated string value = 2;
|
||||
}
|
||||
|
||||
message RpcResultTermRankingOutputs {
|
||||
repeated int64 termId = 1;
|
||||
repeated string term = 2;
|
||||
repeated string factor = 3;
|
||||
repeated string value = 4;
|
||||
}
|
||||
|
||||
/* Defines a single subquery */
|
||||
|
@ -118,7 +118,13 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
||||
.labels(nodeName, "GRPC")
|
||||
.time(() -> {
|
||||
// Perform the search
|
||||
return executeSearch(params);
|
||||
try {
|
||||
return executeSearch(params);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error in handling request", ex);
|
||||
return List.of();
|
||||
}
|
||||
});
|
||||
|
||||
// Prometheus bookkeeping
|
||||
@ -286,7 +292,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
||||
awaitCompletion();
|
||||
|
||||
// Return the best results
|
||||
return resultValuator.selectBestResults(parameters, resultHeap);
|
||||
return resultValuator.selectBestResults(parameters, resultRankingContext, resultHeap);
|
||||
}
|
||||
|
||||
/** Wait for all tasks to complete */
|
||||
@ -399,6 +405,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean execute() throws InterruptedException {
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
@ -417,7 +424,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
||||
stallTime.addAndGet(System.currentTimeMillis() - start);
|
||||
|
||||
resultHeap.addAll(
|
||||
resultValuator.rankResults(parameters, rankingContext, resultIds)
|
||||
resultValuator.rankResults(parameters, false, rankingContext, resultIds)
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -6,13 +6,13 @@ import gnu.trove.list.TLongList;
|
||||
import gnu.trove.list.array.TLongArrayList;
|
||||
import gnu.trove.map.hash.TObjectLongHashMap;
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||
import nu.marginalia.api.searchquery.RpcRawResultItem;
|
||||
import nu.marginalia.api.searchquery.RpcResultKeywordScore;
|
||||
import nu.marginalia.api.searchquery.*;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
|
||||
import nu.marginalia.index.index.CombinedIndexReader;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.model.SearchParameters;
|
||||
@ -48,6 +48,7 @@ public class IndexResultRankingService {
|
||||
}
|
||||
|
||||
public List<SearchResultItem> rankResults(SearchParameters params,
|
||||
boolean exportDebugData,
|
||||
ResultRankingContext rankingContext,
|
||||
CombinedDocIdList resultIds)
|
||||
{
|
||||
@ -99,10 +100,19 @@ public class IndexResultRankingService {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Calculate the preliminary score
|
||||
var score = resultRanker.calculateScore(arena, resultIds.at(i), searchTerms, flags, positions);
|
||||
if (score != null) {
|
||||
results.add(score);
|
||||
if (!exportDebugData) {
|
||||
var score = resultRanker.calculateScore(arena, null, resultIds.at(i), searchTerms, flags, positions);
|
||||
if (score != null) {
|
||||
results.add(score);
|
||||
}
|
||||
}
|
||||
else {
|
||||
var rankingFactors = new DebugRankingFactors();
|
||||
var score = resultRanker.calculateScore(arena, rankingFactors, resultIds.at(i), searchTerms, flags, positions);
|
||||
if (score != null) {
|
||||
score.debugRankingFactors = rankingFactors;
|
||||
results.add(score);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -112,6 +122,7 @@ public class IndexResultRankingService {
|
||||
|
||||
|
||||
public List<RpcDecoratedResultItem> selectBestResults(SearchParameters params,
|
||||
ResultRankingContext resultRankingContext,
|
||||
Collection<SearchResultItem> results) throws SQLException {
|
||||
|
||||
var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain);
|
||||
@ -136,6 +147,25 @@ public class IndexResultRankingService {
|
||||
}
|
||||
}
|
||||
|
||||
// If we're exporting debug data from the ranking, we need to re-run the ranking calculation
|
||||
// for the selected results, as this would be comically expensive to do for all the results we
|
||||
// discard along the way
|
||||
|
||||
if (params.rankingParams.exportDebugData) {
|
||||
var combinedIdsList = new LongArrayList(resultsList.size());
|
||||
for (var item : resultsList) {
|
||||
combinedIdsList.add(item.combinedId);
|
||||
}
|
||||
|
||||
resultsList.clear();
|
||||
resultsList.addAll(this.rankResults(
|
||||
params,
|
||||
true,
|
||||
resultRankingContext,
|
||||
new CombinedDocIdList(combinedIdsList))
|
||||
);
|
||||
}
|
||||
|
||||
// Fetch the document details for the selected results in one go, from the local document database
|
||||
// for this index partition
|
||||
Map<Long, DocdbUrlDetail> detailsById = new HashMap<>(idsList.size());
|
||||
@ -189,11 +219,45 @@ public class IndexResultRankingService {
|
||||
decoratedBuilder.setPubYear(docData.pubYear());
|
||||
}
|
||||
|
||||
/* FIXME
|
||||
var rankingDetails = IndexProtobufCodec.convertRankingDetails(result.rankingDetails);
|
||||
if (rankingDetails != null) {
|
||||
decoratedBuilder.setRankingDetails(rankingDetails);
|
||||
}*/
|
||||
if (result.debugRankingFactors != null) {
|
||||
var debugFactors = result.debugRankingFactors;
|
||||
var detailsBuilder = RpcResultRankingDetails.newBuilder();
|
||||
var documentOutputs = RpcResultDocumentRankingOutputs.newBuilder();
|
||||
|
||||
for (var factor : debugFactors.getDocumentFactors()) {
|
||||
documentOutputs.addFactor(factor.factor());
|
||||
documentOutputs.addValue(factor.value());
|
||||
}
|
||||
|
||||
detailsBuilder.setDocumentOutputs(documentOutputs);
|
||||
|
||||
var termOutputs = RpcResultTermRankingOutputs.newBuilder();
|
||||
|
||||
CqDataLong termIds = params.compiledQueryIds.data;;
|
||||
|
||||
for (var entry : debugFactors.getTermFactors()) {
|
||||
String term = "[ERROR IN LOOKUP]";
|
||||
|
||||
// CURSED: This is a linear search, but the number of terms is small, and it's in a debug path
|
||||
for (int i = 0; i < termIds.size(); i++) {
|
||||
if (termIds.get(i) == entry.termId()) {
|
||||
term = params.compiledQuery.at(i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
termOutputs
|
||||
.addTermId(entry.termId())
|
||||
.addTerm(term)
|
||||
.addFactor(entry.factor())
|
||||
.addValue(entry.value());
|
||||
}
|
||||
|
||||
detailsBuilder.setTermOutputs(termOutputs);
|
||||
decoratedBuilder.setRankingDetails(detailsBuilder);
|
||||
}
|
||||
|
||||
resultItems.add(decoratedBuilder.build());
|
||||
}
|
||||
|
||||
return resultItems;
|
||||
|
@ -7,6 +7,7 @@ import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
|
||||
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||
import nu.marginalia.index.index.CombinedIndexReader;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
@ -57,6 +58,7 @@ public class IndexResultScoreCalculator {
|
||||
|
||||
@Nullable
|
||||
public SearchResultItem calculateScore(Arena arena,
|
||||
@Nullable DebugRankingFactors rankingFactors,
|
||||
long combinedId,
|
||||
QuerySearchTerms searchTerms,
|
||||
long[] wordFlags,
|
||||
@ -88,6 +90,8 @@ public class IndexResultScoreCalculator {
|
||||
DocumentSpans spans = index.getDocumentSpans(arena, docId);
|
||||
|
||||
double score = calculateSearchResultValue(
|
||||
rankingFactors,
|
||||
searchTerms,
|
||||
wordFlagsQuery,
|
||||
positionsQuery,
|
||||
docMetadata,
|
||||
@ -157,7 +161,9 @@ public class IndexResultScoreCalculator {
|
||||
return true;
|
||||
}
|
||||
|
||||
public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery,
|
||||
public double calculateSearchResultValue(DebugRankingFactors rankingFactors,
|
||||
QuerySearchTerms searchTerms,
|
||||
CompiledQueryLong wordFlagsQuery,
|
||||
CompiledQuery<CodedSequence> positionsQuery,
|
||||
long documentMetadata,
|
||||
int features,
|
||||
@ -344,12 +350,82 @@ public class IndexResultScoreCalculator {
|
||||
+ verbatimMatchScore
|
||||
+ keywordMinDistFac;
|
||||
|
||||
|
||||
|
||||
double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / calculateAvgMinDistance(positionsQuery, ctx));
|
||||
double tcfFirstPosition = rankingParams.tcfFirstPosition * (1.0 / Math.sqrt(firstPosition));
|
||||
|
||||
double bM25 = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new Bm25GraphVisitor(rankingParams.bm25Params, weightedCounts, length, ctx));
|
||||
double bFlags = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new TermFlagsGraphVisitor(rankingParams.bm25Params, wordFlagsQuery.data, weightedCounts, ctx));
|
||||
|
||||
if (rankingFactors != null) {
|
||||
rankingFactors.addDocumentFactor("overall.averageSentenceLengthPenalty", Double.toString(averageSentenceLengthPenalty));
|
||||
rankingFactors.addDocumentFactor("overall.documentLengthPenalty", Double.toString(documentLengthPenalty));
|
||||
rankingFactors.addDocumentFactor("overall.qualityPenalty", Double.toString(qualityPenalty));
|
||||
rankingFactors.addDocumentFactor("overall.rankingBonus", Double.toString(rankingBonus));
|
||||
rankingFactors.addDocumentFactor("overall.topologyBonus", Double.toString(topologyBonus));
|
||||
rankingFactors.addDocumentFactor("overall.temporalBias", Double.toString(temporalBias));
|
||||
rankingFactors.addDocumentFactor("overall.flagsPenalty", Double.toString(flagsPenalty));
|
||||
rankingFactors.addDocumentFactor("overall.verbatimMatchScore", Double.toString(verbatimMatchScore));
|
||||
rankingFactors.addDocumentFactor("overall.keywordMinDistFac", Double.toString(keywordMinDistFac));
|
||||
|
||||
rankingFactors.addDocumentFactor("tcf.avgDist", Double.toString(tcfAvgDist));
|
||||
rankingFactors.addDocumentFactor("tcf.firstPosition", Double.toString(tcfFirstPosition));
|
||||
|
||||
rankingFactors.addDocumentFactor("bm25.main", Double.toString(bM25));
|
||||
rankingFactors.addDocumentFactor("bm25.flags", Double.toString(bFlags));
|
||||
|
||||
for (int i = 0; i < searchTerms.termIdsAll.size(); i++) {
|
||||
long termId = searchTerms.termIdsAll.at(i);
|
||||
|
||||
rankingFactors.addTermFactor(termId, "factor.weightedCount", Double.toString(weightedCounts[i]));
|
||||
byte flags = (byte) wordFlagsQuery.at(i);
|
||||
|
||||
for (var flag : WordFlags.values()) {
|
||||
if (flag.isPresent(flags)) {
|
||||
rankingFactors.addTermFactor(termId, "flags." + flag.name(), "true");
|
||||
}
|
||||
}
|
||||
|
||||
if (verbatimMatchInAnchor) {
|
||||
rankingFactors.addTermFactor(termId, "verbatim.anchor", "true");
|
||||
}
|
||||
if (verbatimMatchInBody) {
|
||||
rankingFactors.addTermFactor(termId, "verbatim.body", "true");
|
||||
}
|
||||
if (verbatimMatchInCode) {
|
||||
rankingFactors.addTermFactor(termId, "verbatim.code", "true");
|
||||
}
|
||||
if (verbatimMatchInExtLink) {
|
||||
rankingFactors.addTermFactor(termId, "verbatim.extLink", "true");
|
||||
}
|
||||
if (verbatimMatchInHeading) {
|
||||
rankingFactors.addTermFactor(termId, "verbatim.heading", "true");
|
||||
}
|
||||
if (verbatimMatchInNav) {
|
||||
rankingFactors.addTermFactor(termId, "verbatim.nav", "true");
|
||||
}
|
||||
if (verbatimMatchInTitle) {
|
||||
rankingFactors.addTermFactor(termId, "verbatim.title", "true");
|
||||
}
|
||||
|
||||
rankingFactors.addTermFactor(termId, "unordered.title", Integer.toString(unorderedMatchInTitleCount));
|
||||
rankingFactors.addTermFactor(termId, "unordered.heading", Integer.toString(unorderedMatchInHeadingCount));
|
||||
|
||||
if (positions[i] != null) {
|
||||
rankingFactors.addTermFactor(termId, "positions.all", positions[i].iterator());
|
||||
rankingFactors.addTermFactor(termId, "positions.title", SequenceOperations.findIntersections(spans.title.iterator(), positions[i].iterator()).iterator());
|
||||
rankingFactors.addTermFactor(termId, "positions.heading", SequenceOperations.findIntersections(spans.heading.iterator(), positions[i].iterator()).iterator());
|
||||
rankingFactors.addTermFactor(termId, "positions.anchor", SequenceOperations.findIntersections(spans.anchor.iterator(), positions[i].iterator()).iterator());
|
||||
rankingFactors.addTermFactor(termId, "positions.code", SequenceOperations.findIntersections(spans.code.iterator(), positions[i].iterator()).iterator());
|
||||
rankingFactors.addTermFactor(termId, "positions.nav", SequenceOperations.findIntersections(spans.nav.iterator(), positions[i].iterator()).iterator());
|
||||
rankingFactors.addTermFactor(termId, "positions.externalLinkText", SequenceOperations.findIntersections(spans.externalLinkText.iterator(), positions[i].iterator()).iterator());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Renormalize to 0...15, where 0 is the best possible score;
|
||||
// this is a historical artifact of the original ranking function
|
||||
double ret = normalize(
|
||||
|
@ -25,7 +25,7 @@ class IndexResultDomainDeduplicatorTest {
|
||||
}
|
||||
|
||||
SearchResultItem forId(int domain, int ordinal) {
|
||||
return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(),false, Double.NaN);
|
||||
return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(),false, null, Double.NaN);
|
||||
}
|
||||
|
||||
}
|
@ -3,12 +3,12 @@ package nu.marginalia.query;
|
||||
import com.google.common.base.Strings;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.functions.searchquery.QueryGRPCService;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
import spark.Request;
|
||||
@ -82,7 +82,7 @@ public class QueryBasicInterface {
|
||||
domainCount, count, 250, 8192
|
||||
), set);
|
||||
|
||||
var rankingParams = rankingParamsFromRequest(request);
|
||||
var rankingParams = debugRankingParamsFromRequest(request);
|
||||
|
||||
var detailedDirectResult = queryGRPCService.executeDirect(
|
||||
queryString, queryParams, rankingParams
|
||||
@ -98,7 +98,7 @@ public class QueryBasicInterface {
|
||||
);
|
||||
}
|
||||
|
||||
private ResultRankingParameters rankingParamsFromRequest(Request request) {
|
||||
private ResultRankingParameters debugRankingParamsFromRequest(Request request) {
|
||||
var sensibleDefaults = ResultRankingParameters.sensibleDefaults();
|
||||
|
||||
return ResultRankingParameters.builder()
|
||||
|
@ -102,27 +102,26 @@
|
||||
<p>{{description}}</p>
|
||||
|
||||
<div><small class="text-muted">dataHash: {{dataHash}} wordsTotal: {{wordsTotal}} bestPositions: {{bestPositions}} rankingScore: {{rankingScore}} urlQuality: {{urlQuality}}</small></div>
|
||||
{{#with rankingDetails.inputs}}
|
||||
<div><small class="text-muted">Rank: {{rank}}</small></div>
|
||||
<div><small class="text-muted">ASL: {{asl}}</small></div>
|
||||
<div><small class="text-muted">Quality: {{quality}}</small></div>
|
||||
<div><small class="text-muted">Size: {{size}}</small></div>
|
||||
<div><small class="text-muted">Topology: {{topology}}</small></div>
|
||||
<div><small class="text-muted">Year: {{year}}</small></div>
|
||||
<div><small class="text-muted">Flags: {{#each flags}} {{.}} {{/each}}</small></div>
|
||||
{{#with rankingDetails.docFactorGroups}}
|
||||
{{#each .}}
|
||||
<div><small>{{name}}</small></div>
|
||||
{{#each factors}}
|
||||
<div><small class="text-muted">{{factor}}: {{value}}</small></div>
|
||||
{{/each}}
|
||||
{{/each}}
|
||||
{{/with}}
|
||||
{{#with rankingDetails.outputs}}
|
||||
<div><small class="text-muted">Average Sentence Length Penalty: {{averageSentenceLengthPenalty}}</small></div>
|
||||
<div><small class="text-muted">Quality Penalty: {{qualityPenalty}}</small></div>
|
||||
<div><small class="text-muted">Ranking Bonus: {{rankingBonus}}</small></div>
|
||||
<div><small class="text-muted">Topology Bonus: {{topologyBonus}}</small></div>
|
||||
<div><small class="text-muted">Document Length Penalty: {{documentLengthPenalty}}</small></div>
|
||||
<div><small class="text-muted">Temporal Bias: {{temporalBias}}</small></div>
|
||||
<div><small class="text-muted">Flags Penalty: {{flagsPenalty}}</small></div>
|
||||
<div><small class="text-muted">Overall Part: {{overallPart}}</small></div>
|
||||
<div><small class="text-muted">TCF Avg Distance: {{tcfAvgDist}}</small></div>
|
||||
<div><small class="text-muted">TCF First Position: {{tcfFirstPosition}}</small></div>
|
||||
<div><small class="text-muted">BM25: {{bM25}}</small></div>
|
||||
|
||||
{{#with rankingDetails.termFactorGroups}}
|
||||
{{#each .}}
|
||||
<div>{{termId}}:{{term}}</div>
|
||||
{{#each factorList}}
|
||||
<div>{{name}}</div>
|
||||
{{#each factors}}
|
||||
<div><small class="text-muted">{{factor}}: {{value}}</small></div>
|
||||
{{/each}}
|
||||
|
||||
{{/each}}
|
||||
{{/each}}
|
||||
{{/with}}
|
||||
|
||||
</div>
|
||||
|
Loading…
Reference in New Issue
Block a user