mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(wip) Repair qdebug utility and show new ranking details
This commit is contained in:
parent
7babdb87d5
commit
2e89b55593
@ -4,9 +4,6 @@ import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
|
|||||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||||
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
|
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
|
|
||||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingInputs;
|
|
||||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingOutputs;
|
|
||||||
import nu.marginalia.index.query.limit.QueryLimits;
|
import nu.marginalia.index.query.limit.QueryLimits;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||||
@ -147,43 +144,4 @@ public class IndexProtobufCodec {
|
|||||||
return builder.build();
|
return builder.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static RpcResultRankingDetails convertRankingDetails(ResultRankingDetails rankingDetails) {
|
|
||||||
if (rankingDetails == null) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
return RpcResultRankingDetails.newBuilder()
|
|
||||||
.setInputs(convertRankingInputs(rankingDetails.inputs()))
|
|
||||||
.setOutput(convertRankingOutput(rankingDetails.outputs()))
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static RpcResultRankingOutputs convertRankingOutput(ResultRankingOutputs outputs) {
|
|
||||||
return RpcResultRankingOutputs.newBuilder()
|
|
||||||
.setAverageSentenceLengthPenalty(outputs.averageSentenceLengthPenalty())
|
|
||||||
.setQualityPenalty(outputs.qualityPenalty())
|
|
||||||
.setRankingBonus(outputs.rankingBonus())
|
|
||||||
.setTopologyBonus(outputs.topologyBonus())
|
|
||||||
.setDocumentLengthPenalty(outputs.documentLengthPenalty())
|
|
||||||
.setTemporalBias(outputs.temporalBias())
|
|
||||||
.setFlagsPenalty(outputs.flagsPenalty())
|
|
||||||
.setOverallPart(outputs.overallPart())
|
|
||||||
.setTcfAvgDist(outputs.tcfAvgDist())
|
|
||||||
.setTcfFirstPosition(outputs.tcfFirstPosition())
|
|
||||||
.setBm25Part(outputs.bm25())
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static RpcResultRankingInputs convertRankingInputs(ResultRankingInputs inputs) {
|
|
||||||
return RpcResultRankingInputs.newBuilder()
|
|
||||||
.setRank(inputs.rank())
|
|
||||||
.setAsl(inputs.asl())
|
|
||||||
.setQuality(inputs.quality())
|
|
||||||
.setSize(inputs.size())
|
|
||||||
.setTopology(inputs.topology())
|
|
||||||
.setYear(inputs.year())
|
|
||||||
.addAllFlags(inputs.flags())
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -9,13 +9,17 @@ import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
|||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||||
|
import nu.marginalia.api.searchquery.model.results.debug.DebugFactor;
|
||||||
|
import nu.marginalia.api.searchquery.model.results.debug.DebugFactorGroup;
|
||||||
|
import nu.marginalia.api.searchquery.model.results.debug.DebugTermFactorGroup;
|
||||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
|
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
|
||||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingInputs;
|
|
||||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingOutputs;
|
|
||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
public class QueryProtobufCodec {
|
public class QueryProtobufCodec {
|
||||||
|
|
||||||
@ -138,45 +142,109 @@ public class QueryProtobufCodec {
|
|||||||
private static ResultRankingDetails convertRankingDetails(RpcResultRankingDetails rankingDetails) {
|
private static ResultRankingDetails convertRankingDetails(RpcResultRankingDetails rankingDetails) {
|
||||||
if (rankingDetails == null)
|
if (rankingDetails == null)
|
||||||
return null;
|
return null;
|
||||||
var inputs = rankingDetails.getInputs();
|
|
||||||
var outputs = rankingDetails.getOutput();
|
var docData = rankingDetails.getDocumentOutputs();
|
||||||
|
var termData = rankingDetails.getTermOutputs();
|
||||||
|
|
||||||
return new ResultRankingDetails(
|
return new ResultRankingDetails(
|
||||||
convertRankingInputs(inputs),
|
convertDocumentOutputs(docData),
|
||||||
convertRankingOutputs(outputs)
|
convertTermData(termData)
|
||||||
);
|
);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static ResultRankingOutputs convertRankingOutputs(RpcResultRankingOutputs outputs) {
|
private static List<DebugTermFactorGroup> convertTermData(RpcResultTermRankingOutputs termData) {
|
||||||
return new ResultRankingOutputs(
|
Map<String, Long> termIdByName = new HashMap<>();
|
||||||
outputs.getAverageSentenceLengthPenalty(),
|
Map<String, List<DebugFactor>> factorsByTerm = new HashMap<>();
|
||||||
outputs.getQualityPenalty(),
|
|
||||||
outputs.getRankingBonus(),
|
|
||||||
outputs.getTopologyBonus(),
|
|
||||||
outputs.getDocumentLengthPenalty(),
|
|
||||||
outputs.getTemporalBias(),
|
|
||||||
outputs.getFlagsPenalty(),
|
|
||||||
outputs.getOverallPart(),
|
|
||||||
outputs.getBm25Part(),
|
|
||||||
outputs.getTcfAvgDist(),
|
|
||||||
outputs.getTcfFirstPosition()
|
|
||||||
|
|
||||||
);
|
for (int i = 0; i < termData.getTermCount(); i++) {
|
||||||
|
termIdByName.put(termData.getTerm(i), termData.getTermId(i));
|
||||||
|
factorsByTerm.computeIfAbsent(termData.getTerm(i), k -> new ArrayList<>())
|
||||||
|
.add(new DebugFactor(termData.getFactor(i), termData.getValue(i)));
|
||||||
|
}
|
||||||
|
|
||||||
|
Map<String, List<DebugFactorGroup>> factorGroupsByTerm = new HashMap<>();
|
||||||
|
for (var entry : factorsByTerm.entrySet()) {
|
||||||
|
String term = entry.getKey();
|
||||||
|
var factorsList = entry.getValue();
|
||||||
|
|
||||||
|
Map<String, List<DebugFactor>> factorsByGroup = new HashMap<>();
|
||||||
|
|
||||||
|
for (var factor : factorsList) {
|
||||||
|
String[] parts = factor.factor().split("\\.");
|
||||||
|
|
||||||
|
String group, name;
|
||||||
|
|
||||||
|
if (parts.length != 2) {
|
||||||
|
group = "unknown";
|
||||||
|
name = parts[0];
|
||||||
|
} else {
|
||||||
|
group = parts[0];
|
||||||
|
name = parts[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
factorsByGroup.computeIfAbsent(group, k -> new ArrayList<>())
|
||||||
|
.add(new DebugFactor(name, factor.value()));
|
||||||
|
}
|
||||||
|
|
||||||
|
factorsByGroup.forEach((groupName, groupData) -> {
|
||||||
|
factorGroupsByTerm.computeIfAbsent(term, k -> new ArrayList<>())
|
||||||
|
.add(new DebugFactorGroup(groupName, groupData));
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
List<DebugTermFactorGroup> groups = new ArrayList<>();
|
||||||
|
|
||||||
|
for (var entry : factorGroupsByTerm.entrySet()) {
|
||||||
|
groups.add(new DebugTermFactorGroup(entry.getKey(), termIdByName.get(entry.getKey()), entry.getValue()));
|
||||||
|
}
|
||||||
|
|
||||||
|
return groups;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static ResultRankingInputs convertRankingInputs(RpcResultRankingInputs inputs) {
|
private static List<DebugFactorGroup> convertDocumentOutputs(RpcResultDocumentRankingOutputs docData) {
|
||||||
return new ResultRankingInputs(
|
|
||||||
inputs.getRank(),
|
List<DebugFactor> unclusteredFactors = new ArrayList<>();
|
||||||
inputs.getAsl(),
|
for (int i = 0; i < docData.getFactorCount(); i++) {
|
||||||
inputs.getQuality(),
|
String factor = docData.getFactor(i);
|
||||||
inputs.getSize(),
|
String value = docData.getValue(i);
|
||||||
inputs.getTopology(),
|
unclusteredFactors.add(new DebugFactor(factor, value));
|
||||||
inputs.getYear(),
|
}
|
||||||
inputs.getFlagsList()
|
|
||||||
);
|
Map<String, List<DebugFactor>> factorsByGroup = new HashMap<>();
|
||||||
|
|
||||||
|
for (var factor : unclusteredFactors) {
|
||||||
|
String factorName = factor.factor();
|
||||||
|
String value = factor.value();
|
||||||
|
|
||||||
|
String[] parts = factorName.split("\\.");
|
||||||
|
|
||||||
|
String group, name;
|
||||||
|
|
||||||
|
if (parts.length != 2) {
|
||||||
|
group = "unknown";
|
||||||
|
name = factorName;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
group = parts[0];
|
||||||
|
name = parts[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
factorsByGroup.computeIfAbsent(group, k -> new ArrayList<>())
|
||||||
|
.add(new DebugFactor(name, value));
|
||||||
|
}
|
||||||
|
|
||||||
|
List<DebugFactorGroup> groups = new ArrayList<>();
|
||||||
|
for (var entry : factorsByGroup.entrySet()) {
|
||||||
|
groups.add(new DebugFactorGroup(entry.getKey(), entry.getValue()));
|
||||||
|
}
|
||||||
|
|
||||||
|
return groups;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static SearchResultItem convertRawResult(RpcRawResultItem rawItem) {
|
private static SearchResultItem convertRawResult(RpcRawResultItem rawItem) {
|
||||||
var keywordScores = new ArrayList<SearchResultKeywordScore>(rawItem.getKeywordScoresCount());
|
var keywordScores = new ArrayList<SearchResultKeywordScore>(rawItem.getKeywordScoresCount());
|
||||||
|
|
||||||
@ -189,6 +257,7 @@ public class QueryProtobufCodec {
|
|||||||
rawItem.getHtmlFeatures(),
|
rawItem.getHtmlFeatures(),
|
||||||
keywordScores,
|
keywordScores,
|
||||||
rawItem.getHasPriorityTerms(),
|
rawItem.getHasPriorityTerms(),
|
||||||
|
null, // Not set
|
||||||
Double.NaN // Not set
|
Double.NaN // Not set
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,7 @@ package nu.marginalia.api.searchquery.model.results;
|
|||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
|
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
@ -27,6 +28,8 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
|||||||
|
|
||||||
public boolean hasPrioTerm;
|
public boolean hasPrioTerm;
|
||||||
|
|
||||||
|
public DebugRankingFactors debugRankingFactors;
|
||||||
|
|
||||||
public SearchResultItem(long combinedId,
|
public SearchResultItem(long combinedId,
|
||||||
long encodedDocMetadata,
|
long encodedDocMetadata,
|
||||||
int htmlFeatures) {
|
int htmlFeatures) {
|
||||||
|
@ -0,0 +1,4 @@
|
|||||||
|
package nu.marginalia.api.searchquery.model.results.debug;
|
||||||
|
|
||||||
|
public record DebugFactor(String factor, String value) {
|
||||||
|
}
|
@ -0,0 +1,5 @@
|
|||||||
|
package nu.marginalia.api.searchquery.model.results.debug;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public record DebugFactorGroup(String name, List<DebugFactor> factors) {}
|
@ -0,0 +1,38 @@
|
|||||||
|
package nu.marginalia.api.searchquery.model.results.debug;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.StringJoiner;
|
||||||
|
|
||||||
|
public class DebugRankingFactors {
|
||||||
|
private final List<DebugFactor> documentFactors = new ArrayList<>();
|
||||||
|
private final List<DebugTermFactor> termFactors = new ArrayList<>();
|
||||||
|
|
||||||
|
public DebugRankingFactors() {}
|
||||||
|
|
||||||
|
public void addDocumentFactor(String factor, String value) {
|
||||||
|
documentFactors.add(new DebugFactor(factor, value));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addTermFactor(long termId, String factor, String value) {
|
||||||
|
termFactors.add(new DebugTermFactor(termId, null, factor, value));
|
||||||
|
}
|
||||||
|
public void addTermFactor(long termId, String factor, IntIterator sequenceIter) {
|
||||||
|
if (!sequenceIter.hasNext()) return;
|
||||||
|
|
||||||
|
StringJoiner joiner = new StringJoiner(",");
|
||||||
|
while (sequenceIter.hasNext()) {
|
||||||
|
joiner.add(String.valueOf(sequenceIter.nextInt()));
|
||||||
|
}
|
||||||
|
termFactors.add(new DebugTermFactor(termId, null, factor, joiner.toString()));
|
||||||
|
}
|
||||||
|
public List<DebugFactor> getDocumentFactors() {
|
||||||
|
return documentFactors;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<DebugTermFactor> getTermFactors() {
|
||||||
|
return termFactors;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,4 @@
|
|||||||
|
package nu.marginalia.api.searchquery.model.results.debug;
|
||||||
|
|
||||||
|
public record DebugTermFactor(long termId, String term, String factor, String value) {
|
||||||
|
}
|
@ -0,0 +1,6 @@
|
|||||||
|
package nu.marginalia.api.searchquery.model.results.debug;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public record DebugTermFactorGroup(String term, long termId, List<DebugFactorGroup> factorList) {
|
||||||
|
}
|
@ -1,6 +1,9 @@
|
|||||||
package nu.marginalia.api.searchquery.model.results.debug;
|
package nu.marginalia.api.searchquery.model.results.debug;
|
||||||
|
|
||||||
public record ResultRankingDetails(ResultRankingInputs inputs, ResultRankingOutputs outputs)
|
import java.util.List;
|
||||||
|
|
||||||
|
public record ResultRankingDetails(List<DebugFactorGroup> docFactorGroups,
|
||||||
|
List<DebugTermFactorGroup> termFactorGroups)
|
||||||
{
|
{
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,5 +0,0 @@
|
|||||||
package nu.marginalia.api.searchquery.model.results.debug;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
public record ResultRankingInputs(int rank, int asl, int quality, int size, int topology, int year, List<String> flags) {}
|
|
@ -1,16 +0,0 @@
|
|||||||
package nu.marginalia.api.searchquery.model.results.debug;
|
|
||||||
|
|
||||||
|
|
||||||
public record ResultRankingOutputs(double averageSentenceLengthPenalty,
|
|
||||||
double qualityPenalty,
|
|
||||||
double rankingBonus,
|
|
||||||
double topologyBonus,
|
|
||||||
double documentLengthPenalty,
|
|
||||||
double temporalBias,
|
|
||||||
double flagsPenalty,
|
|
||||||
double overallPart,
|
|
||||||
double bm25,
|
|
||||||
double tcfAvgDist,
|
|
||||||
double tcfFirstPosition)
|
|
||||||
{
|
|
||||||
}
|
|
@ -143,8 +143,8 @@ message RpcResultRankingParameters {
|
|||||||
}
|
}
|
||||||
|
|
||||||
message RpcResultRankingDetails {
|
message RpcResultRankingDetails {
|
||||||
RpcResultRankingInputs inputs = 1;
|
RpcResultDocumentRankingOutputs documentOutputs = 1;
|
||||||
RpcResultRankingOutputs output = 2;
|
RpcResultTermRankingOutputs termOutputs = 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
message RpcResultRankingInputs {
|
message RpcResultRankingInputs {
|
||||||
@ -158,19 +158,16 @@ message RpcResultRankingInputs {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Summary of the output of the ranking function */
|
/** Summary of the output of the ranking function */
|
||||||
message RpcResultRankingOutputs {
|
message RpcResultDocumentRankingOutputs {
|
||||||
double averageSentenceLengthPenalty = 1;
|
repeated string factor = 1;
|
||||||
double qualityPenalty = 2;
|
repeated string value = 2;
|
||||||
double rankingBonus = 3;
|
}
|
||||||
double topologyBonus = 4;
|
|
||||||
double documentLengthPenalty = 5;
|
message RpcResultTermRankingOutputs {
|
||||||
double temporalBias = 6;
|
repeated int64 termId = 1;
|
||||||
double flagsPenalty = 7;
|
repeated string term = 2;
|
||||||
double overallPart = 8;
|
repeated string factor = 3;
|
||||||
double bm25Part = 9;
|
repeated string value = 4;
|
||||||
// 10-14 unused
|
|
||||||
double tcfAvgDist = 15;
|
|
||||||
double tcfFirstPosition = 16;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Defines a single subquery */
|
/* Defines a single subquery */
|
||||||
|
@ -118,7 +118,13 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
|||||||
.labels(nodeName, "GRPC")
|
.labels(nodeName, "GRPC")
|
||||||
.time(() -> {
|
.time(() -> {
|
||||||
// Perform the search
|
// Perform the search
|
||||||
return executeSearch(params);
|
try {
|
||||||
|
return executeSearch(params);
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.error("Error in handling request", ex);
|
||||||
|
return List.of();
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// Prometheus bookkeeping
|
// Prometheus bookkeeping
|
||||||
@ -286,7 +292,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
|||||||
awaitCompletion();
|
awaitCompletion();
|
||||||
|
|
||||||
// Return the best results
|
// Return the best results
|
||||||
return resultValuator.selectBestResults(parameters, resultHeap);
|
return resultValuator.selectBestResults(parameters, resultRankingContext, resultHeap);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Wait for all tasks to complete */
|
/** Wait for all tasks to complete */
|
||||||
@ -399,6 +405,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean execute() throws InterruptedException {
|
private boolean execute() throws InterruptedException {
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
|
|
||||||
@ -417,7 +424,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
|||||||
stallTime.addAndGet(System.currentTimeMillis() - start);
|
stallTime.addAndGet(System.currentTimeMillis() - start);
|
||||||
|
|
||||||
resultHeap.addAll(
|
resultHeap.addAll(
|
||||||
resultValuator.rankResults(parameters, rankingContext, resultIds)
|
resultValuator.rankResults(parameters, false, rankingContext, resultIds)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6,13 +6,13 @@ import gnu.trove.list.TLongList;
|
|||||||
import gnu.trove.list.array.TLongArrayList;
|
import gnu.trove.list.array.TLongArrayList;
|
||||||
import gnu.trove.map.hash.TObjectLongHashMap;
|
import gnu.trove.map.hash.TObjectLongHashMap;
|
||||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
import nu.marginalia.api.searchquery.*;
|
||||||
import nu.marginalia.api.searchquery.RpcRawResultItem;
|
|
||||||
import nu.marginalia.api.searchquery.RpcResultKeywordScore;
|
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||||
|
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
|
||||||
import nu.marginalia.index.index.CombinedIndexReader;
|
import nu.marginalia.index.index.CombinedIndexReader;
|
||||||
import nu.marginalia.index.index.StatefulIndex;
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
import nu.marginalia.index.model.SearchParameters;
|
import nu.marginalia.index.model.SearchParameters;
|
||||||
@ -48,6 +48,7 @@ public class IndexResultRankingService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public List<SearchResultItem> rankResults(SearchParameters params,
|
public List<SearchResultItem> rankResults(SearchParameters params,
|
||||||
|
boolean exportDebugData,
|
||||||
ResultRankingContext rankingContext,
|
ResultRankingContext rankingContext,
|
||||||
CombinedDocIdList resultIds)
|
CombinedDocIdList resultIds)
|
||||||
{
|
{
|
||||||
@ -99,10 +100,19 @@ public class IndexResultRankingService {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate the preliminary score
|
if (!exportDebugData) {
|
||||||
var score = resultRanker.calculateScore(arena, resultIds.at(i), searchTerms, flags, positions);
|
var score = resultRanker.calculateScore(arena, null, resultIds.at(i), searchTerms, flags, positions);
|
||||||
if (score != null) {
|
if (score != null) {
|
||||||
results.add(score);
|
results.add(score);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
var rankingFactors = new DebugRankingFactors();
|
||||||
|
var score = resultRanker.calculateScore(arena, rankingFactors, resultIds.at(i), searchTerms, flags, positions);
|
||||||
|
if (score != null) {
|
||||||
|
score.debugRankingFactors = rankingFactors;
|
||||||
|
results.add(score);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -112,6 +122,7 @@ public class IndexResultRankingService {
|
|||||||
|
|
||||||
|
|
||||||
public List<RpcDecoratedResultItem> selectBestResults(SearchParameters params,
|
public List<RpcDecoratedResultItem> selectBestResults(SearchParameters params,
|
||||||
|
ResultRankingContext resultRankingContext,
|
||||||
Collection<SearchResultItem> results) throws SQLException {
|
Collection<SearchResultItem> results) throws SQLException {
|
||||||
|
|
||||||
var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain);
|
var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain);
|
||||||
@ -136,6 +147,25 @@ public class IndexResultRankingService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If we're exporting debug data from the ranking, we need to re-run the ranking calculation
|
||||||
|
// for the selected results, as this would be comically expensive to do for all the results we
|
||||||
|
// discard along the way
|
||||||
|
|
||||||
|
if (params.rankingParams.exportDebugData) {
|
||||||
|
var combinedIdsList = new LongArrayList(resultsList.size());
|
||||||
|
for (var item : resultsList) {
|
||||||
|
combinedIdsList.add(item.combinedId);
|
||||||
|
}
|
||||||
|
|
||||||
|
resultsList.clear();
|
||||||
|
resultsList.addAll(this.rankResults(
|
||||||
|
params,
|
||||||
|
true,
|
||||||
|
resultRankingContext,
|
||||||
|
new CombinedDocIdList(combinedIdsList))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// Fetch the document details for the selected results in one go, from the local document database
|
// Fetch the document details for the selected results in one go, from the local document database
|
||||||
// for this index partition
|
// for this index partition
|
||||||
Map<Long, DocdbUrlDetail> detailsById = new HashMap<>(idsList.size());
|
Map<Long, DocdbUrlDetail> detailsById = new HashMap<>(idsList.size());
|
||||||
@ -189,11 +219,45 @@ public class IndexResultRankingService {
|
|||||||
decoratedBuilder.setPubYear(docData.pubYear());
|
decoratedBuilder.setPubYear(docData.pubYear());
|
||||||
}
|
}
|
||||||
|
|
||||||
/* FIXME
|
if (result.debugRankingFactors != null) {
|
||||||
var rankingDetails = IndexProtobufCodec.convertRankingDetails(result.rankingDetails);
|
var debugFactors = result.debugRankingFactors;
|
||||||
if (rankingDetails != null) {
|
var detailsBuilder = RpcResultRankingDetails.newBuilder();
|
||||||
decoratedBuilder.setRankingDetails(rankingDetails);
|
var documentOutputs = RpcResultDocumentRankingOutputs.newBuilder();
|
||||||
}*/
|
|
||||||
|
for (var factor : debugFactors.getDocumentFactors()) {
|
||||||
|
documentOutputs.addFactor(factor.factor());
|
||||||
|
documentOutputs.addValue(factor.value());
|
||||||
|
}
|
||||||
|
|
||||||
|
detailsBuilder.setDocumentOutputs(documentOutputs);
|
||||||
|
|
||||||
|
var termOutputs = RpcResultTermRankingOutputs.newBuilder();
|
||||||
|
|
||||||
|
CqDataLong termIds = params.compiledQueryIds.data;;
|
||||||
|
|
||||||
|
for (var entry : debugFactors.getTermFactors()) {
|
||||||
|
String term = "[ERROR IN LOOKUP]";
|
||||||
|
|
||||||
|
// CURSED: This is a linear search, but the number of terms is small, and it's in a debug path
|
||||||
|
for (int i = 0; i < termIds.size(); i++) {
|
||||||
|
if (termIds.get(i) == entry.termId()) {
|
||||||
|
term = params.compiledQuery.at(i);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
termOutputs
|
||||||
|
.addTermId(entry.termId())
|
||||||
|
.addTerm(term)
|
||||||
|
.addFactor(entry.factor())
|
||||||
|
.addValue(entry.value());
|
||||||
|
}
|
||||||
|
|
||||||
|
detailsBuilder.setTermOutputs(termOutputs);
|
||||||
|
decoratedBuilder.setRankingDetails(detailsBuilder);
|
||||||
|
}
|
||||||
|
|
||||||
|
resultItems.add(decoratedBuilder.build());
|
||||||
}
|
}
|
||||||
|
|
||||||
return resultItems;
|
return resultItems;
|
||||||
|
@ -7,6 +7,7 @@ import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
|||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||||
|
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
|
||||||
import nu.marginalia.index.forward.spans.DocumentSpans;
|
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||||
import nu.marginalia.index.index.CombinedIndexReader;
|
import nu.marginalia.index.index.CombinedIndexReader;
|
||||||
import nu.marginalia.index.index.StatefulIndex;
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
@ -57,6 +58,7 @@ public class IndexResultScoreCalculator {
|
|||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
public SearchResultItem calculateScore(Arena arena,
|
public SearchResultItem calculateScore(Arena arena,
|
||||||
|
@Nullable DebugRankingFactors rankingFactors,
|
||||||
long combinedId,
|
long combinedId,
|
||||||
QuerySearchTerms searchTerms,
|
QuerySearchTerms searchTerms,
|
||||||
long[] wordFlags,
|
long[] wordFlags,
|
||||||
@ -88,6 +90,8 @@ public class IndexResultScoreCalculator {
|
|||||||
DocumentSpans spans = index.getDocumentSpans(arena, docId);
|
DocumentSpans spans = index.getDocumentSpans(arena, docId);
|
||||||
|
|
||||||
double score = calculateSearchResultValue(
|
double score = calculateSearchResultValue(
|
||||||
|
rankingFactors,
|
||||||
|
searchTerms,
|
||||||
wordFlagsQuery,
|
wordFlagsQuery,
|
||||||
positionsQuery,
|
positionsQuery,
|
||||||
docMetadata,
|
docMetadata,
|
||||||
@ -157,7 +161,9 @@ public class IndexResultScoreCalculator {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery,
|
public double calculateSearchResultValue(DebugRankingFactors rankingFactors,
|
||||||
|
QuerySearchTerms searchTerms,
|
||||||
|
CompiledQueryLong wordFlagsQuery,
|
||||||
CompiledQuery<CodedSequence> positionsQuery,
|
CompiledQuery<CodedSequence> positionsQuery,
|
||||||
long documentMetadata,
|
long documentMetadata,
|
||||||
int features,
|
int features,
|
||||||
@ -344,12 +350,82 @@ public class IndexResultScoreCalculator {
|
|||||||
+ verbatimMatchScore
|
+ verbatimMatchScore
|
||||||
+ keywordMinDistFac;
|
+ keywordMinDistFac;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / calculateAvgMinDistance(positionsQuery, ctx));
|
double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / calculateAvgMinDistance(positionsQuery, ctx));
|
||||||
double tcfFirstPosition = rankingParams.tcfFirstPosition * (1.0 / Math.sqrt(firstPosition));
|
double tcfFirstPosition = rankingParams.tcfFirstPosition * (1.0 / Math.sqrt(firstPosition));
|
||||||
|
|
||||||
double bM25 = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new Bm25GraphVisitor(rankingParams.bm25Params, weightedCounts, length, ctx));
|
double bM25 = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new Bm25GraphVisitor(rankingParams.bm25Params, weightedCounts, length, ctx));
|
||||||
double bFlags = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new TermFlagsGraphVisitor(rankingParams.bm25Params, wordFlagsQuery.data, weightedCounts, ctx));
|
double bFlags = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new TermFlagsGraphVisitor(rankingParams.bm25Params, wordFlagsQuery.data, weightedCounts, ctx));
|
||||||
|
|
||||||
|
if (rankingFactors != null) {
|
||||||
|
rankingFactors.addDocumentFactor("overall.averageSentenceLengthPenalty", Double.toString(averageSentenceLengthPenalty));
|
||||||
|
rankingFactors.addDocumentFactor("overall.documentLengthPenalty", Double.toString(documentLengthPenalty));
|
||||||
|
rankingFactors.addDocumentFactor("overall.qualityPenalty", Double.toString(qualityPenalty));
|
||||||
|
rankingFactors.addDocumentFactor("overall.rankingBonus", Double.toString(rankingBonus));
|
||||||
|
rankingFactors.addDocumentFactor("overall.topologyBonus", Double.toString(topologyBonus));
|
||||||
|
rankingFactors.addDocumentFactor("overall.temporalBias", Double.toString(temporalBias));
|
||||||
|
rankingFactors.addDocumentFactor("overall.flagsPenalty", Double.toString(flagsPenalty));
|
||||||
|
rankingFactors.addDocumentFactor("overall.verbatimMatchScore", Double.toString(verbatimMatchScore));
|
||||||
|
rankingFactors.addDocumentFactor("overall.keywordMinDistFac", Double.toString(keywordMinDistFac));
|
||||||
|
|
||||||
|
rankingFactors.addDocumentFactor("tcf.avgDist", Double.toString(tcfAvgDist));
|
||||||
|
rankingFactors.addDocumentFactor("tcf.firstPosition", Double.toString(tcfFirstPosition));
|
||||||
|
|
||||||
|
rankingFactors.addDocumentFactor("bm25.main", Double.toString(bM25));
|
||||||
|
rankingFactors.addDocumentFactor("bm25.flags", Double.toString(bFlags));
|
||||||
|
|
||||||
|
for (int i = 0; i < searchTerms.termIdsAll.size(); i++) {
|
||||||
|
long termId = searchTerms.termIdsAll.at(i);
|
||||||
|
|
||||||
|
rankingFactors.addTermFactor(termId, "factor.weightedCount", Double.toString(weightedCounts[i]));
|
||||||
|
byte flags = (byte) wordFlagsQuery.at(i);
|
||||||
|
|
||||||
|
for (var flag : WordFlags.values()) {
|
||||||
|
if (flag.isPresent(flags)) {
|
||||||
|
rankingFactors.addTermFactor(termId, "flags." + flag.name(), "true");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (verbatimMatchInAnchor) {
|
||||||
|
rankingFactors.addTermFactor(termId, "verbatim.anchor", "true");
|
||||||
|
}
|
||||||
|
if (verbatimMatchInBody) {
|
||||||
|
rankingFactors.addTermFactor(termId, "verbatim.body", "true");
|
||||||
|
}
|
||||||
|
if (verbatimMatchInCode) {
|
||||||
|
rankingFactors.addTermFactor(termId, "verbatim.code", "true");
|
||||||
|
}
|
||||||
|
if (verbatimMatchInExtLink) {
|
||||||
|
rankingFactors.addTermFactor(termId, "verbatim.extLink", "true");
|
||||||
|
}
|
||||||
|
if (verbatimMatchInHeading) {
|
||||||
|
rankingFactors.addTermFactor(termId, "verbatim.heading", "true");
|
||||||
|
}
|
||||||
|
if (verbatimMatchInNav) {
|
||||||
|
rankingFactors.addTermFactor(termId, "verbatim.nav", "true");
|
||||||
|
}
|
||||||
|
if (verbatimMatchInTitle) {
|
||||||
|
rankingFactors.addTermFactor(termId, "verbatim.title", "true");
|
||||||
|
}
|
||||||
|
|
||||||
|
rankingFactors.addTermFactor(termId, "unordered.title", Integer.toString(unorderedMatchInTitleCount));
|
||||||
|
rankingFactors.addTermFactor(termId, "unordered.heading", Integer.toString(unorderedMatchInHeadingCount));
|
||||||
|
|
||||||
|
if (positions[i] != null) {
|
||||||
|
rankingFactors.addTermFactor(termId, "positions.all", positions[i].iterator());
|
||||||
|
rankingFactors.addTermFactor(termId, "positions.title", SequenceOperations.findIntersections(spans.title.iterator(), positions[i].iterator()).iterator());
|
||||||
|
rankingFactors.addTermFactor(termId, "positions.heading", SequenceOperations.findIntersections(spans.heading.iterator(), positions[i].iterator()).iterator());
|
||||||
|
rankingFactors.addTermFactor(termId, "positions.anchor", SequenceOperations.findIntersections(spans.anchor.iterator(), positions[i].iterator()).iterator());
|
||||||
|
rankingFactors.addTermFactor(termId, "positions.code", SequenceOperations.findIntersections(spans.code.iterator(), positions[i].iterator()).iterator());
|
||||||
|
rankingFactors.addTermFactor(termId, "positions.nav", SequenceOperations.findIntersections(spans.nav.iterator(), positions[i].iterator()).iterator());
|
||||||
|
rankingFactors.addTermFactor(termId, "positions.externalLinkText", SequenceOperations.findIntersections(spans.externalLinkText.iterator(), positions[i].iterator()).iterator());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
// Renormalize to 0...15, where 0 is the best possible score;
|
// Renormalize to 0...15, where 0 is the best possible score;
|
||||||
// this is a historical artifact of the original ranking function
|
// this is a historical artifact of the original ranking function
|
||||||
double ret = normalize(
|
double ret = normalize(
|
||||||
|
@ -25,7 +25,7 @@ class IndexResultDomainDeduplicatorTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
SearchResultItem forId(int domain, int ordinal) {
|
SearchResultItem forId(int domain, int ordinal) {
|
||||||
return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(),false, Double.NaN);
|
return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(),false, null, Double.NaN);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -3,12 +3,12 @@ package nu.marginalia.query;
|
|||||||
import com.google.common.base.Strings;
|
import com.google.common.base.Strings;
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
|
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||||
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
|
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||||
import nu.marginalia.functions.searchquery.QueryGRPCService;
|
import nu.marginalia.functions.searchquery.QueryGRPCService;
|
||||||
import nu.marginalia.index.query.limit.QueryLimits;
|
import nu.marginalia.index.query.limit.QueryLimits;
|
||||||
import nu.marginalia.model.gson.GsonFactory;
|
import nu.marginalia.model.gson.GsonFactory;
|
||||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
|
||||||
import nu.marginalia.renderer.MustacheRenderer;
|
import nu.marginalia.renderer.MustacheRenderer;
|
||||||
import nu.marginalia.renderer.RendererFactory;
|
import nu.marginalia.renderer.RendererFactory;
|
||||||
import spark.Request;
|
import spark.Request;
|
||||||
@ -82,7 +82,7 @@ public class QueryBasicInterface {
|
|||||||
domainCount, count, 250, 8192
|
domainCount, count, 250, 8192
|
||||||
), set);
|
), set);
|
||||||
|
|
||||||
var rankingParams = rankingParamsFromRequest(request);
|
var rankingParams = debugRankingParamsFromRequest(request);
|
||||||
|
|
||||||
var detailedDirectResult = queryGRPCService.executeDirect(
|
var detailedDirectResult = queryGRPCService.executeDirect(
|
||||||
queryString, queryParams, rankingParams
|
queryString, queryParams, rankingParams
|
||||||
@ -98,7 +98,7 @@ public class QueryBasicInterface {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
private ResultRankingParameters rankingParamsFromRequest(Request request) {
|
private ResultRankingParameters debugRankingParamsFromRequest(Request request) {
|
||||||
var sensibleDefaults = ResultRankingParameters.sensibleDefaults();
|
var sensibleDefaults = ResultRankingParameters.sensibleDefaults();
|
||||||
|
|
||||||
return ResultRankingParameters.builder()
|
return ResultRankingParameters.builder()
|
||||||
|
@ -102,27 +102,26 @@
|
|||||||
<p>{{description}}</p>
|
<p>{{description}}</p>
|
||||||
|
|
||||||
<div><small class="text-muted">dataHash: {{dataHash}} wordsTotal: {{wordsTotal}} bestPositions: {{bestPositions}} rankingScore: {{rankingScore}} urlQuality: {{urlQuality}}</small></div>
|
<div><small class="text-muted">dataHash: {{dataHash}} wordsTotal: {{wordsTotal}} bestPositions: {{bestPositions}} rankingScore: {{rankingScore}} urlQuality: {{urlQuality}}</small></div>
|
||||||
{{#with rankingDetails.inputs}}
|
{{#with rankingDetails.docFactorGroups}}
|
||||||
<div><small class="text-muted">Rank: {{rank}}</small></div>
|
{{#each .}}
|
||||||
<div><small class="text-muted">ASL: {{asl}}</small></div>
|
<div><small>{{name}}</small></div>
|
||||||
<div><small class="text-muted">Quality: {{quality}}</small></div>
|
{{#each factors}}
|
||||||
<div><small class="text-muted">Size: {{size}}</small></div>
|
<div><small class="text-muted">{{factor}}: {{value}}</small></div>
|
||||||
<div><small class="text-muted">Topology: {{topology}}</small></div>
|
{{/each}}
|
||||||
<div><small class="text-muted">Year: {{year}}</small></div>
|
{{/each}}
|
||||||
<div><small class="text-muted">Flags: {{#each flags}} {{.}} {{/each}}</small></div>
|
|
||||||
{{/with}}
|
{{/with}}
|
||||||
{{#with rankingDetails.outputs}}
|
|
||||||
<div><small class="text-muted">Average Sentence Length Penalty: {{averageSentenceLengthPenalty}}</small></div>
|
{{#with rankingDetails.termFactorGroups}}
|
||||||
<div><small class="text-muted">Quality Penalty: {{qualityPenalty}}</small></div>
|
{{#each .}}
|
||||||
<div><small class="text-muted">Ranking Bonus: {{rankingBonus}}</small></div>
|
<div>{{termId}}:{{term}}</div>
|
||||||
<div><small class="text-muted">Topology Bonus: {{topologyBonus}}</small></div>
|
{{#each factorList}}
|
||||||
<div><small class="text-muted">Document Length Penalty: {{documentLengthPenalty}}</small></div>
|
<div>{{name}}</div>
|
||||||
<div><small class="text-muted">Temporal Bias: {{temporalBias}}</small></div>
|
{{#each factors}}
|
||||||
<div><small class="text-muted">Flags Penalty: {{flagsPenalty}}</small></div>
|
<div><small class="text-muted">{{factor}}: {{value}}</small></div>
|
||||||
<div><small class="text-muted">Overall Part: {{overallPart}}</small></div>
|
{{/each}}
|
||||||
<div><small class="text-muted">TCF Avg Distance: {{tcfAvgDist}}</small></div>
|
|
||||||
<div><small class="text-muted">TCF First Position: {{tcfFirstPosition}}</small></div>
|
{{/each}}
|
||||||
<div><small class="text-muted">BM25: {{bM25}}</small></div>
|
{{/each}}
|
||||||
{{/with}}
|
{{/with}}
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
|
Loading…
Reference in New Issue
Block a user