(index) Reduce the number of abstractions around result ranking

The change also restructures the internal API a bit, moving resultsFromDomain from RpcRawResultItem into RpcDecoratedResultItem, as the previous order was driving complexity in the code that generates these objects, and the consumer side of things puts all this data in the same object regardless.
This commit is contained in:
Viktor Lofgren 2024-07-15 05:18:10 +02:00
parent 8ed5b51a32
commit dfd19b5eb9
21 changed files with 633 additions and 920 deletions

View File

@ -22,6 +22,12 @@ import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
/** Reads the document database, which is a SQLite database
* containing the URLs and metadata of the documents in the
* index.
* <p></p>
* The database is created by the DocumentDbWriter class.
* */
@Singleton
public class DocumentDbReader {
private final Path dbFile;
@ -52,6 +58,11 @@ public class DocumentDbReader {
}
}
/** Switches the input database file to a new file.
* <p></p>
* This is used to switch over to a new database file
* when the index is re-indexed.
* */
public void switchInput(Path newDbFile) throws IOException, SQLException {
if (!Files.isRegularFile(newDbFile)) {
logger.error("Source is not a file, refusing switch-over {}", newDbFile);
@ -78,35 +89,11 @@ public class DocumentDbReader {
connection = createConnection();
}
public List<String> getUrlsFromDomain(int domainId) throws SQLException {
if (connection == null ||
connection.isClosed())
{
throw new RuntimeException("URL query temporarily unavailable due to database switch");
}
long minId = UrlIdCodec.encodeId(domainId, 0);
long maxId = UrlIdCodec.encodeId(domainId+1, 0);
List<String> ret = new ArrayList<>();
try (var stmt = connection.prepareStatement("""
SELECT URL
FROM DOCUMENT
WHERE ID >= ? AND ID < ?
"""))
{
stmt.setLong(1, minId);
stmt.setLong(2, maxId);
var rs = stmt.executeQuery();
while (rs.next()) {
ret.add(rs.getString(1));
}
}
return ret;
}
/** Returns the URL details for the given document ids.
* <p></p>
* This is used to get the URL details for the search
* results.
* */
public List<DocdbUrlDetail> getUrlDetails(TLongList ids) throws SQLException {
List<DocdbUrlDetail> ret = new ArrayList<>(ids.size());

View File

@ -9,6 +9,10 @@ import java.sql.DriverManager;
import java.sql.SQLException;
import java.util.List;
/** Writes the document database, which is a SQLite database
* containing the URLs and metadata of the documents in the
* index.
* */
public class DocumentDbWriter {
private final Connection connection;

View File

@ -130,6 +130,7 @@ public class QueryProtobufCodec {
results.getWordsTotal(),
results.getBestPositions(),
results.getRankingScore(),
results.getResultsFromDomain(),
convertRankingDetails(results.getRankingDetails())
);
}
@ -187,7 +188,6 @@ public class QueryProtobufCodec {
rawItem.getEncodedDocMetadata(),
rawItem.getHtmlFeatures(),
keywordScores,
rawItem.getResultsFromDomain(),
rawItem.getHasPriorityTerms(),
Double.NaN // Not set
);
@ -256,6 +256,7 @@ public class QueryProtobufCodec {
rpcDecoratedResultItem.getWordsTotal(),
rpcDecoratedResultItem.getBestPositions(),
rpcDecoratedResultItem.getRankingScore(),
rpcDecoratedResultItem.getResultsFromDomain(),
convertRankingDetails(rpcDecoratedResultItem.getRankingDetails())
);
}

View File

@ -34,6 +34,8 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
public final long bestPositions;
public final double rankingScore;
public final int resultsFromDomain;
@Nullable
public ResultRankingDetails rankingDetails;
@ -43,9 +45,6 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
public int domainId() {
return rawIndexResult.getDomainId();
}
public int resultsFromDomain() {
return rawIndexResult.getResultsFromDomain();
}
public List<SearchResultKeywordScore> keywordScores() {
return rawIndexResult.getKeywordScores();
@ -72,6 +71,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
int wordsTotal,
long bestPositions,
double rankingScore,
int resultsFromDomain,
@Nullable
ResultRankingDetails rankingDetails
)
@ -88,6 +88,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
this.wordsTotal = wordsTotal;
this.bestPositions = bestPositions;
this.rankingScore = rankingScore;
this.resultsFromDomain = resultsFromDomain;
this.rankingDetails = rankingDetails;
}

View File

@ -25,9 +25,6 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
/** How did the subqueries match against the document ? */
public final List<SearchResultKeywordScore> keywordScores;
/** How many other potential results existed in the same domain */
public int resultsFromDomain;
public boolean hasPrioTerm;
public SearchResultItem(long combinedId,

View File

@ -93,12 +93,12 @@ message RpcDecoratedResultItem {
double rankingScore = 11; // The ranking score of this search result item, lower is better
int64 bestPositions = 12;
RpcResultRankingDetails rankingDetails = 13; // optional, only present if exportDebugData is true in RpcResultRankingParameters
int32 resultsFromDomain = 14;
}
/** A raw index-service view of a search result */
message RpcRawResultItem {
int64 combinedId = 1; // raw ID with bit-encoded ranking information still present
int32 resultsFromDomain = 2; // number of other results from the same domain
int64 encodedDocMetadata = 3; // bit encoded document metadata
int32 htmlFeatures = 4; // bitmask encoding features of the document
repeated RpcResultKeywordScore keywordScores = 5;

View File

@ -20,7 +20,7 @@ import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.model.SearchTerms;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexSearchBudget;
import nu.marginalia.index.results.IndexResultValuatorService;
import nu.marginalia.index.results.IndexResultRankingService;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.searchset.SearchSetsService;
import nu.marginalia.index.searchset.SmallSearchSet;
@ -81,7 +81,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
private final StatefulIndex statefulIndex;
private final SearchSetsService searchSetsService;
private final IndexResultValuatorService resultValuator;
private final IndexResultRankingService resultValuator;
private final String nodeName;
@ -91,7 +91,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
public IndexGrpcService(ServiceConfiguration serviceConfiguration,
StatefulIndex statefulIndex,
SearchSetsService searchSetsService,
IndexResultValuatorService resultValuator)
IndexResultRankingService resultValuator)
{
var nodeId = serviceConfiguration.node();
this.nodeName = Integer.toString(nodeId);
@ -135,7 +135,6 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
var rawItem = RpcRawResultItem.newBuilder();
rawItem.setCombinedId(rawResult.combinedId);
rawItem.setResultsFromDomain(rawResult.resultsFromDomain);
rawItem.setHtmlFeatures(rawResult.htmlFeatures);
rawItem.setEncodedDocMetadata(rawResult.encodedDocMetadata);
rawItem.setHasPriorityTerms(rawResult.hasPrioTerm);
@ -159,6 +158,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
.setUrlQuality(result.urlQuality)
.setWordsTotal(result.wordsTotal)
.setBestPositions(result.bestPositions)
.setResultsFromDomain(result.resultsFromDomain)
.setRawItem(rawItem);
var rankingDetails = IndexProtobufCodec.convertRankingDetails(result.rankingDetails);

View File

@ -3,7 +3,6 @@ package nu.marginalia.index.index;
import java.util.List;
import gnu.trove.set.hash.TLongHashSet;
import nu.marginalia.index.FullReverseIndexReader;
import nu.marginalia.index.PrioReverseIndexReader;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.query.filter.QueryFilterAnyOf;

View File

@ -1,33 +1,38 @@
package nu.marginalia.ranking.results.factors;
package nu.marginalia.index.results;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.model.idx.WordMetadata;
import java.util.BitSet;
import java.util.List;
public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor {
/** Visitor for calculating the best BM25 score for a graph representing a search query
*/
public class Bm25GraphVisitor implements CqExpression.DoubleVisitor {
private static final long AVG_LENGTH = 5000;
private final CqDataInt counts;
private final CqDataInt frequencies;
private final Bm25Parameters bm25Parameters;
private final double k1;
private final double b;
private final int docCount;
private final int length;
private final BitSet mask;
public Bm25FullGraphVisitor(Bm25Parameters bm25Parameters,
CqDataInt counts,
int length,
ResultRankingContext ctx) {
public Bm25GraphVisitor(Bm25Parameters bm25Parameters,
CqDataInt counts,
int length,
ResultRankingContext ctx) {
this.length = length;
this.bm25Parameters = bm25Parameters;
this.k1 = bm25Parameters.k();
this.b = bm25Parameters.b();
this.docCount = ctx.termFreqDocCount();
this.counts = counts;
this.frequencies = ctx.fullCounts;
@ -37,9 +42,11 @@ public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor {
@Override
public double onAnd(List<? extends CqExpression> parts) {
double value = 0;
for (var part : parts) {
value += part.visit(this);
}
return value;
}
@ -59,10 +66,9 @@ public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor {
}
double count = counts.get(idx);
int freq = frequencies.get(idx);
return invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length);
return invFreq(docCount, freq) * f(count, length);
}
/**
@ -76,14 +82,12 @@ public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor {
/**
*
* @param k determines the size of the impact of a single term
* @param b determines the magnitude of the length normalization
* @param count number of occurrences in the document
* @param length document length
*/
private double f(double k, double b, double count, int length) {
private double f(double count, int length) {
final double lengthRatio = (double) length / AVG_LENGTH;
return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio));
return (count * (k1 + 1)) / (count + k1 * (1 - b + b * lengthRatio));
}
}

View File

@ -1,96 +0,0 @@
package nu.marginalia.index.results;
import com.google.inject.Inject;
import gnu.trove.map.hash.TObjectLongHashMap;
import it.unimi.dsi.fastutil.longs.Long2ObjectArrayMap;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchTermsUtil;
import nu.marginalia.index.results.model.QuerySearchTerms;
import nu.marginalia.index.results.model.TermCoherenceGroupList;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.results.model.ids.TermMetadataList;
import nu.marginalia.index.results.model.ids.TermIdList;
import java.lang.foreign.Arena;
import java.util.ArrayList;
import static nu.marginalia.index.results.model.TermCoherenceGroupList.TermCoherenceGroup;
public class IndexMetadataService {
private final StatefulIndex statefulIndex;
@Inject
public IndexMetadataService(StatefulIndex index) {
this.statefulIndex = index;
}
public Long2ObjectArrayMap<TermMetadataList>
getTermMetadataForDocuments(Arena arena, CombinedDocIdList combinedIdsAll, TermIdList termIdsList)
{
var currentIndex = statefulIndex.get();
Long2ObjectArrayMap<TermMetadataList> termdocToMeta =
new Long2ObjectArrayMap<>(termIdsList.size());
for (long termId : termIdsList.array()) {
termdocToMeta.put(termId, currentIndex.getTermMetadata(arena, termId, combinedIdsAll));
}
return termdocToMeta;
}
public QuerySearchTerms getSearchTerms(CompiledQuery<String> compiledQuery, SearchQuery searchQuery) {
LongArrayList termIdsList = new LongArrayList();
LongArrayList termIdsPrio = new LongArrayList();
TObjectLongHashMap<String> termToId = new TObjectLongHashMap<>(10, 0.75f, -1);
for (String word : compiledQuery) {
long id = SearchTermsUtil.getWordId(word);
termIdsList.add(id);
termToId.put(word, id);
}
for (var term : searchQuery.searchTermsAdvice) {
if (termToId.containsKey(term)) {
continue;
}
long id = SearchTermsUtil.getWordId(term);
termIdsList.add(id);
termToId.put(term, id);
}
for (var term : searchQuery.searchTermsPriority) {
if (termToId.containsKey(term)) {
long id = SearchTermsUtil.getWordId(term);
termIdsPrio.add(id);
}
else {
long id = SearchTermsUtil.getWordId(term);
termIdsList.add(id);
termIdsPrio.add(id);
termToId.put(term, id);
}
}
var idsAll = new TermIdList(termIdsList);
var idsPrio = new TermIdList(termIdsPrio);
var constraints = new ArrayList<TermCoherenceGroup>();
for (var coherence : searchQuery.searchTermCoherences) {
constraints.add(new TermCoherenceGroup(coherence, idsAll));
}
return new QuerySearchTerms(termToId,
idsAll,
idsPrio,
new TermCoherenceGroupList(constraints)
);
}
}

View File

@ -0,0 +1,229 @@
package nu.marginalia.index.results;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import gnu.trove.list.TLongList;
import gnu.trove.list.array.TLongArrayList;
import gnu.trove.map.hash.TObjectLongHashMap;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.model.SearchTermsUtil;
import nu.marginalia.index.results.model.QuerySearchTerms;
import nu.marginalia.index.results.model.TermCoherenceGroupList;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.results.model.ids.TermIdList;
import nu.marginalia.index.results.model.ids.TermMetadataList;
import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.linkdb.model.DocdbUrlDetail;
import nu.marginalia.sequence.GammaCodedSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.lang.foreign.Arena;
import java.sql.SQLException;
import java.util.*;
@Singleton
public class IndexResultRankingService {
private static final Logger logger = LoggerFactory.getLogger(IndexResultRankingService.class);
private final DocumentDbReader documentDbReader;
private final StatefulIndex statefulIndex;
@Inject
public IndexResultRankingService(DocumentDbReader documentDbReader,
StatefulIndex statefulIndex)
{
this.documentDbReader = documentDbReader;
this.statefulIndex = statefulIndex;
}
public List<SearchResultItem> rankResults(SearchParameters params,
ResultRankingContext rankingContext,
CombinedDocIdList resultIds)
{
IndexResultScoreCalculator resultRanker = new IndexResultScoreCalculator(statefulIndex, rankingContext, params);
List<SearchResultItem> results = new ArrayList<>(resultIds.size());
// Get the current index reader, which is the one we'll use for this calculation,
// this may change during the calculation, but we don't want to switch over mid-calculation
final CombinedIndexReader currentIndex = statefulIndex.get();
final QuerySearchTerms searchTerms = getSearchTerms(params.compiledQuery, params.query);
final int termCount = searchTerms.termIdsAll.size();
// We use an arena for the position data to avoid gc pressure
// from the gamma coded sequences, which can be large and have a lifetime
// that matches the try block here
try (var arena = Arena.ofConfined()) {
TermMetadataList[] termsForDocs = new TermMetadataList[termCount];
for (int ti = 0; ti < termCount; ti++) {
termsForDocs[ti] = currentIndex.getTermMetadata(arena, searchTerms.termIdsAll.at(ti), resultIds);
}
// Data for the document. We arrange this in arrays outside the calculation function to avoid
// hash lookups in the inner loop, as it's hot code, and we don't want unnecessary cpu cache
// thrashing in there; out here we can rely on implicit array ordering to match up the data.
long[] flags = new long[termCount];
GammaCodedSequence[] positions = new GammaCodedSequence[termCount];
// Iterate over documents by their index in the combinedDocIds, as we need the index for the
// term data arrays as well
for (int i = 0; i < resultIds.size(); i++) {
// Prepare term-level data for the document
for (int ti = 0; ti < flags.length; ti++) {
var tfd = termsForDocs[ti];
assert tfd != null : "No term data for term " + ti;
flags[ti] = tfd.flag(i);
positions[ti] = tfd.position(i);
}
// Ignore documents that don't match the mandatory constraints
if (!searchTerms.coherences.testMandatory(positions)) {
continue;
}
// Calculate the preliminary score
var score = resultRanker.calculateScore(resultIds.at(i), searchTerms, flags, positions);
if (score != null) {
results.add(score);
}
}
return results;
}
}
public List<DecoratedSearchResultItem> selectBestResults(SearchParameters params,
Collection<SearchResultItem> results) throws SQLException {
var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain);
List<SearchResultItem> resultsList = new ArrayList<>(results.size());
TLongList idsList = new TLongArrayList(params.limitTotal);
for (var item : results) {
if (domainCountFilter.test(item)) {
if (resultsList.size() < params.limitTotal) {
resultsList.add(item);
idsList.add(item.getDocumentId());
}
//
// else { break; } <-- don't add this even though it looks like it should be present!
//
// It's important that this filter runs across all results, not just the top N,
// so we shouldn't break the loop in a putative else-case here!
//
}
}
// Fetch the document details for the selected results in one go, from the local document database
// for this index partition
Map<Long, DocdbUrlDetail> detailsById = new HashMap<>(idsList.size());
for (var item : documentDbReader.getUrlDetails(idsList)) {
detailsById.put(item.urlId(), item);
}
List<DecoratedSearchResultItem> resultItems = new ArrayList<>(resultsList.size());
// Decorate the results with the document details
for (var result : resultsList) {
final long id = result.getDocumentId();
final DocdbUrlDetail docData = detailsById.get(id);
if (docData == null) {
logger.warn("No document data for id {}", id);
continue;
}
// Create a decorated search result item from the result and the document data
resultItems.add(new DecoratedSearchResultItem(
result,
docData.url(),
docData.title(),
docData.description(),
docData.urlQuality(),
docData.format(),
docData.features(),
docData.pubYear(),
docData.dataHash(),
docData.wordsTotal(),
0L, //bestPositions(wordMetas),
result.getScore(),
domainCountFilter.getCount(result),
null
));
}
return resultItems;
}
public QuerySearchTerms getSearchTerms(CompiledQuery<String> compiledQuery, SearchQuery searchQuery) {
LongArrayList termIdsList = new LongArrayList();
LongArrayList termIdsPrio = new LongArrayList();
TObjectLongHashMap<String> termToId = new TObjectLongHashMap<>(10, 0.75f, -1);
for (String word : compiledQuery) {
long id = SearchTermsUtil.getWordId(word);
termIdsList.add(id);
termToId.put(word, id);
}
for (var term : searchQuery.searchTermsAdvice) {
if (termToId.containsKey(term)) {
continue;
}
long id = SearchTermsUtil.getWordId(term);
termIdsList.add(id);
termToId.put(term, id);
}
for (var term : searchQuery.searchTermsPriority) {
if (termToId.containsKey(term)) {
long id = SearchTermsUtil.getWordId(term);
termIdsPrio.add(id);
}
else {
long id = SearchTermsUtil.getWordId(term);
termIdsList.add(id);
termIdsPrio.add(id);
termToId.put(term, id);
}
}
var idsAll = new TermIdList(termIdsList);
var idsPrio = new TermIdList(termIdsPrio);
var constraints = new ArrayList<TermCoherenceGroupList.TermCoherenceGroup>();
for (var coherence : searchQuery.searchTermCoherences) {
constraints.add(new TermCoherenceGroupList.TermCoherenceGroup(coherence, idsAll));
}
return new QuerySearchTerms(termToId,
idsAll,
idsPrio,
new TermCoherenceGroupList(constraints)
);
}
}

View File

@ -0,0 +1,349 @@
package nu.marginalia.index.results;
import nu.marginalia.api.searchquery.model.compiled.*;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.model.QueryParams;
import nu.marginalia.index.results.model.QuerySearchTerms;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.SequenceOperations;
import javax.annotation.Nullable;
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.*;
/** This class is responsible for calculating the score of a search result.
* It holds the data required to perform the scoring, as there is strong
* reasons to cache this data, and performs the calculations */
public class IndexResultScoreCalculator {
private final CombinedIndexReader index;
private final QueryParams queryParams;
private final ResultRankingContext rankingContext;
private final CompiledQuery<String> compiledQuery;
public IndexResultScoreCalculator(StatefulIndex statefulIndex,
ResultRankingContext rankingContext,
SearchParameters params)
{
this.index = statefulIndex.get();
this.rankingContext = rankingContext;
this.queryParams = params.queryParams;
this.compiledQuery = params.compiledQuery;
}
private final long flagsFilterMask = WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit();
@Nullable
public SearchResultItem calculateScore(long combinedId,
QuerySearchTerms searchTerms,
long[] wordFlags,
GammaCodedSequence[] positions)
{
CompiledQuery<GammaCodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
int[] counts = new int[compiledQuery.size()];
for (int i = 0; i < counts.length; i++) {
if (positions[i] != null) {
counts[i] = positions[i].valueCount();
}
}
CompiledQueryInt positionsCountQuery = compiledQuery.root.newQuery(counts);
CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags);
// If the document is not relevant to the query, abort early to reduce allocations and
// avoid unnecessary calculations
if (testRelevance(wordFlagsQuery, positionsCountQuery)) {
return null;
}
long docId = UrlIdCodec.removeRank(combinedId);
long docMetadata = index.getDocumentMetadata(docId);
int htmlFeatures = index.getHtmlFeatures(docId);
int docSize = index.getDocumentSize(docId);
int bestCoherence = searchTerms.coherences.testOptional(positions);
double score = calculateSearchResultValue(
wordFlagsQuery,
positionsCountQuery,
positionsQuery,
docMetadata,
htmlFeatures,
docSize,
bestCoherence,
rankingContext);
SearchResultItem searchResult = new SearchResultItem(docId,
docMetadata,
htmlFeatures);
if (hasPrioTerm(searchTerms, positions)) {
score = 0.75 * score;
}
searchResult.setScore(score);
return searchResult;
}
private boolean testRelevance(CompiledQueryLong wordFlagsQuery, CompiledQueryInt countsQuery) {
boolean allSynthetic = booleanAggregate(wordFlagsQuery, WordFlags.Synthetic::isPresent);
int flagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & flagsFilterMask));
int positionsCount = intMaxMinAggregate(countsQuery, p -> p);
if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams.queryStrategy())) {
return true;
}
if (flagsCount == 0 && !allSynthetic && positionsCount == 0) {
return true;
}
return false;
}
private boolean hasPrioTerm(QuerySearchTerms searchTerms, GammaCodedSequence[] positions) {
var allTerms = searchTerms.termIdsAll;
var prioTerms = searchTerms.termIdsPrio;
for (int i = 0; i < allTerms.size(); i++) {
if (positions[i] != null && prioTerms.contains(allTerms.at(i))) {
return true;
}
}
return false;
}
private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores,
QueryStrategy queryStrategy)
{
if (queryStrategy == QueryStrategy.AUTO ||
queryStrategy == QueryStrategy.SENTENCE ||
queryStrategy == QueryStrategy.TOPIC) {
return true;
}
return booleanAggregate(queryGraphScores,
docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy()));
}
private boolean meetsQueryStrategyRequirements(long wordMeta, QueryStrategy queryStrategy) {
if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) {
return WordFlags.Site.isPresent(wordMeta);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) {
return WordFlags.Subjects.isPresent(wordMeta);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) {
return WordFlags.Title.isPresent(wordMeta);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) {
return WordFlags.UrlPath.isPresent(wordMeta);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) {
return WordFlags.UrlDomain.isPresent(wordMeta);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_LINK) {
return WordFlags.ExternalLink.isPresent(wordMeta);
}
return true;
}
public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery,
CompiledQueryInt positionsCountQuery,
CompiledQuery<GammaCodedSequence> positionsQuery, long documentMetadata,
int features,
int length,
int bestCoherence,
ResultRankingContext ctx)
{
if (length < 0) {
length = 5000;
}
var rankingParams = ctx.params;
int rank = DocumentMetadata.decodeRank(documentMetadata);
int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata);
int quality = DocumentMetadata.decodeQuality(documentMetadata);
int size = DocumentMetadata.decodeSize(documentMetadata);
int flagsPenalty = flagsPenalty(features, documentMetadata & 0xFF, size);
int topology = DocumentMetadata.decodeTopology(documentMetadata);
int year = DocumentMetadata.decodeYear(documentMetadata);
double averageSentenceLengthPenalty = (asl >= rankingParams.shortSentenceThreshold ? 0 : -rankingParams.shortSentencePenalty);
final double qualityPenalty = calculateQualityPenalty(size, quality, rankingParams);
final double rankingBonus = (255. - rank) * rankingParams.domainRankBonus;
final double topologyBonus = Math.log(1 + topology);
final double documentLengthPenalty = length > rankingParams.shortDocumentThreshold ? 0 : -rankingParams.shortDocumentPenalty;
final double temporalBias;
if (rankingParams.temporalBias == ResultRankingParameters.TemporalBias.RECENT) {
temporalBias = - Math.abs(year - PubDate.MAX_YEAR) * rankingParams.temporalBiasWeight;
} else if (rankingParams.temporalBias == ResultRankingParameters.TemporalBias.OLD) {
temporalBias = - Math.abs(year - PubDate.MIN_YEAR) * rankingParams.temporalBiasWeight;
} else {
temporalBias = 0;
}
double overallPart = averageSentenceLengthPenalty
+ documentLengthPenalty
+ qualityPenalty
+ rankingBonus
+ topologyBonus
+ temporalBias
+ flagsPenalty
+ bestCoherence;
double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / calculateAvgMinDistance(positionsQuery, ctx));
double tcfFirstPosition = 0.;
double bM25 = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new Bm25GraphVisitor(rankingParams.bm25Params, positionsCountQuery.data, length, ctx));
// Renormalize to 0...15, where 0 is the best possible score;
// this is a historical artifact of the original ranking function
double ret = normalize(
tcfAvgDist + tcfFirstPosition
+ bM25
+ Math.max(0, overallPart),
-Math.min(0, overallPart));
if (Double.isNaN(ret)) { // This should never happen but if it does, we want to know about it
if (getClass().desiredAssertionStatus()) {
throw new IllegalStateException("NaN in result value calculation");
}
return Double.MAX_VALUE;
}
else {
return ret;
}
}
private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) {
if (size < 400) {
if (quality < 5)
return 0;
return -quality * rankingParams.qualityPenalty;
}
else {
return -quality * rankingParams.qualityPenalty * 20;
}
}
private int flagsPenalty(int featureFlags, long docFlags, int size) {
// Short-circuit for index-service, which does not have the feature flags
if (featureFlags == 0)
return 0;
double penalty = 0;
boolean isForum = DocumentFlags.GeneratorForum.isPresent(docFlags);
boolean isWiki = DocumentFlags.GeneratorWiki.isPresent(docFlags);
boolean isDocs = DocumentFlags.GeneratorDocs.isPresent(docFlags);
// Penalize large sites harder for any bullshit as it's a strong signal of a low quality site
double largeSiteFactor = 1.;
if (!isForum && !isWiki && !isDocs && size > 400) {
// Long urls-that-look-like-this tend to be poor search results
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.KEBAB_CASE_URL.getFeatureBit()))
penalty += 30.0;
else if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.LONG_URL.getFeatureBit()))
penalty += 30.;
else penalty += 5.;
largeSiteFactor = 2;
}
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit()))
penalty += 7.5 * largeSiteFactor;
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit()))
penalty += 5.0 * largeSiteFactor;
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.COOKIES.getFeatureBit()))
penalty += 2.5 * largeSiteFactor;
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit()))
penalty += 2.5 * largeSiteFactor;
if (isForum || isWiki) {
penalty = Math.min(0, penalty - 2);
}
return (int) -penalty;
}
/** Normalize a value to the range 0...15, where 0 is the best possible score
*
* @param value The value to normalize, must be positive or zero
* @param penalty Any negative component of the value
* */
public static double normalize(double value, double penalty) {
if (value < 0)
value = 0;
return Math.sqrt((1.0 + 500. + 10 * penalty) / (1.0 + value));
}
public static double calculateAvgMinDistance(CompiledQuery<GammaCodedSequence> positions, ResultRankingContext ctx) {
double sum = 0;
int cnt = 0;
for (int i = 0; i < positions.size(); i++) {
// Skip terms that are not in the regular mask
if (!ctx.regularMask.get(i))
continue;
var posi = positions.at(i);
// Skip terms that are not in the document
if (posi == null)
continue;
for (int j = i + 1; j < positions.size(); j++) {
// Skip terms that are not in the regular mask
if (!ctx.regularMask.get(j))
continue;
var posj = positions.at(j);
// Skip terms that are not in the document
if (posj == null)
continue;
int distance = SequenceOperations.minDistance(posi.iterator(), posj.iterator());
sum += distance;
cnt++;
}
}
if (cnt > 0) {
return sum / cnt;
} else {
return 1000.;
}
}
}

View File

@ -1,165 +0,0 @@
package nu.marginalia.index.results;
import nu.marginalia.api.searchquery.model.compiled.*;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.model.QueryParams;
import nu.marginalia.index.results.model.QuerySearchTerms;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.ranking.results.ResultValuator;
import nu.marginalia.sequence.GammaCodedSequence;
import javax.annotation.Nullable;
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.*;
/** This class is responsible for calculating the score of a search result.
* It holds the data required to perform the scoring, as there is strong
* reasons to cache this data, and performs the calculations */
public class IndexResultValuationContext {
private final CombinedIndexReader index;
private final QueryParams queryParams;
private final ResultRankingContext rankingContext;
private final ResultValuator searchResultValuator;
private final CompiledQuery<String> compiledQuery;
public IndexResultValuationContext(ResultValuator searchResultValuator,
StatefulIndex statefulIndex,
ResultRankingContext rankingContext,
SearchParameters params)
{
this.index = statefulIndex.get();
this.rankingContext = rankingContext;
this.searchResultValuator = searchResultValuator;
this.queryParams = params.queryParams;
this.compiledQuery = params.compiledQuery;
}
private final long flagsFilterMask = WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit();
@Nullable
public SearchResultItem calculatePreliminaryScore(long combinedId,
QuerySearchTerms searchTerms,
long[] wordFlags,
GammaCodedSequence[] positions)
{
if (!searchTerms.coherences.testMandatory(positions))
return null;
CompiledQuery<GammaCodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags);
int[] counts = new int[compiledQuery.size()];
for (int i = 0; i < counts.length; i++) {
if (positions[i] != null) {
counts[i] = positions[i].valueCount();
}
}
CompiledQueryInt positionsCountQuery = compiledQuery.root.newQuery(counts);
// If the document is not relevant to the query, abort early to reduce allocations and
// avoid unnecessary calculations
if (testRelevance(wordFlagsQuery, positionsCountQuery)) {
return null;
}
long docId = UrlIdCodec.removeRank(combinedId);
long docMetadata = index.getDocumentMetadata(docId);
int htmlFeatures = index.getHtmlFeatures(docId);
int docSize = index.getDocumentSize(docId);
int bestCoherence = searchTerms.coherences.testOptional(positions);
double score = searchResultValuator.calculateSearchResultValue(
wordFlagsQuery,
positionsCountQuery,
positionsQuery,
docMetadata,
htmlFeatures,
docSize,
bestCoherence,
rankingContext, null);
SearchResultItem searchResult = new SearchResultItem(docId,
docMetadata,
htmlFeatures);
if (hasPrioTerm(searchTerms, positions)) {
score = 0.75 * score;
}
searchResult.setScore(score);
return searchResult;
}
private boolean testRelevance(CompiledQueryLong wordFlagsQuery, CompiledQueryInt countsQuery) {
boolean allSynthetic = booleanAggregate(wordFlagsQuery, WordFlags.Synthetic::isPresent);
int flagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & flagsFilterMask));
int positionsCount = intMaxMinAggregate(countsQuery, p -> p);
if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams.queryStrategy())) {
return true;
}
if (flagsCount == 0 && !allSynthetic && positionsCount == 0) {
return true;
}
return false;
}
private boolean hasPrioTerm(QuerySearchTerms searchTerms, GammaCodedSequence[] positions) {
var allTerms = searchTerms.termIdsAll;
var prioTerms = searchTerms.termIdsPrio;
for (int i = 0; i < allTerms.size(); i++) {
if (positions[i] != null && prioTerms.contains(allTerms.at(i))) {
return true;
}
}
return false;
}
private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores,
QueryStrategy queryStrategy)
{
if (queryStrategy == QueryStrategy.AUTO ||
queryStrategy == QueryStrategy.SENTENCE ||
queryStrategy == QueryStrategy.TOPIC) {
return true;
}
return booleanAggregate(queryGraphScores,
docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy()));
}
private boolean meetsQueryStrategyRequirements(long wordMeta, QueryStrategy queryStrategy) {
if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) {
return WordFlags.Site.isPresent(wordMeta);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) {
return WordFlags.Subjects.isPresent(wordMeta);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) {
return WordFlags.Title.isPresent(wordMeta);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) {
return WordFlags.UrlPath.isPresent(wordMeta);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) {
return WordFlags.UrlDomain.isPresent(wordMeta);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_LINK) {
return WordFlags.ExternalLink.isPresent(wordMeta);
}
return true;
}
}

View File

@ -1,210 +0,0 @@
package nu.marginalia.index.results;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import gnu.trove.list.TLongList;
import gnu.trove.list.array.TLongArrayList;
import it.unimi.dsi.fastutil.longs.LongSet;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.linkdb.model.DocdbUrlDetail;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.ranking.results.ResultValuator;
import nu.marginalia.sequence.GammaCodedSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.lang.foreign.Arena;
import java.sql.SQLException;
import java.util.*;
@Singleton
public class IndexResultValuatorService {
private static final Logger logger = LoggerFactory.getLogger(IndexResultValuatorService.class);
private final IndexMetadataService metadataService;
private final DocumentDbReader documentDbReader;
private final ResultValuator resultValuator;
private final StatefulIndex statefulIndex;
@Inject
public IndexResultValuatorService(IndexMetadataService metadataService,
DocumentDbReader documentDbReader,
ResultValuator resultValuator,
StatefulIndex statefulIndex)
{
this.metadataService = metadataService;
this.documentDbReader = documentDbReader;
this.resultValuator = resultValuator;
this.statefulIndex = statefulIndex;
}
public List<SearchResultItem> rankResults(SearchParameters params,
ResultRankingContext rankingContext,
CombinedDocIdList resultIds)
{
IndexResultValuationContext evaluator =
new IndexResultValuationContext(resultValuator, statefulIndex, rankingContext, params);
List<SearchResultItem> results = new ArrayList<>(resultIds.size());
try (var arena = Arena.ofConfined()) {
// Batch-fetch the word metadata for the documents
var searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query);
var termsForDocs = metadataService.getTermMetadataForDocuments(arena, resultIds, searchTerms.termIdsAll);
// Prepare data for the document. We do this outside of the calculation function to avoid
// hash lookups in the inner loop, as it's very hot code and we don't want thrashing in there;
// out here we can rely on implicit array ordering to match up the data.
var ra = resultIds.array();
long[] flags = new long[searchTerms.termIdsAll.size()];
GammaCodedSequence[] positions = new GammaCodedSequence[searchTerms.termIdsAll.size()];
for (int i = 0; i < ra.length; i++) {
long id = ra[i];
// Prepare term-level data for the document
for (int ti = 0; ti < flags.length; ti++) {
long tid = searchTerms.termIdsAll.at(ti);
var tfd = termsForDocs.get(tid);
assert tfd != null : "No term data for term " + ti;
flags[ti] = tfd.flag(i);
positions[ti] = tfd.position(i);
}
// Calculate the preliminary score
var score = evaluator.calculatePreliminaryScore(id, searchTerms, flags, positions);
if (score != null) {
results.add(score);
}
}
return results;
}
}
public List<DecoratedSearchResultItem> selectBestResults(SearchParameters params,
Collection<SearchResultItem> results) throws SQLException {
var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain);
List<SearchResultItem> resultsList = new ArrayList<>(results.size());
for (var item : results) {
if (domainCountFilter.test(item)) {
// It's important that this filter runs across all results, not just the top N
if (resultsList.size() < params.limitTotal) {
resultsList.add(item);
}
}
}
for (var item : resultsList) {
item.resultsFromDomain = domainCountFilter.getCount(item);
}
return decorateResults(resultsList, params.compiledQuery);
}
/** Decorate the result items with additional information from the link database
* and calculate an updated ranking with the additional information */
public List<DecoratedSearchResultItem> decorateResults(List<SearchResultItem> rawResults,
CompiledQuery<String> compiledQuery)
throws SQLException
{
TLongList idsList = new TLongArrayList(rawResults.size());
for (var result : rawResults)
idsList.add(result.getDocumentId());
Map<Long, DocdbUrlDetail> urlDetailsById = new HashMap<>(rawResults.size());
for (var item : documentDbReader.getUrlDetails(idsList))
urlDetailsById.put(item.urlId(), item);
List<DecoratedSearchResultItem> resultItems = new ArrayList<>(rawResults.size());
for (var result : rawResults) {
var id = result.getDocumentId();
var docData = urlDetailsById.get(id);
if (docData == null) {
logger.warn("No document data for id {}", id);
continue;
}
resultItems.add(createCombinedItem(
result,
docData));
}
return resultItems;
}
private DecoratedSearchResultItem createCombinedItem(SearchResultItem result,
DocdbUrlDetail docData) {
ResultRankingDetailsExtractor detailsExtractor = new ResultRankingDetailsExtractor();
// Consumer<ResultRankingDetails> detailConsumer = rankingContext.params.exportDebugData ? detailsExtractor::set : null;
return new DecoratedSearchResultItem(
result,
docData.url(),
docData.title(),
docData.description(),
docData.urlQuality(),
docData.format(),
docData.features(),
docData.pubYear(),
docData.dataHash(),
docData.wordsTotal(),
0L, //bestPositions(wordMetas),
result.getScore(),
detailsExtractor.get()
);
}
private static class ResultRankingDetailsExtractor {
private ResultRankingDetails value = null;
public ResultRankingDetails get() {
return value;
}
public void set(ResultRankingDetails value) {
this.value = value;
}
}
private long bestPositions(CompiledQueryLong wordMetas) {
LongSet positionsSet = CompiledQueryAggregates.positionsAggregate(wordMetas, WordMetadata::decodePositions);
int bestPc = 0;
long bestPositions = 0;
var li = positionsSet.longIterator();
while (li.hasNext()) {
long pos = li.nextLong();
int pc = Long.bitCount(pos);
if (pc > bestPc) {
bestPc = pc;
bestPositions = pos;
}
}
return bestPositions;
}
}

View File

@ -32,6 +32,7 @@ public final class CombinedDocIdList {
public int size() {
return data.length;
}
public long at(int i) { return data[i]; }
public LongStream stream() {
return Arrays.stream(data);

View File

@ -1,209 +0,0 @@
package nu.marginalia.ranking.results;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingInputs;
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingOutputs;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.ranking.results.factors.*;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.sequence.GammaCodedSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.util.function.Consumer;
@Singleton
public class ResultValuator {
final static double scalingFactor = 500.;
private final TermCoherenceFactor termCoherenceFactor;
private static final Logger logger = LoggerFactory.getLogger(ResultValuator.class);
@Inject
public ResultValuator(TermCoherenceFactor termCoherenceFactor) {
this.termCoherenceFactor = termCoherenceFactor;
}
public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery,
CompiledQueryInt positionsCountQuery, CompiledQuery<GammaCodedSequence> positionsQuery, long documentMetadata,
int features,
int length,
int bestCoherence,
ResultRankingContext ctx,
@Nullable Consumer<ResultRankingDetails> detailsConsumer
)
{
if (wordFlagsQuery.isEmpty())
return Double.MAX_VALUE;
if (length < 0) {
length = 5000;
}
var rankingParams = ctx.params;
int rank = DocumentMetadata.decodeRank(documentMetadata);
int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata);
int quality = DocumentMetadata.decodeQuality(documentMetadata);
int size = DocumentMetadata.decodeSize(documentMetadata);
int flagsPenalty = flagsPenalty(features, documentMetadata & 0xFF, size);
int topology = DocumentMetadata.decodeTopology(documentMetadata);
int year = DocumentMetadata.decodeYear(documentMetadata);
double averageSentenceLengthPenalty = (asl >= rankingParams.shortSentenceThreshold ? 0 : -rankingParams.shortSentencePenalty);
final double qualityPenalty = calculateQualityPenalty(size, quality, rankingParams);
final double rankingBonus = (255. - rank) * rankingParams.domainRankBonus;
final double topologyBonus = Math.log(1 + topology);
final double documentLengthPenalty = length > rankingParams.shortDocumentThreshold ? 0 : -rankingParams.shortDocumentPenalty;
final double temporalBias;
if (rankingParams.temporalBias == ResultRankingParameters.TemporalBias.RECENT) {
temporalBias = - Math.abs(year - PubDate.MAX_YEAR) * rankingParams.temporalBiasWeight;
} else if (rankingParams.temporalBias == ResultRankingParameters.TemporalBias.OLD) {
temporalBias = - Math.abs(year - PubDate.MIN_YEAR) * rankingParams.temporalBiasWeight;
} else {
temporalBias = 0;
}
double overallPart = averageSentenceLengthPenalty
+ documentLengthPenalty
+ qualityPenalty
+ rankingBonus
+ topologyBonus
+ temporalBias
+ flagsPenalty
+ bestCoherence;
// FIXME: need a weighting factor here
double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / termCoherenceFactor.calculateAvgMinDistance(positionsQuery, ctx));
double tcfFirstPosition = 0.;
double bM25 = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new Bm25FullGraphVisitor(rankingParams.bm25Params, positionsCountQuery.data, length, ctx));
double overallPartPositive = Math.max(0, overallPart);
double overallPartNegative = -Math.min(0, overallPart);
if (null != detailsConsumer) {
var details = new ResultRankingDetails(
new ResultRankingInputs(
rank,
asl,
quality,
size,
topology,
year,
DocumentFlags.decode(documentMetadata).stream().map(Enum::name).toList()
),
new ResultRankingOutputs(
averageSentenceLengthPenalty,
qualityPenalty,
rankingBonus,
topologyBonus,
documentLengthPenalty,
temporalBias,
flagsPenalty,
overallPart,
bM25,
tcfAvgDist,
tcfFirstPosition)
);
detailsConsumer.accept(details);
}
// Renormalize to 0...15, where 0 is the best possible score;
// this is a historical artifact of the original ranking function
double ret = normalize(
tcfAvgDist + tcfFirstPosition
+ bM25
+ overallPartPositive,
overallPartNegative);
if (Double.isNaN(ret)) {
if (getClass().desiredAssertionStatus()) {
throw new IllegalStateException("NaN in result value calculation");
}
return Double.MAX_VALUE;
}
else {
return ret;
}
}
private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) {
if (size < 400) {
if (quality < 5)
return 0;
return -quality * rankingParams.qualityPenalty;
}
else {
return -quality * rankingParams.qualityPenalty * 20;
}
}
private int flagsPenalty(int featureFlags, long docFlags, int size) {
// Short-circuit for index-service, which does not have the feature flags
if (featureFlags == 0)
return 0;
double penalty = 0;
boolean isForum = DocumentFlags.GeneratorForum.isPresent(docFlags);
boolean isWiki = DocumentFlags.GeneratorWiki.isPresent(docFlags);
boolean isDocs = DocumentFlags.GeneratorDocs.isPresent(docFlags);
// Penalize large sites harder for any bullshit as it's a strong signal of a low quality site
double largeSiteFactor = 1.;
if (!isForum && !isWiki && !isDocs && size > 400) {
// Long urls-that-look-like-this tend to be poor search results
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.KEBAB_CASE_URL.getFeatureBit()))
penalty += 30.0;
else if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.LONG_URL.getFeatureBit()))
penalty += 30.;
else penalty += 5.;
largeSiteFactor = 2;
}
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit()))
penalty += 7.5 * largeSiteFactor;
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit()))
penalty += 5.0 * largeSiteFactor;
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.COOKIES.getFeatureBit()))
penalty += 2.5 * largeSiteFactor;
if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit()))
penalty += 2.5 * largeSiteFactor;
if (isForum || isWiki) {
penalty = Math.min(0, penalty - 2);
}
return (int) -penalty;
}
public static double normalize(double value, double penalty) {
if (value < 0)
value = 0;
return Math.sqrt((1.0 + scalingFactor + 10 * penalty) / (1.0 + value));
}
}

View File

@ -1,127 +0,0 @@
package nu.marginalia.ranking.results.factors;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import java.util.List;
public class Bm25PrioGraphVisitor implements CqExpression.DoubleVisitor {
private static final long AVG_LENGTH = 5000;
private final CqDataLong wordMetaData;
private final CqDataInt frequencies;
private final Bm25Parameters bm25Parameters;
private final int docCount;
public Bm25PrioGraphVisitor(Bm25Parameters bm25Parameters,
CqDataLong wordMetaData,
ResultRankingContext ctx) {
this.bm25Parameters = bm25Parameters;
this.docCount = ctx.termFreqDocCount();
this.wordMetaData = wordMetaData;
this.frequencies = ctx.fullCounts;
}
@Override
public double onAnd(List<? extends CqExpression> parts) {
double value = 0;
for (var part : parts) {
value += part.visit(this);
}
return value;
}
@Override
public double onOr(List<? extends CqExpression> parts) {
double value = 0;
for (var part : parts) {
value = Math.max(value, part.visit(this));
}
return value;
}
@Override
public double onLeaf(int idx) {
double count = evaluatePriorityScore(wordMetaData.get(idx));
int freq = frequencies.get(idx);
// note we override b to zero for priority terms as they are independent of document length
return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0);
}
private static double evaluatePriorityScore(long wordMeta) {
int pcount = Long.bitCount(WordMetadata.decodePositions(wordMeta));
double qcount = 0.;
if ((wordMeta & WordFlags.ExternalLink.asBit()) != 0) {
qcount += 2.5;
if ((wordMeta & WordFlags.UrlDomain.asBit()) != 0)
qcount += 2.5;
else if ((wordMeta & WordFlags.UrlPath.asBit()) != 0)
qcount += 1.5;
if ((wordMeta & WordFlags.Site.asBit()) != 0)
qcount += 1.25;
if ((wordMeta & WordFlags.SiteAdjacent.asBit()) != 0)
qcount += 1.25;
}
else {
if ((wordMeta & WordFlags.UrlDomain.asBit()) != 0)
qcount += 3;
else if ((wordMeta & WordFlags.UrlPath.asBit()) != 0)
qcount += 1;
if ((wordMeta & WordFlags.Site.asBit()) != 0)
qcount += 0.5;
if ((wordMeta & WordFlags.SiteAdjacent.asBit()) != 0)
qcount += 0.5;
}
if ((wordMeta & WordFlags.Title.asBit()) != 0)
qcount += 1.5;
if (pcount > 2) {
if ((wordMeta & WordFlags.Subjects.asBit()) != 0)
qcount += 1.25;
if ((wordMeta & WordFlags.NamesWords.asBit()) != 0)
qcount += 0.25;
if ((wordMeta & WordFlags.TfIdfHigh.asBit()) != 0)
qcount += 0.5;
}
return qcount;
}
/**
*
* @param docCount Number of documents
* @param freq Number of matching documents
*/
private double invFreq(int docCount, int freq) {
return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5));
}
/**
*
* @param k determines the size of the impact of a single term
* @param b determines the magnitude of the length normalization
* @param count number of occurrences in the document
* @param length document length
*/
private double f(double k, double b, double count, int length) {
final double lengthRatio = (double) length / AVG_LENGTH;
return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio));
}
}

View File

@ -1,53 +0,0 @@
package nu.marginalia.ranking.results.factors;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.SequenceOperations;
/** Rewards documents where terms appear frequently within the same sentences
*/
public class TermCoherenceFactor {
public double calculateAvgMinDistance(CompiledQuery<GammaCodedSequence> positions, ResultRankingContext ctx) {
double sum = 0;
int cnt = 0;
for (int i = 0; i < positions.size(); i++) {
// Skip terms that are not in the regular mask
if (!ctx.regularMask.get(i))
continue;
var posi = positions.at(i);
// Skip terms that are not in the document
if (posi == null)
continue;
for (int j = i + 1; j < positions.size(); j++) {
// Skip terms that are not in the regular mask
if (!ctx.regularMask.get(j))
continue;
var posj = positions.at(j);
// Skip terms that are not in the document
if (posj == null)
continue;
int distance = SequenceOperations.minDistance(posi.iterator(), posj.iterator());
sum += distance;
cnt++;
}
}
if (cnt > 0) {
return sum / cnt;
} else {
return 1000.;
}
}
}

View File

@ -25,7 +25,7 @@ class IndexResultDomainDeduplicatorTest {
}
SearchResultItem forId(int domain, int ordinal) {
return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(), 4, false, Double.NaN);
return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(),false, Double.NaN);
}
}

View File

@ -87,7 +87,7 @@ public class SearchQueryIndexService {
detail.features,
DomainIndexingState.ACTIVE,
detail.rankingScore, // termScore
detail.resultsFromDomain(),
detail.resultsFromDomain,
getPositionsString(detail),
Long.bitCount(detail.bestPositions),
detail.rawIndexResult,

View File

@ -103,6 +103,7 @@ public class SearchServicePaperDoll extends AbstractModule {
400,
positions,
score,
4,
null)
);
}