diff --git a/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbReader.java b/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbReader.java index ba48f3ec..5b6112fe 100644 --- a/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbReader.java +++ b/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbReader.java @@ -22,6 +22,12 @@ import java.sql.SQLException; import java.util.ArrayList; import java.util.List; +/** Reads the document database, which is a SQLite database + * containing the URLs and metadata of the documents in the + * index. + *

+ * The database is created by the DocumentDbWriter class. + * */ @Singleton public class DocumentDbReader { private final Path dbFile; @@ -52,6 +58,11 @@ public class DocumentDbReader { } } + /** Switches the input database file to a new file. + *

+ * This is used to switch over to a new database file + * when the index is re-indexed. + * */ public void switchInput(Path newDbFile) throws IOException, SQLException { if (!Files.isRegularFile(newDbFile)) { logger.error("Source is not a file, refusing switch-over {}", newDbFile); @@ -78,35 +89,11 @@ public class DocumentDbReader { connection = createConnection(); } - public List getUrlsFromDomain(int domainId) throws SQLException { - if (connection == null || - connection.isClosed()) - { - throw new RuntimeException("URL query temporarily unavailable due to database switch"); - } - - long minId = UrlIdCodec.encodeId(domainId, 0); - long maxId = UrlIdCodec.encodeId(domainId+1, 0); - - List ret = new ArrayList<>(); - - try (var stmt = connection.prepareStatement(""" - SELECT URL - FROM DOCUMENT - WHERE ID >= ? AND ID < ? - """)) - { - stmt.setLong(1, minId); - stmt.setLong(2, maxId); - var rs = stmt.executeQuery(); - while (rs.next()) { - ret.add(rs.getString(1)); - } - } - - return ret; - } - + /** Returns the URL details for the given document ids. + *

+ * This is used to get the URL details for the search + * results. + * */ public List getUrlDetails(TLongList ids) throws SQLException { List ret = new ArrayList<>(ids.size()); diff --git a/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbWriter.java b/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbWriter.java index e843e826..faa98bf5 100644 --- a/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbWriter.java +++ b/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbWriter.java @@ -9,6 +9,10 @@ import java.sql.DriverManager; import java.sql.SQLException; import java.util.List; +/** Writes the document database, which is a SQLite database + * containing the URLs and metadata of the documents in the + * index. + * */ public class DocumentDbWriter { private final Connection connection; diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java index 5d79cfea..46681de4 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java @@ -130,6 +130,7 @@ public class QueryProtobufCodec { results.getWordsTotal(), results.getBestPositions(), results.getRankingScore(), + results.getResultsFromDomain(), convertRankingDetails(results.getRankingDetails()) ); } @@ -187,7 +188,6 @@ public class QueryProtobufCodec { rawItem.getEncodedDocMetadata(), rawItem.getHtmlFeatures(), keywordScores, - rawItem.getResultsFromDomain(), rawItem.getHasPriorityTerms(), Double.NaN // Not set ); @@ -256,6 +256,7 @@ public class QueryProtobufCodec { rpcDecoratedResultItem.getWordsTotal(), rpcDecoratedResultItem.getBestPositions(), rpcDecoratedResultItem.getRankingScore(), + rpcDecoratedResultItem.getResultsFromDomain(), convertRankingDetails(rpcDecoratedResultItem.getRankingDetails()) ); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java index 0522e7bc..8a9b690b 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java @@ -34,6 +34,8 @@ public class DecoratedSearchResultItem implements Comparable keywordScores() { return rawIndexResult.getKeywordScores(); @@ -72,6 +71,7 @@ public class DecoratedSearchResultItem implements Comparable { /** How did the subqueries match against the document ? */ public final List keywordScores; - /** How many other potential results existed in the same domain */ - public int resultsFromDomain; - public boolean hasPrioTerm; public SearchResultItem(long combinedId, diff --git a/code/functions/search-query/api/src/main/protobuf/query-api.proto b/code/functions/search-query/api/src/main/protobuf/query-api.proto index a29b7010..642b28ed 100644 --- a/code/functions/search-query/api/src/main/protobuf/query-api.proto +++ b/code/functions/search-query/api/src/main/protobuf/query-api.proto @@ -93,12 +93,12 @@ message RpcDecoratedResultItem { double rankingScore = 11; // The ranking score of this search result item, lower is better int64 bestPositions = 12; RpcResultRankingDetails rankingDetails = 13; // optional, only present if exportDebugData is true in RpcResultRankingParameters + int32 resultsFromDomain = 14; } /** A raw index-service view of a search result */ message RpcRawResultItem { int64 combinedId = 1; // raw ID with bit-encoded ranking information still present - int32 resultsFromDomain = 2; // number of other results from the same domain int64 encodedDocMetadata = 3; // bit encoded document metadata int32 htmlFeatures = 4; // bitmask encoding features of the document repeated RpcResultKeywordScore keywordScores = 5; diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java index 1dc847b8..58a9a4b0 100644 --- a/code/index/java/nu/marginalia/index/IndexGrpcService.java +++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java @@ -20,7 +20,7 @@ import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.model.SearchTerms; import nu.marginalia.index.query.IndexQuery; import nu.marginalia.index.query.IndexSearchBudget; -import nu.marginalia.index.results.IndexResultValuatorService; +import nu.marginalia.index.results.IndexResultRankingService; import nu.marginalia.index.results.model.ids.CombinedDocIdList; import nu.marginalia.index.searchset.SearchSetsService; import nu.marginalia.index.searchset.SmallSearchSet; @@ -81,7 +81,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { private final StatefulIndex statefulIndex; private final SearchSetsService searchSetsService; - private final IndexResultValuatorService resultValuator; + private final IndexResultRankingService resultValuator; private final String nodeName; @@ -91,7 +91,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { public IndexGrpcService(ServiceConfiguration serviceConfiguration, StatefulIndex statefulIndex, SearchSetsService searchSetsService, - IndexResultValuatorService resultValuator) + IndexResultRankingService resultValuator) { var nodeId = serviceConfiguration.node(); this.nodeName = Integer.toString(nodeId); @@ -135,7 +135,6 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { var rawItem = RpcRawResultItem.newBuilder(); rawItem.setCombinedId(rawResult.combinedId); - rawItem.setResultsFromDomain(rawResult.resultsFromDomain); rawItem.setHtmlFeatures(rawResult.htmlFeatures); rawItem.setEncodedDocMetadata(rawResult.encodedDocMetadata); rawItem.setHasPriorityTerms(rawResult.hasPrioTerm); @@ -159,6 +158,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { .setUrlQuality(result.urlQuality) .setWordsTotal(result.wordsTotal) .setBestPositions(result.bestPositions) + .setResultsFromDomain(result.resultsFromDomain) .setRawItem(rawItem); var rankingDetails = IndexProtobufCodec.convertRankingDetails(result.rankingDetails); diff --git a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java index cd416ca3..abdbc836 100644 --- a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java +++ b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java @@ -3,7 +3,6 @@ package nu.marginalia.index.index; import java.util.List; import gnu.trove.set.hash.TLongHashSet; import nu.marginalia.index.FullReverseIndexReader; -import nu.marginalia.index.PrioReverseIndexReader; import nu.marginalia.index.query.IndexQuery; import nu.marginalia.index.query.IndexQueryBuilder; import nu.marginalia.index.query.filter.QueryFilterAnyOf; diff --git a/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java b/code/index/java/nu/marginalia/index/results/Bm25GraphVisitor.java similarity index 66% rename from code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java rename to code/index/java/nu/marginalia/index/results/Bm25GraphVisitor.java index 88a592bb..9416bf13 100644 --- a/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java +++ b/code/index/java/nu/marginalia/index/results/Bm25GraphVisitor.java @@ -1,33 +1,38 @@ -package nu.marginalia.ranking.results.factors; +package nu.marginalia.index.results; import nu.marginalia.api.searchquery.model.compiled.CqDataInt; -import nu.marginalia.api.searchquery.model.compiled.CqDataLong; import nu.marginalia.api.searchquery.model.compiled.CqExpression; import nu.marginalia.api.searchquery.model.results.Bm25Parameters; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; -import nu.marginalia.model.idx.WordMetadata; import java.util.BitSet; import java.util.List; -public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor { +/** Visitor for calculating the best BM25 score for a graph representing a search query + */ +public class Bm25GraphVisitor implements CqExpression.DoubleVisitor { private static final long AVG_LENGTH = 5000; private final CqDataInt counts; private final CqDataInt frequencies; - private final Bm25Parameters bm25Parameters; + + private final double k1; + private final double b; private final int docCount; private final int length; private final BitSet mask; - public Bm25FullGraphVisitor(Bm25Parameters bm25Parameters, - CqDataInt counts, - int length, - ResultRankingContext ctx) { + public Bm25GraphVisitor(Bm25Parameters bm25Parameters, + CqDataInt counts, + int length, + ResultRankingContext ctx) { this.length = length; - this.bm25Parameters = bm25Parameters; + + this.k1 = bm25Parameters.k(); + this.b = bm25Parameters.b(); + this.docCount = ctx.termFreqDocCount(); this.counts = counts; this.frequencies = ctx.fullCounts; @@ -37,9 +42,11 @@ public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor { @Override public double onAnd(List parts) { double value = 0; + for (var part : parts) { value += part.visit(this); } + return value; } @@ -59,10 +66,9 @@ public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor { } double count = counts.get(idx); - int freq = frequencies.get(idx); - return invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length); + return invFreq(docCount, freq) * f(count, length); } /** @@ -76,14 +82,12 @@ public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor { /** * - * @param k determines the size of the impact of a single term - * @param b determines the magnitude of the length normalization * @param count number of occurrences in the document * @param length document length */ - private double f(double k, double b, double count, int length) { + private double f(double count, int length) { final double lengthRatio = (double) length / AVG_LENGTH; - return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio)); + return (count * (k1 + 1)) / (count + k1 * (1 - b + b * lengthRatio)); } } diff --git a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java b/code/index/java/nu/marginalia/index/results/IndexMetadataService.java deleted file mode 100644 index 86437f02..00000000 --- a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java +++ /dev/null @@ -1,96 +0,0 @@ -package nu.marginalia.index.results; - -import com.google.inject.Inject; -import gnu.trove.map.hash.TObjectLongHashMap; -import it.unimi.dsi.fastutil.longs.Long2ObjectArrayMap; -import it.unimi.dsi.fastutil.longs.LongArrayList; -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; -import nu.marginalia.api.searchquery.model.query.SearchQuery; -import nu.marginalia.index.index.StatefulIndex; -import nu.marginalia.index.model.SearchTermsUtil; -import nu.marginalia.index.results.model.QuerySearchTerms; -import nu.marginalia.index.results.model.TermCoherenceGroupList; -import nu.marginalia.index.results.model.ids.CombinedDocIdList; -import nu.marginalia.index.results.model.ids.TermMetadataList; -import nu.marginalia.index.results.model.ids.TermIdList; - -import java.lang.foreign.Arena; -import java.util.ArrayList; - -import static nu.marginalia.index.results.model.TermCoherenceGroupList.TermCoherenceGroup; - -public class IndexMetadataService { - private final StatefulIndex statefulIndex; - - @Inject - public IndexMetadataService(StatefulIndex index) { - this.statefulIndex = index; - } - - public Long2ObjectArrayMap - getTermMetadataForDocuments(Arena arena, CombinedDocIdList combinedIdsAll, TermIdList termIdsList) - { - var currentIndex = statefulIndex.get(); - - Long2ObjectArrayMap termdocToMeta = - new Long2ObjectArrayMap<>(termIdsList.size()); - - for (long termId : termIdsList.array()) { - termdocToMeta.put(termId, currentIndex.getTermMetadata(arena, termId, combinedIdsAll)); - } - - return termdocToMeta; - } - - public QuerySearchTerms getSearchTerms(CompiledQuery compiledQuery, SearchQuery searchQuery) { - - LongArrayList termIdsList = new LongArrayList(); - LongArrayList termIdsPrio = new LongArrayList(); - - TObjectLongHashMap termToId = new TObjectLongHashMap<>(10, 0.75f, -1); - - for (String word : compiledQuery) { - long id = SearchTermsUtil.getWordId(word); - termIdsList.add(id); - termToId.put(word, id); - } - - for (var term : searchQuery.searchTermsAdvice) { - if (termToId.containsKey(term)) { - continue; - } - - long id = SearchTermsUtil.getWordId(term); - termIdsList.add(id); - termToId.put(term, id); - } - - for (var term : searchQuery.searchTermsPriority) { - if (termToId.containsKey(term)) { - long id = SearchTermsUtil.getWordId(term); - termIdsPrio.add(id); - } - else { - long id = SearchTermsUtil.getWordId(term); - termIdsList.add(id); - termIdsPrio.add(id); - termToId.put(term, id); - } - } - - var idsAll = new TermIdList(termIdsList); - var idsPrio = new TermIdList(termIdsPrio); - - var constraints = new ArrayList(); - for (var coherence : searchQuery.searchTermCoherences) { - constraints.add(new TermCoherenceGroup(coherence, idsAll)); - } - - return new QuerySearchTerms(termToId, - idsAll, - idsPrio, - new TermCoherenceGroupList(constraints) - ); - } - -} diff --git a/code/index/java/nu/marginalia/index/results/IndexResultRankingService.java b/code/index/java/nu/marginalia/index/results/IndexResultRankingService.java new file mode 100644 index 00000000..4b455580 --- /dev/null +++ b/code/index/java/nu/marginalia/index/results/IndexResultRankingService.java @@ -0,0 +1,229 @@ +package nu.marginalia.index.results; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import gnu.trove.list.TLongList; +import gnu.trove.list.array.TLongArrayList; +import gnu.trove.map.hash.TObjectLongHashMap; +import it.unimi.dsi.fastutil.longs.LongArrayList; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; +import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; +import nu.marginalia.api.searchquery.model.results.ResultRankingContext; +import nu.marginalia.api.searchquery.model.results.SearchResultItem; +import nu.marginalia.index.index.CombinedIndexReader; +import nu.marginalia.index.index.StatefulIndex; +import nu.marginalia.index.model.SearchParameters; +import nu.marginalia.index.model.SearchTermsUtil; +import nu.marginalia.index.results.model.QuerySearchTerms; +import nu.marginalia.index.results.model.TermCoherenceGroupList; +import nu.marginalia.index.results.model.ids.CombinedDocIdList; +import nu.marginalia.index.results.model.ids.TermIdList; +import nu.marginalia.index.results.model.ids.TermMetadataList; +import nu.marginalia.linkdb.docs.DocumentDbReader; +import nu.marginalia.linkdb.model.DocdbUrlDetail; +import nu.marginalia.sequence.GammaCodedSequence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.foreign.Arena; +import java.sql.SQLException; +import java.util.*; + +@Singleton +public class IndexResultRankingService { + private static final Logger logger = LoggerFactory.getLogger(IndexResultRankingService.class); + + private final DocumentDbReader documentDbReader; + private final StatefulIndex statefulIndex; + + @Inject + public IndexResultRankingService(DocumentDbReader documentDbReader, + StatefulIndex statefulIndex) + { + this.documentDbReader = documentDbReader; + this.statefulIndex = statefulIndex; + } + + public List rankResults(SearchParameters params, + ResultRankingContext rankingContext, + CombinedDocIdList resultIds) + { + IndexResultScoreCalculator resultRanker = new IndexResultScoreCalculator(statefulIndex, rankingContext, params); + + List results = new ArrayList<>(resultIds.size()); + + // Get the current index reader, which is the one we'll use for this calculation, + // this may change during the calculation, but we don't want to switch over mid-calculation + final CombinedIndexReader currentIndex = statefulIndex.get(); + + final QuerySearchTerms searchTerms = getSearchTerms(params.compiledQuery, params.query); + final int termCount = searchTerms.termIdsAll.size(); + + // We use an arena for the position data to avoid gc pressure + // from the gamma coded sequences, which can be large and have a lifetime + // that matches the try block here + try (var arena = Arena.ofConfined()) { + + TermMetadataList[] termsForDocs = new TermMetadataList[termCount]; + for (int ti = 0; ti < termCount; ti++) { + termsForDocs[ti] = currentIndex.getTermMetadata(arena, searchTerms.termIdsAll.at(ti), resultIds); + } + + // Data for the document. We arrange this in arrays outside the calculation function to avoid + // hash lookups in the inner loop, as it's hot code, and we don't want unnecessary cpu cache + // thrashing in there; out here we can rely on implicit array ordering to match up the data. + + long[] flags = new long[termCount]; + GammaCodedSequence[] positions = new GammaCodedSequence[termCount]; + + // Iterate over documents by their index in the combinedDocIds, as we need the index for the + // term data arrays as well + + for (int i = 0; i < resultIds.size(); i++) { + + // Prepare term-level data for the document + for (int ti = 0; ti < flags.length; ti++) { + var tfd = termsForDocs[ti]; + + assert tfd != null : "No term data for term " + ti; + + flags[ti] = tfd.flag(i); + positions[ti] = tfd.position(i); + } + + // Ignore documents that don't match the mandatory constraints + if (!searchTerms.coherences.testMandatory(positions)) { + continue; + } + + // Calculate the preliminary score + var score = resultRanker.calculateScore(resultIds.at(i), searchTerms, flags, positions); + if (score != null) { + results.add(score); + } + } + + return results; + } + } + + + public List selectBestResults(SearchParameters params, + Collection results) throws SQLException { + + var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain); + + List resultsList = new ArrayList<>(results.size()); + TLongList idsList = new TLongArrayList(params.limitTotal); + + for (var item : results) { + if (domainCountFilter.test(item)) { + + if (resultsList.size() < params.limitTotal) { + resultsList.add(item); + idsList.add(item.getDocumentId()); + } + // + // else { break; } <-- don't add this even though it looks like it should be present! + // + // It's important that this filter runs across all results, not just the top N, + // so we shouldn't break the loop in a putative else-case here! + // + + } + } + + // Fetch the document details for the selected results in one go, from the local document database + // for this index partition + Map detailsById = new HashMap<>(idsList.size()); + for (var item : documentDbReader.getUrlDetails(idsList)) { + detailsById.put(item.urlId(), item); + } + + List resultItems = new ArrayList<>(resultsList.size()); + + // Decorate the results with the document details + for (var result : resultsList) { + final long id = result.getDocumentId(); + final DocdbUrlDetail docData = detailsById.get(id); + + if (docData == null) { + logger.warn("No document data for id {}", id); + continue; + } + + // Create a decorated search result item from the result and the document data + resultItems.add(new DecoratedSearchResultItem( + result, + docData.url(), + docData.title(), + docData.description(), + docData.urlQuality(), + docData.format(), + docData.features(), + docData.pubYear(), + docData.dataHash(), + docData.wordsTotal(), + 0L, //bestPositions(wordMetas), + result.getScore(), + domainCountFilter.getCount(result), + null + )); + } + + return resultItems; + } + + + public QuerySearchTerms getSearchTerms(CompiledQuery compiledQuery, SearchQuery searchQuery) { + + LongArrayList termIdsList = new LongArrayList(); + LongArrayList termIdsPrio = new LongArrayList(); + + TObjectLongHashMap termToId = new TObjectLongHashMap<>(10, 0.75f, -1); + + for (String word : compiledQuery) { + long id = SearchTermsUtil.getWordId(word); + termIdsList.add(id); + termToId.put(word, id); + } + + for (var term : searchQuery.searchTermsAdvice) { + if (termToId.containsKey(term)) { + continue; + } + + long id = SearchTermsUtil.getWordId(term); + termIdsList.add(id); + termToId.put(term, id); + } + + for (var term : searchQuery.searchTermsPriority) { + if (termToId.containsKey(term)) { + long id = SearchTermsUtil.getWordId(term); + termIdsPrio.add(id); + } + else { + long id = SearchTermsUtil.getWordId(term); + termIdsList.add(id); + termIdsPrio.add(id); + termToId.put(term, id); + } + } + + var idsAll = new TermIdList(termIdsList); + var idsPrio = new TermIdList(termIdsPrio); + + var constraints = new ArrayList(); + for (var coherence : searchQuery.searchTermCoherences) { + constraints.add(new TermCoherenceGroupList.TermCoherenceGroup(coherence, idsAll)); + } + + return new QuerySearchTerms(termToId, + idsAll, + idsPrio, + new TermCoherenceGroupList(constraints) + ); + } +} diff --git a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java new file mode 100644 index 00000000..20af5f92 --- /dev/null +++ b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java @@ -0,0 +1,349 @@ +package nu.marginalia.index.results; + +import nu.marginalia.api.searchquery.model.compiled.*; +import nu.marginalia.api.searchquery.model.results.ResultRankingContext; +import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; +import nu.marginalia.api.searchquery.model.results.SearchResultItem; +import nu.marginalia.index.index.CombinedIndexReader; +import nu.marginalia.index.index.StatefulIndex; +import nu.marginalia.index.model.SearchParameters; +import nu.marginalia.index.model.QueryParams; +import nu.marginalia.index.results.model.QuerySearchTerms; +import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.id.UrlIdCodec; +import nu.marginalia.model.idx.DocumentFlags; +import nu.marginalia.model.idx.DocumentMetadata; +import nu.marginalia.model.idx.WordFlags; +import nu.marginalia.index.query.limit.QueryStrategy; +import nu.marginalia.sequence.GammaCodedSequence; +import nu.marginalia.sequence.SequenceOperations; + +import javax.annotation.Nullable; + +import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.*; + +/** This class is responsible for calculating the score of a search result. + * It holds the data required to perform the scoring, as there is strong + * reasons to cache this data, and performs the calculations */ +public class IndexResultScoreCalculator { + private final CombinedIndexReader index; + private final QueryParams queryParams; + + private final ResultRankingContext rankingContext; + private final CompiledQuery compiledQuery; + + public IndexResultScoreCalculator(StatefulIndex statefulIndex, + ResultRankingContext rankingContext, + SearchParameters params) + { + this.index = statefulIndex.get(); + this.rankingContext = rankingContext; + + this.queryParams = params.queryParams; + this.compiledQuery = params.compiledQuery; + } + + private final long flagsFilterMask = WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit(); + + @Nullable + public SearchResultItem calculateScore(long combinedId, + QuerySearchTerms searchTerms, + long[] wordFlags, + GammaCodedSequence[] positions) + { + + CompiledQuery positionsQuery = compiledQuery.root.newQuery(positions); + + int[] counts = new int[compiledQuery.size()]; + + for (int i = 0; i < counts.length; i++) { + if (positions[i] != null) { + counts[i] = positions[i].valueCount(); + } + } + CompiledQueryInt positionsCountQuery = compiledQuery.root.newQuery(counts); + CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags); + + // If the document is not relevant to the query, abort early to reduce allocations and + // avoid unnecessary calculations + if (testRelevance(wordFlagsQuery, positionsCountQuery)) { + return null; + } + + long docId = UrlIdCodec.removeRank(combinedId); + long docMetadata = index.getDocumentMetadata(docId); + int htmlFeatures = index.getHtmlFeatures(docId); + int docSize = index.getDocumentSize(docId); + + int bestCoherence = searchTerms.coherences.testOptional(positions); + + double score = calculateSearchResultValue( + wordFlagsQuery, + positionsCountQuery, + positionsQuery, + docMetadata, + htmlFeatures, + docSize, + bestCoherence, + rankingContext); + + SearchResultItem searchResult = new SearchResultItem(docId, + docMetadata, + htmlFeatures); + + if (hasPrioTerm(searchTerms, positions)) { + score = 0.75 * score; + } + + searchResult.setScore(score); + + return searchResult; + } + + private boolean testRelevance(CompiledQueryLong wordFlagsQuery, CompiledQueryInt countsQuery) { + boolean allSynthetic = booleanAggregate(wordFlagsQuery, WordFlags.Synthetic::isPresent); + int flagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & flagsFilterMask)); + int positionsCount = intMaxMinAggregate(countsQuery, p -> p); + + if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams.queryStrategy())) { + return true; + } + if (flagsCount == 0 && !allSynthetic && positionsCount == 0) { + return true; + } + + return false; + } + + private boolean hasPrioTerm(QuerySearchTerms searchTerms, GammaCodedSequence[] positions) { + var allTerms = searchTerms.termIdsAll; + var prioTerms = searchTerms.termIdsPrio; + + for (int i = 0; i < allTerms.size(); i++) { + if (positions[i] != null && prioTerms.contains(allTerms.at(i))) { + return true; + } + } + + return false; + } + + private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores, + QueryStrategy queryStrategy) + { + if (queryStrategy == QueryStrategy.AUTO || + queryStrategy == QueryStrategy.SENTENCE || + queryStrategy == QueryStrategy.TOPIC) { + return true; + } + + return booleanAggregate(queryGraphScores, + docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy())); + } + + private boolean meetsQueryStrategyRequirements(long wordMeta, QueryStrategy queryStrategy) { + if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) { + return WordFlags.Site.isPresent(wordMeta); + } + else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) { + return WordFlags.Subjects.isPresent(wordMeta); + } + else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) { + return WordFlags.Title.isPresent(wordMeta); + } + else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) { + return WordFlags.UrlPath.isPresent(wordMeta); + } + else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) { + return WordFlags.UrlDomain.isPresent(wordMeta); + } + else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_LINK) { + return WordFlags.ExternalLink.isPresent(wordMeta); + } + return true; + } + + public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery, + CompiledQueryInt positionsCountQuery, + CompiledQuery positionsQuery, long documentMetadata, + int features, + int length, + int bestCoherence, + ResultRankingContext ctx) + { + if (length < 0) { + length = 5000; + } + + var rankingParams = ctx.params; + + int rank = DocumentMetadata.decodeRank(documentMetadata); + int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata); + int quality = DocumentMetadata.decodeQuality(documentMetadata); + int size = DocumentMetadata.decodeSize(documentMetadata); + int flagsPenalty = flagsPenalty(features, documentMetadata & 0xFF, size); + int topology = DocumentMetadata.decodeTopology(documentMetadata); + int year = DocumentMetadata.decodeYear(documentMetadata); + + double averageSentenceLengthPenalty = (asl >= rankingParams.shortSentenceThreshold ? 0 : -rankingParams.shortSentencePenalty); + + final double qualityPenalty = calculateQualityPenalty(size, quality, rankingParams); + final double rankingBonus = (255. - rank) * rankingParams.domainRankBonus; + final double topologyBonus = Math.log(1 + topology); + final double documentLengthPenalty = length > rankingParams.shortDocumentThreshold ? 0 : -rankingParams.shortDocumentPenalty; + final double temporalBias; + + if (rankingParams.temporalBias == ResultRankingParameters.TemporalBias.RECENT) { + temporalBias = - Math.abs(year - PubDate.MAX_YEAR) * rankingParams.temporalBiasWeight; + } else if (rankingParams.temporalBias == ResultRankingParameters.TemporalBias.OLD) { + temporalBias = - Math.abs(year - PubDate.MIN_YEAR) * rankingParams.temporalBiasWeight; + } else { + temporalBias = 0; + } + + double overallPart = averageSentenceLengthPenalty + + documentLengthPenalty + + qualityPenalty + + rankingBonus + + topologyBonus + + temporalBias + + flagsPenalty + + bestCoherence; + + double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / calculateAvgMinDistance(positionsQuery, ctx)); + double tcfFirstPosition = 0.; + + double bM25 = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new Bm25GraphVisitor(rankingParams.bm25Params, positionsCountQuery.data, length, ctx)); + + // Renormalize to 0...15, where 0 is the best possible score; + // this is a historical artifact of the original ranking function + double ret = normalize( + tcfAvgDist + tcfFirstPosition + + bM25 + + Math.max(0, overallPart), + -Math.min(0, overallPart)); + + if (Double.isNaN(ret)) { // This should never happen but if it does, we want to know about it + if (getClass().desiredAssertionStatus()) { + throw new IllegalStateException("NaN in result value calculation"); + } + + return Double.MAX_VALUE; + } + else { + return ret; + } + } + + private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) { + if (size < 400) { + if (quality < 5) + return 0; + return -quality * rankingParams.qualityPenalty; + } + else { + return -quality * rankingParams.qualityPenalty * 20; + } + } + + private int flagsPenalty(int featureFlags, long docFlags, int size) { + + // Short-circuit for index-service, which does not have the feature flags + if (featureFlags == 0) + return 0; + + double penalty = 0; + + boolean isForum = DocumentFlags.GeneratorForum.isPresent(docFlags); + boolean isWiki = DocumentFlags.GeneratorWiki.isPresent(docFlags); + boolean isDocs = DocumentFlags.GeneratorDocs.isPresent(docFlags); + + // Penalize large sites harder for any bullshit as it's a strong signal of a low quality site + double largeSiteFactor = 1.; + + if (!isForum && !isWiki && !isDocs && size > 400) { + // Long urls-that-look-like-this tend to be poor search results + if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.KEBAB_CASE_URL.getFeatureBit())) + penalty += 30.0; + else if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.LONG_URL.getFeatureBit())) + penalty += 30.; + else penalty += 5.; + + largeSiteFactor = 2; + } + + if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit())) + penalty += 7.5 * largeSiteFactor; + + if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit())) + penalty += 5.0 * largeSiteFactor; + + if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.COOKIES.getFeatureBit())) + penalty += 2.5 * largeSiteFactor; + + if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit())) + penalty += 2.5 * largeSiteFactor; + + if (isForum || isWiki) { + penalty = Math.min(0, penalty - 2); + } + + return (int) -penalty; + } + + /** Normalize a value to the range 0...15, where 0 is the best possible score + * + * @param value The value to normalize, must be positive or zero + * @param penalty Any negative component of the value + * */ + public static double normalize(double value, double penalty) { + if (value < 0) + value = 0; + + return Math.sqrt((1.0 + 500. + 10 * penalty) / (1.0 + value)); + } + + + public static double calculateAvgMinDistance(CompiledQuery positions, ResultRankingContext ctx) { + double sum = 0; + int cnt = 0; + + for (int i = 0; i < positions.size(); i++) { + + // Skip terms that are not in the regular mask + if (!ctx.regularMask.get(i)) + continue; + + var posi = positions.at(i); + + // Skip terms that are not in the document + if (posi == null) + continue; + + for (int j = i + 1; j < positions.size(); j++) { + + // Skip terms that are not in the regular mask + if (!ctx.regularMask.get(j)) + continue; + + var posj = positions.at(j); + + // Skip terms that are not in the document + if (posj == null) + continue; + + int distance = SequenceOperations.minDistance(posi.iterator(), posj.iterator()); + sum += distance; + cnt++; + } + } + + if (cnt > 0) { + return sum / cnt; + } else { + return 1000.; + } + } + +} diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java deleted file mode 100644 index 2facf59f..00000000 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java +++ /dev/null @@ -1,165 +0,0 @@ -package nu.marginalia.index.results; - -import nu.marginalia.api.searchquery.model.compiled.*; -import nu.marginalia.api.searchquery.model.results.ResultRankingContext; -import nu.marginalia.api.searchquery.model.results.SearchResultItem; -import nu.marginalia.index.index.CombinedIndexReader; -import nu.marginalia.index.index.StatefulIndex; -import nu.marginalia.index.model.SearchParameters; -import nu.marginalia.index.model.QueryParams; -import nu.marginalia.index.results.model.QuerySearchTerms; -import nu.marginalia.model.id.UrlIdCodec; -import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.index.query.limit.QueryStrategy; -import nu.marginalia.ranking.results.ResultValuator; -import nu.marginalia.sequence.GammaCodedSequence; - -import javax.annotation.Nullable; - -import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.*; - -/** This class is responsible for calculating the score of a search result. - * It holds the data required to perform the scoring, as there is strong - * reasons to cache this data, and performs the calculations */ -public class IndexResultValuationContext { - private final CombinedIndexReader index; - private final QueryParams queryParams; - - private final ResultRankingContext rankingContext; - private final ResultValuator searchResultValuator; - private final CompiledQuery compiledQuery; - - public IndexResultValuationContext(ResultValuator searchResultValuator, - StatefulIndex statefulIndex, - ResultRankingContext rankingContext, - SearchParameters params) - { - this.index = statefulIndex.get(); - this.rankingContext = rankingContext; - this.searchResultValuator = searchResultValuator; - - this.queryParams = params.queryParams; - this.compiledQuery = params.compiledQuery; - } - - private final long flagsFilterMask = WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit(); - - @Nullable - public SearchResultItem calculatePreliminaryScore(long combinedId, - QuerySearchTerms searchTerms, - long[] wordFlags, - GammaCodedSequence[] positions) - { - if (!searchTerms.coherences.testMandatory(positions)) - return null; - - CompiledQuery positionsQuery = compiledQuery.root.newQuery(positions); - CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags); - int[] counts = new int[compiledQuery.size()]; - for (int i = 0; i < counts.length; i++) { - if (positions[i] != null) { - counts[i] = positions[i].valueCount(); - } - } - CompiledQueryInt positionsCountQuery = compiledQuery.root.newQuery(counts); - - // If the document is not relevant to the query, abort early to reduce allocations and - // avoid unnecessary calculations - if (testRelevance(wordFlagsQuery, positionsCountQuery)) { - return null; - } - - long docId = UrlIdCodec.removeRank(combinedId); - long docMetadata = index.getDocumentMetadata(docId); - int htmlFeatures = index.getHtmlFeatures(docId); - int docSize = index.getDocumentSize(docId); - - int bestCoherence = searchTerms.coherences.testOptional(positions); - - double score = searchResultValuator.calculateSearchResultValue( - wordFlagsQuery, - positionsCountQuery, - positionsQuery, - docMetadata, - htmlFeatures, - docSize, - bestCoherence, - rankingContext, null); - - SearchResultItem searchResult = new SearchResultItem(docId, - docMetadata, - htmlFeatures); - - if (hasPrioTerm(searchTerms, positions)) { - score = 0.75 * score; - } - - searchResult.setScore(score); - - return searchResult; - } - - private boolean testRelevance(CompiledQueryLong wordFlagsQuery, CompiledQueryInt countsQuery) { - boolean allSynthetic = booleanAggregate(wordFlagsQuery, WordFlags.Synthetic::isPresent); - int flagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & flagsFilterMask)); - int positionsCount = intMaxMinAggregate(countsQuery, p -> p); - - if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams.queryStrategy())) { - return true; - } - if (flagsCount == 0 && !allSynthetic && positionsCount == 0) { - return true; - } - - return false; - } - - private boolean hasPrioTerm(QuerySearchTerms searchTerms, GammaCodedSequence[] positions) { - var allTerms = searchTerms.termIdsAll; - var prioTerms = searchTerms.termIdsPrio; - - for (int i = 0; i < allTerms.size(); i++) { - if (positions[i] != null && prioTerms.contains(allTerms.at(i))) { - return true; - } - } - - return false; - } - - private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores, - QueryStrategy queryStrategy) - { - if (queryStrategy == QueryStrategy.AUTO || - queryStrategy == QueryStrategy.SENTENCE || - queryStrategy == QueryStrategy.TOPIC) { - return true; - } - - return booleanAggregate(queryGraphScores, - docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy())); - } - - private boolean meetsQueryStrategyRequirements(long wordMeta, QueryStrategy queryStrategy) { - if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) { - return WordFlags.Site.isPresent(wordMeta); - } - else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) { - return WordFlags.Subjects.isPresent(wordMeta); - } - else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) { - return WordFlags.Title.isPresent(wordMeta); - } - else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) { - return WordFlags.UrlPath.isPresent(wordMeta); - } - else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) { - return WordFlags.UrlDomain.isPresent(wordMeta); - } - else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_LINK) { - return WordFlags.ExternalLink.isPresent(wordMeta); - } - return true; - } - -} diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java b/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java deleted file mode 100644 index fbe99cb1..00000000 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java +++ /dev/null @@ -1,210 +0,0 @@ -package nu.marginalia.index.results; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import gnu.trove.list.TLongList; -import gnu.trove.list.array.TLongArrayList; -import it.unimi.dsi.fastutil.longs.LongSet; -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; -import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; -import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; -import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; -import nu.marginalia.api.searchquery.model.results.ResultRankingContext; -import nu.marginalia.api.searchquery.model.results.SearchResultItem; -import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails; -import nu.marginalia.index.index.StatefulIndex; -import nu.marginalia.index.model.SearchParameters; -import nu.marginalia.index.results.model.ids.CombinedDocIdList; -import nu.marginalia.linkdb.docs.DocumentDbReader; -import nu.marginalia.linkdb.model.DocdbUrlDetail; -import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.ranking.results.ResultValuator; -import nu.marginalia.sequence.GammaCodedSequence; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.lang.foreign.Arena; -import java.sql.SQLException; -import java.util.*; - -@Singleton -public class IndexResultValuatorService { - private static final Logger logger = LoggerFactory.getLogger(IndexResultValuatorService.class); - - private final IndexMetadataService metadataService; - private final DocumentDbReader documentDbReader; - private final ResultValuator resultValuator; - private final StatefulIndex statefulIndex; - - @Inject - public IndexResultValuatorService(IndexMetadataService metadataService, - DocumentDbReader documentDbReader, - ResultValuator resultValuator, - StatefulIndex statefulIndex) - { - this.metadataService = metadataService; - this.documentDbReader = documentDbReader; - this.resultValuator = resultValuator; - this.statefulIndex = statefulIndex; - } - - public List rankResults(SearchParameters params, - ResultRankingContext rankingContext, - CombinedDocIdList resultIds) - { - IndexResultValuationContext evaluator = - new IndexResultValuationContext(resultValuator, statefulIndex, rankingContext, params); - - List results = new ArrayList<>(resultIds.size()); - - try (var arena = Arena.ofConfined()) { - // Batch-fetch the word metadata for the documents - - var searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query); - var termsForDocs = metadataService.getTermMetadataForDocuments(arena, resultIds, searchTerms.termIdsAll); - - // Prepare data for the document. We do this outside of the calculation function to avoid - // hash lookups in the inner loop, as it's very hot code and we don't want thrashing in there; - // out here we can rely on implicit array ordering to match up the data. - - var ra = resultIds.array(); - long[] flags = new long[searchTerms.termIdsAll.size()]; - GammaCodedSequence[] positions = new GammaCodedSequence[searchTerms.termIdsAll.size()]; - - for (int i = 0; i < ra.length; i++) { - long id = ra[i]; - - // Prepare term-level data for the document - for (int ti = 0; ti < flags.length; ti++) { - long tid = searchTerms.termIdsAll.at(ti); - var tfd = termsForDocs.get(tid); - - assert tfd != null : "No term data for term " + ti; - - flags[ti] = tfd.flag(i); - positions[ti] = tfd.position(i); - } - - // Calculate the preliminary score - - var score = evaluator.calculatePreliminaryScore(id, searchTerms, flags, positions); - if (score != null) { - results.add(score); - } - } - - return results; - } - } - - - public List selectBestResults(SearchParameters params, - Collection results) throws SQLException { - - var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain); - - List resultsList = new ArrayList<>(results.size()); - - for (var item : results) { - if (domainCountFilter.test(item)) { - // It's important that this filter runs across all results, not just the top N - if (resultsList.size() < params.limitTotal) { - resultsList.add(item); - } - } - } - - for (var item : resultsList) { - item.resultsFromDomain = domainCountFilter.getCount(item); - } - - return decorateResults(resultsList, params.compiledQuery); - } - - /** Decorate the result items with additional information from the link database - * and calculate an updated ranking with the additional information */ - public List decorateResults(List rawResults, - CompiledQuery compiledQuery) - throws SQLException - { - TLongList idsList = new TLongArrayList(rawResults.size()); - - for (var result : rawResults) - idsList.add(result.getDocumentId()); - - Map urlDetailsById = new HashMap<>(rawResults.size()); - - for (var item : documentDbReader.getUrlDetails(idsList)) - urlDetailsById.put(item.urlId(), item); - - List resultItems = new ArrayList<>(rawResults.size()); - for (var result : rawResults) { - var id = result.getDocumentId(); - var docData = urlDetailsById.get(id); - - if (docData == null) { - logger.warn("No document data for id {}", id); - continue; - } - - resultItems.add(createCombinedItem( - result, - docData)); - } - return resultItems; - } - - private DecoratedSearchResultItem createCombinedItem(SearchResultItem result, - DocdbUrlDetail docData) { - - ResultRankingDetailsExtractor detailsExtractor = new ResultRankingDetailsExtractor(); - // Consumer detailConsumer = rankingContext.params.exportDebugData ? detailsExtractor::set : null; - - return new DecoratedSearchResultItem( - result, - docData.url(), - docData.title(), - docData.description(), - docData.urlQuality(), - docData.format(), - docData.features(), - docData.pubYear(), - docData.dataHash(), - docData.wordsTotal(), - 0L, //bestPositions(wordMetas), - result.getScore(), - detailsExtractor.get() - ); - } - - private static class ResultRankingDetailsExtractor { - private ResultRankingDetails value = null; - - public ResultRankingDetails get() { - return value; - } - public void set(ResultRankingDetails value) { - this.value = value; - } - } - - private long bestPositions(CompiledQueryLong wordMetas) { - LongSet positionsSet = CompiledQueryAggregates.positionsAggregate(wordMetas, WordMetadata::decodePositions); - - int bestPc = 0; - long bestPositions = 0; - - var li = positionsSet.longIterator(); - - while (li.hasNext()) { - long pos = li.nextLong(); - int pc = Long.bitCount(pos); - if (pc > bestPc) { - bestPc = pc; - bestPositions = pos; - } - } - - return bestPositions; - } -} diff --git a/code/index/java/nu/marginalia/index/results/model/ids/CombinedDocIdList.java b/code/index/java/nu/marginalia/index/results/model/ids/CombinedDocIdList.java index 7845f14f..43f5c575 100644 --- a/code/index/java/nu/marginalia/index/results/model/ids/CombinedDocIdList.java +++ b/code/index/java/nu/marginalia/index/results/model/ids/CombinedDocIdList.java @@ -32,6 +32,7 @@ public final class CombinedDocIdList { public int size() { return data.length; } + public long at(int i) { return data[i]; } public LongStream stream() { return Arrays.stream(data); diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java deleted file mode 100644 index 6ab72eef..00000000 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ /dev/null @@ -1,209 +0,0 @@ -package nu.marginalia.ranking.results; - -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; -import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt; -import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; -import nu.marginalia.api.searchquery.model.results.ResultRankingContext; -import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; -import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails; -import nu.marginalia.api.searchquery.model.results.debug.ResultRankingInputs; -import nu.marginalia.api.searchquery.model.results.debug.ResultRankingOutputs; -import nu.marginalia.model.crawl.HtmlFeature; -import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.model.idx.DocumentFlags; -import nu.marginalia.model.idx.DocumentMetadata; -import nu.marginalia.ranking.results.factors.*; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import nu.marginalia.sequence.GammaCodedSequence; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.Nullable; -import java.util.function.Consumer; - -@Singleton -public class ResultValuator { - final static double scalingFactor = 500.; - - private final TermCoherenceFactor termCoherenceFactor; - - private static final Logger logger = LoggerFactory.getLogger(ResultValuator.class); - - @Inject - public ResultValuator(TermCoherenceFactor termCoherenceFactor) { - this.termCoherenceFactor = termCoherenceFactor; - } - - public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery, - CompiledQueryInt positionsCountQuery, CompiledQuery positionsQuery, long documentMetadata, - int features, - int length, - int bestCoherence, - ResultRankingContext ctx, - @Nullable Consumer detailsConsumer - ) - { - if (wordFlagsQuery.isEmpty()) - return Double.MAX_VALUE; - - if (length < 0) { - length = 5000; - } - - var rankingParams = ctx.params; - - int rank = DocumentMetadata.decodeRank(documentMetadata); - int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata); - int quality = DocumentMetadata.decodeQuality(documentMetadata); - int size = DocumentMetadata.decodeSize(documentMetadata); - int flagsPenalty = flagsPenalty(features, documentMetadata & 0xFF, size); - int topology = DocumentMetadata.decodeTopology(documentMetadata); - int year = DocumentMetadata.decodeYear(documentMetadata); - - double averageSentenceLengthPenalty = (asl >= rankingParams.shortSentenceThreshold ? 0 : -rankingParams.shortSentencePenalty); - - final double qualityPenalty = calculateQualityPenalty(size, quality, rankingParams); - final double rankingBonus = (255. - rank) * rankingParams.domainRankBonus; - final double topologyBonus = Math.log(1 + topology); - final double documentLengthPenalty = length > rankingParams.shortDocumentThreshold ? 0 : -rankingParams.shortDocumentPenalty; - final double temporalBias; - - if (rankingParams.temporalBias == ResultRankingParameters.TemporalBias.RECENT) { - temporalBias = - Math.abs(year - PubDate.MAX_YEAR) * rankingParams.temporalBiasWeight; - } else if (rankingParams.temporalBias == ResultRankingParameters.TemporalBias.OLD) { - temporalBias = - Math.abs(year - PubDate.MIN_YEAR) * rankingParams.temporalBiasWeight; - } else { - temporalBias = 0; - } - - double overallPart = averageSentenceLengthPenalty - + documentLengthPenalty - + qualityPenalty - + rankingBonus - + topologyBonus - + temporalBias - + flagsPenalty - + bestCoherence; - - // FIXME: need a weighting factor here - double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / termCoherenceFactor.calculateAvgMinDistance(positionsQuery, ctx)); - double tcfFirstPosition = 0.; - - double bM25 = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new Bm25FullGraphVisitor(rankingParams.bm25Params, positionsCountQuery.data, length, ctx)); - - double overallPartPositive = Math.max(0, overallPart); - double overallPartNegative = -Math.min(0, overallPart); - - if (null != detailsConsumer) { - var details = new ResultRankingDetails( - new ResultRankingInputs( - rank, - asl, - quality, - size, - topology, - year, - DocumentFlags.decode(documentMetadata).stream().map(Enum::name).toList() - ), - new ResultRankingOutputs( - averageSentenceLengthPenalty, - qualityPenalty, - rankingBonus, - topologyBonus, - documentLengthPenalty, - temporalBias, - flagsPenalty, - overallPart, - bM25, - tcfAvgDist, - tcfFirstPosition) - ); - - detailsConsumer.accept(details); - } - - // Renormalize to 0...15, where 0 is the best possible score; - // this is a historical artifact of the original ranking function - double ret = normalize( - tcfAvgDist + tcfFirstPosition - + bM25 - + overallPartPositive, - overallPartNegative); - - if (Double.isNaN(ret)) { - if (getClass().desiredAssertionStatus()) { - throw new IllegalStateException("NaN in result value calculation"); - } - - return Double.MAX_VALUE; - } - else { - return ret; - } - } - - private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) { - if (size < 400) { - if (quality < 5) - return 0; - return -quality * rankingParams.qualityPenalty; - } - else { - return -quality * rankingParams.qualityPenalty * 20; - } - } - - private int flagsPenalty(int featureFlags, long docFlags, int size) { - - // Short-circuit for index-service, which does not have the feature flags - if (featureFlags == 0) - return 0; - - double penalty = 0; - - boolean isForum = DocumentFlags.GeneratorForum.isPresent(docFlags); - boolean isWiki = DocumentFlags.GeneratorWiki.isPresent(docFlags); - boolean isDocs = DocumentFlags.GeneratorDocs.isPresent(docFlags); - - // Penalize large sites harder for any bullshit as it's a strong signal of a low quality site - double largeSiteFactor = 1.; - - if (!isForum && !isWiki && !isDocs && size > 400) { - // Long urls-that-look-like-this tend to be poor search results - if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.KEBAB_CASE_URL.getFeatureBit())) - penalty += 30.0; - else if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.LONG_URL.getFeatureBit())) - penalty += 30.; - else penalty += 5.; - - largeSiteFactor = 2; - } - - if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit())) - penalty += 7.5 * largeSiteFactor; - - if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit())) - penalty += 5.0 * largeSiteFactor; - - if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.COOKIES.getFeatureBit())) - penalty += 2.5 * largeSiteFactor; - - if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit())) - penalty += 2.5 * largeSiteFactor; - - if (isForum || isWiki) { - penalty = Math.min(0, penalty - 2); - } - - return (int) -penalty; - } - - public static double normalize(double value, double penalty) { - if (value < 0) - value = 0; - - return Math.sqrt((1.0 + scalingFactor + 10 * penalty) / (1.0 + value)); - } -} diff --git a/code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java b/code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java deleted file mode 100644 index 1fb26f6b..00000000 --- a/code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java +++ /dev/null @@ -1,127 +0,0 @@ -package nu.marginalia.ranking.results.factors; - -import nu.marginalia.api.searchquery.model.compiled.CqDataInt; -import nu.marginalia.api.searchquery.model.compiled.CqDataLong; -import nu.marginalia.api.searchquery.model.compiled.CqExpression; -import nu.marginalia.api.searchquery.model.results.Bm25Parameters; -import nu.marginalia.api.searchquery.model.results.ResultRankingContext; -import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.model.idx.WordMetadata; - -import java.util.List; - -public class Bm25PrioGraphVisitor implements CqExpression.DoubleVisitor { - private static final long AVG_LENGTH = 5000; - - private final CqDataLong wordMetaData; - private final CqDataInt frequencies; - private final Bm25Parameters bm25Parameters; - - private final int docCount; - - public Bm25PrioGraphVisitor(Bm25Parameters bm25Parameters, - CqDataLong wordMetaData, - ResultRankingContext ctx) { - this.bm25Parameters = bm25Parameters; - this.docCount = ctx.termFreqDocCount(); - this.wordMetaData = wordMetaData; - this.frequencies = ctx.fullCounts; - } - - @Override - public double onAnd(List parts) { - double value = 0; - for (var part : parts) { - value += part.visit(this); - } - return value; - } - - @Override - public double onOr(List parts) { - double value = 0; - for (var part : parts) { - value = Math.max(value, part.visit(this)); - } - return value; - } - - @Override - public double onLeaf(int idx) { - double count = evaluatePriorityScore(wordMetaData.get(idx)); - - int freq = frequencies.get(idx); - - // note we override b to zero for priority terms as they are independent of document length - return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0); - } - - private static double evaluatePriorityScore(long wordMeta) { - int pcount = Long.bitCount(WordMetadata.decodePositions(wordMeta)); - - double qcount = 0.; - - if ((wordMeta & WordFlags.ExternalLink.asBit()) != 0) { - - qcount += 2.5; - - if ((wordMeta & WordFlags.UrlDomain.asBit()) != 0) - qcount += 2.5; - else if ((wordMeta & WordFlags.UrlPath.asBit()) != 0) - qcount += 1.5; - - if ((wordMeta & WordFlags.Site.asBit()) != 0) - qcount += 1.25; - if ((wordMeta & WordFlags.SiteAdjacent.asBit()) != 0) - qcount += 1.25; - } - else { - if ((wordMeta & WordFlags.UrlDomain.asBit()) != 0) - qcount += 3; - else if ((wordMeta & WordFlags.UrlPath.asBit()) != 0) - qcount += 1; - - if ((wordMeta & WordFlags.Site.asBit()) != 0) - qcount += 0.5; - if ((wordMeta & WordFlags.SiteAdjacent.asBit()) != 0) - qcount += 0.5; - } - - if ((wordMeta & WordFlags.Title.asBit()) != 0) - qcount += 1.5; - - if (pcount > 2) { - if ((wordMeta & WordFlags.Subjects.asBit()) != 0) - qcount += 1.25; - if ((wordMeta & WordFlags.NamesWords.asBit()) != 0) - qcount += 0.25; - if ((wordMeta & WordFlags.TfIdfHigh.asBit()) != 0) - qcount += 0.5; - } - - return qcount; - } - - - /** - * - * @param docCount Number of documents - * @param freq Number of matching documents - */ - private double invFreq(int docCount, int freq) { - return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5)); - } - - /** - * - * @param k determines the size of the impact of a single term - * @param b determines the magnitude of the length normalization - * @param count number of occurrences in the document - * @param length document length - */ - private double f(double k, double b, double count, int length) { - final double lengthRatio = (double) length / AVG_LENGTH; - - return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio)); - } -} diff --git a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java deleted file mode 100644 index 2ebef7cd..00000000 --- a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java +++ /dev/null @@ -1,53 +0,0 @@ -package nu.marginalia.ranking.results.factors; - -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; -import nu.marginalia.api.searchquery.model.results.ResultRankingContext; -import nu.marginalia.sequence.GammaCodedSequence; -import nu.marginalia.sequence.SequenceOperations; - -/** Rewards documents where terms appear frequently within the same sentences - */ -public class TermCoherenceFactor { - - public double calculateAvgMinDistance(CompiledQuery positions, ResultRankingContext ctx) { - double sum = 0; - int cnt = 0; - - for (int i = 0; i < positions.size(); i++) { - - // Skip terms that are not in the regular mask - if (!ctx.regularMask.get(i)) - continue; - - var posi = positions.at(i); - - // Skip terms that are not in the document - if (posi == null) - continue; - - for (int j = i + 1; j < positions.size(); j++) { - - // Skip terms that are not in the regular mask - if (!ctx.regularMask.get(j)) - continue; - - var posj = positions.at(j); - - // Skip terms that are not in the document - if (posj == null) - continue; - - int distance = SequenceOperations.minDistance(posi.iterator(), posj.iterator()); - sum += distance; - cnt++; - } - } - - if (cnt > 0) { - return sum / cnt; - } else { - return 1000.; - } - } - -} \ No newline at end of file diff --git a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java index f4740e31..4966e5f0 100644 --- a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java +++ b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java @@ -25,7 +25,7 @@ class IndexResultDomainDeduplicatorTest { } SearchResultItem forId(int domain, int ordinal) { - return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(), 4, false, Double.NaN); + return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(),false, Double.NaN); } } \ No newline at end of file diff --git a/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java b/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java index 901174f4..c7214060 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java +++ b/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java @@ -87,7 +87,7 @@ public class SearchQueryIndexService { detail.features, DomainIndexingState.ACTIVE, detail.rankingScore, // termScore - detail.resultsFromDomain(), + detail.resultsFromDomain, getPositionsString(detail), Long.bitCount(detail.bestPositions), detail.rawIndexResult, diff --git a/code/services-application/search-service/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java b/code/services-application/search-service/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java index be3fe0b7..76fb62fc 100644 --- a/code/services-application/search-service/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java +++ b/code/services-application/search-service/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java @@ -103,6 +103,7 @@ public class SearchServicePaperDoll extends AbstractModule { 400, positions, score, + 4, null) ); }