diff --git a/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbReader.java b/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbReader.java
index ba48f3ec..5b6112fe 100644
--- a/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbReader.java
+++ b/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbReader.java
@@ -22,6 +22,12 @@ import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
+/** Reads the document database, which is a SQLite database
+ * containing the URLs and metadata of the documents in the
+ * index.
+ *
+ * The database is created by the DocumentDbWriter class.
+ * */
@Singleton
public class DocumentDbReader {
private final Path dbFile;
@@ -52,6 +58,11 @@ public class DocumentDbReader {
}
}
+ /** Switches the input database file to a new file.
+ *
+ * This is used to switch over to a new database file
+ * when the index is re-indexed.
+ * */
public void switchInput(Path newDbFile) throws IOException, SQLException {
if (!Files.isRegularFile(newDbFile)) {
logger.error("Source is not a file, refusing switch-over {}", newDbFile);
@@ -78,35 +89,11 @@ public class DocumentDbReader {
connection = createConnection();
}
- public List getUrlsFromDomain(int domainId) throws SQLException {
- if (connection == null ||
- connection.isClosed())
- {
- throw new RuntimeException("URL query temporarily unavailable due to database switch");
- }
-
- long minId = UrlIdCodec.encodeId(domainId, 0);
- long maxId = UrlIdCodec.encodeId(domainId+1, 0);
-
- List ret = new ArrayList<>();
-
- try (var stmt = connection.prepareStatement("""
- SELECT URL
- FROM DOCUMENT
- WHERE ID >= ? AND ID < ?
- """))
- {
- stmt.setLong(1, minId);
- stmt.setLong(2, maxId);
- var rs = stmt.executeQuery();
- while (rs.next()) {
- ret.add(rs.getString(1));
- }
- }
-
- return ret;
- }
-
+ /** Returns the URL details for the given document ids.
+ *
+ * This is used to get the URL details for the search
+ * results.
+ * */
public List getUrlDetails(TLongList ids) throws SQLException {
List ret = new ArrayList<>(ids.size());
diff --git a/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbWriter.java b/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbWriter.java
index e843e826..faa98bf5 100644
--- a/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbWriter.java
+++ b/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbWriter.java
@@ -9,6 +9,10 @@ import java.sql.DriverManager;
import java.sql.SQLException;
import java.util.List;
+/** Writes the document database, which is a SQLite database
+ * containing the URLs and metadata of the documents in the
+ * index.
+ * */
public class DocumentDbWriter {
private final Connection connection;
diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java
index 5d79cfea..46681de4 100644
--- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java
+++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java
@@ -130,6 +130,7 @@ public class QueryProtobufCodec {
results.getWordsTotal(),
results.getBestPositions(),
results.getRankingScore(),
+ results.getResultsFromDomain(),
convertRankingDetails(results.getRankingDetails())
);
}
@@ -187,7 +188,6 @@ public class QueryProtobufCodec {
rawItem.getEncodedDocMetadata(),
rawItem.getHtmlFeatures(),
keywordScores,
- rawItem.getResultsFromDomain(),
rawItem.getHasPriorityTerms(),
Double.NaN // Not set
);
@@ -256,6 +256,7 @@ public class QueryProtobufCodec {
rpcDecoratedResultItem.getWordsTotal(),
rpcDecoratedResultItem.getBestPositions(),
rpcDecoratedResultItem.getRankingScore(),
+ rpcDecoratedResultItem.getResultsFromDomain(),
convertRankingDetails(rpcDecoratedResultItem.getRankingDetails())
);
}
diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java
index 0522e7bc..8a9b690b 100644
--- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java
+++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java
@@ -34,6 +34,8 @@ public class DecoratedSearchResultItem implements Comparable keywordScores() {
return rawIndexResult.getKeywordScores();
@@ -72,6 +71,7 @@ public class DecoratedSearchResultItem implements Comparable {
/** How did the subqueries match against the document ? */
public final List keywordScores;
- /** How many other potential results existed in the same domain */
- public int resultsFromDomain;
-
public boolean hasPrioTerm;
public SearchResultItem(long combinedId,
diff --git a/code/functions/search-query/api/src/main/protobuf/query-api.proto b/code/functions/search-query/api/src/main/protobuf/query-api.proto
index a29b7010..642b28ed 100644
--- a/code/functions/search-query/api/src/main/protobuf/query-api.proto
+++ b/code/functions/search-query/api/src/main/protobuf/query-api.proto
@@ -93,12 +93,12 @@ message RpcDecoratedResultItem {
double rankingScore = 11; // The ranking score of this search result item, lower is better
int64 bestPositions = 12;
RpcResultRankingDetails rankingDetails = 13; // optional, only present if exportDebugData is true in RpcResultRankingParameters
+ int32 resultsFromDomain = 14;
}
/** A raw index-service view of a search result */
message RpcRawResultItem {
int64 combinedId = 1; // raw ID with bit-encoded ranking information still present
- int32 resultsFromDomain = 2; // number of other results from the same domain
int64 encodedDocMetadata = 3; // bit encoded document metadata
int32 htmlFeatures = 4; // bitmask encoding features of the document
repeated RpcResultKeywordScore keywordScores = 5;
diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java
index 1dc847b8..58a9a4b0 100644
--- a/code/index/java/nu/marginalia/index/IndexGrpcService.java
+++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java
@@ -20,7 +20,7 @@ import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.model.SearchTerms;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexSearchBudget;
-import nu.marginalia.index.results.IndexResultValuatorService;
+import nu.marginalia.index.results.IndexResultRankingService;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.searchset.SearchSetsService;
import nu.marginalia.index.searchset.SmallSearchSet;
@@ -81,7 +81,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
private final StatefulIndex statefulIndex;
private final SearchSetsService searchSetsService;
- private final IndexResultValuatorService resultValuator;
+ private final IndexResultRankingService resultValuator;
private final String nodeName;
@@ -91,7 +91,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
public IndexGrpcService(ServiceConfiguration serviceConfiguration,
StatefulIndex statefulIndex,
SearchSetsService searchSetsService,
- IndexResultValuatorService resultValuator)
+ IndexResultRankingService resultValuator)
{
var nodeId = serviceConfiguration.node();
this.nodeName = Integer.toString(nodeId);
@@ -135,7 +135,6 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
var rawItem = RpcRawResultItem.newBuilder();
rawItem.setCombinedId(rawResult.combinedId);
- rawItem.setResultsFromDomain(rawResult.resultsFromDomain);
rawItem.setHtmlFeatures(rawResult.htmlFeatures);
rawItem.setEncodedDocMetadata(rawResult.encodedDocMetadata);
rawItem.setHasPriorityTerms(rawResult.hasPrioTerm);
@@ -159,6 +158,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
.setUrlQuality(result.urlQuality)
.setWordsTotal(result.wordsTotal)
.setBestPositions(result.bestPositions)
+ .setResultsFromDomain(result.resultsFromDomain)
.setRawItem(rawItem);
var rankingDetails = IndexProtobufCodec.convertRankingDetails(result.rankingDetails);
diff --git a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java
index cd416ca3..abdbc836 100644
--- a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java
+++ b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java
@@ -3,7 +3,6 @@ package nu.marginalia.index.index;
import java.util.List;
import gnu.trove.set.hash.TLongHashSet;
import nu.marginalia.index.FullReverseIndexReader;
-import nu.marginalia.index.PrioReverseIndexReader;
import nu.marginalia.index.query.IndexQuery;
import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.query.filter.QueryFilterAnyOf;
diff --git a/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java b/code/index/java/nu/marginalia/index/results/Bm25GraphVisitor.java
similarity index 66%
rename from code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java
rename to code/index/java/nu/marginalia/index/results/Bm25GraphVisitor.java
index 88a592bb..9416bf13 100644
--- a/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java
+++ b/code/index/java/nu/marginalia/index/results/Bm25GraphVisitor.java
@@ -1,33 +1,38 @@
-package nu.marginalia.ranking.results.factors;
+package nu.marginalia.index.results;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
-import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
-import nu.marginalia.model.idx.WordMetadata;
import java.util.BitSet;
import java.util.List;
-public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor {
+/** Visitor for calculating the best BM25 score for a graph representing a search query
+ */
+public class Bm25GraphVisitor implements CqExpression.DoubleVisitor {
private static final long AVG_LENGTH = 5000;
private final CqDataInt counts;
private final CqDataInt frequencies;
- private final Bm25Parameters bm25Parameters;
+
+ private final double k1;
+ private final double b;
private final int docCount;
private final int length;
private final BitSet mask;
- public Bm25FullGraphVisitor(Bm25Parameters bm25Parameters,
- CqDataInt counts,
- int length,
- ResultRankingContext ctx) {
+ public Bm25GraphVisitor(Bm25Parameters bm25Parameters,
+ CqDataInt counts,
+ int length,
+ ResultRankingContext ctx) {
this.length = length;
- this.bm25Parameters = bm25Parameters;
+
+ this.k1 = bm25Parameters.k();
+ this.b = bm25Parameters.b();
+
this.docCount = ctx.termFreqDocCount();
this.counts = counts;
this.frequencies = ctx.fullCounts;
@@ -37,9 +42,11 @@ public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor {
@Override
public double onAnd(List extends CqExpression> parts) {
double value = 0;
+
for (var part : parts) {
value += part.visit(this);
}
+
return value;
}
@@ -59,10 +66,9 @@ public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor {
}
double count = counts.get(idx);
-
int freq = frequencies.get(idx);
- return invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length);
+ return invFreq(docCount, freq) * f(count, length);
}
/**
@@ -76,14 +82,12 @@ public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor {
/**
*
- * @param k determines the size of the impact of a single term
- * @param b determines the magnitude of the length normalization
* @param count number of occurrences in the document
* @param length document length
*/
- private double f(double k, double b, double count, int length) {
+ private double f(double count, int length) {
final double lengthRatio = (double) length / AVG_LENGTH;
- return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio));
+ return (count * (k1 + 1)) / (count + k1 * (1 - b + b * lengthRatio));
}
}
diff --git a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java b/code/index/java/nu/marginalia/index/results/IndexMetadataService.java
deleted file mode 100644
index 86437f02..00000000
--- a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java
+++ /dev/null
@@ -1,96 +0,0 @@
-package nu.marginalia.index.results;
-
-import com.google.inject.Inject;
-import gnu.trove.map.hash.TObjectLongHashMap;
-import it.unimi.dsi.fastutil.longs.Long2ObjectArrayMap;
-import it.unimi.dsi.fastutil.longs.LongArrayList;
-import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
-import nu.marginalia.api.searchquery.model.query.SearchQuery;
-import nu.marginalia.index.index.StatefulIndex;
-import nu.marginalia.index.model.SearchTermsUtil;
-import nu.marginalia.index.results.model.QuerySearchTerms;
-import nu.marginalia.index.results.model.TermCoherenceGroupList;
-import nu.marginalia.index.results.model.ids.CombinedDocIdList;
-import nu.marginalia.index.results.model.ids.TermMetadataList;
-import nu.marginalia.index.results.model.ids.TermIdList;
-
-import java.lang.foreign.Arena;
-import java.util.ArrayList;
-
-import static nu.marginalia.index.results.model.TermCoherenceGroupList.TermCoherenceGroup;
-
-public class IndexMetadataService {
- private final StatefulIndex statefulIndex;
-
- @Inject
- public IndexMetadataService(StatefulIndex index) {
- this.statefulIndex = index;
- }
-
- public Long2ObjectArrayMap
- getTermMetadataForDocuments(Arena arena, CombinedDocIdList combinedIdsAll, TermIdList termIdsList)
- {
- var currentIndex = statefulIndex.get();
-
- Long2ObjectArrayMap termdocToMeta =
- new Long2ObjectArrayMap<>(termIdsList.size());
-
- for (long termId : termIdsList.array()) {
- termdocToMeta.put(termId, currentIndex.getTermMetadata(arena, termId, combinedIdsAll));
- }
-
- return termdocToMeta;
- }
-
- public QuerySearchTerms getSearchTerms(CompiledQuery compiledQuery, SearchQuery searchQuery) {
-
- LongArrayList termIdsList = new LongArrayList();
- LongArrayList termIdsPrio = new LongArrayList();
-
- TObjectLongHashMap termToId = new TObjectLongHashMap<>(10, 0.75f, -1);
-
- for (String word : compiledQuery) {
- long id = SearchTermsUtil.getWordId(word);
- termIdsList.add(id);
- termToId.put(word, id);
- }
-
- for (var term : searchQuery.searchTermsAdvice) {
- if (termToId.containsKey(term)) {
- continue;
- }
-
- long id = SearchTermsUtil.getWordId(term);
- termIdsList.add(id);
- termToId.put(term, id);
- }
-
- for (var term : searchQuery.searchTermsPriority) {
- if (termToId.containsKey(term)) {
- long id = SearchTermsUtil.getWordId(term);
- termIdsPrio.add(id);
- }
- else {
- long id = SearchTermsUtil.getWordId(term);
- termIdsList.add(id);
- termIdsPrio.add(id);
- termToId.put(term, id);
- }
- }
-
- var idsAll = new TermIdList(termIdsList);
- var idsPrio = new TermIdList(termIdsPrio);
-
- var constraints = new ArrayList();
- for (var coherence : searchQuery.searchTermCoherences) {
- constraints.add(new TermCoherenceGroup(coherence, idsAll));
- }
-
- return new QuerySearchTerms(termToId,
- idsAll,
- idsPrio,
- new TermCoherenceGroupList(constraints)
- );
- }
-
-}
diff --git a/code/index/java/nu/marginalia/index/results/IndexResultRankingService.java b/code/index/java/nu/marginalia/index/results/IndexResultRankingService.java
new file mode 100644
index 00000000..4b455580
--- /dev/null
+++ b/code/index/java/nu/marginalia/index/results/IndexResultRankingService.java
@@ -0,0 +1,229 @@
+package nu.marginalia.index.results;
+
+import com.google.inject.Inject;
+import com.google.inject.Singleton;
+import gnu.trove.list.TLongList;
+import gnu.trove.list.array.TLongArrayList;
+import gnu.trove.map.hash.TObjectLongHashMap;
+import it.unimi.dsi.fastutil.longs.LongArrayList;
+import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
+import nu.marginalia.api.searchquery.model.query.SearchQuery;
+import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
+import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
+import nu.marginalia.api.searchquery.model.results.SearchResultItem;
+import nu.marginalia.index.index.CombinedIndexReader;
+import nu.marginalia.index.index.StatefulIndex;
+import nu.marginalia.index.model.SearchParameters;
+import nu.marginalia.index.model.SearchTermsUtil;
+import nu.marginalia.index.results.model.QuerySearchTerms;
+import nu.marginalia.index.results.model.TermCoherenceGroupList;
+import nu.marginalia.index.results.model.ids.CombinedDocIdList;
+import nu.marginalia.index.results.model.ids.TermIdList;
+import nu.marginalia.index.results.model.ids.TermMetadataList;
+import nu.marginalia.linkdb.docs.DocumentDbReader;
+import nu.marginalia.linkdb.model.DocdbUrlDetail;
+import nu.marginalia.sequence.GammaCodedSequence;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.foreign.Arena;
+import java.sql.SQLException;
+import java.util.*;
+
+@Singleton
+public class IndexResultRankingService {
+ private static final Logger logger = LoggerFactory.getLogger(IndexResultRankingService.class);
+
+ private final DocumentDbReader documentDbReader;
+ private final StatefulIndex statefulIndex;
+
+ @Inject
+ public IndexResultRankingService(DocumentDbReader documentDbReader,
+ StatefulIndex statefulIndex)
+ {
+ this.documentDbReader = documentDbReader;
+ this.statefulIndex = statefulIndex;
+ }
+
+ public List rankResults(SearchParameters params,
+ ResultRankingContext rankingContext,
+ CombinedDocIdList resultIds)
+ {
+ IndexResultScoreCalculator resultRanker = new IndexResultScoreCalculator(statefulIndex, rankingContext, params);
+
+ List results = new ArrayList<>(resultIds.size());
+
+ // Get the current index reader, which is the one we'll use for this calculation,
+ // this may change during the calculation, but we don't want to switch over mid-calculation
+ final CombinedIndexReader currentIndex = statefulIndex.get();
+
+ final QuerySearchTerms searchTerms = getSearchTerms(params.compiledQuery, params.query);
+ final int termCount = searchTerms.termIdsAll.size();
+
+ // We use an arena for the position data to avoid gc pressure
+ // from the gamma coded sequences, which can be large and have a lifetime
+ // that matches the try block here
+ try (var arena = Arena.ofConfined()) {
+
+ TermMetadataList[] termsForDocs = new TermMetadataList[termCount];
+ for (int ti = 0; ti < termCount; ti++) {
+ termsForDocs[ti] = currentIndex.getTermMetadata(arena, searchTerms.termIdsAll.at(ti), resultIds);
+ }
+
+ // Data for the document. We arrange this in arrays outside the calculation function to avoid
+ // hash lookups in the inner loop, as it's hot code, and we don't want unnecessary cpu cache
+ // thrashing in there; out here we can rely on implicit array ordering to match up the data.
+
+ long[] flags = new long[termCount];
+ GammaCodedSequence[] positions = new GammaCodedSequence[termCount];
+
+ // Iterate over documents by their index in the combinedDocIds, as we need the index for the
+ // term data arrays as well
+
+ for (int i = 0; i < resultIds.size(); i++) {
+
+ // Prepare term-level data for the document
+ for (int ti = 0; ti < flags.length; ti++) {
+ var tfd = termsForDocs[ti];
+
+ assert tfd != null : "No term data for term " + ti;
+
+ flags[ti] = tfd.flag(i);
+ positions[ti] = tfd.position(i);
+ }
+
+ // Ignore documents that don't match the mandatory constraints
+ if (!searchTerms.coherences.testMandatory(positions)) {
+ continue;
+ }
+
+ // Calculate the preliminary score
+ var score = resultRanker.calculateScore(resultIds.at(i), searchTerms, flags, positions);
+ if (score != null) {
+ results.add(score);
+ }
+ }
+
+ return results;
+ }
+ }
+
+
+ public List selectBestResults(SearchParameters params,
+ Collection results) throws SQLException {
+
+ var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain);
+
+ List resultsList = new ArrayList<>(results.size());
+ TLongList idsList = new TLongArrayList(params.limitTotal);
+
+ for (var item : results) {
+ if (domainCountFilter.test(item)) {
+
+ if (resultsList.size() < params.limitTotal) {
+ resultsList.add(item);
+ idsList.add(item.getDocumentId());
+ }
+ //
+ // else { break; } <-- don't add this even though it looks like it should be present!
+ //
+ // It's important that this filter runs across all results, not just the top N,
+ // so we shouldn't break the loop in a putative else-case here!
+ //
+
+ }
+ }
+
+ // Fetch the document details for the selected results in one go, from the local document database
+ // for this index partition
+ Map detailsById = new HashMap<>(idsList.size());
+ for (var item : documentDbReader.getUrlDetails(idsList)) {
+ detailsById.put(item.urlId(), item);
+ }
+
+ List resultItems = new ArrayList<>(resultsList.size());
+
+ // Decorate the results with the document details
+ for (var result : resultsList) {
+ final long id = result.getDocumentId();
+ final DocdbUrlDetail docData = detailsById.get(id);
+
+ if (docData == null) {
+ logger.warn("No document data for id {}", id);
+ continue;
+ }
+
+ // Create a decorated search result item from the result and the document data
+ resultItems.add(new DecoratedSearchResultItem(
+ result,
+ docData.url(),
+ docData.title(),
+ docData.description(),
+ docData.urlQuality(),
+ docData.format(),
+ docData.features(),
+ docData.pubYear(),
+ docData.dataHash(),
+ docData.wordsTotal(),
+ 0L, //bestPositions(wordMetas),
+ result.getScore(),
+ domainCountFilter.getCount(result),
+ null
+ ));
+ }
+
+ return resultItems;
+ }
+
+
+ public QuerySearchTerms getSearchTerms(CompiledQuery compiledQuery, SearchQuery searchQuery) {
+
+ LongArrayList termIdsList = new LongArrayList();
+ LongArrayList termIdsPrio = new LongArrayList();
+
+ TObjectLongHashMap termToId = new TObjectLongHashMap<>(10, 0.75f, -1);
+
+ for (String word : compiledQuery) {
+ long id = SearchTermsUtil.getWordId(word);
+ termIdsList.add(id);
+ termToId.put(word, id);
+ }
+
+ for (var term : searchQuery.searchTermsAdvice) {
+ if (termToId.containsKey(term)) {
+ continue;
+ }
+
+ long id = SearchTermsUtil.getWordId(term);
+ termIdsList.add(id);
+ termToId.put(term, id);
+ }
+
+ for (var term : searchQuery.searchTermsPriority) {
+ if (termToId.containsKey(term)) {
+ long id = SearchTermsUtil.getWordId(term);
+ termIdsPrio.add(id);
+ }
+ else {
+ long id = SearchTermsUtil.getWordId(term);
+ termIdsList.add(id);
+ termIdsPrio.add(id);
+ termToId.put(term, id);
+ }
+ }
+
+ var idsAll = new TermIdList(termIdsList);
+ var idsPrio = new TermIdList(termIdsPrio);
+
+ var constraints = new ArrayList();
+ for (var coherence : searchQuery.searchTermCoherences) {
+ constraints.add(new TermCoherenceGroupList.TermCoherenceGroup(coherence, idsAll));
+ }
+
+ return new QuerySearchTerms(termToId,
+ idsAll,
+ idsPrio,
+ new TermCoherenceGroupList(constraints)
+ );
+ }
+}
diff --git a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java
new file mode 100644
index 00000000..20af5f92
--- /dev/null
+++ b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java
@@ -0,0 +1,349 @@
+package nu.marginalia.index.results;
+
+import nu.marginalia.api.searchquery.model.compiled.*;
+import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
+import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
+import nu.marginalia.api.searchquery.model.results.SearchResultItem;
+import nu.marginalia.index.index.CombinedIndexReader;
+import nu.marginalia.index.index.StatefulIndex;
+import nu.marginalia.index.model.SearchParameters;
+import nu.marginalia.index.model.QueryParams;
+import nu.marginalia.index.results.model.QuerySearchTerms;
+import nu.marginalia.model.crawl.HtmlFeature;
+import nu.marginalia.model.crawl.PubDate;
+import nu.marginalia.model.id.UrlIdCodec;
+import nu.marginalia.model.idx.DocumentFlags;
+import nu.marginalia.model.idx.DocumentMetadata;
+import nu.marginalia.model.idx.WordFlags;
+import nu.marginalia.index.query.limit.QueryStrategy;
+import nu.marginalia.sequence.GammaCodedSequence;
+import nu.marginalia.sequence.SequenceOperations;
+
+import javax.annotation.Nullable;
+
+import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.*;
+
+/** This class is responsible for calculating the score of a search result.
+ * It holds the data required to perform the scoring, as there is strong
+ * reasons to cache this data, and performs the calculations */
+public class IndexResultScoreCalculator {
+ private final CombinedIndexReader index;
+ private final QueryParams queryParams;
+
+ private final ResultRankingContext rankingContext;
+ private final CompiledQuery compiledQuery;
+
+ public IndexResultScoreCalculator(StatefulIndex statefulIndex,
+ ResultRankingContext rankingContext,
+ SearchParameters params)
+ {
+ this.index = statefulIndex.get();
+ this.rankingContext = rankingContext;
+
+ this.queryParams = params.queryParams;
+ this.compiledQuery = params.compiledQuery;
+ }
+
+ private final long flagsFilterMask = WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit();
+
+ @Nullable
+ public SearchResultItem calculateScore(long combinedId,
+ QuerySearchTerms searchTerms,
+ long[] wordFlags,
+ GammaCodedSequence[] positions)
+ {
+
+ CompiledQuery positionsQuery = compiledQuery.root.newQuery(positions);
+
+ int[] counts = new int[compiledQuery.size()];
+
+ for (int i = 0; i < counts.length; i++) {
+ if (positions[i] != null) {
+ counts[i] = positions[i].valueCount();
+ }
+ }
+ CompiledQueryInt positionsCountQuery = compiledQuery.root.newQuery(counts);
+ CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags);
+
+ // If the document is not relevant to the query, abort early to reduce allocations and
+ // avoid unnecessary calculations
+ if (testRelevance(wordFlagsQuery, positionsCountQuery)) {
+ return null;
+ }
+
+ long docId = UrlIdCodec.removeRank(combinedId);
+ long docMetadata = index.getDocumentMetadata(docId);
+ int htmlFeatures = index.getHtmlFeatures(docId);
+ int docSize = index.getDocumentSize(docId);
+
+ int bestCoherence = searchTerms.coherences.testOptional(positions);
+
+ double score = calculateSearchResultValue(
+ wordFlagsQuery,
+ positionsCountQuery,
+ positionsQuery,
+ docMetadata,
+ htmlFeatures,
+ docSize,
+ bestCoherence,
+ rankingContext);
+
+ SearchResultItem searchResult = new SearchResultItem(docId,
+ docMetadata,
+ htmlFeatures);
+
+ if (hasPrioTerm(searchTerms, positions)) {
+ score = 0.75 * score;
+ }
+
+ searchResult.setScore(score);
+
+ return searchResult;
+ }
+
+ private boolean testRelevance(CompiledQueryLong wordFlagsQuery, CompiledQueryInt countsQuery) {
+ boolean allSynthetic = booleanAggregate(wordFlagsQuery, WordFlags.Synthetic::isPresent);
+ int flagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & flagsFilterMask));
+ int positionsCount = intMaxMinAggregate(countsQuery, p -> p);
+
+ if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams.queryStrategy())) {
+ return true;
+ }
+ if (flagsCount == 0 && !allSynthetic && positionsCount == 0) {
+ return true;
+ }
+
+ return false;
+ }
+
+ private boolean hasPrioTerm(QuerySearchTerms searchTerms, GammaCodedSequence[] positions) {
+ var allTerms = searchTerms.termIdsAll;
+ var prioTerms = searchTerms.termIdsPrio;
+
+ for (int i = 0; i < allTerms.size(); i++) {
+ if (positions[i] != null && prioTerms.contains(allTerms.at(i))) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores,
+ QueryStrategy queryStrategy)
+ {
+ if (queryStrategy == QueryStrategy.AUTO ||
+ queryStrategy == QueryStrategy.SENTENCE ||
+ queryStrategy == QueryStrategy.TOPIC) {
+ return true;
+ }
+
+ return booleanAggregate(queryGraphScores,
+ docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy()));
+ }
+
+ private boolean meetsQueryStrategyRequirements(long wordMeta, QueryStrategy queryStrategy) {
+ if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) {
+ return WordFlags.Site.isPresent(wordMeta);
+ }
+ else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) {
+ return WordFlags.Subjects.isPresent(wordMeta);
+ }
+ else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) {
+ return WordFlags.Title.isPresent(wordMeta);
+ }
+ else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) {
+ return WordFlags.UrlPath.isPresent(wordMeta);
+ }
+ else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) {
+ return WordFlags.UrlDomain.isPresent(wordMeta);
+ }
+ else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_LINK) {
+ return WordFlags.ExternalLink.isPresent(wordMeta);
+ }
+ return true;
+ }
+
+ public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery,
+ CompiledQueryInt positionsCountQuery,
+ CompiledQuery positionsQuery, long documentMetadata,
+ int features,
+ int length,
+ int bestCoherence,
+ ResultRankingContext ctx)
+ {
+ if (length < 0) {
+ length = 5000;
+ }
+
+ var rankingParams = ctx.params;
+
+ int rank = DocumentMetadata.decodeRank(documentMetadata);
+ int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata);
+ int quality = DocumentMetadata.decodeQuality(documentMetadata);
+ int size = DocumentMetadata.decodeSize(documentMetadata);
+ int flagsPenalty = flagsPenalty(features, documentMetadata & 0xFF, size);
+ int topology = DocumentMetadata.decodeTopology(documentMetadata);
+ int year = DocumentMetadata.decodeYear(documentMetadata);
+
+ double averageSentenceLengthPenalty = (asl >= rankingParams.shortSentenceThreshold ? 0 : -rankingParams.shortSentencePenalty);
+
+ final double qualityPenalty = calculateQualityPenalty(size, quality, rankingParams);
+ final double rankingBonus = (255. - rank) * rankingParams.domainRankBonus;
+ final double topologyBonus = Math.log(1 + topology);
+ final double documentLengthPenalty = length > rankingParams.shortDocumentThreshold ? 0 : -rankingParams.shortDocumentPenalty;
+ final double temporalBias;
+
+ if (rankingParams.temporalBias == ResultRankingParameters.TemporalBias.RECENT) {
+ temporalBias = - Math.abs(year - PubDate.MAX_YEAR) * rankingParams.temporalBiasWeight;
+ } else if (rankingParams.temporalBias == ResultRankingParameters.TemporalBias.OLD) {
+ temporalBias = - Math.abs(year - PubDate.MIN_YEAR) * rankingParams.temporalBiasWeight;
+ } else {
+ temporalBias = 0;
+ }
+
+ double overallPart = averageSentenceLengthPenalty
+ + documentLengthPenalty
+ + qualityPenalty
+ + rankingBonus
+ + topologyBonus
+ + temporalBias
+ + flagsPenalty
+ + bestCoherence;
+
+ double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / calculateAvgMinDistance(positionsQuery, ctx));
+ double tcfFirstPosition = 0.;
+
+ double bM25 = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new Bm25GraphVisitor(rankingParams.bm25Params, positionsCountQuery.data, length, ctx));
+
+ // Renormalize to 0...15, where 0 is the best possible score;
+ // this is a historical artifact of the original ranking function
+ double ret = normalize(
+ tcfAvgDist + tcfFirstPosition
+ + bM25
+ + Math.max(0, overallPart),
+ -Math.min(0, overallPart));
+
+ if (Double.isNaN(ret)) { // This should never happen but if it does, we want to know about it
+ if (getClass().desiredAssertionStatus()) {
+ throw new IllegalStateException("NaN in result value calculation");
+ }
+
+ return Double.MAX_VALUE;
+ }
+ else {
+ return ret;
+ }
+ }
+
+ private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) {
+ if (size < 400) {
+ if (quality < 5)
+ return 0;
+ return -quality * rankingParams.qualityPenalty;
+ }
+ else {
+ return -quality * rankingParams.qualityPenalty * 20;
+ }
+ }
+
+ private int flagsPenalty(int featureFlags, long docFlags, int size) {
+
+ // Short-circuit for index-service, which does not have the feature flags
+ if (featureFlags == 0)
+ return 0;
+
+ double penalty = 0;
+
+ boolean isForum = DocumentFlags.GeneratorForum.isPresent(docFlags);
+ boolean isWiki = DocumentFlags.GeneratorWiki.isPresent(docFlags);
+ boolean isDocs = DocumentFlags.GeneratorDocs.isPresent(docFlags);
+
+ // Penalize large sites harder for any bullshit as it's a strong signal of a low quality site
+ double largeSiteFactor = 1.;
+
+ if (!isForum && !isWiki && !isDocs && size > 400) {
+ // Long urls-that-look-like-this tend to be poor search results
+ if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.KEBAB_CASE_URL.getFeatureBit()))
+ penalty += 30.0;
+ else if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.LONG_URL.getFeatureBit()))
+ penalty += 30.;
+ else penalty += 5.;
+
+ largeSiteFactor = 2;
+ }
+
+ if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit()))
+ penalty += 7.5 * largeSiteFactor;
+
+ if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit()))
+ penalty += 5.0 * largeSiteFactor;
+
+ if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.COOKIES.getFeatureBit()))
+ penalty += 2.5 * largeSiteFactor;
+
+ if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit()))
+ penalty += 2.5 * largeSiteFactor;
+
+ if (isForum || isWiki) {
+ penalty = Math.min(0, penalty - 2);
+ }
+
+ return (int) -penalty;
+ }
+
+ /** Normalize a value to the range 0...15, where 0 is the best possible score
+ *
+ * @param value The value to normalize, must be positive or zero
+ * @param penalty Any negative component of the value
+ * */
+ public static double normalize(double value, double penalty) {
+ if (value < 0)
+ value = 0;
+
+ return Math.sqrt((1.0 + 500. + 10 * penalty) / (1.0 + value));
+ }
+
+
+ public static double calculateAvgMinDistance(CompiledQuery positions, ResultRankingContext ctx) {
+ double sum = 0;
+ int cnt = 0;
+
+ for (int i = 0; i < positions.size(); i++) {
+
+ // Skip terms that are not in the regular mask
+ if (!ctx.regularMask.get(i))
+ continue;
+
+ var posi = positions.at(i);
+
+ // Skip terms that are not in the document
+ if (posi == null)
+ continue;
+
+ for (int j = i + 1; j < positions.size(); j++) {
+
+ // Skip terms that are not in the regular mask
+ if (!ctx.regularMask.get(j))
+ continue;
+
+ var posj = positions.at(j);
+
+ // Skip terms that are not in the document
+ if (posj == null)
+ continue;
+
+ int distance = SequenceOperations.minDistance(posi.iterator(), posj.iterator());
+ sum += distance;
+ cnt++;
+ }
+ }
+
+ if (cnt > 0) {
+ return sum / cnt;
+ } else {
+ return 1000.;
+ }
+ }
+
+}
diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java
deleted file mode 100644
index 2facf59f..00000000
--- a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java
+++ /dev/null
@@ -1,165 +0,0 @@
-package nu.marginalia.index.results;
-
-import nu.marginalia.api.searchquery.model.compiled.*;
-import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
-import nu.marginalia.api.searchquery.model.results.SearchResultItem;
-import nu.marginalia.index.index.CombinedIndexReader;
-import nu.marginalia.index.index.StatefulIndex;
-import nu.marginalia.index.model.SearchParameters;
-import nu.marginalia.index.model.QueryParams;
-import nu.marginalia.index.results.model.QuerySearchTerms;
-import nu.marginalia.model.id.UrlIdCodec;
-import nu.marginalia.model.idx.WordFlags;
-import nu.marginalia.index.query.limit.QueryStrategy;
-import nu.marginalia.ranking.results.ResultValuator;
-import nu.marginalia.sequence.GammaCodedSequence;
-
-import javax.annotation.Nullable;
-
-import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.*;
-
-/** This class is responsible for calculating the score of a search result.
- * It holds the data required to perform the scoring, as there is strong
- * reasons to cache this data, and performs the calculations */
-public class IndexResultValuationContext {
- private final CombinedIndexReader index;
- private final QueryParams queryParams;
-
- private final ResultRankingContext rankingContext;
- private final ResultValuator searchResultValuator;
- private final CompiledQuery compiledQuery;
-
- public IndexResultValuationContext(ResultValuator searchResultValuator,
- StatefulIndex statefulIndex,
- ResultRankingContext rankingContext,
- SearchParameters params)
- {
- this.index = statefulIndex.get();
- this.rankingContext = rankingContext;
- this.searchResultValuator = searchResultValuator;
-
- this.queryParams = params.queryParams;
- this.compiledQuery = params.compiledQuery;
- }
-
- private final long flagsFilterMask = WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit();
-
- @Nullable
- public SearchResultItem calculatePreliminaryScore(long combinedId,
- QuerySearchTerms searchTerms,
- long[] wordFlags,
- GammaCodedSequence[] positions)
- {
- if (!searchTerms.coherences.testMandatory(positions))
- return null;
-
- CompiledQuery positionsQuery = compiledQuery.root.newQuery(positions);
- CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags);
- int[] counts = new int[compiledQuery.size()];
- for (int i = 0; i < counts.length; i++) {
- if (positions[i] != null) {
- counts[i] = positions[i].valueCount();
- }
- }
- CompiledQueryInt positionsCountQuery = compiledQuery.root.newQuery(counts);
-
- // If the document is not relevant to the query, abort early to reduce allocations and
- // avoid unnecessary calculations
- if (testRelevance(wordFlagsQuery, positionsCountQuery)) {
- return null;
- }
-
- long docId = UrlIdCodec.removeRank(combinedId);
- long docMetadata = index.getDocumentMetadata(docId);
- int htmlFeatures = index.getHtmlFeatures(docId);
- int docSize = index.getDocumentSize(docId);
-
- int bestCoherence = searchTerms.coherences.testOptional(positions);
-
- double score = searchResultValuator.calculateSearchResultValue(
- wordFlagsQuery,
- positionsCountQuery,
- positionsQuery,
- docMetadata,
- htmlFeatures,
- docSize,
- bestCoherence,
- rankingContext, null);
-
- SearchResultItem searchResult = new SearchResultItem(docId,
- docMetadata,
- htmlFeatures);
-
- if (hasPrioTerm(searchTerms, positions)) {
- score = 0.75 * score;
- }
-
- searchResult.setScore(score);
-
- return searchResult;
- }
-
- private boolean testRelevance(CompiledQueryLong wordFlagsQuery, CompiledQueryInt countsQuery) {
- boolean allSynthetic = booleanAggregate(wordFlagsQuery, WordFlags.Synthetic::isPresent);
- int flagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & flagsFilterMask));
- int positionsCount = intMaxMinAggregate(countsQuery, p -> p);
-
- if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams.queryStrategy())) {
- return true;
- }
- if (flagsCount == 0 && !allSynthetic && positionsCount == 0) {
- return true;
- }
-
- return false;
- }
-
- private boolean hasPrioTerm(QuerySearchTerms searchTerms, GammaCodedSequence[] positions) {
- var allTerms = searchTerms.termIdsAll;
- var prioTerms = searchTerms.termIdsPrio;
-
- for (int i = 0; i < allTerms.size(); i++) {
- if (positions[i] != null && prioTerms.contains(allTerms.at(i))) {
- return true;
- }
- }
-
- return false;
- }
-
- private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores,
- QueryStrategy queryStrategy)
- {
- if (queryStrategy == QueryStrategy.AUTO ||
- queryStrategy == QueryStrategy.SENTENCE ||
- queryStrategy == QueryStrategy.TOPIC) {
- return true;
- }
-
- return booleanAggregate(queryGraphScores,
- docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy()));
- }
-
- private boolean meetsQueryStrategyRequirements(long wordMeta, QueryStrategy queryStrategy) {
- if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) {
- return WordFlags.Site.isPresent(wordMeta);
- }
- else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) {
- return WordFlags.Subjects.isPresent(wordMeta);
- }
- else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) {
- return WordFlags.Title.isPresent(wordMeta);
- }
- else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) {
- return WordFlags.UrlPath.isPresent(wordMeta);
- }
- else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) {
- return WordFlags.UrlDomain.isPresent(wordMeta);
- }
- else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_LINK) {
- return WordFlags.ExternalLink.isPresent(wordMeta);
- }
- return true;
- }
-
-}
diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java b/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java
deleted file mode 100644
index fbe99cb1..00000000
--- a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java
+++ /dev/null
@@ -1,210 +0,0 @@
-package nu.marginalia.index.results;
-
-import com.google.inject.Inject;
-import com.google.inject.Singleton;
-import gnu.trove.list.TLongList;
-import gnu.trove.list.array.TLongArrayList;
-import it.unimi.dsi.fastutil.longs.LongSet;
-import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
-import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
-import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
-import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
-import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
-import nu.marginalia.api.searchquery.model.results.SearchResultItem;
-import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
-import nu.marginalia.index.index.StatefulIndex;
-import nu.marginalia.index.model.SearchParameters;
-import nu.marginalia.index.results.model.ids.CombinedDocIdList;
-import nu.marginalia.linkdb.docs.DocumentDbReader;
-import nu.marginalia.linkdb.model.DocdbUrlDetail;
-import nu.marginalia.model.idx.WordMetadata;
-import nu.marginalia.ranking.results.ResultValuator;
-import nu.marginalia.sequence.GammaCodedSequence;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import java.lang.foreign.Arena;
-import java.sql.SQLException;
-import java.util.*;
-
-@Singleton
-public class IndexResultValuatorService {
- private static final Logger logger = LoggerFactory.getLogger(IndexResultValuatorService.class);
-
- private final IndexMetadataService metadataService;
- private final DocumentDbReader documentDbReader;
- private final ResultValuator resultValuator;
- private final StatefulIndex statefulIndex;
-
- @Inject
- public IndexResultValuatorService(IndexMetadataService metadataService,
- DocumentDbReader documentDbReader,
- ResultValuator resultValuator,
- StatefulIndex statefulIndex)
- {
- this.metadataService = metadataService;
- this.documentDbReader = documentDbReader;
- this.resultValuator = resultValuator;
- this.statefulIndex = statefulIndex;
- }
-
- public List rankResults(SearchParameters params,
- ResultRankingContext rankingContext,
- CombinedDocIdList resultIds)
- {
- IndexResultValuationContext evaluator =
- new IndexResultValuationContext(resultValuator, statefulIndex, rankingContext, params);
-
- List results = new ArrayList<>(resultIds.size());
-
- try (var arena = Arena.ofConfined()) {
- // Batch-fetch the word metadata for the documents
-
- var searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query);
- var termsForDocs = metadataService.getTermMetadataForDocuments(arena, resultIds, searchTerms.termIdsAll);
-
- // Prepare data for the document. We do this outside of the calculation function to avoid
- // hash lookups in the inner loop, as it's very hot code and we don't want thrashing in there;
- // out here we can rely on implicit array ordering to match up the data.
-
- var ra = resultIds.array();
- long[] flags = new long[searchTerms.termIdsAll.size()];
- GammaCodedSequence[] positions = new GammaCodedSequence[searchTerms.termIdsAll.size()];
-
- for (int i = 0; i < ra.length; i++) {
- long id = ra[i];
-
- // Prepare term-level data for the document
- for (int ti = 0; ti < flags.length; ti++) {
- long tid = searchTerms.termIdsAll.at(ti);
- var tfd = termsForDocs.get(tid);
-
- assert tfd != null : "No term data for term " + ti;
-
- flags[ti] = tfd.flag(i);
- positions[ti] = tfd.position(i);
- }
-
- // Calculate the preliminary score
-
- var score = evaluator.calculatePreliminaryScore(id, searchTerms, flags, positions);
- if (score != null) {
- results.add(score);
- }
- }
-
- return results;
- }
- }
-
-
- public List selectBestResults(SearchParameters params,
- Collection results) throws SQLException {
-
- var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain);
-
- List resultsList = new ArrayList<>(results.size());
-
- for (var item : results) {
- if (domainCountFilter.test(item)) {
- // It's important that this filter runs across all results, not just the top N
- if (resultsList.size() < params.limitTotal) {
- resultsList.add(item);
- }
- }
- }
-
- for (var item : resultsList) {
- item.resultsFromDomain = domainCountFilter.getCount(item);
- }
-
- return decorateResults(resultsList, params.compiledQuery);
- }
-
- /** Decorate the result items with additional information from the link database
- * and calculate an updated ranking with the additional information */
- public List decorateResults(List rawResults,
- CompiledQuery compiledQuery)
- throws SQLException
- {
- TLongList idsList = new TLongArrayList(rawResults.size());
-
- for (var result : rawResults)
- idsList.add(result.getDocumentId());
-
- Map urlDetailsById = new HashMap<>(rawResults.size());
-
- for (var item : documentDbReader.getUrlDetails(idsList))
- urlDetailsById.put(item.urlId(), item);
-
- List resultItems = new ArrayList<>(rawResults.size());
- for (var result : rawResults) {
- var id = result.getDocumentId();
- var docData = urlDetailsById.get(id);
-
- if (docData == null) {
- logger.warn("No document data for id {}", id);
- continue;
- }
-
- resultItems.add(createCombinedItem(
- result,
- docData));
- }
- return resultItems;
- }
-
- private DecoratedSearchResultItem createCombinedItem(SearchResultItem result,
- DocdbUrlDetail docData) {
-
- ResultRankingDetailsExtractor detailsExtractor = new ResultRankingDetailsExtractor();
- // Consumer detailConsumer = rankingContext.params.exportDebugData ? detailsExtractor::set : null;
-
- return new DecoratedSearchResultItem(
- result,
- docData.url(),
- docData.title(),
- docData.description(),
- docData.urlQuality(),
- docData.format(),
- docData.features(),
- docData.pubYear(),
- docData.dataHash(),
- docData.wordsTotal(),
- 0L, //bestPositions(wordMetas),
- result.getScore(),
- detailsExtractor.get()
- );
- }
-
- private static class ResultRankingDetailsExtractor {
- private ResultRankingDetails value = null;
-
- public ResultRankingDetails get() {
- return value;
- }
- public void set(ResultRankingDetails value) {
- this.value = value;
- }
- }
-
- private long bestPositions(CompiledQueryLong wordMetas) {
- LongSet positionsSet = CompiledQueryAggregates.positionsAggregate(wordMetas, WordMetadata::decodePositions);
-
- int bestPc = 0;
- long bestPositions = 0;
-
- var li = positionsSet.longIterator();
-
- while (li.hasNext()) {
- long pos = li.nextLong();
- int pc = Long.bitCount(pos);
- if (pc > bestPc) {
- bestPc = pc;
- bestPositions = pos;
- }
- }
-
- return bestPositions;
- }
-}
diff --git a/code/index/java/nu/marginalia/index/results/model/ids/CombinedDocIdList.java b/code/index/java/nu/marginalia/index/results/model/ids/CombinedDocIdList.java
index 7845f14f..43f5c575 100644
--- a/code/index/java/nu/marginalia/index/results/model/ids/CombinedDocIdList.java
+++ b/code/index/java/nu/marginalia/index/results/model/ids/CombinedDocIdList.java
@@ -32,6 +32,7 @@ public final class CombinedDocIdList {
public int size() {
return data.length;
}
+ public long at(int i) { return data[i]; }
public LongStream stream() {
return Arrays.stream(data);
diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java
deleted file mode 100644
index 6ab72eef..00000000
--- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java
+++ /dev/null
@@ -1,209 +0,0 @@
-package nu.marginalia.ranking.results;
-
-import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
-import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
-import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
-import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
-import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
-import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
-import nu.marginalia.api.searchquery.model.results.debug.ResultRankingInputs;
-import nu.marginalia.api.searchquery.model.results.debug.ResultRankingOutputs;
-import nu.marginalia.model.crawl.HtmlFeature;
-import nu.marginalia.model.crawl.PubDate;
-import nu.marginalia.model.idx.DocumentFlags;
-import nu.marginalia.model.idx.DocumentMetadata;
-import nu.marginalia.ranking.results.factors.*;
-
-import com.google.inject.Inject;
-import com.google.inject.Singleton;
-import nu.marginalia.sequence.GammaCodedSequence;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import javax.annotation.Nullable;
-import java.util.function.Consumer;
-
-@Singleton
-public class ResultValuator {
- final static double scalingFactor = 500.;
-
- private final TermCoherenceFactor termCoherenceFactor;
-
- private static final Logger logger = LoggerFactory.getLogger(ResultValuator.class);
-
- @Inject
- public ResultValuator(TermCoherenceFactor termCoherenceFactor) {
- this.termCoherenceFactor = termCoherenceFactor;
- }
-
- public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery,
- CompiledQueryInt positionsCountQuery, CompiledQuery positionsQuery, long documentMetadata,
- int features,
- int length,
- int bestCoherence,
- ResultRankingContext ctx,
- @Nullable Consumer detailsConsumer
- )
- {
- if (wordFlagsQuery.isEmpty())
- return Double.MAX_VALUE;
-
- if (length < 0) {
- length = 5000;
- }
-
- var rankingParams = ctx.params;
-
- int rank = DocumentMetadata.decodeRank(documentMetadata);
- int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata);
- int quality = DocumentMetadata.decodeQuality(documentMetadata);
- int size = DocumentMetadata.decodeSize(documentMetadata);
- int flagsPenalty = flagsPenalty(features, documentMetadata & 0xFF, size);
- int topology = DocumentMetadata.decodeTopology(documentMetadata);
- int year = DocumentMetadata.decodeYear(documentMetadata);
-
- double averageSentenceLengthPenalty = (asl >= rankingParams.shortSentenceThreshold ? 0 : -rankingParams.shortSentencePenalty);
-
- final double qualityPenalty = calculateQualityPenalty(size, quality, rankingParams);
- final double rankingBonus = (255. - rank) * rankingParams.domainRankBonus;
- final double topologyBonus = Math.log(1 + topology);
- final double documentLengthPenalty = length > rankingParams.shortDocumentThreshold ? 0 : -rankingParams.shortDocumentPenalty;
- final double temporalBias;
-
- if (rankingParams.temporalBias == ResultRankingParameters.TemporalBias.RECENT) {
- temporalBias = - Math.abs(year - PubDate.MAX_YEAR) * rankingParams.temporalBiasWeight;
- } else if (rankingParams.temporalBias == ResultRankingParameters.TemporalBias.OLD) {
- temporalBias = - Math.abs(year - PubDate.MIN_YEAR) * rankingParams.temporalBiasWeight;
- } else {
- temporalBias = 0;
- }
-
- double overallPart = averageSentenceLengthPenalty
- + documentLengthPenalty
- + qualityPenalty
- + rankingBonus
- + topologyBonus
- + temporalBias
- + flagsPenalty
- + bestCoherence;
-
- // FIXME: need a weighting factor here
- double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / termCoherenceFactor.calculateAvgMinDistance(positionsQuery, ctx));
- double tcfFirstPosition = 0.;
-
- double bM25 = rankingParams.bm25Weight * wordFlagsQuery.root.visit(new Bm25FullGraphVisitor(rankingParams.bm25Params, positionsCountQuery.data, length, ctx));
-
- double overallPartPositive = Math.max(0, overallPart);
- double overallPartNegative = -Math.min(0, overallPart);
-
- if (null != detailsConsumer) {
- var details = new ResultRankingDetails(
- new ResultRankingInputs(
- rank,
- asl,
- quality,
- size,
- topology,
- year,
- DocumentFlags.decode(documentMetadata).stream().map(Enum::name).toList()
- ),
- new ResultRankingOutputs(
- averageSentenceLengthPenalty,
- qualityPenalty,
- rankingBonus,
- topologyBonus,
- documentLengthPenalty,
- temporalBias,
- flagsPenalty,
- overallPart,
- bM25,
- tcfAvgDist,
- tcfFirstPosition)
- );
-
- detailsConsumer.accept(details);
- }
-
- // Renormalize to 0...15, where 0 is the best possible score;
- // this is a historical artifact of the original ranking function
- double ret = normalize(
- tcfAvgDist + tcfFirstPosition
- + bM25
- + overallPartPositive,
- overallPartNegative);
-
- if (Double.isNaN(ret)) {
- if (getClass().desiredAssertionStatus()) {
- throw new IllegalStateException("NaN in result value calculation");
- }
-
- return Double.MAX_VALUE;
- }
- else {
- return ret;
- }
- }
-
- private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) {
- if (size < 400) {
- if (quality < 5)
- return 0;
- return -quality * rankingParams.qualityPenalty;
- }
- else {
- return -quality * rankingParams.qualityPenalty * 20;
- }
- }
-
- private int flagsPenalty(int featureFlags, long docFlags, int size) {
-
- // Short-circuit for index-service, which does not have the feature flags
- if (featureFlags == 0)
- return 0;
-
- double penalty = 0;
-
- boolean isForum = DocumentFlags.GeneratorForum.isPresent(docFlags);
- boolean isWiki = DocumentFlags.GeneratorWiki.isPresent(docFlags);
- boolean isDocs = DocumentFlags.GeneratorDocs.isPresent(docFlags);
-
- // Penalize large sites harder for any bullshit as it's a strong signal of a low quality site
- double largeSiteFactor = 1.;
-
- if (!isForum && !isWiki && !isDocs && size > 400) {
- // Long urls-that-look-like-this tend to be poor search results
- if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.KEBAB_CASE_URL.getFeatureBit()))
- penalty += 30.0;
- else if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.LONG_URL.getFeatureBit()))
- penalty += 30.;
- else penalty += 5.;
-
- largeSiteFactor = 2;
- }
-
- if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit()))
- penalty += 7.5 * largeSiteFactor;
-
- if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit()))
- penalty += 5.0 * largeSiteFactor;
-
- if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.COOKIES.getFeatureBit()))
- penalty += 2.5 * largeSiteFactor;
-
- if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit()))
- penalty += 2.5 * largeSiteFactor;
-
- if (isForum || isWiki) {
- penalty = Math.min(0, penalty - 2);
- }
-
- return (int) -penalty;
- }
-
- public static double normalize(double value, double penalty) {
- if (value < 0)
- value = 0;
-
- return Math.sqrt((1.0 + scalingFactor + 10 * penalty) / (1.0 + value));
- }
-}
diff --git a/code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java b/code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java
deleted file mode 100644
index 1fb26f6b..00000000
--- a/code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java
+++ /dev/null
@@ -1,127 +0,0 @@
-package nu.marginalia.ranking.results.factors;
-
-import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
-import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
-import nu.marginalia.api.searchquery.model.compiled.CqExpression;
-import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
-import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
-import nu.marginalia.model.idx.WordFlags;
-import nu.marginalia.model.idx.WordMetadata;
-
-import java.util.List;
-
-public class Bm25PrioGraphVisitor implements CqExpression.DoubleVisitor {
- private static final long AVG_LENGTH = 5000;
-
- private final CqDataLong wordMetaData;
- private final CqDataInt frequencies;
- private final Bm25Parameters bm25Parameters;
-
- private final int docCount;
-
- public Bm25PrioGraphVisitor(Bm25Parameters bm25Parameters,
- CqDataLong wordMetaData,
- ResultRankingContext ctx) {
- this.bm25Parameters = bm25Parameters;
- this.docCount = ctx.termFreqDocCount();
- this.wordMetaData = wordMetaData;
- this.frequencies = ctx.fullCounts;
- }
-
- @Override
- public double onAnd(List extends CqExpression> parts) {
- double value = 0;
- for (var part : parts) {
- value += part.visit(this);
- }
- return value;
- }
-
- @Override
- public double onOr(List extends CqExpression> parts) {
- double value = 0;
- for (var part : parts) {
- value = Math.max(value, part.visit(this));
- }
- return value;
- }
-
- @Override
- public double onLeaf(int idx) {
- double count = evaluatePriorityScore(wordMetaData.get(idx));
-
- int freq = frequencies.get(idx);
-
- // note we override b to zero for priority terms as they are independent of document length
- return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0);
- }
-
- private static double evaluatePriorityScore(long wordMeta) {
- int pcount = Long.bitCount(WordMetadata.decodePositions(wordMeta));
-
- double qcount = 0.;
-
- if ((wordMeta & WordFlags.ExternalLink.asBit()) != 0) {
-
- qcount += 2.5;
-
- if ((wordMeta & WordFlags.UrlDomain.asBit()) != 0)
- qcount += 2.5;
- else if ((wordMeta & WordFlags.UrlPath.asBit()) != 0)
- qcount += 1.5;
-
- if ((wordMeta & WordFlags.Site.asBit()) != 0)
- qcount += 1.25;
- if ((wordMeta & WordFlags.SiteAdjacent.asBit()) != 0)
- qcount += 1.25;
- }
- else {
- if ((wordMeta & WordFlags.UrlDomain.asBit()) != 0)
- qcount += 3;
- else if ((wordMeta & WordFlags.UrlPath.asBit()) != 0)
- qcount += 1;
-
- if ((wordMeta & WordFlags.Site.asBit()) != 0)
- qcount += 0.5;
- if ((wordMeta & WordFlags.SiteAdjacent.asBit()) != 0)
- qcount += 0.5;
- }
-
- if ((wordMeta & WordFlags.Title.asBit()) != 0)
- qcount += 1.5;
-
- if (pcount > 2) {
- if ((wordMeta & WordFlags.Subjects.asBit()) != 0)
- qcount += 1.25;
- if ((wordMeta & WordFlags.NamesWords.asBit()) != 0)
- qcount += 0.25;
- if ((wordMeta & WordFlags.TfIdfHigh.asBit()) != 0)
- qcount += 0.5;
- }
-
- return qcount;
- }
-
-
- /**
- *
- * @param docCount Number of documents
- * @param freq Number of matching documents
- */
- private double invFreq(int docCount, int freq) {
- return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5));
- }
-
- /**
- *
- * @param k determines the size of the impact of a single term
- * @param b determines the magnitude of the length normalization
- * @param count number of occurrences in the document
- * @param length document length
- */
- private double f(double k, double b, double count, int length) {
- final double lengthRatio = (double) length / AVG_LENGTH;
-
- return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio));
- }
-}
diff --git a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java
deleted file mode 100644
index 2ebef7cd..00000000
--- a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java
+++ /dev/null
@@ -1,53 +0,0 @@
-package nu.marginalia.ranking.results.factors;
-
-import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
-import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
-import nu.marginalia.sequence.GammaCodedSequence;
-import nu.marginalia.sequence.SequenceOperations;
-
-/** Rewards documents where terms appear frequently within the same sentences
- */
-public class TermCoherenceFactor {
-
- public double calculateAvgMinDistance(CompiledQuery positions, ResultRankingContext ctx) {
- double sum = 0;
- int cnt = 0;
-
- for (int i = 0; i < positions.size(); i++) {
-
- // Skip terms that are not in the regular mask
- if (!ctx.regularMask.get(i))
- continue;
-
- var posi = positions.at(i);
-
- // Skip terms that are not in the document
- if (posi == null)
- continue;
-
- for (int j = i + 1; j < positions.size(); j++) {
-
- // Skip terms that are not in the regular mask
- if (!ctx.regularMask.get(j))
- continue;
-
- var posj = positions.at(j);
-
- // Skip terms that are not in the document
- if (posj == null)
- continue;
-
- int distance = SequenceOperations.minDistance(posi.iterator(), posj.iterator());
- sum += distance;
- cnt++;
- }
- }
-
- if (cnt > 0) {
- return sum / cnt;
- } else {
- return 1000.;
- }
- }
-
-}
\ No newline at end of file
diff --git a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java
index f4740e31..4966e5f0 100644
--- a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java
+++ b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java
@@ -25,7 +25,7 @@ class IndexResultDomainDeduplicatorTest {
}
SearchResultItem forId(int domain, int ordinal) {
- return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(), 4, false, Double.NaN);
+ return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(),false, Double.NaN);
}
}
\ No newline at end of file
diff --git a/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java b/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java
index 901174f4..c7214060 100644
--- a/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java
+++ b/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java
@@ -87,7 +87,7 @@ public class SearchQueryIndexService {
detail.features,
DomainIndexingState.ACTIVE,
detail.rankingScore, // termScore
- detail.resultsFromDomain(),
+ detail.resultsFromDomain,
getPositionsString(detail),
Long.bitCount(detail.bestPositions),
detail.rawIndexResult,
diff --git a/code/services-application/search-service/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java b/code/services-application/search-service/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java
index be3fe0b7..76fb62fc 100644
--- a/code/services-application/search-service/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java
+++ b/code/services-application/search-service/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java
@@ -103,6 +103,7 @@ public class SearchServicePaperDoll extends AbstractModule {
400,
positions,
score,
+ 4,
null)
);
}