(index) Clean up data model

The change set cleans up the data model for the term-level data.  This used to contain a bunch of fields with document-level metadata.  This data-duplication means a larger memory footprint and worse memory locality.

The ranking code is also modified to not accept SearchResultKeywordScores, but rather CompiledQueryLong and CqDataInts containing only the term metadata and the frequency information needed for ranking.  This is again an effort to improve memory locality.
This commit is contained in:
Viktor Lofgren 2024-04-15 16:04:07 +02:00
parent 52f0c0d336
commit b6d365bacd
31 changed files with 520 additions and 285 deletions

View File

@ -50,6 +50,10 @@ public enum WordFlags {
return (asBit() & value) > 0;
}
public boolean isAbsent(long value) {
return (asBit() & value) == 0;
}
public static EnumSet<WordFlags> decode(long encodedValue) {
EnumSet<WordFlags> ret = EnumSet.noneOf(WordFlags.class);
@ -61,4 +65,5 @@ public enum WordFlags {
return ret;
}
}

View File

@ -134,6 +134,8 @@ public class QueryProtobufCodec {
return new SearchResultItem(
rawItem.getCombinedId(),
rawItem.getEncodedDocMetadata(),
rawItem.getHtmlFeatures(),
keywordScores,
rawItem.getResultsFromDomain(),
Double.NaN // Not set
@ -144,9 +146,7 @@ public class QueryProtobufCodec {
return new SearchResultKeywordScore(
keywordScores.getKeyword(),
-1, // termId is internal to index service
keywordScores.getEncodedWordMetadata(),
keywordScores.getEncodedDocMetadata(),
keywordScores.getHtmlFeatures()
keywordScores.getEncodedWordMetadata()
);
}

View File

@ -46,6 +46,10 @@ public class CompiledQuery<T> implements Iterable<T> {
return new CompiledQueryLong(root, data.mapToLong(mapper));
}
public CompiledQueryLong mapToInt(ToIntFunction<T> mapper) {
return new CompiledQueryLong(root, data.mapToInt(mapper));
}
public CqExpression root() {
return root;
}

View File

@ -0,0 +1,44 @@
package nu.marginalia.api.searchquery.model.compiled;
import java.util.stream.IntStream;
/** A compiled index service query */
public class CompiledQueryInt {
private final CqExpression root;
private final CqDataInt data;
public CompiledQueryInt(CqExpression root, CqDataInt data) {
this.root = root;
this.data = data;
}
public CqExpression root() {
return root;
}
public IntStream stream() {
return data.stream();
}
public IntStream indices() {
return IntStream.range(0, data.size());
}
public long at(int index) {
return data.get(index);
}
public int[] copyData() {
return data.copyData();
}
public boolean isEmpty() {
return data.size() == 0;
}
public int size() {
return data.size();
}
}

View File

@ -9,8 +9,8 @@ import java.util.stream.LongStream;
/** A compiled index service query */
public class CompiledQueryLong implements Iterable<Long> {
private final CqExpression root;
private final CqDataLong data;
public final CqExpression root;
public final CqDataLong data;
public CompiledQueryLong(CqExpression root, CqDataLong data) {
this.root = root;
@ -47,4 +47,8 @@ public class CompiledQueryLong implements Iterable<Long> {
public boolean isEmpty() {
return data.size() == 0;
}
public int size() {
return data.size();
}
}

View File

@ -3,7 +3,7 @@ package nu.marginalia.api.searchquery.model.compiled;
import java.lang.reflect.Array;
import java.util.Arrays;
import java.util.function.Function;
import java.util.function.ToDoubleFunction;
import java.util.function.ToIntFunction;
import java.util.function.ToLongFunction;
import java.util.stream.Stream;
@ -33,6 +33,15 @@ public class CqData<T> {
return new CqDataLong(newData);
}
public CqDataLong mapToInt(ToIntFunction<T> mapper) {
long[] newData = new long[data.length];
for (int i = 0; i < data.length; i++) {
newData[i] = mapper.applyAsInt((T) data[i]);
}
return new CqDataLong(newData);
}
public T get(int i) {
return data[i];
}

View File

@ -0,0 +1,31 @@
package nu.marginalia.api.searchquery.model.compiled;
import java.util.Arrays;
import java.util.stream.IntStream;
public class CqDataInt {
private final int[] data;
public CqDataInt(int[] data) {
this.data = data;
}
public int get(int i) {
return data[i];
}
public int get(CqExpression.Word w) {
return data[w.idx()];
}
public IntStream stream() {
return Arrays.stream(data);
}
public int size() {
return data.length;
}
public int[] copyData() {
return Arrays.copyOf(data, data.length);
}
}

View File

@ -17,6 +17,9 @@ public class CompiledQueryAggregates {
static public <T> boolean booleanAggregate(CompiledQuery<T> query, Predicate<T> predicate) {
return query.root.visit(new CqBooleanAggregate(query, predicate));
}
static public boolean booleanAggregate(CompiledQueryLong query, LongPredicate predicate) {
return query.root.visit(new CqBooleanAggregate(query, predicate));
}
/** Compiled query aggregate that for a 64b bitmask that treats or-branches as logical OR,
@ -25,13 +28,20 @@ public class CompiledQueryAggregates {
public static <T> long longBitmaskAggregate(CompiledQuery<T> query, ToLongFunction<T> operator) {
return query.root.visit(new CqLongBitmaskOperator(query, operator));
}
public static long longBitmaskAggregate(CompiledQueryLong query, LongUnaryOperator operator) {
return query.root.visit(new CqLongBitmaskOperator(query, operator));
}
/** Apply the operator to each leaf node, then return the highest minimum value found along any path */
public static <T> int intMaxMinAggregate(CompiledQuery<T> query, ToIntFunction<T> operator) {
return query.root.visit(new CqIntMaxMinOperator(query, operator));
}
/** Apply the operator to each leaf node, then return the highest minimum value found along any path */
public static int intMaxMinAggregate(CompiledQueryLong query, LongToIntFunction operator) {
return query.root.visit(new CqIntMaxMinOperator(query, operator));
}
/** Apply the operator to each leaf node, and then return the highest sum of values possible
* through each branch in the compiled query.
*
@ -49,4 +59,9 @@ public class CompiledQueryAggregates {
public static <T> LongSet positionsAggregate(CompiledQuery<T> query, ToLongFunction<T> operator) {
return query.root().visit(new CqPositionsOperator(query, operator));
}
/** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */
public static <T> LongSet positionsAggregate(CompiledQueryLong query, LongUnaryOperator operator) {
return query.root().visit(new CqPositionsOperator(query, operator));
}
}

View File

@ -1,10 +1,12 @@
package nu.marginalia.api.searchquery.model.compiled.aggregate;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import java.util.List;
import java.util.function.IntPredicate;
import java.util.function.LongPredicate;
import java.util.function.Predicate;
public class CqBooleanAggregate implements CqExpression.BoolVisitor {
@ -15,6 +17,10 @@ public class CqBooleanAggregate implements CqExpression.BoolVisitor {
this.predicate = idx -> objPred.test(query.at(idx));
}
public CqBooleanAggregate(CompiledQueryLong query, LongPredicate longPredicate) {
this.predicate = idx -> longPredicate.test(query.at(idx));
}
@Override
public boolean onAnd(List<? extends CqExpression> parts) {
for (var part : parts) {

View File

@ -1,10 +1,12 @@
package nu.marginalia.api.searchquery.model.compiled.aggregate;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import java.util.List;
import java.util.function.IntToDoubleFunction;
import java.util.function.LongToDoubleFunction;
import java.util.function.ToDoubleFunction;
public class CqDoubleSumOperator implements CqExpression.DoubleVisitor {
@ -15,6 +17,10 @@ public class CqDoubleSumOperator implements CqExpression.DoubleVisitor {
this.operator = idx -> operator.applyAsDouble(query.at(idx));
}
public CqDoubleSumOperator(IntToDoubleFunction operator) {
this.operator = operator;
}
@Override
public double onAnd(List<? extends CqExpression> parts) {
double value = 0;

View File

@ -1,10 +1,12 @@
package nu.marginalia.api.searchquery.model.compiled.aggregate;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import java.util.List;
import java.util.function.IntUnaryOperator;
import java.util.function.LongToIntFunction;
import java.util.function.ToIntFunction;
public class CqIntMaxMinOperator implements CqExpression.IntVisitor {
@ -16,6 +18,10 @@ public class CqIntMaxMinOperator implements CqExpression.IntVisitor {
this.operator = idx -> operator.applyAsInt(query.at(idx));
}
public CqIntMaxMinOperator(CompiledQueryLong query, LongToIntFunction operator) {
this.operator = idx -> operator.applyAsInt(query.at(idx));
}
@Override
public int onAnd(List<? extends CqExpression> parts) {
int value = parts.getFirst().visit(this);

View File

@ -1,10 +1,12 @@
package nu.marginalia.api.searchquery.model.compiled.aggregate;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import java.util.List;
import java.util.function.IntToLongFunction;
import java.util.function.LongUnaryOperator;
import java.util.function.ToLongFunction;
public class CqLongBitmaskOperator implements CqExpression.LongVisitor {
@ -14,6 +16,9 @@ public class CqLongBitmaskOperator implements CqExpression.LongVisitor {
public <T> CqLongBitmaskOperator(CompiledQuery<T> query, ToLongFunction<T> operator) {
this.operator = idx-> operator.applyAsLong(query.at(idx));
}
public CqLongBitmaskOperator(CompiledQueryLong query, LongUnaryOperator operator) {
this.operator = idx-> operator.applyAsLong(query.at(idx));
}
@Override
public long onAnd(List<? extends CqExpression> parts) {

View File

@ -4,10 +4,12 @@ import it.unimi.dsi.fastutil.longs.LongArraySet;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongSet;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import java.util.List;
import java.util.function.IntToLongFunction;
import java.util.function.LongUnaryOperator;
import java.util.function.ToLongFunction;
public class CqPositionsOperator implements CqExpression.ObjectVisitor<LongSet> {
@ -17,6 +19,10 @@ public class CqPositionsOperator implements CqExpression.ObjectVisitor<LongSet>
this.operator = idx -> operator.applyAsLong(query.at(idx));
}
public CqPositionsOperator(CompiledQueryLong query, LongUnaryOperator operator) {
this.operator = idx -> operator.applyAsLong(query.at(idx));
}
@Override
public LongSet onAnd(List<? extends CqExpression> parts) {
LongSet ret = new LongArraySet();

View File

@ -1,38 +1,34 @@
package nu.marginalia.api.searchquery.model.results;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import lombok.ToString;
import java.util.Map;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
@ToString
public class ResultRankingContext {
private final int docCount;
public final ResultRankingParameters params;
private final Object2IntOpenHashMap<String> fullCounts = new Object2IntOpenHashMap<>(10, 0.5f);
private final Object2IntOpenHashMap<String> priorityCounts = new Object2IntOpenHashMap<>(10, 0.5f);
/** CqDataInt associated with frequency information of the terms in the query
* in the full index. The dataset is indexed by the compiled query. */
public final CqDataInt fullCounts;
/** CqDataInt associated with frequency information of the terms in the query
* in the full index. The dataset is indexed by the compiled query. */
public final CqDataInt priorityCounts;
public ResultRankingContext(int docCount,
ResultRankingParameters params,
Map<String, Integer> fullCounts,
Map<String, Integer> prioCounts
) {
CqDataInt fullCounts,
CqDataInt prioCounts)
{
this.docCount = docCount;
this.params = params;
this.fullCounts.putAll(fullCounts);
this.priorityCounts.putAll(prioCounts);
this.fullCounts = fullCounts;
this.priorityCounts = prioCounts;
}
public int termFreqDocCount() {
return docCount;
}
public int frequency(String keyword) {
return fullCounts.getOrDefault(keyword, 1);
}
public int priorityFrequency(String keyword) {
return priorityCounts.getOrDefault(keyword, 1);
}
}

View File

@ -15,15 +15,24 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
* probably not what you want, use getDocumentId() instead */
public final long combinedId;
/** Encoded document metadata */
public final long encodedDocMetadata;
/** Encoded html features of document */
public final int htmlFeatures;
/** How did the subqueries match against the document ? */
public final List<SearchResultKeywordScore> keywordScores;
/** How many other potential results existed in the same domain */
public int resultsFromDomain;
public SearchResultItem(long combinedId) {
public SearchResultItem(long combinedId, long encodedDocMetadata, int htmlFeatures) {
this.combinedId = combinedId;
this.encodedDocMetadata = encodedDocMetadata;
this.keywordScores = new ArrayList<>();
this.htmlFeatures = htmlFeatures;
}

View File

@ -2,7 +2,6 @@ package nu.marginalia.api.searchquery.model.results;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.model.idx.DocumentMetadata;
import java.util.Objects;
@ -10,34 +9,20 @@ public final class SearchResultKeywordScore {
public final long termId;
public final String keyword;
private final long encodedWordMetadata;
private final long encodedDocMetadata;
private final int htmlFeatures;
public SearchResultKeywordScore(String keyword,
long termId,
long encodedWordMetadata,
long encodedDocMetadata,
int htmlFeatures) {
long encodedWordMetadata) {
this.termId = termId;
this.keyword = keyword;
this.encodedWordMetadata = encodedWordMetadata;
this.encodedDocMetadata = encodedDocMetadata;
this.htmlFeatures = htmlFeatures;
}
public boolean hasTermFlag(WordFlags flag) {
return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit());
}
public int positionCount() {
return Long.bitCount(positions());
}
@Deprecated // FIXME 2024-04-06
public int subquery() {
return -1;
}
public long positions() {
return WordMetadata.decodePositions(encodedWordMetadata);
}
@ -46,44 +31,28 @@ public final class SearchResultKeywordScore {
return keyword.contains(":") || hasTermFlag(WordFlags.Synthetic);
}
public boolean isKeywordRegular() {
return !keyword.contains(":")
&& !hasTermFlag(WordFlags.Synthetic);
}
public long encodedWordMetadata() {
return encodedWordMetadata;
}
public long encodedDocMetadata() {
return encodedDocMetadata;
}
public int htmlFeatures() {
return htmlFeatures;
}
@Override
public boolean equals(Object obj) {
if (obj == this) return true;
if (obj == null || obj.getClass() != this.getClass()) return false;
var that = (SearchResultKeywordScore) obj;
return Objects.equals(this.keyword, that.keyword) &&
this.encodedWordMetadata == that.encodedWordMetadata &&
this.encodedDocMetadata == that.encodedDocMetadata;
return Objects.equals(this.termId, that.termId);
}
@Override
public int hashCode() {
return Objects.hash(keyword, encodedWordMetadata, encodedDocMetadata);
return Objects.hash(termId);
}
@Override
public String toString() {
return "SearchResultKeywordScore[" +
"keyword=" + keyword + ", " +
"encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ", " +
"encodedDocMetadata=" + new DocumentMetadata(encodedDocMetadata) + ']';
"encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ']';
}
}

View File

@ -98,16 +98,16 @@ message RpcDecoratedResultItem {
message RpcRawResultItem {
int64 combinedId = 1; // raw ID with bit-encoded ranking information still present
int32 resultsFromDomain = 2; // number of other results from the same domain
repeated RpcResultKeywordScore keywordScores = 3;
int64 encodedDocMetadata = 3; // bit encoded document metadata
int32 htmlFeatures = 4; // bitmask encoding features of the document
repeated RpcResultKeywordScore keywordScores = 5;
}
/* Information about how well a keyword matches a query */
message RpcResultKeywordScore {
string keyword = 1; // the keyword
int64 encodedWordMetadata = 2; // bit encoded word metadata
int64 encodedDocMetadata = 3; // bit encoded document metadata
bool hasPriorityTerms = 4; // true if this word is important to the document
int32 htmlFeatures = 5; // bit encoded document features
bool hasPriorityTerms = 3; // true if this word is important to the document
}
/* Query execution parameters */

View File

@ -11,6 +11,7 @@ import lombok.SneakyThrows;
import nu.marginalia.api.searchquery.*;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.*;
import nu.marginalia.array.buffer.LongQueryBuffer;
@ -135,14 +136,14 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
var rawItem = RpcRawResultItem.newBuilder();
rawItem.setCombinedId(rawResult.combinedId);
rawItem.setResultsFromDomain(rawResult.resultsFromDomain);
rawItem.setHtmlFeatures(rawResult.htmlFeatures);
rawItem.setEncodedDocMetadata(rawResult.encodedDocMetadata);
for (var score : rawResult.keywordScores) {
rawItem.addKeywordScores(
RpcResultKeywordScore.newBuilder()
.setEncodedDocMetadata(score.encodedDocMetadata())
.setEncodedWordMetadata(score.encodedWordMetadata())
.setKeyword(score.keyword)
.setHtmlFeatures(score.htmlFeatures())
);
}
@ -203,9 +204,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
return new SearchResultSet(List.of());
}
ResultRankingContext rankingContext = createRankingContext(params.rankingParams,
params.compiledQuery,
params.compiledQueryIds);
ResultRankingContext rankingContext = createRankingContext(params.rankingParams, params.compiledQueryIds);
var queryExecution = new QueryExecution(rankingContext, params.fetchSize);
@ -414,22 +413,22 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
}
private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams,
CompiledQuery<String> query,
CompiledQueryLong compiledQueryIds)
{
Map<String, Long> termToId = new HashMap<>(query.size());
query.indices().forEach(id -> termToId.put(query.at(id), compiledQueryIds.at(id)));
final Map<String, Integer> termFrequencies = new HashMap<>(termToId.size());
final Map<String, Integer> prioFrequencies = new HashMap<>(termToId.size());
int[] full = new int[compiledQueryIds.size()];
int[] prio = new int[compiledQueryIds.size()];
termToId.forEach((key, id) -> termFrequencies.put(key, index.getTermFrequency(id)));
termToId.forEach((key, id) -> prioFrequencies.put(key, index.getTermFrequencyPrio(id)));
for (int idx = 0; idx < compiledQueryIds.size(); idx++) {
long id = compiledQueryIds.at(idx);
full[idx] = index.getTermFrequency(id);
prio[idx] = index.getTermFrequencyPrio(id);
}
return new ResultRankingContext(index.getTotalDocCount(),
rankingParams,
termFrequencies,
prioFrequencies);
new CqDataInt(full),
new CqDataInt(prio));
}
}

View File

@ -1,7 +1,6 @@
package nu.marginalia.index.results;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.*;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
@ -70,39 +69,42 @@ public class IndexResultValuationContext {
long docMetadata = statefulIndex.getDocumentMetadata(docId);
int htmlFeatures = statefulIndex.getHtmlFeatures(docId);
SearchResultItem searchResult = new SearchResultItem(docId);
SearchResultItem searchResult = new SearchResultItem(docId, docMetadata, htmlFeatures);
long[] wordMetas = new long[compiledQuery.size()];
SearchResultKeywordScore[] scores = new SearchResultKeywordScore[compiledQuery.size()];
for (int i = 0; i < wordMetas.length; i++) {
final long termId = compiledQueryIds.at(i);
final String term = compiledQuery.at(i);
wordMetas[i] = termMetadataForCombinedDocumentIds.getTermMetadata(termId, combinedId);
scores[i] = new SearchResultKeywordScore(term, termId, wordMetas[i]);
}
SearchResultKeywordScore[] scores = compiledQuery.indices().mapToObj(idx ->
new SearchResultKeywordScore(
compiledQuery.at(idx),
compiledQueryIds.at(idx),
termMetadataForCombinedDocumentIds.getTermMetadata(
compiledQueryIds.at(idx), combinedId
),
docMetadata,
htmlFeatures)
)
.toArray(SearchResultKeywordScore[]::new);
// DANGER: IndexResultValuatorService assumes that searchResult.keywordScores has this specific order, as it needs
// to be able to re-construct its own CompiledQuery<SearchResultKeywordScore> for re-ranking the results. This is
// a very flimsy assumption.
searchResult.keywordScores.addAll(List.of(scores));
CompiledQuery<SearchResultKeywordScore> queryGraphScores = new CompiledQuery<>(compiledQuery.root, scores);
CompiledQueryLong wordMetasQuery = new CompiledQueryLong(compiledQuery.root, new CqDataLong(wordMetas));
boolean allSynthetic = !CompiledQueryAggregates.booleanAggregate(queryGraphScores, score -> !score.hasTermFlag(WordFlags.Synthetic));
int flagsCount = CompiledQueryAggregates.intMaxMinAggregate(queryGraphScores, score -> Long.bitCount(score.encodedWordMetadata() & flagsFilterMask));
int positionsCount = CompiledQueryAggregates.intMaxMinAggregate(queryGraphScores, SearchResultKeywordScore::positionCount);
boolean allSynthetic = !CompiledQueryAggregates.booleanAggregate(wordMetasQuery, WordFlags.Synthetic::isAbsent);
int flagsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(wordMeta & flagsFilterMask));
int positionsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(WordMetadata.decodePositions(wordMeta)));
if (!meetsQueryStrategyRequirements(queryGraphScores, queryParams.queryStrategy())) {
if (!meetsQueryStrategyRequirements(wordMetasQuery, queryParams.queryStrategy())) {
return null;
}
if (flagsCount == 0 && !allSynthetic && positionsCount == 0)
return null;
double score = searchResultValuator.calculateSearchResultValue(queryGraphScores,
double score = searchResultValuator.calculateSearchResultValue(
wordMetasQuery,
docMetadata,
htmlFeatures,
5000, // use a dummy value here as it's not present in the index
rankingContext);
@ -111,7 +113,7 @@ public class IndexResultValuationContext {
return searchResult;
}
private boolean meetsQueryStrategyRequirements(CompiledQuery<SearchResultKeywordScore> queryGraphScores,
private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores,
QueryStrategy queryStrategy)
{
if (queryStrategy == QueryStrategy.AUTO ||
@ -124,24 +126,24 @@ public class IndexResultValuationContext {
docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy()));
}
private boolean meetsQueryStrategyRequirements(SearchResultKeywordScore termScore, QueryStrategy queryStrategy) {
private boolean meetsQueryStrategyRequirements(long wordMeta, QueryStrategy queryStrategy) {
if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) {
return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.Site.asBit());
return WordFlags.Site.isPresent(wordMeta);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) {
return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.Subjects.asBit());
return WordFlags.Subjects.isPresent(wordMeta);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) {
return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.Title.asBit());
return WordFlags.Title.isPresent(wordMeta);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) {
return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.UrlPath.asBit());
return WordFlags.UrlPath.isPresent(wordMeta);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) {
return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.UrlDomain.asBit());
return WordFlags.UrlDomain.isPresent(wordMeta);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_LINK) {
return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.ExternalLink.asBit());
return WordFlags.ExternalLink.isPresent(wordMeta);
}
return true;
}

View File

@ -6,16 +6,19 @@ import gnu.trove.list.TLongList;
import gnu.trove.list.array.TLongArrayList;
import it.unimi.dsi.fastutil.longs.LongSet;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.linkdb.model.DocdbUrlDetail;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.ranking.results.ResultValuator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -126,22 +129,31 @@ public class IndexResultValuatorService {
continue;
}
// Reconstruct the SearchResultKeywordScore-compiledquery for re-valuation
// Reconstruct the compiledquery for re-valuation
//
// CAVEAT: This hinges on a very fragile that IndexResultValuationContext puts them in the same
// order as the data for the CompiledQuery<String>.
CompiledQuery<SearchResultKeywordScore> resultQuery =
new CompiledQuery<>(compiledQuery.root, result.keywordScores.toArray(SearchResultKeywordScore[]::new));
long[] wordMetas = new long[compiledQuery.size()];
for (int i = 0; i < compiledQuery.size(); i++) {
var score = result.keywordScores.get(i);
wordMetas[i] = score.encodedWordMetadata();
}
resultItems.add(createCombinedItem(result, docData, resultQuery, rankingContext));
CompiledQueryLong metaQuery = new CompiledQueryLong(compiledQuery.root, new CqDataLong(wordMetas));
resultItems.add(createCombinedItem(
result,
docData,
metaQuery,
rankingContext));
}
return resultItems;
}
private DecoratedSearchResultItem createCombinedItem(SearchResultItem result,
DocdbUrlDetail docData,
CompiledQuery<SearchResultKeywordScore> resultQuery,
CompiledQueryLong wordMetas,
ResultRankingContext rankingContext) {
return new DecoratedSearchResultItem(
result,
@ -154,13 +166,19 @@ public class IndexResultValuatorService {
docData.pubYear(),
docData.dataHash(),
docData.wordsTotal(),
bestPositions(resultQuery),
resultValuator.calculateSearchResultValue(resultQuery, docData.wordsTotal(), rankingContext)
bestPositions(wordMetas),
resultValuator.calculateSearchResultValue(wordMetas,
result.encodedDocMetadata,
result.htmlFeatures,
docData.wordsTotal(),
rankingContext)
);
}
private long bestPositions(CompiledQuery<SearchResultKeywordScore> resultQuery) {
LongSet positionsSet = CompiledQueryAggregates.positionsAggregate(resultQuery, SearchResultKeywordScore::positions);
private long bestPositions(CompiledQueryLong wordMetas) {
LongSet positionsSet = CompiledQueryAggregates.positionsAggregate(wordMetas, WordMetadata::decodePositions);
int bestPc = 0;
long bestPositions = 0;

View File

@ -1,9 +1,8 @@
package nu.marginalia.ranking.results;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.idx.DocumentFlags;
@ -15,36 +14,32 @@ import com.google.inject.Singleton;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
@Singleton
public class ResultValuator {
final static double scalingFactor = 500.;
private final Bm25Factor bm25Factor;
private final TermCoherenceFactor termCoherenceFactor;
private static final Logger logger = LoggerFactory.getLogger(ResultValuator.class);
@Inject
public ResultValuator(Bm25Factor bm25Factor,
TermCoherenceFactor termCoherenceFactor) {
this.bm25Factor = bm25Factor;
public ResultValuator(TermCoherenceFactor termCoherenceFactor) {
this.termCoherenceFactor = termCoherenceFactor;
}
public double calculateSearchResultValue(CompiledQuery<SearchResultKeywordScore> scores,
public double calculateSearchResultValue(CompiledQueryLong wordMeta,
long documentMetadata,
int features,
int length,
ResultRankingContext ctx)
{
if (scores.size() == 0)
if (wordMeta.isEmpty())
return Double.MAX_VALUE;
if (length < 0)
length = 5000;
long documentMetadata = scores.at(0).encodedDocMetadata();
int features = scores.at(0).htmlFeatures();
if (length < 0) {
length = 5000;
}
var rankingParams = ctx.params;
int rank = DocumentMetadata.decodeRank(documentMetadata);
@ -79,9 +74,10 @@ public class ResultValuator {
+ temporalBias
+ flagsPenalty;
double bestTcf = rankingParams.tcfWeight * termCoherenceFactor.calculate(scores);
double bestBM25F = rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.prioParams, scores, length, ctx);
double bestBM25P = rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, scores, ctx);
double bestTcf = rankingParams.tcfWeight * termCoherenceFactor.calculate(wordMeta);
double bestBM25F = rankingParams.bm25FullWeight * wordMeta.root.visit(new Bm25FullGraphVisitor(rankingParams.fullParams, wordMeta.data, length, ctx));
double bestBM25P = rankingParams.bm25PrioWeight * wordMeta.root.visit(new Bm25PrioGraphVisitor(rankingParams.prioParams, wordMeta.data, ctx));
double overallPartPositive = Math.max(0, overallPart);
double overallPartNegative = -Math.min(0, overallPart);

View File

@ -1,113 +0,0 @@
package nu.marginalia.ranking.results.factors;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
import nu.marginalia.model.idx.WordFlags;
public class Bm25Factor {
private static final int AVG_LENGTH = 5000;
/** This is an estimation of <a href="https://en.wikipedia.org/wiki/Okapi_BM25">BM-25</a>.
*
* @see Bm25Parameters
*/
public double calculateBm25(Bm25Parameters bm25Parameters, CompiledQuery<SearchResultKeywordScore> scores, int length, ResultRankingContext ctx) {
final int docCount = ctx.termFreqDocCount();
return CompiledQueryAggregates.doubleSumAggregate(scores, keyword -> {
double count = keyword.positionCount();
int freq = ctx.frequency(keyword.keyword);
return invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length);
});
}
/** Bm25 calculation, except instead of counting positions in the document,
* the number of relevance signals for the term is counted instead.
*/
public double calculateBm25Prio(Bm25Parameters bm25Parameters, CompiledQuery<SearchResultKeywordScore> scores, ResultRankingContext ctx) {
final int docCount = ctx.termFreqDocCount();
return CompiledQueryAggregates.doubleSumAggregate(scores, keyword -> {
double count = evaluatePriorityScore(keyword);
int freq = ctx.priorityFrequency(keyword.keyword);
// note we override b to zero for priority terms as they are independent of document length
return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0);
});
}
private static double evaluatePriorityScore(SearchResultKeywordScore keyword) {
int pcount = keyword.positionCount();
double qcount = 0.;
if ((keyword.encodedWordMetadata() & WordFlags.ExternalLink.asBit()) != 0) {
qcount += 2.5;
if ((keyword.encodedWordMetadata() & WordFlags.UrlDomain.asBit()) != 0)
qcount += 2.5;
else if ((keyword.encodedWordMetadata() & WordFlags.UrlPath.asBit()) != 0)
qcount += 1.5;
if ((keyword.encodedWordMetadata() & WordFlags.Site.asBit()) != 0)
qcount += 1.25;
if ((keyword.encodedWordMetadata() & WordFlags.SiteAdjacent.asBit()) != 0)
qcount += 1.25;
}
else {
if ((keyword.encodedWordMetadata() & WordFlags.UrlDomain.asBit()) != 0)
qcount += 3;
else if ((keyword.encodedWordMetadata() & WordFlags.UrlPath.asBit()) != 0)
qcount += 1;
if ((keyword.encodedWordMetadata() & WordFlags.Site.asBit()) != 0)
qcount += 0.5;
if ((keyword.encodedWordMetadata() & WordFlags.SiteAdjacent.asBit()) != 0)
qcount += 0.5;
}
if ((keyword.encodedWordMetadata() & WordFlags.Title.asBit()) != 0)
qcount += 1.5;
if (pcount > 2) {
if ((keyword.encodedWordMetadata() & WordFlags.Subjects.asBit()) != 0)
qcount += 1.25;
if ((keyword.encodedWordMetadata() & WordFlags.NamesWords.asBit()) != 0)
qcount += 0.25;
if ((keyword.encodedWordMetadata() & WordFlags.TfIdfHigh.asBit()) != 0)
qcount += 0.5;
}
return qcount;
}
/**
*
* @param docCount Number of documents
* @param freq Number of matching documents
*/
private double invFreq(int docCount, int freq) {
return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5));
}
/**
*
* @param k determines the size of the impact of a single term
* @param b determines the magnitude of the length normalization
* @param count number of occurrences in the document
* @param length document length
*/
private double f(double k, double b, double count, int length) {
final double lengthRatio = (double) length / AVG_LENGTH;
return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio));
}
}

View File

@ -0,0 +1,81 @@
package nu.marginalia.ranking.results.factors;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.model.idx.WordMetadata;
import java.util.List;
public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor {
private static final long AVG_LENGTH = 5000;
private final CqDataLong wordMetaData;
private final CqDataInt frequencies;
private final Bm25Parameters bm25Parameters;
private final int docCount;
private final int length;
public Bm25FullGraphVisitor(Bm25Parameters bm25Parameters,
CqDataLong wordMetaData,
int length,
ResultRankingContext ctx) {
this.length = length;
this.bm25Parameters = bm25Parameters;
this.docCount = ctx.termFreqDocCount();
this.wordMetaData = wordMetaData;
this.frequencies = ctx.fullCounts;
}
@Override
public double onAnd(List<? extends CqExpression> parts) {
double value = 0;
for (var part : parts) {
value += part.visit(this);
}
return value;
}
@Override
public double onOr(List<? extends CqExpression> parts) {
double value = 0;
for (var part : parts) {
value = Math.max(value, part.visit(this));
}
return value;
}
@Override
public double onLeaf(int idx) {
double count = Long.bitCount(WordMetadata.decodePositions(wordMetaData.get(idx)));
int freq = frequencies.get(idx);
return invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length);
}
/**
*
* @param docCount Number of documents
* @param freq Number of matching documents
*/
private double invFreq(int docCount, int freq) {
return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5));
}
/**
*
* @param k determines the size of the impact of a single term
* @param b determines the magnitude of the length normalization
* @param count number of occurrences in the document
* @param length document length
*/
private double f(double k, double b, double count, int length) {
final double lengthRatio = (double) length / AVG_LENGTH;
return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio));
}
}

View File

@ -0,0 +1,127 @@
package nu.marginalia.ranking.results.factors;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import java.util.List;
public class Bm25PrioGraphVisitor implements CqExpression.DoubleVisitor {
private static final long AVG_LENGTH = 5000;
private final CqDataLong wordMetaData;
private final CqDataInt frequencies;
private final Bm25Parameters bm25Parameters;
private final int docCount;
public Bm25PrioGraphVisitor(Bm25Parameters bm25Parameters,
CqDataLong wordMetaData,
ResultRankingContext ctx) {
this.bm25Parameters = bm25Parameters;
this.docCount = ctx.termFreqDocCount();
this.wordMetaData = wordMetaData;
this.frequencies = ctx.fullCounts;
}
@Override
public double onAnd(List<? extends CqExpression> parts) {
double value = 0;
for (var part : parts) {
value += part.visit(this);
}
return value;
}
@Override
public double onOr(List<? extends CqExpression> parts) {
double value = 0;
for (var part : parts) {
value = Math.max(value, part.visit(this));
}
return value;
}
@Override
public double onLeaf(int idx) {
double count = evaluatePriorityScore(wordMetaData.get(idx));
int freq = frequencies.get(idx);
// note we override b to zero for priority terms as they are independent of document length
return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0);
}
private static double evaluatePriorityScore(long wordMeta) {
int pcount = Long.bitCount(WordMetadata.decodePositions(wordMeta));
double qcount = 0.;
if ((wordMeta & WordFlags.ExternalLink.asBit()) != 0) {
qcount += 2.5;
if ((wordMeta & WordFlags.UrlDomain.asBit()) != 0)
qcount += 2.5;
else if ((wordMeta & WordFlags.UrlPath.asBit()) != 0)
qcount += 1.5;
if ((wordMeta & WordFlags.Site.asBit()) != 0)
qcount += 1.25;
if ((wordMeta & WordFlags.SiteAdjacent.asBit()) != 0)
qcount += 1.25;
}
else {
if ((wordMeta & WordFlags.UrlDomain.asBit()) != 0)
qcount += 3;
else if ((wordMeta & WordFlags.UrlPath.asBit()) != 0)
qcount += 1;
if ((wordMeta & WordFlags.Site.asBit()) != 0)
qcount += 0.5;
if ((wordMeta & WordFlags.SiteAdjacent.asBit()) != 0)
qcount += 0.5;
}
if ((wordMeta & WordFlags.Title.asBit()) != 0)
qcount += 1.5;
if (pcount > 2) {
if ((wordMeta & WordFlags.Subjects.asBit()) != 0)
qcount += 1.25;
if ((wordMeta & WordFlags.NamesWords.asBit()) != 0)
qcount += 0.25;
if ((wordMeta & WordFlags.TfIdfHigh.asBit()) != 0)
qcount += 0.5;
}
return qcount;
}
/**
*
* @param docCount Number of documents
* @param freq Number of matching documents
*/
private double invFreq(int docCount, int freq) {
return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5));
}
/**
*
* @param k determines the size of the impact of a single term
* @param b determines the magnitude of the length normalization
* @param count number of occurrences in the document
* @param length document length
*/
private double f(double k, double b, double count, int length) {
final double lengthRatio = (double) length / AVG_LENGTH;
return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio));
}
}

View File

@ -1,16 +1,16 @@
package nu.marginalia.ranking.results.factors;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
import nu.marginalia.model.idx.WordMetadata;
/** Rewards documents where terms appear frequently within the same sentences
*/
public class TermCoherenceFactor {
public double calculate(CompiledQuery<SearchResultKeywordScore> scores) {
long mask = CompiledQueryAggregates.longBitmaskAggregate(scores, score -> score.positions() & WordMetadata.POSITIONS_MASK);
public double calculate(CompiledQueryLong wordMetadataQuery) {
long mask = CompiledQueryAggregates.longBitmaskAggregate(wordMetadataQuery,
score -> score >>> WordMetadata.POSITIONS_SHIFT);
return bitsSetFactor(mask);
}

View File

@ -215,9 +215,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
Set<Integer> years = new HashSet<>();
for (var res : rsp.results) {
for (var score : res.rawIndexResult.getKeywordScores()) {
years.add(DocumentMetadata.decodeYear(score.encodedDocMetadata()));
}
years.add(DocumentMetadata.decodeYear(res.rawIndexResult.encodedDocMetadata));
}
assertEquals(Set.of(1998), years);

View File

@ -25,7 +25,7 @@ class IndexResultDomainDeduplicatorTest {
}
SearchResultItem forId(int domain, int ordinal) {
return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), List.of(), 4, Double.NaN);
return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(), 4, Double.NaN);
}
}

View File

@ -1,6 +1,8 @@
package nu.marginalia.ranking.results;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
@ -31,30 +33,27 @@ class ResultValuatorTest {
when(dict.docCount()).thenReturn(100_000);
valuator = new ResultValuator(
new Bm25Factor(),
new TermCoherenceFactor()
);
}
CompiledQuery<SearchResultKeywordScore> titleOnlyLowCountSet = CompiledQuery.just(
new SearchResultKeywordScore("bob", 1,
wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)),
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
0)
);
CompiledQuery<SearchResultKeywordScore> highCountNoTitleSet = CompiledQuery.just(
new SearchResultKeywordScore("bob", 1,
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)),
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
0)
);
CompiledQuery<SearchResultKeywordScore> highCountSubjectSet = CompiledQuery.just(
CqDataInt frequencyData = new CqDataInt(new int[] { 10 });
CompiledQueryLong titleOnlyLowCountSet = CompiledQuery.just(
new SearchResultKeywordScore("bob", 1,
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)),
docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)),
0)
);
wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)))
).mapToLong(SearchResultKeywordScore::encodedWordMetadata);
CompiledQueryLong highCountNoTitleSet = CompiledQuery.just(
new SearchResultKeywordScore("bob", 1,
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)))
).mapToLong(SearchResultKeywordScore::encodedWordMetadata);;
CompiledQueryLong highCountSubjectSet = CompiledQuery.just(
new SearchResultKeywordScore("bob", 1,
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)))
).mapToLong(SearchResultKeywordScore::encodedWordMetadata);;
@Test
@ -63,12 +62,16 @@ class ResultValuatorTest {
when(dict.getTermFreq("bob")).thenReturn(10);
ResultRankingContext context = new ResultRankingContext(100000,
ResultRankingParameters.sensibleDefaults(),
Map.of("bob", 10), Collections.emptyMap());
frequencyData,
frequencyData);
double titleOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, 10_000, context);
double titleLongOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, 10_000, context);
double highCountNoTitle = valuator.calculateSearchResultValue(highCountNoTitleSet, 10_000, context);
double highCountSubject = valuator.calculateSearchResultValue(highCountSubjectSet, 10_000, context);
long docMeta = docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class));
int features = 0;
double titleOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context);
double titleLongOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context);
double highCountNoTitle = valuator.calculateSearchResultValue(highCountNoTitleSet, docMeta, features, 10_000, context);
double highCountSubject = valuator.calculateSearchResultValue(highCountSubjectSet, docMeta, features, 10_000, context);
System.out.println(titleOnlyLowCount);
System.out.println(titleLongOnlyLowCount);

View File

@ -18,14 +18,23 @@ class TermCoherenceFactorTest {
@Test
public void testAllBitsSet() {
var allPositionsSet = createSet(
WordMetadata.POSITIONS_MASK, WordMetadata.POSITIONS_MASK
~0L,
~0L
);
long mask = CompiledQueryAggregates.longBitmaskAggregate(allPositionsSet, score -> score.positions() & WordMetadata.POSITIONS_MASK);
long mask = CompiledQueryAggregates.longBitmaskAggregate(
allPositionsSet,
SearchResultKeywordScore::positions
);
assertEquals(1.0, termCoherenceFactor.bitsSetFactor(mask), 0.01);
assertEquals(1.0, termCoherenceFactor.calculate(allPositionsSet));
assertEquals(1.0,
termCoherenceFactor.calculate(
allPositionsSet.mapToLong(SearchResultKeywordScore::encodedWordMetadata)
)
);
}
@Test
@ -38,7 +47,7 @@ class TermCoherenceFactorTest {
assertEquals(0, termCoherenceFactor.bitsSetFactor(mask), 0.01);
assertEquals(0, termCoherenceFactor.calculate(allPositionsSet));
assertEquals(0, termCoherenceFactor.calculate(allPositionsSet.mapToLong(SearchResultKeywordScore::encodedWordMetadata)));
}
@Test @SuppressWarnings("unchecked")
@ -90,7 +99,7 @@ class TermCoherenceFactorTest {
for (int i = 0; i < positionMasks.length; i++) {
keywords.add(new SearchResultKeywordScore("", 0,
new WordMetadata(positionMasks[i], (byte) 0).encode(), 0, 0));
new WordMetadata(positionMasks[i] & WordMetadata.POSITIONS_MASK, (byte) 0).encode()));
}
return CompiledQuery.just(keywords.toArray(SearchResultKeywordScore[]::new));

View File

@ -15,7 +15,7 @@ class NgramLexiconTest {
}
void addNgram(String... ngram) {
lexicon.incOrdered(HasherGroup.ordered().rollingHash(ngram));
lexicon.incOrderedTitle(HasherGroup.ordered().rollingHash(ngram));
}
@Test

View File

@ -38,7 +38,7 @@ public class ClusteredUrlDetails implements Comparable<ClusteredUrlDetails> {
for (var keywordScore : urlDetail.resultItem.keywordScores) {
if (keywordScore.isKeywordSpecial())
continue;
if (keywordScore.positionCount() == 0)
if (keywordScore.positions() == 0)
continue;
if (keywordScore.hasTermFlag(WordFlags.Title))