(index) Bugs and error fixes, chasing and fixing mystery results that did not contain all relevant keywords

This commit is contained in:
Viktor Lofgren 2024-08-09 16:38:21 +02:00
parent df89661ed2
commit 016a4c62e1
7 changed files with 67 additions and 56 deletions

View File

@ -3,7 +3,9 @@ package nu.marginalia.api.searchquery.model.compiled;
import org.jetbrains.annotations.NotNull;
import java.util.Iterator;
import java.util.function.*;
import java.util.function.Function;
import java.util.function.ToIntFunction;
import java.util.function.ToLongFunction;
import java.util.stream.IntStream;
import java.util.stream.Stream;
@ -46,8 +48,8 @@ public class CompiledQuery<T> implements Iterable<T> {
return new CompiledQueryLong(root, data.mapToLong(mapper));
}
public CompiledQueryLong mapToInt(ToIntFunction<T> mapper) {
return new CompiledQueryLong(root, data.mapToInt(mapper));
public CompiledQueryInt mapToInt(ToIntFunction<T> mapper) {
return new CompiledQueryInt(root, data.mapToInt(mapper));
}
public CqExpression root() {

View File

@ -33,13 +33,13 @@ public class CqData<T> {
return new CqDataLong(newData);
}
public CqDataLong mapToInt(ToIntFunction<T> mapper) {
long[] newData = new long[data.length];
public CqDataInt mapToInt(ToIntFunction<T> mapper) {
int[] newData = new int[data.length];
for (int i = 0; i < data.length; i++) {
newData[i] = mapper.applyAsInt((T) data[i]);
newData[i] = mapper.applyAsInt(data[i]);
}
return new CqDataLong(newData);
return new CqDataInt(newData);
}
public T get(int i) {

View File

@ -32,11 +32,13 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
public SearchResultItem(long combinedId,
long encodedDocMetadata,
int htmlFeatures) {
int htmlFeatures,
double score) {
this.combinedId = combinedId;
this.encodedDocMetadata = encodedDocMetadata;
this.keywordScores = new ArrayList<>();
this.htmlFeatures = htmlFeatures;
this.scoreValue = score;
}

View File

@ -1,10 +1,11 @@
package nu.marginalia.api.searchquery.model.compiled;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import org.junit.jupiter.api.Test;
import java.util.List;
import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
class CompiledQueryParserTest {
@ -22,6 +23,21 @@ class CompiledQueryParserTest {
assertEquals(w(q, "foo"), q.root);
}
@Test
public void testCohen() {
CompiledQuery<String> q = CompiledQueryParser.parse("( tube brief of elaboration | brief_elaboration_of_a_tube )");
int val = CompiledQueryAggregates.intMaxMinAggregate(q, s ->
switch (s) {
case "brief" -> 3;
case "tube" -> 2;
case "of" -> 1;
default -> 0;
});
assertEquals(0, val);
System.out.println(q.stream().toList());
}
@Test
public void testAndTwoWords() {
CompiledQuery<String> q = CompiledQueryParser.parse("foo bar");

View File

@ -2,7 +2,6 @@ package nu.marginalia.index.results;
import it.unimi.dsi.fastutil.ints.IntIterator;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
@ -54,8 +53,6 @@ public class IndexResultScoreCalculator {
this.compiledQuery = params.compiledQuery;
}
private final long flagsFilterMask = WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit();
@Nullable
public SearchResultItem calculateScore(Arena arena,
@Nullable DebugRankingFactors rankingFactors,
@ -67,19 +64,19 @@ public class IndexResultScoreCalculator {
CompiledQuery<CodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
int[] counts = new int[compiledQuery.size()];
for (int i = 0; i < counts.length; i++) {
if (positions[i] != null) {
counts[i] = positions[i].valueCount();
}
}
CompiledQueryInt positionsCountQuery = compiledQuery.root.newQuery(counts);
CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags);
// If the document is not relevant to the query, abort early to reduce allocations and
// avoid unnecessary calculations
if (testRelevance(wordFlagsQuery, positionsCountQuery)) {
CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags);
if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams.queryStrategy())) {
return null;
}
boolean allSynthetic = booleanAggregate(wordFlagsQuery, flags -> WordFlags.Synthetic.isPresent((byte) flags));
int minFlagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & 0xff));
int minPositionsCount = intMaxMinAggregate(positionsQuery, pos -> pos == null ? 0 : pos.valueCount());
if (minFlagsCount == 0 && !allSynthetic && minPositionsCount == 0) {
return null;
}
@ -102,28 +99,7 @@ public class IndexResultScoreCalculator {
searchTerms.coherences,
rankingContext);
SearchResultItem searchResult = new SearchResultItem(combinedId,
docMetadata,
htmlFeatures);
searchResult.setScore(score);
return searchResult;
}
private boolean testRelevance(CompiledQueryLong wordFlagsQuery, CompiledQueryInt countsQuery) {
boolean allSynthetic = booleanAggregate(wordFlagsQuery, flags -> WordFlags.Synthetic.isPresent((byte) flags));
int flagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & flagsFilterMask));
int positionsCount = intMaxMinAggregate(countsQuery, p -> p);
if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams.queryStrategy())) {
return true;
}
if (flagsCount == 0 && !allSynthetic && positionsCount == 0) {
return true;
}
return false;
return new SearchResultItem(combinedId, docMetadata, htmlFeatures, score);
}
private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores,
@ -320,6 +296,11 @@ public class IndexResultScoreCalculator {
weightedCounts[i] += 0.2f;
else if (spans.nav.containsPosition(pos))
weightedCounts[i] += 0.1f;
else
weightedCounts[i] += 1.0f;
if (spans.externalLinkText.containsPosition(pos))
weightedCounts[i] += 1.0f;
}
if (titleMatch) {
@ -375,14 +356,19 @@ public class IndexResultScoreCalculator {
rankingFactors.addDocumentFactor("bm25.main", Double.toString(bM25));
rankingFactors.addDocumentFactor("bm25.flags", Double.toString(bFlags));
rankingFactors.addDocumentFactor("unordered.title", Integer.toString(unorderedMatchInTitleCount));
rankingFactors.addDocumentFactor("unordered.heading", Integer.toString(unorderedMatchInHeadingCount));
for (int i = 0; i < searchTerms.termIdsAll.size(); i++) {
long termId = searchTerms.termIdsAll.at(i);
rankingFactors.addTermFactor(termId, "factor.weightedCount", Double.toString(weightedCounts[i]));
byte flags = (byte) wordFlagsQuery.at(i);
var flags = wordFlagsQuery.at(i);
rankingFactors.addTermFactor(termId, "flags.rawEncoded", Long.toString(flags));
for (var flag : WordFlags.values()) {
if (flag.isPresent(flags)) {
if (flag.isPresent((byte) flags)) {
rankingFactors.addTermFactor(termId, "flags." + flag.name(), "true");
}
}
@ -409,9 +395,6 @@ public class IndexResultScoreCalculator {
rankingFactors.addTermFactor(termId, "verbatim.title", "true");
}
rankingFactors.addTermFactor(termId, "unordered.title", Integer.toString(unorderedMatchInTitleCount));
rankingFactors.addTermFactor(termId, "unordered.heading", Integer.toString(unorderedMatchInHeadingCount));
if (positions[i] != null) {
rankingFactors.addTermFactor(termId, "positions.all", positions[i].iterator());
rankingFactors.addTermFactor(termId, "positions.title", SequenceOperations.findIntersections(spans.title.iterator(), positions[i].iterator()).iterator());

View File

@ -3,7 +3,6 @@ package nu.marginalia.index.results.model.ids;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import java.util.Arrays;
import java.util.Objects;
import java.util.stream.LongStream;
public final class TermIdList {
@ -11,7 +10,6 @@ public final class TermIdList {
public TermIdList(long[] array) {
this.array = array;
Arrays.sort(this.array);
}
public TermIdList(LongArrayList list) {
@ -35,12 +33,22 @@ public final class TermIdList {
}
public boolean contains(long id) {
// Implicitly sorted
return Arrays.binarySearch(array, id) >= 0;
// array is typically small and unsorted, so linear search is fine
for (int i = 0; i < array.length; i++) {
if (array[i] == id) {
return true;
}
}
return false;
}
public int indexOf(long id) {
return Arrays.binarySearch(array, id);
for (int i = 0; i < array.length; i++) {
if (array[i] == id) {
return i;
}
}
return -1;
}
@Override

View File

@ -91,7 +91,7 @@ public class SearchServicePaperDoll extends AbstractModule {
long positions)
{
results.add(new DecoratedSearchResultItem(
new SearchResultItem(url.hashCode(), 2, 3),
new SearchResultItem(url.hashCode(), 2, 3, score),
new EdgeUrl(url),
title,
description,