(index) Bugs and error fixes, chasing and fixing mystery results that did not contain all relevant keywords

This commit is contained in:
Viktor Lofgren 2024-08-09 16:38:21 +02:00
parent df89661ed2
commit 016a4c62e1
7 changed files with 67 additions and 56 deletions

View File

@ -3,7 +3,9 @@ package nu.marginalia.api.searchquery.model.compiled;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import java.util.Iterator; import java.util.Iterator;
import java.util.function.*; import java.util.function.Function;
import java.util.function.ToIntFunction;
import java.util.function.ToLongFunction;
import java.util.stream.IntStream; import java.util.stream.IntStream;
import java.util.stream.Stream; import java.util.stream.Stream;
@ -46,8 +48,8 @@ public class CompiledQuery<T> implements Iterable<T> {
return new CompiledQueryLong(root, data.mapToLong(mapper)); return new CompiledQueryLong(root, data.mapToLong(mapper));
} }
public CompiledQueryLong mapToInt(ToIntFunction<T> mapper) { public CompiledQueryInt mapToInt(ToIntFunction<T> mapper) {
return new CompiledQueryLong(root, data.mapToInt(mapper)); return new CompiledQueryInt(root, data.mapToInt(mapper));
} }
public CqExpression root() { public CqExpression root() {

View File

@ -33,13 +33,13 @@ public class CqData<T> {
return new CqDataLong(newData); return new CqDataLong(newData);
} }
public CqDataLong mapToInt(ToIntFunction<T> mapper) { public CqDataInt mapToInt(ToIntFunction<T> mapper) {
long[] newData = new long[data.length]; int[] newData = new int[data.length];
for (int i = 0; i < data.length; i++) { for (int i = 0; i < data.length; i++) {
newData[i] = mapper.applyAsInt((T) data[i]); newData[i] = mapper.applyAsInt(data[i]);
} }
return new CqDataLong(newData); return new CqDataInt(newData);
} }
public T get(int i) { public T get(int i) {

View File

@ -32,11 +32,13 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
public SearchResultItem(long combinedId, public SearchResultItem(long combinedId,
long encodedDocMetadata, long encodedDocMetadata,
int htmlFeatures) { int htmlFeatures,
double score) {
this.combinedId = combinedId; this.combinedId = combinedId;
this.encodedDocMetadata = encodedDocMetadata; this.encodedDocMetadata = encodedDocMetadata;
this.keywordScores = new ArrayList<>(); this.keywordScores = new ArrayList<>();
this.htmlFeatures = htmlFeatures; this.htmlFeatures = htmlFeatures;
this.scoreValue = score;
} }

View File

@ -1,10 +1,11 @@
package nu.marginalia.api.searchquery.model.compiled; package nu.marginalia.api.searchquery.model.compiled;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.util.List; import java.util.List;
import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.assertEquals;
class CompiledQueryParserTest { class CompiledQueryParserTest {
@ -22,6 +23,21 @@ class CompiledQueryParserTest {
assertEquals(w(q, "foo"), q.root); assertEquals(w(q, "foo"), q.root);
} }
@Test
public void testCohen() {
CompiledQuery<String> q = CompiledQueryParser.parse("( tube brief of elaboration | brief_elaboration_of_a_tube )");
int val = CompiledQueryAggregates.intMaxMinAggregate(q, s ->
switch (s) {
case "brief" -> 3;
case "tube" -> 2;
case "of" -> 1;
default -> 0;
});
assertEquals(0, val);
System.out.println(q.stream().toList());
}
@Test @Test
public void testAndTwoWords() { public void testAndTwoWords() {
CompiledQuery<String> q = CompiledQueryParser.parse("foo bar"); CompiledQuery<String> q = CompiledQueryParser.parse("foo bar");

View File

@ -2,7 +2,6 @@ package nu.marginalia.index.results;
import it.unimi.dsi.fastutil.ints.IntIterator; import it.unimi.dsi.fastutil.ints.IntIterator;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
@ -54,8 +53,6 @@ public class IndexResultScoreCalculator {
this.compiledQuery = params.compiledQuery; this.compiledQuery = params.compiledQuery;
} }
private final long flagsFilterMask = WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit();
@Nullable @Nullable
public SearchResultItem calculateScore(Arena arena, public SearchResultItem calculateScore(Arena arena,
@Nullable DebugRankingFactors rankingFactors, @Nullable DebugRankingFactors rankingFactors,
@ -67,19 +64,19 @@ public class IndexResultScoreCalculator {
CompiledQuery<CodedSequence> positionsQuery = compiledQuery.root.newQuery(positions); CompiledQuery<CodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
int[] counts = new int[compiledQuery.size()];
for (int i = 0; i < counts.length; i++) {
if (positions[i] != null) {
counts[i] = positions[i].valueCount();
}
}
CompiledQueryInt positionsCountQuery = compiledQuery.root.newQuery(counts);
CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags);
// If the document is not relevant to the query, abort early to reduce allocations and // If the document is not relevant to the query, abort early to reduce allocations and
// avoid unnecessary calculations // avoid unnecessary calculations
if (testRelevance(wordFlagsQuery, positionsCountQuery)) {
CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags);
if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams.queryStrategy())) {
return null;
}
boolean allSynthetic = booleanAggregate(wordFlagsQuery, flags -> WordFlags.Synthetic.isPresent((byte) flags));
int minFlagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & 0xff));
int minPositionsCount = intMaxMinAggregate(positionsQuery, pos -> pos == null ? 0 : pos.valueCount());
if (minFlagsCount == 0 && !allSynthetic && minPositionsCount == 0) {
return null; return null;
} }
@ -102,28 +99,7 @@ public class IndexResultScoreCalculator {
searchTerms.coherences, searchTerms.coherences,
rankingContext); rankingContext);
SearchResultItem searchResult = new SearchResultItem(combinedId, return new SearchResultItem(combinedId, docMetadata, htmlFeatures, score);
docMetadata,
htmlFeatures);
searchResult.setScore(score);
return searchResult;
}
private boolean testRelevance(CompiledQueryLong wordFlagsQuery, CompiledQueryInt countsQuery) {
boolean allSynthetic = booleanAggregate(wordFlagsQuery, flags -> WordFlags.Synthetic.isPresent((byte) flags));
int flagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & flagsFilterMask));
int positionsCount = intMaxMinAggregate(countsQuery, p -> p);
if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams.queryStrategy())) {
return true;
}
if (flagsCount == 0 && !allSynthetic && positionsCount == 0) {
return true;
}
return false;
} }
private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores, private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores,
@ -320,6 +296,11 @@ public class IndexResultScoreCalculator {
weightedCounts[i] += 0.2f; weightedCounts[i] += 0.2f;
else if (spans.nav.containsPosition(pos)) else if (spans.nav.containsPosition(pos))
weightedCounts[i] += 0.1f; weightedCounts[i] += 0.1f;
else
weightedCounts[i] += 1.0f;
if (spans.externalLinkText.containsPosition(pos))
weightedCounts[i] += 1.0f;
} }
if (titleMatch) { if (titleMatch) {
@ -375,14 +356,19 @@ public class IndexResultScoreCalculator {
rankingFactors.addDocumentFactor("bm25.main", Double.toString(bM25)); rankingFactors.addDocumentFactor("bm25.main", Double.toString(bM25));
rankingFactors.addDocumentFactor("bm25.flags", Double.toString(bFlags)); rankingFactors.addDocumentFactor("bm25.flags", Double.toString(bFlags));
rankingFactors.addDocumentFactor("unordered.title", Integer.toString(unorderedMatchInTitleCount));
rankingFactors.addDocumentFactor("unordered.heading", Integer.toString(unorderedMatchInHeadingCount));
for (int i = 0; i < searchTerms.termIdsAll.size(); i++) { for (int i = 0; i < searchTerms.termIdsAll.size(); i++) {
long termId = searchTerms.termIdsAll.at(i); long termId = searchTerms.termIdsAll.at(i);
rankingFactors.addTermFactor(termId, "factor.weightedCount", Double.toString(weightedCounts[i])); rankingFactors.addTermFactor(termId, "factor.weightedCount", Double.toString(weightedCounts[i]));
byte flags = (byte) wordFlagsQuery.at(i); var flags = wordFlagsQuery.at(i);
rankingFactors.addTermFactor(termId, "flags.rawEncoded", Long.toString(flags));
for (var flag : WordFlags.values()) { for (var flag : WordFlags.values()) {
if (flag.isPresent(flags)) { if (flag.isPresent((byte) flags)) {
rankingFactors.addTermFactor(termId, "flags." + flag.name(), "true"); rankingFactors.addTermFactor(termId, "flags." + flag.name(), "true");
} }
} }
@ -409,9 +395,6 @@ public class IndexResultScoreCalculator {
rankingFactors.addTermFactor(termId, "verbatim.title", "true"); rankingFactors.addTermFactor(termId, "verbatim.title", "true");
} }
rankingFactors.addTermFactor(termId, "unordered.title", Integer.toString(unorderedMatchInTitleCount));
rankingFactors.addTermFactor(termId, "unordered.heading", Integer.toString(unorderedMatchInHeadingCount));
if (positions[i] != null) { if (positions[i] != null) {
rankingFactors.addTermFactor(termId, "positions.all", positions[i].iterator()); rankingFactors.addTermFactor(termId, "positions.all", positions[i].iterator());
rankingFactors.addTermFactor(termId, "positions.title", SequenceOperations.findIntersections(spans.title.iterator(), positions[i].iterator()).iterator()); rankingFactors.addTermFactor(termId, "positions.title", SequenceOperations.findIntersections(spans.title.iterator(), positions[i].iterator()).iterator());

View File

@ -3,7 +3,6 @@ package nu.marginalia.index.results.model.ids;
import it.unimi.dsi.fastutil.longs.LongArrayList; import it.unimi.dsi.fastutil.longs.LongArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Objects;
import java.util.stream.LongStream; import java.util.stream.LongStream;
public final class TermIdList { public final class TermIdList {
@ -11,7 +10,6 @@ public final class TermIdList {
public TermIdList(long[] array) { public TermIdList(long[] array) {
this.array = array; this.array = array;
Arrays.sort(this.array);
} }
public TermIdList(LongArrayList list) { public TermIdList(LongArrayList list) {
@ -35,12 +33,22 @@ public final class TermIdList {
} }
public boolean contains(long id) { public boolean contains(long id) {
// Implicitly sorted // array is typically small and unsorted, so linear search is fine
return Arrays.binarySearch(array, id) >= 0; for (int i = 0; i < array.length; i++) {
if (array[i] == id) {
return true;
}
}
return false;
} }
public int indexOf(long id) { public int indexOf(long id) {
return Arrays.binarySearch(array, id); for (int i = 0; i < array.length; i++) {
if (array[i] == id) {
return i;
}
}
return -1;
} }
@Override @Override

View File

@ -91,7 +91,7 @@ public class SearchServicePaperDoll extends AbstractModule {
long positions) long positions)
{ {
results.add(new DecoratedSearchResultItem( results.add(new DecoratedSearchResultItem(
new SearchResultItem(url.hashCode(), 2, 3), new SearchResultItem(url.hashCode(), 2, 3, score),
new EdgeUrl(url), new EdgeUrl(url),
title, title,
description, description,