mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(index) Bugs and error fixes, chasing and fixing mystery results that did not contain all relevant keywords
This commit is contained in:
parent
df89661ed2
commit
016a4c62e1
@ -3,7 +3,9 @@ package nu.marginalia.api.searchquery.model.compiled;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.function.*;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.ToIntFunction;
|
||||
import java.util.function.ToLongFunction;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@ -46,8 +48,8 @@ public class CompiledQuery<T> implements Iterable<T> {
|
||||
return new CompiledQueryLong(root, data.mapToLong(mapper));
|
||||
}
|
||||
|
||||
public CompiledQueryLong mapToInt(ToIntFunction<T> mapper) {
|
||||
return new CompiledQueryLong(root, data.mapToInt(mapper));
|
||||
public CompiledQueryInt mapToInt(ToIntFunction<T> mapper) {
|
||||
return new CompiledQueryInt(root, data.mapToInt(mapper));
|
||||
}
|
||||
|
||||
public CqExpression root() {
|
||||
|
@ -33,13 +33,13 @@ public class CqData<T> {
|
||||
return new CqDataLong(newData);
|
||||
}
|
||||
|
||||
public CqDataLong mapToInt(ToIntFunction<T> mapper) {
|
||||
long[] newData = new long[data.length];
|
||||
public CqDataInt mapToInt(ToIntFunction<T> mapper) {
|
||||
int[] newData = new int[data.length];
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
newData[i] = mapper.applyAsInt((T) data[i]);
|
||||
newData[i] = mapper.applyAsInt(data[i]);
|
||||
}
|
||||
|
||||
return new CqDataLong(newData);
|
||||
return new CqDataInt(newData);
|
||||
}
|
||||
|
||||
public T get(int i) {
|
||||
|
@ -32,11 +32,13 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
||||
|
||||
public SearchResultItem(long combinedId,
|
||||
long encodedDocMetadata,
|
||||
int htmlFeatures) {
|
||||
int htmlFeatures,
|
||||
double score) {
|
||||
this.combinedId = combinedId;
|
||||
this.encodedDocMetadata = encodedDocMetadata;
|
||||
this.keywordScores = new ArrayList<>();
|
||||
this.htmlFeatures = htmlFeatures;
|
||||
this.scoreValue = score;
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,10 +1,11 @@
|
||||
package nu.marginalia.api.searchquery.model.compiled;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class CompiledQueryParserTest {
|
||||
|
||||
@ -22,6 +23,21 @@ class CompiledQueryParserTest {
|
||||
assertEquals(w(q, "foo"), q.root);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCohen() {
|
||||
CompiledQuery<String> q = CompiledQueryParser.parse("( tube brief of elaboration | brief_elaboration_of_a_tube )");
|
||||
int val = CompiledQueryAggregates.intMaxMinAggregate(q, s ->
|
||||
switch (s) {
|
||||
case "brief" -> 3;
|
||||
case "tube" -> 2;
|
||||
case "of" -> 1;
|
||||
default -> 0;
|
||||
});
|
||||
assertEquals(0, val);
|
||||
|
||||
System.out.println(q.stream().toList());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAndTwoWords() {
|
||||
CompiledQuery<String> q = CompiledQueryParser.parse("foo bar");
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.index.results;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
@ -54,8 +53,6 @@ public class IndexResultScoreCalculator {
|
||||
this.compiledQuery = params.compiledQuery;
|
||||
}
|
||||
|
||||
private final long flagsFilterMask = WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit();
|
||||
|
||||
@Nullable
|
||||
public SearchResultItem calculateScore(Arena arena,
|
||||
@Nullable DebugRankingFactors rankingFactors,
|
||||
@ -67,19 +64,19 @@ public class IndexResultScoreCalculator {
|
||||
|
||||
CompiledQuery<CodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
|
||||
|
||||
int[] counts = new int[compiledQuery.size()];
|
||||
|
||||
for (int i = 0; i < counts.length; i++) {
|
||||
if (positions[i] != null) {
|
||||
counts[i] = positions[i].valueCount();
|
||||
}
|
||||
}
|
||||
CompiledQueryInt positionsCountQuery = compiledQuery.root.newQuery(counts);
|
||||
CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags);
|
||||
|
||||
// If the document is not relevant to the query, abort early to reduce allocations and
|
||||
// avoid unnecessary calculations
|
||||
if (testRelevance(wordFlagsQuery, positionsCountQuery)) {
|
||||
|
||||
CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags);
|
||||
if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams.queryStrategy())) {
|
||||
return null;
|
||||
}
|
||||
|
||||
boolean allSynthetic = booleanAggregate(wordFlagsQuery, flags -> WordFlags.Synthetic.isPresent((byte) flags));
|
||||
int minFlagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & 0xff));
|
||||
int minPositionsCount = intMaxMinAggregate(positionsQuery, pos -> pos == null ? 0 : pos.valueCount());
|
||||
|
||||
if (minFlagsCount == 0 && !allSynthetic && minPositionsCount == 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@ -102,28 +99,7 @@ public class IndexResultScoreCalculator {
|
||||
searchTerms.coherences,
|
||||
rankingContext);
|
||||
|
||||
SearchResultItem searchResult = new SearchResultItem(combinedId,
|
||||
docMetadata,
|
||||
htmlFeatures);
|
||||
|
||||
searchResult.setScore(score);
|
||||
|
||||
return searchResult;
|
||||
}
|
||||
|
||||
private boolean testRelevance(CompiledQueryLong wordFlagsQuery, CompiledQueryInt countsQuery) {
|
||||
boolean allSynthetic = booleanAggregate(wordFlagsQuery, flags -> WordFlags.Synthetic.isPresent((byte) flags));
|
||||
int flagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & flagsFilterMask));
|
||||
int positionsCount = intMaxMinAggregate(countsQuery, p -> p);
|
||||
|
||||
if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams.queryStrategy())) {
|
||||
return true;
|
||||
}
|
||||
if (flagsCount == 0 && !allSynthetic && positionsCount == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
return new SearchResultItem(combinedId, docMetadata, htmlFeatures, score);
|
||||
}
|
||||
|
||||
private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores,
|
||||
@ -320,6 +296,11 @@ public class IndexResultScoreCalculator {
|
||||
weightedCounts[i] += 0.2f;
|
||||
else if (spans.nav.containsPosition(pos))
|
||||
weightedCounts[i] += 0.1f;
|
||||
else
|
||||
weightedCounts[i] += 1.0f;
|
||||
|
||||
if (spans.externalLinkText.containsPosition(pos))
|
||||
weightedCounts[i] += 1.0f;
|
||||
}
|
||||
|
||||
if (titleMatch) {
|
||||
@ -375,14 +356,19 @@ public class IndexResultScoreCalculator {
|
||||
rankingFactors.addDocumentFactor("bm25.main", Double.toString(bM25));
|
||||
rankingFactors.addDocumentFactor("bm25.flags", Double.toString(bFlags));
|
||||
|
||||
rankingFactors.addDocumentFactor("unordered.title", Integer.toString(unorderedMatchInTitleCount));
|
||||
rankingFactors.addDocumentFactor("unordered.heading", Integer.toString(unorderedMatchInHeadingCount));
|
||||
|
||||
for (int i = 0; i < searchTerms.termIdsAll.size(); i++) {
|
||||
long termId = searchTerms.termIdsAll.at(i);
|
||||
|
||||
rankingFactors.addTermFactor(termId, "factor.weightedCount", Double.toString(weightedCounts[i]));
|
||||
byte flags = (byte) wordFlagsQuery.at(i);
|
||||
var flags = wordFlagsQuery.at(i);
|
||||
|
||||
rankingFactors.addTermFactor(termId, "flags.rawEncoded", Long.toString(flags));
|
||||
|
||||
for (var flag : WordFlags.values()) {
|
||||
if (flag.isPresent(flags)) {
|
||||
if (flag.isPresent((byte) flags)) {
|
||||
rankingFactors.addTermFactor(termId, "flags." + flag.name(), "true");
|
||||
}
|
||||
}
|
||||
@ -409,9 +395,6 @@ public class IndexResultScoreCalculator {
|
||||
rankingFactors.addTermFactor(termId, "verbatim.title", "true");
|
||||
}
|
||||
|
||||
rankingFactors.addTermFactor(termId, "unordered.title", Integer.toString(unorderedMatchInTitleCount));
|
||||
rankingFactors.addTermFactor(termId, "unordered.heading", Integer.toString(unorderedMatchInHeadingCount));
|
||||
|
||||
if (positions[i] != null) {
|
||||
rankingFactors.addTermFactor(termId, "positions.all", positions[i].iterator());
|
||||
rankingFactors.addTermFactor(termId, "positions.title", SequenceOperations.findIntersections(spans.title.iterator(), positions[i].iterator()).iterator());
|
||||
|
@ -3,7 +3,6 @@ package nu.marginalia.index.results.model.ids;
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
public final class TermIdList {
|
||||
@ -11,7 +10,6 @@ public final class TermIdList {
|
||||
|
||||
public TermIdList(long[] array) {
|
||||
this.array = array;
|
||||
Arrays.sort(this.array);
|
||||
}
|
||||
|
||||
public TermIdList(LongArrayList list) {
|
||||
@ -35,12 +33,22 @@ public final class TermIdList {
|
||||
}
|
||||
|
||||
public boolean contains(long id) {
|
||||
// Implicitly sorted
|
||||
return Arrays.binarySearch(array, id) >= 0;
|
||||
// array is typically small and unsorted, so linear search is fine
|
||||
for (int i = 0; i < array.length; i++) {
|
||||
if (array[i] == id) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public int indexOf(long id) {
|
||||
return Arrays.binarySearch(array, id);
|
||||
for (int i = 0; i < array.length; i++) {
|
||||
if (array[i] == id) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -91,7 +91,7 @@ public class SearchServicePaperDoll extends AbstractModule {
|
||||
long positions)
|
||||
{
|
||||
results.add(new DecoratedSearchResultItem(
|
||||
new SearchResultItem(url.hashCode(), 2, 3),
|
||||
new SearchResultItem(url.hashCode(), 2, 3, score),
|
||||
new EdgeUrl(url),
|
||||
title,
|
||||
description,
|
||||
|
Loading…
Reference in New Issue
Block a user