mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(index) Bugs and error fixes, chasing and fixing mystery results that did not contain all relevant keywords
This commit is contained in:
parent
df89661ed2
commit
016a4c62e1
@ -3,7 +3,9 @@ package nu.marginalia.api.searchquery.model.compiled;
|
|||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.function.*;
|
import java.util.function.Function;
|
||||||
|
import java.util.function.ToIntFunction;
|
||||||
|
import java.util.function.ToLongFunction;
|
||||||
import java.util.stream.IntStream;
|
import java.util.stream.IntStream;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
@ -46,8 +48,8 @@ public class CompiledQuery<T> implements Iterable<T> {
|
|||||||
return new CompiledQueryLong(root, data.mapToLong(mapper));
|
return new CompiledQueryLong(root, data.mapToLong(mapper));
|
||||||
}
|
}
|
||||||
|
|
||||||
public CompiledQueryLong mapToInt(ToIntFunction<T> mapper) {
|
public CompiledQueryInt mapToInt(ToIntFunction<T> mapper) {
|
||||||
return new CompiledQueryLong(root, data.mapToInt(mapper));
|
return new CompiledQueryInt(root, data.mapToInt(mapper));
|
||||||
}
|
}
|
||||||
|
|
||||||
public CqExpression root() {
|
public CqExpression root() {
|
||||||
|
@ -33,13 +33,13 @@ public class CqData<T> {
|
|||||||
return new CqDataLong(newData);
|
return new CqDataLong(newData);
|
||||||
}
|
}
|
||||||
|
|
||||||
public CqDataLong mapToInt(ToIntFunction<T> mapper) {
|
public CqDataInt mapToInt(ToIntFunction<T> mapper) {
|
||||||
long[] newData = new long[data.length];
|
int[] newData = new int[data.length];
|
||||||
for (int i = 0; i < data.length; i++) {
|
for (int i = 0; i < data.length; i++) {
|
||||||
newData[i] = mapper.applyAsInt((T) data[i]);
|
newData[i] = mapper.applyAsInt(data[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
return new CqDataLong(newData);
|
return new CqDataInt(newData);
|
||||||
}
|
}
|
||||||
|
|
||||||
public T get(int i) {
|
public T get(int i) {
|
||||||
|
@ -32,11 +32,13 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
|||||||
|
|
||||||
public SearchResultItem(long combinedId,
|
public SearchResultItem(long combinedId,
|
||||||
long encodedDocMetadata,
|
long encodedDocMetadata,
|
||||||
int htmlFeatures) {
|
int htmlFeatures,
|
||||||
|
double score) {
|
||||||
this.combinedId = combinedId;
|
this.combinedId = combinedId;
|
||||||
this.encodedDocMetadata = encodedDocMetadata;
|
this.encodedDocMetadata = encodedDocMetadata;
|
||||||
this.keywordScores = new ArrayList<>();
|
this.keywordScores = new ArrayList<>();
|
||||||
this.htmlFeatures = htmlFeatures;
|
this.htmlFeatures = htmlFeatures;
|
||||||
|
this.scoreValue = score;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,10 +1,11 @@
|
|||||||
package nu.marginalia.api.searchquery.model.compiled;
|
package nu.marginalia.api.searchquery.model.compiled;
|
||||||
|
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
class CompiledQueryParserTest {
|
class CompiledQueryParserTest {
|
||||||
|
|
||||||
@ -22,6 +23,21 @@ class CompiledQueryParserTest {
|
|||||||
assertEquals(w(q, "foo"), q.root);
|
assertEquals(w(q, "foo"), q.root);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCohen() {
|
||||||
|
CompiledQuery<String> q = CompiledQueryParser.parse("( tube brief of elaboration | brief_elaboration_of_a_tube )");
|
||||||
|
int val = CompiledQueryAggregates.intMaxMinAggregate(q, s ->
|
||||||
|
switch (s) {
|
||||||
|
case "brief" -> 3;
|
||||||
|
case "tube" -> 2;
|
||||||
|
case "of" -> 1;
|
||||||
|
default -> 0;
|
||||||
|
});
|
||||||
|
assertEquals(0, val);
|
||||||
|
|
||||||
|
System.out.println(q.stream().toList());
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testAndTwoWords() {
|
public void testAndTwoWords() {
|
||||||
CompiledQuery<String> q = CompiledQueryParser.parse("foo bar");
|
CompiledQuery<String> q = CompiledQueryParser.parse("foo bar");
|
||||||
|
@ -2,7 +2,6 @@ package nu.marginalia.index.results;
|
|||||||
|
|
||||||
import it.unimi.dsi.fastutil.ints.IntIterator;
|
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
|
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||||
@ -54,8 +53,6 @@ public class IndexResultScoreCalculator {
|
|||||||
this.compiledQuery = params.compiledQuery;
|
this.compiledQuery = params.compiledQuery;
|
||||||
}
|
}
|
||||||
|
|
||||||
private final long flagsFilterMask = WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit();
|
|
||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
public SearchResultItem calculateScore(Arena arena,
|
public SearchResultItem calculateScore(Arena arena,
|
||||||
@Nullable DebugRankingFactors rankingFactors,
|
@Nullable DebugRankingFactors rankingFactors,
|
||||||
@ -67,19 +64,19 @@ public class IndexResultScoreCalculator {
|
|||||||
|
|
||||||
CompiledQuery<CodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
|
CompiledQuery<CodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
|
||||||
|
|
||||||
int[] counts = new int[compiledQuery.size()];
|
|
||||||
|
|
||||||
for (int i = 0; i < counts.length; i++) {
|
|
||||||
if (positions[i] != null) {
|
|
||||||
counts[i] = positions[i].valueCount();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
CompiledQueryInt positionsCountQuery = compiledQuery.root.newQuery(counts);
|
|
||||||
CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags);
|
|
||||||
|
|
||||||
// If the document is not relevant to the query, abort early to reduce allocations and
|
// If the document is not relevant to the query, abort early to reduce allocations and
|
||||||
// avoid unnecessary calculations
|
// avoid unnecessary calculations
|
||||||
if (testRelevance(wordFlagsQuery, positionsCountQuery)) {
|
|
||||||
|
CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags);
|
||||||
|
if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams.queryStrategy())) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean allSynthetic = booleanAggregate(wordFlagsQuery, flags -> WordFlags.Synthetic.isPresent((byte) flags));
|
||||||
|
int minFlagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & 0xff));
|
||||||
|
int minPositionsCount = intMaxMinAggregate(positionsQuery, pos -> pos == null ? 0 : pos.valueCount());
|
||||||
|
|
||||||
|
if (minFlagsCount == 0 && !allSynthetic && minPositionsCount == 0) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -102,28 +99,7 @@ public class IndexResultScoreCalculator {
|
|||||||
searchTerms.coherences,
|
searchTerms.coherences,
|
||||||
rankingContext);
|
rankingContext);
|
||||||
|
|
||||||
SearchResultItem searchResult = new SearchResultItem(combinedId,
|
return new SearchResultItem(combinedId, docMetadata, htmlFeatures, score);
|
||||||
docMetadata,
|
|
||||||
htmlFeatures);
|
|
||||||
|
|
||||||
searchResult.setScore(score);
|
|
||||||
|
|
||||||
return searchResult;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean testRelevance(CompiledQueryLong wordFlagsQuery, CompiledQueryInt countsQuery) {
|
|
||||||
boolean allSynthetic = booleanAggregate(wordFlagsQuery, flags -> WordFlags.Synthetic.isPresent((byte) flags));
|
|
||||||
int flagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & flagsFilterMask));
|
|
||||||
int positionsCount = intMaxMinAggregate(countsQuery, p -> p);
|
|
||||||
|
|
||||||
if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams.queryStrategy())) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (flagsCount == 0 && !allSynthetic && positionsCount == 0) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores,
|
private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores,
|
||||||
@ -320,6 +296,11 @@ public class IndexResultScoreCalculator {
|
|||||||
weightedCounts[i] += 0.2f;
|
weightedCounts[i] += 0.2f;
|
||||||
else if (spans.nav.containsPosition(pos))
|
else if (spans.nav.containsPosition(pos))
|
||||||
weightedCounts[i] += 0.1f;
|
weightedCounts[i] += 0.1f;
|
||||||
|
else
|
||||||
|
weightedCounts[i] += 1.0f;
|
||||||
|
|
||||||
|
if (spans.externalLinkText.containsPosition(pos))
|
||||||
|
weightedCounts[i] += 1.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (titleMatch) {
|
if (titleMatch) {
|
||||||
@ -375,14 +356,19 @@ public class IndexResultScoreCalculator {
|
|||||||
rankingFactors.addDocumentFactor("bm25.main", Double.toString(bM25));
|
rankingFactors.addDocumentFactor("bm25.main", Double.toString(bM25));
|
||||||
rankingFactors.addDocumentFactor("bm25.flags", Double.toString(bFlags));
|
rankingFactors.addDocumentFactor("bm25.flags", Double.toString(bFlags));
|
||||||
|
|
||||||
|
rankingFactors.addDocumentFactor("unordered.title", Integer.toString(unorderedMatchInTitleCount));
|
||||||
|
rankingFactors.addDocumentFactor("unordered.heading", Integer.toString(unorderedMatchInHeadingCount));
|
||||||
|
|
||||||
for (int i = 0; i < searchTerms.termIdsAll.size(); i++) {
|
for (int i = 0; i < searchTerms.termIdsAll.size(); i++) {
|
||||||
long termId = searchTerms.termIdsAll.at(i);
|
long termId = searchTerms.termIdsAll.at(i);
|
||||||
|
|
||||||
rankingFactors.addTermFactor(termId, "factor.weightedCount", Double.toString(weightedCounts[i]));
|
rankingFactors.addTermFactor(termId, "factor.weightedCount", Double.toString(weightedCounts[i]));
|
||||||
byte flags = (byte) wordFlagsQuery.at(i);
|
var flags = wordFlagsQuery.at(i);
|
||||||
|
|
||||||
|
rankingFactors.addTermFactor(termId, "flags.rawEncoded", Long.toString(flags));
|
||||||
|
|
||||||
for (var flag : WordFlags.values()) {
|
for (var flag : WordFlags.values()) {
|
||||||
if (flag.isPresent(flags)) {
|
if (flag.isPresent((byte) flags)) {
|
||||||
rankingFactors.addTermFactor(termId, "flags." + flag.name(), "true");
|
rankingFactors.addTermFactor(termId, "flags." + flag.name(), "true");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -409,9 +395,6 @@ public class IndexResultScoreCalculator {
|
|||||||
rankingFactors.addTermFactor(termId, "verbatim.title", "true");
|
rankingFactors.addTermFactor(termId, "verbatim.title", "true");
|
||||||
}
|
}
|
||||||
|
|
||||||
rankingFactors.addTermFactor(termId, "unordered.title", Integer.toString(unorderedMatchInTitleCount));
|
|
||||||
rankingFactors.addTermFactor(termId, "unordered.heading", Integer.toString(unorderedMatchInHeadingCount));
|
|
||||||
|
|
||||||
if (positions[i] != null) {
|
if (positions[i] != null) {
|
||||||
rankingFactors.addTermFactor(termId, "positions.all", positions[i].iterator());
|
rankingFactors.addTermFactor(termId, "positions.all", positions[i].iterator());
|
||||||
rankingFactors.addTermFactor(termId, "positions.title", SequenceOperations.findIntersections(spans.title.iterator(), positions[i].iterator()).iterator());
|
rankingFactors.addTermFactor(termId, "positions.title", SequenceOperations.findIntersections(spans.title.iterator(), positions[i].iterator()).iterator());
|
||||||
|
@ -3,7 +3,6 @@ package nu.marginalia.index.results.model.ids;
|
|||||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.stream.LongStream;
|
import java.util.stream.LongStream;
|
||||||
|
|
||||||
public final class TermIdList {
|
public final class TermIdList {
|
||||||
@ -11,7 +10,6 @@ public final class TermIdList {
|
|||||||
|
|
||||||
public TermIdList(long[] array) {
|
public TermIdList(long[] array) {
|
||||||
this.array = array;
|
this.array = array;
|
||||||
Arrays.sort(this.array);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public TermIdList(LongArrayList list) {
|
public TermIdList(LongArrayList list) {
|
||||||
@ -35,12 +33,22 @@ public final class TermIdList {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public boolean contains(long id) {
|
public boolean contains(long id) {
|
||||||
// Implicitly sorted
|
// array is typically small and unsorted, so linear search is fine
|
||||||
return Arrays.binarySearch(array, id) >= 0;
|
for (int i = 0; i < array.length; i++) {
|
||||||
|
if (array[i] == id) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int indexOf(long id) {
|
public int indexOf(long id) {
|
||||||
return Arrays.binarySearch(array, id);
|
for (int i = 0; i < array.length; i++) {
|
||||||
|
if (array[i] == id) {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -91,7 +91,7 @@ public class SearchServicePaperDoll extends AbstractModule {
|
|||||||
long positions)
|
long positions)
|
||||||
{
|
{
|
||||||
results.add(new DecoratedSearchResultItem(
|
results.add(new DecoratedSearchResultItem(
|
||||||
new SearchResultItem(url.hashCode(), 2, 3),
|
new SearchResultItem(url.hashCode(), 2, 3, score),
|
||||||
new EdgeUrl(url),
|
new EdgeUrl(url),
|
||||||
title,
|
title,
|
||||||
description,
|
description,
|
||||||
|
Loading…
Reference in New Issue
Block a user