diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java index 356a1d86..775d63fb 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java @@ -3,7 +3,9 @@ package nu.marginalia.api.searchquery.model.compiled; import org.jetbrains.annotations.NotNull; import java.util.Iterator; -import java.util.function.*; +import java.util.function.Function; +import java.util.function.ToIntFunction; +import java.util.function.ToLongFunction; import java.util.stream.IntStream; import java.util.stream.Stream; @@ -46,8 +48,8 @@ public class CompiledQuery implements Iterable { return new CompiledQueryLong(root, data.mapToLong(mapper)); } - public CompiledQueryLong mapToInt(ToIntFunction mapper) { - return new CompiledQueryLong(root, data.mapToInt(mapper)); + public CompiledQueryInt mapToInt(ToIntFunction mapper) { + return new CompiledQueryInt(root, data.mapToInt(mapper)); } public CqExpression root() { diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java index 145f3f0f..63f7301b 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java @@ -33,13 +33,13 @@ public class CqData { return new CqDataLong(newData); } - public CqDataLong mapToInt(ToIntFunction mapper) { - long[] newData = new long[data.length]; + public CqDataInt mapToInt(ToIntFunction mapper) { + int[] newData = new int[data.length]; for (int i = 0; i < data.length; i++) { - newData[i] = mapper.applyAsInt((T) data[i]); + newData[i] = mapper.applyAsInt(data[i]); } - return new CqDataLong(newData); + return new CqDataInt(newData); } public T get(int i) { diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java index c9599b2e..6a70625c 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java @@ -32,11 +32,13 @@ public class SearchResultItem implements Comparable { public SearchResultItem(long combinedId, long encodedDocMetadata, - int htmlFeatures) { + int htmlFeatures, + double score) { this.combinedId = combinedId; this.encodedDocMetadata = encodedDocMetadata; this.keywordScores = new ArrayList<>(); this.htmlFeatures = htmlFeatures; + this.scoreValue = score; } diff --git a/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParserTest.java b/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParserTest.java index 47983820..e7b1ce5d 100644 --- a/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParserTest.java +++ b/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParserTest.java @@ -1,10 +1,11 @@ package nu.marginalia.api.searchquery.model.compiled; +import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import org.junit.jupiter.api.Test; import java.util.List; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; class CompiledQueryParserTest { @@ -22,6 +23,21 @@ class CompiledQueryParserTest { assertEquals(w(q, "foo"), q.root); } + @Test + public void testCohen() { + CompiledQuery q = CompiledQueryParser.parse("( tube brief of elaboration | brief_elaboration_of_a_tube )"); + int val = CompiledQueryAggregates.intMaxMinAggregate(q, s -> + switch (s) { + case "brief" -> 3; + case "tube" -> 2; + case "of" -> 1; + default -> 0; + }); + assertEquals(0, val); + + System.out.println(q.stream().toList()); + } + @Test public void testAndTwoWords() { CompiledQuery q = CompiledQueryParser.parse("foo bar"); diff --git a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java index e1dd41cf..0705433c 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java @@ -2,7 +2,6 @@ package nu.marginalia.index.results; import it.unimi.dsi.fastutil.ints.IntIterator; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; -import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; @@ -54,8 +53,6 @@ public class IndexResultScoreCalculator { this.compiledQuery = params.compiledQuery; } - private final long flagsFilterMask = WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit(); - @Nullable public SearchResultItem calculateScore(Arena arena, @Nullable DebugRankingFactors rankingFactors, @@ -67,19 +64,19 @@ public class IndexResultScoreCalculator { CompiledQuery positionsQuery = compiledQuery.root.newQuery(positions); - int[] counts = new int[compiledQuery.size()]; - - for (int i = 0; i < counts.length; i++) { - if (positions[i] != null) { - counts[i] = positions[i].valueCount(); - } - } - CompiledQueryInt positionsCountQuery = compiledQuery.root.newQuery(counts); - CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags); - // If the document is not relevant to the query, abort early to reduce allocations and // avoid unnecessary calculations - if (testRelevance(wordFlagsQuery, positionsCountQuery)) { + + CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags); + if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams.queryStrategy())) { + return null; + } + + boolean allSynthetic = booleanAggregate(wordFlagsQuery, flags -> WordFlags.Synthetic.isPresent((byte) flags)); + int minFlagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & 0xff)); + int minPositionsCount = intMaxMinAggregate(positionsQuery, pos -> pos == null ? 0 : pos.valueCount()); + + if (minFlagsCount == 0 && !allSynthetic && minPositionsCount == 0) { return null; } @@ -102,28 +99,7 @@ public class IndexResultScoreCalculator { searchTerms.coherences, rankingContext); - SearchResultItem searchResult = new SearchResultItem(combinedId, - docMetadata, - htmlFeatures); - - searchResult.setScore(score); - - return searchResult; - } - - private boolean testRelevance(CompiledQueryLong wordFlagsQuery, CompiledQueryInt countsQuery) { - boolean allSynthetic = booleanAggregate(wordFlagsQuery, flags -> WordFlags.Synthetic.isPresent((byte) flags)); - int flagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & flagsFilterMask)); - int positionsCount = intMaxMinAggregate(countsQuery, p -> p); - - if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams.queryStrategy())) { - return true; - } - if (flagsCount == 0 && !allSynthetic && positionsCount == 0) { - return true; - } - - return false; + return new SearchResultItem(combinedId, docMetadata, htmlFeatures, score); } private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores, @@ -320,6 +296,11 @@ public class IndexResultScoreCalculator { weightedCounts[i] += 0.2f; else if (spans.nav.containsPosition(pos)) weightedCounts[i] += 0.1f; + else + weightedCounts[i] += 1.0f; + + if (spans.externalLinkText.containsPosition(pos)) + weightedCounts[i] += 1.0f; } if (titleMatch) { @@ -375,14 +356,19 @@ public class IndexResultScoreCalculator { rankingFactors.addDocumentFactor("bm25.main", Double.toString(bM25)); rankingFactors.addDocumentFactor("bm25.flags", Double.toString(bFlags)); + rankingFactors.addDocumentFactor("unordered.title", Integer.toString(unorderedMatchInTitleCount)); + rankingFactors.addDocumentFactor("unordered.heading", Integer.toString(unorderedMatchInHeadingCount)); + for (int i = 0; i < searchTerms.termIdsAll.size(); i++) { long termId = searchTerms.termIdsAll.at(i); rankingFactors.addTermFactor(termId, "factor.weightedCount", Double.toString(weightedCounts[i])); - byte flags = (byte) wordFlagsQuery.at(i); + var flags = wordFlagsQuery.at(i); + + rankingFactors.addTermFactor(termId, "flags.rawEncoded", Long.toString(flags)); for (var flag : WordFlags.values()) { - if (flag.isPresent(flags)) { + if (flag.isPresent((byte) flags)) { rankingFactors.addTermFactor(termId, "flags." + flag.name(), "true"); } } @@ -409,9 +395,6 @@ public class IndexResultScoreCalculator { rankingFactors.addTermFactor(termId, "verbatim.title", "true"); } - rankingFactors.addTermFactor(termId, "unordered.title", Integer.toString(unorderedMatchInTitleCount)); - rankingFactors.addTermFactor(termId, "unordered.heading", Integer.toString(unorderedMatchInHeadingCount)); - if (positions[i] != null) { rankingFactors.addTermFactor(termId, "positions.all", positions[i].iterator()); rankingFactors.addTermFactor(termId, "positions.title", SequenceOperations.findIntersections(spans.title.iterator(), positions[i].iterator()).iterator()); diff --git a/code/index/java/nu/marginalia/index/results/model/ids/TermIdList.java b/code/index/java/nu/marginalia/index/results/model/ids/TermIdList.java index 2afba3a6..9737761c 100644 --- a/code/index/java/nu/marginalia/index/results/model/ids/TermIdList.java +++ b/code/index/java/nu/marginalia/index/results/model/ids/TermIdList.java @@ -3,7 +3,6 @@ package nu.marginalia.index.results.model.ids; import it.unimi.dsi.fastutil.longs.LongArrayList; import java.util.Arrays; -import java.util.Objects; import java.util.stream.LongStream; public final class TermIdList { @@ -11,7 +10,6 @@ public final class TermIdList { public TermIdList(long[] array) { this.array = array; - Arrays.sort(this.array); } public TermIdList(LongArrayList list) { @@ -35,12 +33,22 @@ public final class TermIdList { } public boolean contains(long id) { - // Implicitly sorted - return Arrays.binarySearch(array, id) >= 0; + // array is typically small and unsorted, so linear search is fine + for (int i = 0; i < array.length; i++) { + if (array[i] == id) { + return true; + } + } + return false; } public int indexOf(long id) { - return Arrays.binarySearch(array, id); + for (int i = 0; i < array.length; i++) { + if (array[i] == id) { + return i; + } + } + return -1; } @Override diff --git a/code/services-application/search-service/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java b/code/services-application/search-service/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java index 76fb62fc..7ef84262 100644 --- a/code/services-application/search-service/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java +++ b/code/services-application/search-service/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java @@ -91,7 +91,7 @@ public class SearchServicePaperDoll extends AbstractModule { long positions) { results.add(new DecoratedSearchResultItem( - new SearchResultItem(url.hashCode(), 2, 3), + new SearchResultItem(url.hashCode(), 2, 3, score), new EdgeUrl(url), title, description,