(search) Fix outdated assumptions about the results

We no longer break the query into "sets" of search terms and need to adapt the code to not use this assumption.

For the API service, we'll simulate the old behavior to keep the API stable.

For the search service, we'll introduce a new way of calculating positions through tree aggregation.
This commit is contained in:
Viktor Lofgren 2024-04-07 11:24:30 +02:00
parent dbdcf459a7
commit fcdc843c15
11 changed files with 165 additions and 73 deletions

View File

@ -121,6 +121,7 @@ public class QueryProtobufCodec {
results.getPubYear(), // ??,
results.getDataHash(),
results.getWordsTotal(),
results.getBestPositions(),
results.getRankingScore()
);
}
@ -202,6 +203,7 @@ public class QueryProtobufCodec {
rpcDecoratedResultItem.getPubYear(),
rpcDecoratedResultItem.getDataHash(),
rpcDecoratedResultItem.getWordsTotal(),
rpcDecoratedResultItem.getBestPositions(),
rpcDecoratedResultItem.getRankingScore()
);
}

View File

@ -44,4 +44,9 @@ public class CompiledQueryAggregates {
public static List<LongSet> queriesAggregate(CompiledQueryLong query) {
return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query)));
}
/** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */
public static <T> LongSet positionsAggregate(CompiledQuery<T> query, ToLongFunction<T> operator) {
return query.root().visit(new CqPositionsOperator(query, operator));
}
}

View File

@ -0,0 +1,79 @@
package nu.marginalia.api.searchquery.model.compiled.aggregate;
import it.unimi.dsi.fastutil.longs.LongArraySet;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongSet;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import java.util.List;
import java.util.function.IntToLongFunction;
import java.util.function.ToLongFunction;
public class CqPositionsOperator implements CqExpression.ObjectVisitor<LongSet> {
private final IntToLongFunction operator;
public <T> CqPositionsOperator(CompiledQuery<T> query, ToLongFunction<T> operator) {
this.operator = idx -> operator.applyAsLong(query.at(idx));
}
@Override
public LongSet onAnd(List<? extends CqExpression> parts) {
LongSet ret = new LongArraySet();
for (var part : parts) {
ret = comineSets(ret, part.visit(this));
}
return ret;
}
private LongSet comineSets(LongSet a, LongSet b) {
if (a.isEmpty())
return b;
if (b.isEmpty())
return a;
LongSet ret = newSet(a.size() * b.size());
var ai = a.longIterator();
while (ai.hasNext()) {
long aval = ai.nextLong();
var bi = b.longIterator();
while (bi.hasNext()) {
ret.add(aval & bi.nextLong());
}
}
return ret;
}
@Override
public LongSet onOr(List<? extends CqExpression> parts) {
LongSet ret = newSet(parts.size());
for (var part : parts) {
ret.addAll(part.visit(this));
}
return ret;
}
@Override
public LongSet onLeaf(int idx) {
var set = newSet(1);
set.add(operator.applyAsLong(idx));
return set;
}
/** Allocate a new set suitable for a collection with the provided cardinality */
private LongSet newSet(int cardinality) {
if (cardinality < 8)
return new LongArraySet(cardinality);
else
return new LongOpenHashSet(cardinality);
}
}

View File

@ -30,6 +30,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
public final Integer pubYear;
public final long dataHash;
public final int wordsTotal;
public final long bestPositions;
public final double rankingScore;
public long documentId() {
@ -65,6 +66,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
Integer pubYear,
long dataHash,
int wordsTotal,
long bestPositions,
double rankingScore)
{
this.rawIndexResult = rawIndexResult;
@ -77,6 +79,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
this.pubYear = pubYear;
this.dataHash = dataHash;
this.wordsTotal = wordsTotal;
this.bestPositions = bestPositions;
this.rankingScore = rankingScore;
}

View File

@ -7,8 +7,6 @@ import nu.marginalia.model.idx.DocumentMetadata;
import java.util.Objects;
public final class SearchResultKeywordScore {
@Deprecated
public final int subquery;
public final long termId;
public final String keyword;
private final long encodedWordMetadata;
@ -22,7 +20,6 @@ public final class SearchResultKeywordScore {
long encodedDocMetadata,
int htmlFeatures) {
this.termId = termId;
this.subquery = -1; // FIXME, deprecated
this.keyword = keyword;
this.encodedWordMetadata = encodedWordMetadata;
this.encodedDocMetadata = encodedDocMetadata;
@ -37,8 +34,9 @@ public final class SearchResultKeywordScore {
return Long.bitCount(positions());
}
@Deprecated // FIXME 2024-04-06
public int subquery() {
return subquery;
return -1;
}
public long positions() {
return WordMetadata.decodePositions(encodedWordMetadata);
@ -70,21 +68,19 @@ public final class SearchResultKeywordScore {
if (obj == this) return true;
if (obj == null || obj.getClass() != this.getClass()) return false;
var that = (SearchResultKeywordScore) obj;
return this.subquery == that.subquery &&
Objects.equals(this.keyword, that.keyword) &&
return Objects.equals(this.keyword, that.keyword) &&
this.encodedWordMetadata == that.encodedWordMetadata &&
this.encodedDocMetadata == that.encodedDocMetadata;
}
@Override
public int hashCode() {
return Objects.hash(subquery, keyword, encodedWordMetadata, encodedDocMetadata);
return Objects.hash(keyword, encodedWordMetadata, encodedDocMetadata);
}
@Override
public String toString() {
return "SearchResultKeywordScore[" +
"set=" + subquery + ", " +
"keyword=" + keyword + ", " +
"encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ", " +
"encodedDocMetadata=" + new DocumentMetadata(encodedDocMetadata) + ']';

View File

@ -91,6 +91,7 @@ message RpcDecoratedResultItem {
int64 dataHash = 9;
int32 wordsTotal = 10;
double rankingScore = 11; // The ranking score of this search result item, lower is better
int64 bestPositions = 12;
}
/** A raw index-service view of a search result */

View File

@ -155,6 +155,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
.setTitle(result.title)
.setUrl(result.url.toString())
.setWordsTotal(result.wordsTotal)
.setBestPositions(result.bestPositions)
.setRawItem(rawItem);
if (result.pubYear != null) {

View File

@ -4,7 +4,9 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import gnu.trove.list.TLongList;
import gnu.trove.list.array.TLongArrayList;
import it.unimi.dsi.fastutil.longs.LongSet;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
@ -152,8 +154,27 @@ public class IndexResultValuatorService {
docData.pubYear(),
docData.dataHash(),
docData.wordsTotal(),
bestPositions(resultQuery),
resultValuator.calculateSearchResultValue(resultQuery, docData.wordsTotal(), rankingContext)
);
}
private long bestPositions(CompiledQuery<SearchResultKeywordScore> resultQuery) {
LongSet positionsSet = CompiledQueryAggregates.positionsAggregate(resultQuery, SearchResultKeywordScore::positions);
int bestPc = 0;
long bestPositions = 0;
var li = positionsSet.longIterator();
while (li.hasNext()) {
long pos = li.nextLong();
int pc = Long.bitCount(pos);
if (pc > bestPc) {
bestPc = pc;
bestPositions = pos;
}
}
return bestPositions;
}
}

View File

@ -70,22 +70,24 @@ public class ApiSearchOperator {
ApiSearchResult convert(DecoratedSearchResultItem url) {
List<List<ApiSearchResultQueryDetails>> details = new ArrayList<>();
// This list-of-list construction is to avoid breaking the API,
// we'll always have just a single outer list from now on...
if (url.rawIndexResult != null) {
var bySet = url.rawIndexResult.keywordScores.stream().collect(Collectors.groupingBy(SearchResultKeywordScore::subquery));
List<ApiSearchResultQueryDetails> lst = new ArrayList<>();
for (var entry : url.rawIndexResult.keywordScores) {
var metadata = new WordMetadata(entry.encodedWordMetadata());
outer:
for (var entries : bySet.values()) {
List<ApiSearchResultQueryDetails> lst = new ArrayList<>();
for (var entry : entries) {
var metadata = new WordMetadata(entry.encodedWordMetadata());
if (metadata.isEmpty())
continue outer;
// Skip terms that don't appear anywhere
if (metadata.isEmpty())
continue;
Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet());
lst.add(new ApiSearchResultQueryDetails(entry.keyword, Long.bitCount(metadata.positions()), flags));
}
details.add(lst);
Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet());
lst.add(new ApiSearchResultQueryDetails(entry.keyword, Long.bitCount(metadata.positions()), flags));
}
details.add(lst);
}
return new ApiSearchResult(

View File

@ -6,7 +6,6 @@ import nu.marginalia.model.idx.WordFlags;
import org.jetbrains.annotations.NotNull;
import java.util.*;
import java.util.stream.Collectors;
/** A class to hold a list of UrlDetails, grouped by domain, where the first one is the main result
* and the rest are additional results, for summary display. */
@ -19,44 +18,46 @@ public class ClusteredUrlDetails implements Comparable<ClusteredUrlDetails> {
* @param details A collection of UrlDetails, which must not be empty.
*/
public ClusteredUrlDetails(Collection<UrlDetails> details) {
var queue = new PriorityQueue<>(details);
var items = new ArrayList<>(details);
if (queue.isEmpty())
items.sort(Comparator.naturalOrder());
if (items.isEmpty())
throw new IllegalArgumentException("Empty list of details");
this.first = queue.poll();
this.first = items.removeFirst();
this.rest = items;
if (queue.isEmpty()) {
this.rest = Collections.emptyList();
}
else {
double bestScore = first.termScore;
double scoreLimit = Math.min(4.0, bestScore * 1.25);
double bestScore = first.termScore;
double scoreLimit = Math.min(4.0, bestScore * 1.25);
this.rest = queue
.stream()
.filter(this::isEligbleForInclusion)
.takeWhile(next -> next.termScore <= scoreLimit)
.toList();
}
this.rest.removeIf(urlDetail -> {
if (urlDetail.termScore > scoreLimit)
return false;
for (var keywordScore : urlDetail.resultItem.keywordScores) {
if (keywordScore.isKeywordSpecial())
continue;
if (keywordScore.positionCount() == 0)
continue;
if (keywordScore.hasTermFlag(WordFlags.Title))
return false;
if (keywordScore.hasTermFlag(WordFlags.ExternalLink))
return false;
if (keywordScore.hasTermFlag(WordFlags.UrlDomain))
return false;
if (keywordScore.hasTermFlag(WordFlags.UrlPath))
return false;
if (keywordScore.hasTermFlag(WordFlags.Subjects))
return false;
}
return true;
});
}
private boolean isEligbleForInclusion(UrlDetails urlDetails) {
return urlDetails.resultItem.keywordScores.stream()
.filter(score -> !score.keyword.contains(":"))
.collect(Collectors.toMap(
score -> -1, // FIXME
score -> score.hasTermFlag(WordFlags.Title)
| score.hasTermFlag(WordFlags.ExternalLink)
| score.hasTermFlag(WordFlags.UrlDomain)
| score.hasTermFlag(WordFlags.UrlPath)
| score.hasTermFlag(WordFlags.Subjects)
,
(a, b) -> a && b
))
.containsValue(Boolean.TRUE);
}
public ClusteredUrlDetails(@NotNull UrlDetails onlyFirst) {
this.first = onlyFirst;

View File

@ -88,7 +88,7 @@ public class SearchQueryIndexService {
DomainIndexingState.ACTIVE,
detail.rankingScore, // termScore
detail.resultsFromDomain(),
getPositionsString(detail.rawIndexResult),
getPositionsString(detail),
detail.rawIndexResult,
detail.rawIndexResult.keywordScores
));
@ -97,27 +97,8 @@ public class SearchQueryIndexService {
return ret;
}
private String getPositionsString(SearchResultItem resultItem) {
Int2LongArrayMap positionsPerSet = new Int2LongArrayMap(8);
for (var score : resultItem.keywordScores) {
if (!score.isKeywordRegular()) {
continue;
}
positionsPerSet.merge(score.subquery(), score.positions(), this::and);
}
long bits = positionsPerSet.values().longStream().reduce(this::or).orElse(0);
return BrailleBlockPunchCards.printBits(bits, 56);
private String getPositionsString(DecoratedSearchResultItem resultItem) {
return BrailleBlockPunchCards.printBits(resultItem.bestPositions, 56);
}
private long and(long a, long b) {
return a & b;
}
private long or(long a, long b) {
return a | b;
}
}