(search) Fix outdated assumptions about the results

We no longer break the query into "sets" of search terms and need to adapt the code to not use this assumption.

For the API service, we'll simulate the old behavior to keep the API stable.

For the search service, we'll introduce a new way of calculating positions through tree aggregation.
This commit is contained in:
Viktor Lofgren 2024-04-07 11:24:30 +02:00
parent 6cba6aef3b
commit 4fb86ac692
11 changed files with 165 additions and 73 deletions

View File

@ -121,6 +121,7 @@ public class QueryProtobufCodec {
results.getPubYear(), // ??, results.getPubYear(), // ??,
results.getDataHash(), results.getDataHash(),
results.getWordsTotal(), results.getWordsTotal(),
results.getBestPositions(),
results.getRankingScore() results.getRankingScore()
); );
} }
@ -202,6 +203,7 @@ public class QueryProtobufCodec {
rpcDecoratedResultItem.getPubYear(), rpcDecoratedResultItem.getPubYear(),
rpcDecoratedResultItem.getDataHash(), rpcDecoratedResultItem.getDataHash(),
rpcDecoratedResultItem.getWordsTotal(), rpcDecoratedResultItem.getWordsTotal(),
rpcDecoratedResultItem.getBestPositions(),
rpcDecoratedResultItem.getRankingScore() rpcDecoratedResultItem.getRankingScore()
); );
} }

View File

@ -44,4 +44,9 @@ public class CompiledQueryAggregates {
public static List<LongSet> queriesAggregate(CompiledQueryLong query) { public static List<LongSet> queriesAggregate(CompiledQueryLong query) {
return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query))); return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query)));
} }
/** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */
public static <T> LongSet positionsAggregate(CompiledQuery<T> query, ToLongFunction<T> operator) {
return query.root().visit(new CqPositionsOperator(query, operator));
}
} }

View File

@ -0,0 +1,79 @@
package nu.marginalia.api.searchquery.model.compiled.aggregate;
import it.unimi.dsi.fastutil.longs.LongArraySet;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongSet;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import java.util.List;
import java.util.function.IntToLongFunction;
import java.util.function.ToLongFunction;
public class CqPositionsOperator implements CqExpression.ObjectVisitor<LongSet> {
private final IntToLongFunction operator;
public <T> CqPositionsOperator(CompiledQuery<T> query, ToLongFunction<T> operator) {
this.operator = idx -> operator.applyAsLong(query.at(idx));
}
@Override
public LongSet onAnd(List<? extends CqExpression> parts) {
LongSet ret = new LongArraySet();
for (var part : parts) {
ret = comineSets(ret, part.visit(this));
}
return ret;
}
private LongSet comineSets(LongSet a, LongSet b) {
if (a.isEmpty())
return b;
if (b.isEmpty())
return a;
LongSet ret = newSet(a.size() * b.size());
var ai = a.longIterator();
while (ai.hasNext()) {
long aval = ai.nextLong();
var bi = b.longIterator();
while (bi.hasNext()) {
ret.add(aval & bi.nextLong());
}
}
return ret;
}
@Override
public LongSet onOr(List<? extends CqExpression> parts) {
LongSet ret = newSet(parts.size());
for (var part : parts) {
ret.addAll(part.visit(this));
}
return ret;
}
@Override
public LongSet onLeaf(int idx) {
var set = newSet(1);
set.add(operator.applyAsLong(idx));
return set;
}
/** Allocate a new set suitable for a collection with the provided cardinality */
private LongSet newSet(int cardinality) {
if (cardinality < 8)
return new LongArraySet(cardinality);
else
return new LongOpenHashSet(cardinality);
}
}

View File

@ -30,6 +30,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
public final Integer pubYear; public final Integer pubYear;
public final long dataHash; public final long dataHash;
public final int wordsTotal; public final int wordsTotal;
public final long bestPositions;
public final double rankingScore; public final double rankingScore;
public long documentId() { public long documentId() {
@ -65,6 +66,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
Integer pubYear, Integer pubYear,
long dataHash, long dataHash,
int wordsTotal, int wordsTotal,
long bestPositions,
double rankingScore) double rankingScore)
{ {
this.rawIndexResult = rawIndexResult; this.rawIndexResult = rawIndexResult;
@ -77,6 +79,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
this.pubYear = pubYear; this.pubYear = pubYear;
this.dataHash = dataHash; this.dataHash = dataHash;
this.wordsTotal = wordsTotal; this.wordsTotal = wordsTotal;
this.bestPositions = bestPositions;
this.rankingScore = rankingScore; this.rankingScore = rankingScore;
} }

View File

@ -7,8 +7,6 @@ import nu.marginalia.model.idx.DocumentMetadata;
import java.util.Objects; import java.util.Objects;
public final class SearchResultKeywordScore { public final class SearchResultKeywordScore {
@Deprecated
public final int subquery;
public final long termId; public final long termId;
public final String keyword; public final String keyword;
private final long encodedWordMetadata; private final long encodedWordMetadata;
@ -22,7 +20,6 @@ public final class SearchResultKeywordScore {
long encodedDocMetadata, long encodedDocMetadata,
int htmlFeatures) { int htmlFeatures) {
this.termId = termId; this.termId = termId;
this.subquery = -1; // FIXME, deprecated
this.keyword = keyword; this.keyword = keyword;
this.encodedWordMetadata = encodedWordMetadata; this.encodedWordMetadata = encodedWordMetadata;
this.encodedDocMetadata = encodedDocMetadata; this.encodedDocMetadata = encodedDocMetadata;
@ -37,8 +34,9 @@ public final class SearchResultKeywordScore {
return Long.bitCount(positions()); return Long.bitCount(positions());
} }
@Deprecated // FIXME 2024-04-06
public int subquery() { public int subquery() {
return subquery; return -1;
} }
public long positions() { public long positions() {
return WordMetadata.decodePositions(encodedWordMetadata); return WordMetadata.decodePositions(encodedWordMetadata);
@ -70,21 +68,19 @@ public final class SearchResultKeywordScore {
if (obj == this) return true; if (obj == this) return true;
if (obj == null || obj.getClass() != this.getClass()) return false; if (obj == null || obj.getClass() != this.getClass()) return false;
var that = (SearchResultKeywordScore) obj; var that = (SearchResultKeywordScore) obj;
return this.subquery == that.subquery && return Objects.equals(this.keyword, that.keyword) &&
Objects.equals(this.keyword, that.keyword) &&
this.encodedWordMetadata == that.encodedWordMetadata && this.encodedWordMetadata == that.encodedWordMetadata &&
this.encodedDocMetadata == that.encodedDocMetadata; this.encodedDocMetadata == that.encodedDocMetadata;
} }
@Override @Override
public int hashCode() { public int hashCode() {
return Objects.hash(subquery, keyword, encodedWordMetadata, encodedDocMetadata); return Objects.hash(keyword, encodedWordMetadata, encodedDocMetadata);
} }
@Override @Override
public String toString() { public String toString() {
return "SearchResultKeywordScore[" + return "SearchResultKeywordScore[" +
"set=" + subquery + ", " +
"keyword=" + keyword + ", " + "keyword=" + keyword + ", " +
"encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ", " + "encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ", " +
"encodedDocMetadata=" + new DocumentMetadata(encodedDocMetadata) + ']'; "encodedDocMetadata=" + new DocumentMetadata(encodedDocMetadata) + ']';

View File

@ -91,6 +91,7 @@ message RpcDecoratedResultItem {
int64 dataHash = 9; int64 dataHash = 9;
int32 wordsTotal = 10; int32 wordsTotal = 10;
double rankingScore = 11; // The ranking score of this search result item, lower is better double rankingScore = 11; // The ranking score of this search result item, lower is better
int64 bestPositions = 12;
} }
/** A raw index-service view of a search result */ /** A raw index-service view of a search result */

View File

@ -155,6 +155,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
.setTitle(result.title) .setTitle(result.title)
.setUrl(result.url.toString()) .setUrl(result.url.toString())
.setWordsTotal(result.wordsTotal) .setWordsTotal(result.wordsTotal)
.setBestPositions(result.bestPositions)
.setRawItem(rawItem); .setRawItem(rawItem);
if (result.pubYear != null) { if (result.pubYear != null) {

View File

@ -4,7 +4,9 @@ import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import gnu.trove.list.TLongList; import gnu.trove.list.TLongList;
import gnu.trove.list.array.TLongArrayList; import gnu.trove.list.array.TLongArrayList;
import it.unimi.dsi.fastutil.longs.LongSet;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.SearchResultItem; import nu.marginalia.api.searchquery.model.results.SearchResultItem;
@ -152,8 +154,27 @@ public class IndexResultValuatorService {
docData.pubYear(), docData.pubYear(),
docData.dataHash(), docData.dataHash(),
docData.wordsTotal(), docData.wordsTotal(),
bestPositions(resultQuery),
resultValuator.calculateSearchResultValue(resultQuery, docData.wordsTotal(), rankingContext) resultValuator.calculateSearchResultValue(resultQuery, docData.wordsTotal(), rankingContext)
); );
}
private long bestPositions(CompiledQuery<SearchResultKeywordScore> resultQuery) {
LongSet positionsSet = CompiledQueryAggregates.positionsAggregate(resultQuery, SearchResultKeywordScore::positions);
int bestPc = 0;
long bestPositions = 0;
var li = positionsSet.longIterator();
while (li.hasNext()) {
long pos = li.nextLong();
int pc = Long.bitCount(pos);
if (pc > bestPc) {
bestPc = pc;
bestPositions = pos;
}
}
return bestPositions;
} }
} }

View File

@ -70,23 +70,25 @@ public class ApiSearchOperator {
ApiSearchResult convert(DecoratedSearchResultItem url) { ApiSearchResult convert(DecoratedSearchResultItem url) {
List<List<ApiSearchResultQueryDetails>> details = new ArrayList<>(); List<List<ApiSearchResultQueryDetails>> details = new ArrayList<>();
if (url.rawIndexResult != null) {
var bySet = url.rawIndexResult.keywordScores.stream().collect(Collectors.groupingBy(SearchResultKeywordScore::subquery));
outer: // This list-of-list construction is to avoid breaking the API,
for (var entries : bySet.values()) { // we'll always have just a single outer list from now on...
if (url.rawIndexResult != null) {
List<ApiSearchResultQueryDetails> lst = new ArrayList<>(); List<ApiSearchResultQueryDetails> lst = new ArrayList<>();
for (var entry : entries) { for (var entry : url.rawIndexResult.keywordScores) {
var metadata = new WordMetadata(entry.encodedWordMetadata()); var metadata = new WordMetadata(entry.encodedWordMetadata());
// Skip terms that don't appear anywhere
if (metadata.isEmpty()) if (metadata.isEmpty())
continue outer; continue;
Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet()); Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet());
lst.add(new ApiSearchResultQueryDetails(entry.keyword, Long.bitCount(metadata.positions()), flags)); lst.add(new ApiSearchResultQueryDetails(entry.keyword, Long.bitCount(metadata.positions()), flags));
} }
details.add(lst); details.add(lst);
} }
}
return new ApiSearchResult( return new ApiSearchResult(
url.url.toString(), url.url.toString(),

View File

@ -6,7 +6,6 @@ import nu.marginalia.model.idx.WordFlags;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import java.util.*; import java.util.*;
import java.util.stream.Collectors;
/** A class to hold a list of UrlDetails, grouped by domain, where the first one is the main result /** A class to hold a list of UrlDetails, grouped by domain, where the first one is the main result
* and the rest are additional results, for summary display. */ * and the rest are additional results, for summary display. */
@ -19,44 +18,46 @@ public class ClusteredUrlDetails implements Comparable<ClusteredUrlDetails> {
* @param details A collection of UrlDetails, which must not be empty. * @param details A collection of UrlDetails, which must not be empty.
*/ */
public ClusteredUrlDetails(Collection<UrlDetails> details) { public ClusteredUrlDetails(Collection<UrlDetails> details) {
var queue = new PriorityQueue<>(details); var items = new ArrayList<>(details);
if (queue.isEmpty()) items.sort(Comparator.naturalOrder());
if (items.isEmpty())
throw new IllegalArgumentException("Empty list of details"); throw new IllegalArgumentException("Empty list of details");
this.first = queue.poll(); this.first = items.removeFirst();
this.rest = items;
if (queue.isEmpty()) {
this.rest = Collections.emptyList();
}
else {
double bestScore = first.termScore; double bestScore = first.termScore;
double scoreLimit = Math.min(4.0, bestScore * 1.25); double scoreLimit = Math.min(4.0, bestScore * 1.25);
this.rest = queue this.rest.removeIf(urlDetail -> {
.stream() if (urlDetail.termScore > scoreLimit)
.filter(this::isEligbleForInclusion) return false;
.takeWhile(next -> next.termScore <= scoreLimit)
.toList(); for (var keywordScore : urlDetail.resultItem.keywordScores) {
if (keywordScore.isKeywordSpecial())
continue;
if (keywordScore.positionCount() == 0)
continue;
if (keywordScore.hasTermFlag(WordFlags.Title))
return false;
if (keywordScore.hasTermFlag(WordFlags.ExternalLink))
return false;
if (keywordScore.hasTermFlag(WordFlags.UrlDomain))
return false;
if (keywordScore.hasTermFlag(WordFlags.UrlPath))
return false;
if (keywordScore.hasTermFlag(WordFlags.Subjects))
return false;
} }
return true;
});
} }
private boolean isEligbleForInclusion(UrlDetails urlDetails) {
return urlDetails.resultItem.keywordScores.stream()
.filter(score -> !score.keyword.contains(":"))
.collect(Collectors.toMap(
score -> -1, // FIXME
score -> score.hasTermFlag(WordFlags.Title)
| score.hasTermFlag(WordFlags.ExternalLink)
| score.hasTermFlag(WordFlags.UrlDomain)
| score.hasTermFlag(WordFlags.UrlPath)
| score.hasTermFlag(WordFlags.Subjects)
,
(a, b) -> a && b
))
.containsValue(Boolean.TRUE);
}
public ClusteredUrlDetails(@NotNull UrlDetails onlyFirst) { public ClusteredUrlDetails(@NotNull UrlDetails onlyFirst) {
this.first = onlyFirst; this.first = onlyFirst;

View File

@ -88,7 +88,7 @@ public class SearchQueryIndexService {
DomainIndexingState.ACTIVE, DomainIndexingState.ACTIVE,
detail.rankingScore, // termScore detail.rankingScore, // termScore
detail.resultsFromDomain(), detail.resultsFromDomain(),
getPositionsString(detail.rawIndexResult), getPositionsString(detail),
detail.rawIndexResult, detail.rawIndexResult,
detail.rawIndexResult.keywordScores detail.rawIndexResult.keywordScores
)); ));
@ -97,27 +97,8 @@ public class SearchQueryIndexService {
return ret; return ret;
} }
private String getPositionsString(SearchResultItem resultItem) { private String getPositionsString(DecoratedSearchResultItem resultItem) {
Int2LongArrayMap positionsPerSet = new Int2LongArrayMap(8); return BrailleBlockPunchCards.printBits(resultItem.bestPositions, 56);
for (var score : resultItem.keywordScores) {
if (!score.isKeywordRegular()) {
continue;
}
positionsPerSet.merge(score.subquery(), score.positions(), this::and);
}
long bits = positionsPerSet.values().longStream().reduce(this::or).orElse(0);
return BrailleBlockPunchCards.printBits(bits, 56);
} }
private long and(long a, long b) {
return a & b;
}
private long or(long a, long b) {
return a | b;
}
} }