mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(search) Fix outdated assumptions about the results
We no longer break the query into "sets" of search terms and need to adapt the code to not use this assumption. For the API service, we'll simulate the old behavior to keep the API stable. For the search service, we'll introduce a new way of calculating positions through tree aggregation.
This commit is contained in:
parent
dbdcf459a7
commit
fcdc843c15
@ -121,6 +121,7 @@ public class QueryProtobufCodec {
|
||||
results.getPubYear(), // ??,
|
||||
results.getDataHash(),
|
||||
results.getWordsTotal(),
|
||||
results.getBestPositions(),
|
||||
results.getRankingScore()
|
||||
);
|
||||
}
|
||||
@ -202,6 +203,7 @@ public class QueryProtobufCodec {
|
||||
rpcDecoratedResultItem.getPubYear(),
|
||||
rpcDecoratedResultItem.getDataHash(),
|
||||
rpcDecoratedResultItem.getWordsTotal(),
|
||||
rpcDecoratedResultItem.getBestPositions(),
|
||||
rpcDecoratedResultItem.getRankingScore()
|
||||
);
|
||||
}
|
||||
|
@ -44,4 +44,9 @@ public class CompiledQueryAggregates {
|
||||
public static List<LongSet> queriesAggregate(CompiledQueryLong query) {
|
||||
return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query)));
|
||||
}
|
||||
|
||||
/** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */
|
||||
public static <T> LongSet positionsAggregate(CompiledQuery<T> query, ToLongFunction<T> operator) {
|
||||
return query.root().visit(new CqPositionsOperator(query, operator));
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,79 @@
|
||||
package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongArraySet;
|
||||
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.function.IntToLongFunction;
|
||||
import java.util.function.ToLongFunction;
|
||||
|
||||
public class CqPositionsOperator implements CqExpression.ObjectVisitor<LongSet> {
|
||||
private final IntToLongFunction operator;
|
||||
|
||||
public <T> CqPositionsOperator(CompiledQuery<T> query, ToLongFunction<T> operator) {
|
||||
this.operator = idx -> operator.applyAsLong(query.at(idx));
|
||||
}
|
||||
|
||||
@Override
|
||||
public LongSet onAnd(List<? extends CqExpression> parts) {
|
||||
LongSet ret = new LongArraySet();
|
||||
|
||||
for (var part : parts) {
|
||||
ret = comineSets(ret, part.visit(this));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
private LongSet comineSets(LongSet a, LongSet b) {
|
||||
if (a.isEmpty())
|
||||
return b;
|
||||
if (b.isEmpty())
|
||||
return a;
|
||||
|
||||
LongSet ret = newSet(a.size() * b.size());
|
||||
|
||||
var ai = a.longIterator();
|
||||
|
||||
while (ai.hasNext()) {
|
||||
long aval = ai.nextLong();
|
||||
|
||||
var bi = b.longIterator();
|
||||
while (bi.hasNext()) {
|
||||
ret.add(aval & bi.nextLong());
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public LongSet onOr(List<? extends CqExpression> parts) {
|
||||
LongSet ret = newSet(parts.size());
|
||||
|
||||
for (var part : parts) {
|
||||
ret.addAll(part.visit(this));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public LongSet onLeaf(int idx) {
|
||||
var set = newSet(1);
|
||||
set.add(operator.applyAsLong(idx));
|
||||
return set;
|
||||
}
|
||||
|
||||
/** Allocate a new set suitable for a collection with the provided cardinality */
|
||||
private LongSet newSet(int cardinality) {
|
||||
if (cardinality < 8)
|
||||
return new LongArraySet(cardinality);
|
||||
else
|
||||
return new LongOpenHashSet(cardinality);
|
||||
}
|
||||
|
||||
}
|
@ -30,6 +30,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
|
||||
public final Integer pubYear;
|
||||
public final long dataHash;
|
||||
public final int wordsTotal;
|
||||
public final long bestPositions;
|
||||
public final double rankingScore;
|
||||
|
||||
public long documentId() {
|
||||
@ -65,6 +66,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
|
||||
Integer pubYear,
|
||||
long dataHash,
|
||||
int wordsTotal,
|
||||
long bestPositions,
|
||||
double rankingScore)
|
||||
{
|
||||
this.rawIndexResult = rawIndexResult;
|
||||
@ -77,6 +79,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
|
||||
this.pubYear = pubYear;
|
||||
this.dataHash = dataHash;
|
||||
this.wordsTotal = wordsTotal;
|
||||
this.bestPositions = bestPositions;
|
||||
this.rankingScore = rankingScore;
|
||||
}
|
||||
|
||||
|
@ -7,8 +7,6 @@ import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import java.util.Objects;
|
||||
|
||||
public final class SearchResultKeywordScore {
|
||||
@Deprecated
|
||||
public final int subquery;
|
||||
public final long termId;
|
||||
public final String keyword;
|
||||
private final long encodedWordMetadata;
|
||||
@ -22,7 +20,6 @@ public final class SearchResultKeywordScore {
|
||||
long encodedDocMetadata,
|
||||
int htmlFeatures) {
|
||||
this.termId = termId;
|
||||
this.subquery = -1; // FIXME, deprecated
|
||||
this.keyword = keyword;
|
||||
this.encodedWordMetadata = encodedWordMetadata;
|
||||
this.encodedDocMetadata = encodedDocMetadata;
|
||||
@ -37,8 +34,9 @@ public final class SearchResultKeywordScore {
|
||||
return Long.bitCount(positions());
|
||||
}
|
||||
|
||||
@Deprecated // FIXME 2024-04-06
|
||||
public int subquery() {
|
||||
return subquery;
|
||||
return -1;
|
||||
}
|
||||
public long positions() {
|
||||
return WordMetadata.decodePositions(encodedWordMetadata);
|
||||
@ -70,21 +68,19 @@ public final class SearchResultKeywordScore {
|
||||
if (obj == this) return true;
|
||||
if (obj == null || obj.getClass() != this.getClass()) return false;
|
||||
var that = (SearchResultKeywordScore) obj;
|
||||
return this.subquery == that.subquery &&
|
||||
Objects.equals(this.keyword, that.keyword) &&
|
||||
return Objects.equals(this.keyword, that.keyword) &&
|
||||
this.encodedWordMetadata == that.encodedWordMetadata &&
|
||||
this.encodedDocMetadata == that.encodedDocMetadata;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(subquery, keyword, encodedWordMetadata, encodedDocMetadata);
|
||||
return Objects.hash(keyword, encodedWordMetadata, encodedDocMetadata);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "SearchResultKeywordScore[" +
|
||||
"set=" + subquery + ", " +
|
||||
"keyword=" + keyword + ", " +
|
||||
"encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ", " +
|
||||
"encodedDocMetadata=" + new DocumentMetadata(encodedDocMetadata) + ']';
|
||||
|
@ -91,6 +91,7 @@ message RpcDecoratedResultItem {
|
||||
int64 dataHash = 9;
|
||||
int32 wordsTotal = 10;
|
||||
double rankingScore = 11; // The ranking score of this search result item, lower is better
|
||||
int64 bestPositions = 12;
|
||||
}
|
||||
|
||||
/** A raw index-service view of a search result */
|
||||
|
@ -155,6 +155,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
||||
.setTitle(result.title)
|
||||
.setUrl(result.url.toString())
|
||||
.setWordsTotal(result.wordsTotal)
|
||||
.setBestPositions(result.bestPositions)
|
||||
.setRawItem(rawItem);
|
||||
|
||||
if (result.pubYear != null) {
|
||||
|
@ -4,7 +4,9 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import gnu.trove.list.TLongList;
|
||||
import gnu.trove.list.array.TLongArrayList;
|
||||
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
@ -152,8 +154,27 @@ public class IndexResultValuatorService {
|
||||
docData.pubYear(),
|
||||
docData.dataHash(),
|
||||
docData.wordsTotal(),
|
||||
bestPositions(resultQuery),
|
||||
resultValuator.calculateSearchResultValue(resultQuery, docData.wordsTotal(), rankingContext)
|
||||
);
|
||||
}
|
||||
|
||||
private long bestPositions(CompiledQuery<SearchResultKeywordScore> resultQuery) {
|
||||
LongSet positionsSet = CompiledQueryAggregates.positionsAggregate(resultQuery, SearchResultKeywordScore::positions);
|
||||
int bestPc = 0;
|
||||
long bestPositions = 0;
|
||||
|
||||
var li = positionsSet.longIterator();
|
||||
|
||||
while (li.hasNext()) {
|
||||
long pos = li.nextLong();
|
||||
int pc = Long.bitCount(pos);
|
||||
if (pc > bestPc) {
|
||||
bestPc = pc;
|
||||
bestPositions = pos;
|
||||
}
|
||||
}
|
||||
|
||||
return bestPositions;
|
||||
}
|
||||
}
|
||||
|
@ -70,22 +70,24 @@ public class ApiSearchOperator {
|
||||
|
||||
ApiSearchResult convert(DecoratedSearchResultItem url) {
|
||||
List<List<ApiSearchResultQueryDetails>> details = new ArrayList<>();
|
||||
|
||||
// This list-of-list construction is to avoid breaking the API,
|
||||
// we'll always have just a single outer list from now on...
|
||||
|
||||
if (url.rawIndexResult != null) {
|
||||
var bySet = url.rawIndexResult.keywordScores.stream().collect(Collectors.groupingBy(SearchResultKeywordScore::subquery));
|
||||
List<ApiSearchResultQueryDetails> lst = new ArrayList<>();
|
||||
for (var entry : url.rawIndexResult.keywordScores) {
|
||||
var metadata = new WordMetadata(entry.encodedWordMetadata());
|
||||
|
||||
outer:
|
||||
for (var entries : bySet.values()) {
|
||||
List<ApiSearchResultQueryDetails> lst = new ArrayList<>();
|
||||
for (var entry : entries) {
|
||||
var metadata = new WordMetadata(entry.encodedWordMetadata());
|
||||
if (metadata.isEmpty())
|
||||
continue outer;
|
||||
// Skip terms that don't appear anywhere
|
||||
if (metadata.isEmpty())
|
||||
continue;
|
||||
|
||||
Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet());
|
||||
lst.add(new ApiSearchResultQueryDetails(entry.keyword, Long.bitCount(metadata.positions()), flags));
|
||||
}
|
||||
details.add(lst);
|
||||
Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet());
|
||||
lst.add(new ApiSearchResultQueryDetails(entry.keyword, Long.bitCount(metadata.positions()), flags));
|
||||
}
|
||||
|
||||
details.add(lst);
|
||||
}
|
||||
|
||||
return new ApiSearchResult(
|
||||
|
@ -6,7 +6,6 @@ import nu.marginalia.model.idx.WordFlags;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/** A class to hold a list of UrlDetails, grouped by domain, where the first one is the main result
|
||||
* and the rest are additional results, for summary display. */
|
||||
@ -19,44 +18,46 @@ public class ClusteredUrlDetails implements Comparable<ClusteredUrlDetails> {
|
||||
* @param details A collection of UrlDetails, which must not be empty.
|
||||
*/
|
||||
public ClusteredUrlDetails(Collection<UrlDetails> details) {
|
||||
var queue = new PriorityQueue<>(details);
|
||||
var items = new ArrayList<>(details);
|
||||
|
||||
if (queue.isEmpty())
|
||||
items.sort(Comparator.naturalOrder());
|
||||
|
||||
if (items.isEmpty())
|
||||
throw new IllegalArgumentException("Empty list of details");
|
||||
|
||||
this.first = queue.poll();
|
||||
this.first = items.removeFirst();
|
||||
this.rest = items;
|
||||
|
||||
if (queue.isEmpty()) {
|
||||
this.rest = Collections.emptyList();
|
||||
}
|
||||
else {
|
||||
double bestScore = first.termScore;
|
||||
double scoreLimit = Math.min(4.0, bestScore * 1.25);
|
||||
double bestScore = first.termScore;
|
||||
double scoreLimit = Math.min(4.0, bestScore * 1.25);
|
||||
|
||||
this.rest = queue
|
||||
.stream()
|
||||
.filter(this::isEligbleForInclusion)
|
||||
.takeWhile(next -> next.termScore <= scoreLimit)
|
||||
.toList();
|
||||
}
|
||||
this.rest.removeIf(urlDetail -> {
|
||||
if (urlDetail.termScore > scoreLimit)
|
||||
return false;
|
||||
|
||||
for (var keywordScore : urlDetail.resultItem.keywordScores) {
|
||||
if (keywordScore.isKeywordSpecial())
|
||||
continue;
|
||||
if (keywordScore.positionCount() == 0)
|
||||
continue;
|
||||
|
||||
if (keywordScore.hasTermFlag(WordFlags.Title))
|
||||
return false;
|
||||
if (keywordScore.hasTermFlag(WordFlags.ExternalLink))
|
||||
return false;
|
||||
if (keywordScore.hasTermFlag(WordFlags.UrlDomain))
|
||||
return false;
|
||||
if (keywordScore.hasTermFlag(WordFlags.UrlPath))
|
||||
return false;
|
||||
if (keywordScore.hasTermFlag(WordFlags.Subjects))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private boolean isEligbleForInclusion(UrlDetails urlDetails) {
|
||||
return urlDetails.resultItem.keywordScores.stream()
|
||||
.filter(score -> !score.keyword.contains(":"))
|
||||
.collect(Collectors.toMap(
|
||||
score -> -1, // FIXME
|
||||
score -> score.hasTermFlag(WordFlags.Title)
|
||||
| score.hasTermFlag(WordFlags.ExternalLink)
|
||||
| score.hasTermFlag(WordFlags.UrlDomain)
|
||||
| score.hasTermFlag(WordFlags.UrlPath)
|
||||
| score.hasTermFlag(WordFlags.Subjects)
|
||||
,
|
||||
(a, b) -> a && b
|
||||
))
|
||||
.containsValue(Boolean.TRUE);
|
||||
}
|
||||
|
||||
public ClusteredUrlDetails(@NotNull UrlDetails onlyFirst) {
|
||||
this.first = onlyFirst;
|
||||
|
@ -88,7 +88,7 @@ public class SearchQueryIndexService {
|
||||
DomainIndexingState.ACTIVE,
|
||||
detail.rankingScore, // termScore
|
||||
detail.resultsFromDomain(),
|
||||
getPositionsString(detail.rawIndexResult),
|
||||
getPositionsString(detail),
|
||||
detail.rawIndexResult,
|
||||
detail.rawIndexResult.keywordScores
|
||||
));
|
||||
@ -97,27 +97,8 @@ public class SearchQueryIndexService {
|
||||
return ret;
|
||||
}
|
||||
|
||||
private String getPositionsString(SearchResultItem resultItem) {
|
||||
Int2LongArrayMap positionsPerSet = new Int2LongArrayMap(8);
|
||||
|
||||
for (var score : resultItem.keywordScores) {
|
||||
if (!score.isKeywordRegular()) {
|
||||
continue;
|
||||
}
|
||||
positionsPerSet.merge(score.subquery(), score.positions(), this::and);
|
||||
}
|
||||
|
||||
long bits = positionsPerSet.values().longStream().reduce(this::or).orElse(0);
|
||||
|
||||
return BrailleBlockPunchCards.printBits(bits, 56);
|
||||
private String getPositionsString(DecoratedSearchResultItem resultItem) {
|
||||
return BrailleBlockPunchCards.printBits(resultItem.bestPositions, 56);
|
||||
|
||||
}
|
||||
|
||||
private long and(long a, long b) {
|
||||
return a & b;
|
||||
}
|
||||
private long or(long a, long b) {
|
||||
return a | b;
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user