mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 13:19:02 +00:00
(search) Fix outdated assumptions about the results
We no longer break the query into "sets" of search terms and need to adapt the code to not use this assumption. For the API service, we'll simulate the old behavior to keep the API stable. For the search service, we'll introduce a new way of calculating positions through tree aggregation.
This commit is contained in:
parent
6cba6aef3b
commit
4fb86ac692
@ -121,6 +121,7 @@ public class QueryProtobufCodec {
|
|||||||
results.getPubYear(), // ??,
|
results.getPubYear(), // ??,
|
||||||
results.getDataHash(),
|
results.getDataHash(),
|
||||||
results.getWordsTotal(),
|
results.getWordsTotal(),
|
||||||
|
results.getBestPositions(),
|
||||||
results.getRankingScore()
|
results.getRankingScore()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -202,6 +203,7 @@ public class QueryProtobufCodec {
|
|||||||
rpcDecoratedResultItem.getPubYear(),
|
rpcDecoratedResultItem.getPubYear(),
|
||||||
rpcDecoratedResultItem.getDataHash(),
|
rpcDecoratedResultItem.getDataHash(),
|
||||||
rpcDecoratedResultItem.getWordsTotal(),
|
rpcDecoratedResultItem.getWordsTotal(),
|
||||||
|
rpcDecoratedResultItem.getBestPositions(),
|
||||||
rpcDecoratedResultItem.getRankingScore()
|
rpcDecoratedResultItem.getRankingScore()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -44,4 +44,9 @@ public class CompiledQueryAggregates {
|
|||||||
public static List<LongSet> queriesAggregate(CompiledQueryLong query) {
|
public static List<LongSet> queriesAggregate(CompiledQueryLong query) {
|
||||||
return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query)));
|
return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */
|
||||||
|
public static <T> LongSet positionsAggregate(CompiledQuery<T> query, ToLongFunction<T> operator) {
|
||||||
|
return query.root().visit(new CqPositionsOperator(query, operator));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,79 @@
|
|||||||
|
package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongArraySet;
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.function.IntToLongFunction;
|
||||||
|
import java.util.function.ToLongFunction;
|
||||||
|
|
||||||
|
public class CqPositionsOperator implements CqExpression.ObjectVisitor<LongSet> {
|
||||||
|
private final IntToLongFunction operator;
|
||||||
|
|
||||||
|
public <T> CqPositionsOperator(CompiledQuery<T> query, ToLongFunction<T> operator) {
|
||||||
|
this.operator = idx -> operator.applyAsLong(query.at(idx));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public LongSet onAnd(List<? extends CqExpression> parts) {
|
||||||
|
LongSet ret = new LongArraySet();
|
||||||
|
|
||||||
|
for (var part : parts) {
|
||||||
|
ret = comineSets(ret, part.visit(this));
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
private LongSet comineSets(LongSet a, LongSet b) {
|
||||||
|
if (a.isEmpty())
|
||||||
|
return b;
|
||||||
|
if (b.isEmpty())
|
||||||
|
return a;
|
||||||
|
|
||||||
|
LongSet ret = newSet(a.size() * b.size());
|
||||||
|
|
||||||
|
var ai = a.longIterator();
|
||||||
|
|
||||||
|
while (ai.hasNext()) {
|
||||||
|
long aval = ai.nextLong();
|
||||||
|
|
||||||
|
var bi = b.longIterator();
|
||||||
|
while (bi.hasNext()) {
|
||||||
|
ret.add(aval & bi.nextLong());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public LongSet onOr(List<? extends CqExpression> parts) {
|
||||||
|
LongSet ret = newSet(parts.size());
|
||||||
|
|
||||||
|
for (var part : parts) {
|
||||||
|
ret.addAll(part.visit(this));
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public LongSet onLeaf(int idx) {
|
||||||
|
var set = newSet(1);
|
||||||
|
set.add(operator.applyAsLong(idx));
|
||||||
|
return set;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Allocate a new set suitable for a collection with the provided cardinality */
|
||||||
|
private LongSet newSet(int cardinality) {
|
||||||
|
if (cardinality < 8)
|
||||||
|
return new LongArraySet(cardinality);
|
||||||
|
else
|
||||||
|
return new LongOpenHashSet(cardinality);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -30,6 +30,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
|
|||||||
public final Integer pubYear;
|
public final Integer pubYear;
|
||||||
public final long dataHash;
|
public final long dataHash;
|
||||||
public final int wordsTotal;
|
public final int wordsTotal;
|
||||||
|
public final long bestPositions;
|
||||||
public final double rankingScore;
|
public final double rankingScore;
|
||||||
|
|
||||||
public long documentId() {
|
public long documentId() {
|
||||||
@ -65,6 +66,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
|
|||||||
Integer pubYear,
|
Integer pubYear,
|
||||||
long dataHash,
|
long dataHash,
|
||||||
int wordsTotal,
|
int wordsTotal,
|
||||||
|
long bestPositions,
|
||||||
double rankingScore)
|
double rankingScore)
|
||||||
{
|
{
|
||||||
this.rawIndexResult = rawIndexResult;
|
this.rawIndexResult = rawIndexResult;
|
||||||
@ -77,6 +79,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
|
|||||||
this.pubYear = pubYear;
|
this.pubYear = pubYear;
|
||||||
this.dataHash = dataHash;
|
this.dataHash = dataHash;
|
||||||
this.wordsTotal = wordsTotal;
|
this.wordsTotal = wordsTotal;
|
||||||
|
this.bestPositions = bestPositions;
|
||||||
this.rankingScore = rankingScore;
|
this.rankingScore = rankingScore;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,8 +7,6 @@ import nu.marginalia.model.idx.DocumentMetadata;
|
|||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
|
||||||
public final class SearchResultKeywordScore {
|
public final class SearchResultKeywordScore {
|
||||||
@Deprecated
|
|
||||||
public final int subquery;
|
|
||||||
public final long termId;
|
public final long termId;
|
||||||
public final String keyword;
|
public final String keyword;
|
||||||
private final long encodedWordMetadata;
|
private final long encodedWordMetadata;
|
||||||
@ -22,7 +20,6 @@ public final class SearchResultKeywordScore {
|
|||||||
long encodedDocMetadata,
|
long encodedDocMetadata,
|
||||||
int htmlFeatures) {
|
int htmlFeatures) {
|
||||||
this.termId = termId;
|
this.termId = termId;
|
||||||
this.subquery = -1; // FIXME, deprecated
|
|
||||||
this.keyword = keyword;
|
this.keyword = keyword;
|
||||||
this.encodedWordMetadata = encodedWordMetadata;
|
this.encodedWordMetadata = encodedWordMetadata;
|
||||||
this.encodedDocMetadata = encodedDocMetadata;
|
this.encodedDocMetadata = encodedDocMetadata;
|
||||||
@ -37,8 +34,9 @@ public final class SearchResultKeywordScore {
|
|||||||
return Long.bitCount(positions());
|
return Long.bitCount(positions());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Deprecated // FIXME 2024-04-06
|
||||||
public int subquery() {
|
public int subquery() {
|
||||||
return subquery;
|
return -1;
|
||||||
}
|
}
|
||||||
public long positions() {
|
public long positions() {
|
||||||
return WordMetadata.decodePositions(encodedWordMetadata);
|
return WordMetadata.decodePositions(encodedWordMetadata);
|
||||||
@ -70,21 +68,19 @@ public final class SearchResultKeywordScore {
|
|||||||
if (obj == this) return true;
|
if (obj == this) return true;
|
||||||
if (obj == null || obj.getClass() != this.getClass()) return false;
|
if (obj == null || obj.getClass() != this.getClass()) return false;
|
||||||
var that = (SearchResultKeywordScore) obj;
|
var that = (SearchResultKeywordScore) obj;
|
||||||
return this.subquery == that.subquery &&
|
return Objects.equals(this.keyword, that.keyword) &&
|
||||||
Objects.equals(this.keyword, that.keyword) &&
|
|
||||||
this.encodedWordMetadata == that.encodedWordMetadata &&
|
this.encodedWordMetadata == that.encodedWordMetadata &&
|
||||||
this.encodedDocMetadata == that.encodedDocMetadata;
|
this.encodedDocMetadata == that.encodedDocMetadata;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
return Objects.hash(subquery, keyword, encodedWordMetadata, encodedDocMetadata);
|
return Objects.hash(keyword, encodedWordMetadata, encodedDocMetadata);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "SearchResultKeywordScore[" +
|
return "SearchResultKeywordScore[" +
|
||||||
"set=" + subquery + ", " +
|
|
||||||
"keyword=" + keyword + ", " +
|
"keyword=" + keyword + ", " +
|
||||||
"encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ", " +
|
"encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ", " +
|
||||||
"encodedDocMetadata=" + new DocumentMetadata(encodedDocMetadata) + ']';
|
"encodedDocMetadata=" + new DocumentMetadata(encodedDocMetadata) + ']';
|
||||||
|
@ -91,6 +91,7 @@ message RpcDecoratedResultItem {
|
|||||||
int64 dataHash = 9;
|
int64 dataHash = 9;
|
||||||
int32 wordsTotal = 10;
|
int32 wordsTotal = 10;
|
||||||
double rankingScore = 11; // The ranking score of this search result item, lower is better
|
double rankingScore = 11; // The ranking score of this search result item, lower is better
|
||||||
|
int64 bestPositions = 12;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** A raw index-service view of a search result */
|
/** A raw index-service view of a search result */
|
||||||
|
@ -155,6 +155,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
|||||||
.setTitle(result.title)
|
.setTitle(result.title)
|
||||||
.setUrl(result.url.toString())
|
.setUrl(result.url.toString())
|
||||||
.setWordsTotal(result.wordsTotal)
|
.setWordsTotal(result.wordsTotal)
|
||||||
|
.setBestPositions(result.bestPositions)
|
||||||
.setRawItem(rawItem);
|
.setRawItem(rawItem);
|
||||||
|
|
||||||
if (result.pubYear != null) {
|
if (result.pubYear != null) {
|
||||||
|
@ -4,7 +4,9 @@ import com.google.inject.Inject;
|
|||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import gnu.trove.list.TLongList;
|
import gnu.trove.list.TLongList;
|
||||||
import gnu.trove.list.array.TLongArrayList;
|
import gnu.trove.list.array.TLongArrayList;
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
||||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||||
@ -152,8 +154,27 @@ public class IndexResultValuatorService {
|
|||||||
docData.pubYear(),
|
docData.pubYear(),
|
||||||
docData.dataHash(),
|
docData.dataHash(),
|
||||||
docData.wordsTotal(),
|
docData.wordsTotal(),
|
||||||
|
bestPositions(resultQuery),
|
||||||
resultValuator.calculateSearchResultValue(resultQuery, docData.wordsTotal(), rankingContext)
|
resultValuator.calculateSearchResultValue(resultQuery, docData.wordsTotal(), rankingContext)
|
||||||
);
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
private long bestPositions(CompiledQuery<SearchResultKeywordScore> resultQuery) {
|
||||||
|
LongSet positionsSet = CompiledQueryAggregates.positionsAggregate(resultQuery, SearchResultKeywordScore::positions);
|
||||||
|
int bestPc = 0;
|
||||||
|
long bestPositions = 0;
|
||||||
|
|
||||||
|
var li = positionsSet.longIterator();
|
||||||
|
|
||||||
|
while (li.hasNext()) {
|
||||||
|
long pos = li.nextLong();
|
||||||
|
int pc = Long.bitCount(pos);
|
||||||
|
if (pc > bestPc) {
|
||||||
|
bestPc = pc;
|
||||||
|
bestPositions = pos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return bestPositions;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -70,23 +70,25 @@ public class ApiSearchOperator {
|
|||||||
|
|
||||||
ApiSearchResult convert(DecoratedSearchResultItem url) {
|
ApiSearchResult convert(DecoratedSearchResultItem url) {
|
||||||
List<List<ApiSearchResultQueryDetails>> details = new ArrayList<>();
|
List<List<ApiSearchResultQueryDetails>> details = new ArrayList<>();
|
||||||
if (url.rawIndexResult != null) {
|
|
||||||
var bySet = url.rawIndexResult.keywordScores.stream().collect(Collectors.groupingBy(SearchResultKeywordScore::subquery));
|
|
||||||
|
|
||||||
outer:
|
// This list-of-list construction is to avoid breaking the API,
|
||||||
for (var entries : bySet.values()) {
|
// we'll always have just a single outer list from now on...
|
||||||
|
|
||||||
|
if (url.rawIndexResult != null) {
|
||||||
List<ApiSearchResultQueryDetails> lst = new ArrayList<>();
|
List<ApiSearchResultQueryDetails> lst = new ArrayList<>();
|
||||||
for (var entry : entries) {
|
for (var entry : url.rawIndexResult.keywordScores) {
|
||||||
var metadata = new WordMetadata(entry.encodedWordMetadata());
|
var metadata = new WordMetadata(entry.encodedWordMetadata());
|
||||||
|
|
||||||
|
// Skip terms that don't appear anywhere
|
||||||
if (metadata.isEmpty())
|
if (metadata.isEmpty())
|
||||||
continue outer;
|
continue;
|
||||||
|
|
||||||
Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet());
|
Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet());
|
||||||
lst.add(new ApiSearchResultQueryDetails(entry.keyword, Long.bitCount(metadata.positions()), flags));
|
lst.add(new ApiSearchResultQueryDetails(entry.keyword, Long.bitCount(metadata.positions()), flags));
|
||||||
}
|
}
|
||||||
|
|
||||||
details.add(lst);
|
details.add(lst);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return new ApiSearchResult(
|
return new ApiSearchResult(
|
||||||
url.url.toString(),
|
url.url.toString(),
|
||||||
|
@ -6,7 +6,6 @@ import nu.marginalia.model.idx.WordFlags;
|
|||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
/** A class to hold a list of UrlDetails, grouped by domain, where the first one is the main result
|
/** A class to hold a list of UrlDetails, grouped by domain, where the first one is the main result
|
||||||
* and the rest are additional results, for summary display. */
|
* and the rest are additional results, for summary display. */
|
||||||
@ -19,44 +18,46 @@ public class ClusteredUrlDetails implements Comparable<ClusteredUrlDetails> {
|
|||||||
* @param details A collection of UrlDetails, which must not be empty.
|
* @param details A collection of UrlDetails, which must not be empty.
|
||||||
*/
|
*/
|
||||||
public ClusteredUrlDetails(Collection<UrlDetails> details) {
|
public ClusteredUrlDetails(Collection<UrlDetails> details) {
|
||||||
var queue = new PriorityQueue<>(details);
|
var items = new ArrayList<>(details);
|
||||||
|
|
||||||
if (queue.isEmpty())
|
items.sort(Comparator.naturalOrder());
|
||||||
|
|
||||||
|
if (items.isEmpty())
|
||||||
throw new IllegalArgumentException("Empty list of details");
|
throw new IllegalArgumentException("Empty list of details");
|
||||||
|
|
||||||
this.first = queue.poll();
|
this.first = items.removeFirst();
|
||||||
|
this.rest = items;
|
||||||
|
|
||||||
if (queue.isEmpty()) {
|
|
||||||
this.rest = Collections.emptyList();
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
double bestScore = first.termScore;
|
double bestScore = first.termScore;
|
||||||
double scoreLimit = Math.min(4.0, bestScore * 1.25);
|
double scoreLimit = Math.min(4.0, bestScore * 1.25);
|
||||||
|
|
||||||
this.rest = queue
|
this.rest.removeIf(urlDetail -> {
|
||||||
.stream()
|
if (urlDetail.termScore > scoreLimit)
|
||||||
.filter(this::isEligbleForInclusion)
|
return false;
|
||||||
.takeWhile(next -> next.termScore <= scoreLimit)
|
|
||||||
.toList();
|
for (var keywordScore : urlDetail.resultItem.keywordScores) {
|
||||||
|
if (keywordScore.isKeywordSpecial())
|
||||||
|
continue;
|
||||||
|
if (keywordScore.positionCount() == 0)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (keywordScore.hasTermFlag(WordFlags.Title))
|
||||||
|
return false;
|
||||||
|
if (keywordScore.hasTermFlag(WordFlags.ExternalLink))
|
||||||
|
return false;
|
||||||
|
if (keywordScore.hasTermFlag(WordFlags.UrlDomain))
|
||||||
|
return false;
|
||||||
|
if (keywordScore.hasTermFlag(WordFlags.UrlPath))
|
||||||
|
return false;
|
||||||
|
if (keywordScore.hasTermFlag(WordFlags.Subjects))
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isEligbleForInclusion(UrlDetails urlDetails) {
|
|
||||||
return urlDetails.resultItem.keywordScores.stream()
|
|
||||||
.filter(score -> !score.keyword.contains(":"))
|
|
||||||
.collect(Collectors.toMap(
|
|
||||||
score -> -1, // FIXME
|
|
||||||
score -> score.hasTermFlag(WordFlags.Title)
|
|
||||||
| score.hasTermFlag(WordFlags.ExternalLink)
|
|
||||||
| score.hasTermFlag(WordFlags.UrlDomain)
|
|
||||||
| score.hasTermFlag(WordFlags.UrlPath)
|
|
||||||
| score.hasTermFlag(WordFlags.Subjects)
|
|
||||||
,
|
|
||||||
(a, b) -> a && b
|
|
||||||
))
|
|
||||||
.containsValue(Boolean.TRUE);
|
|
||||||
}
|
|
||||||
|
|
||||||
public ClusteredUrlDetails(@NotNull UrlDetails onlyFirst) {
|
public ClusteredUrlDetails(@NotNull UrlDetails onlyFirst) {
|
||||||
this.first = onlyFirst;
|
this.first = onlyFirst;
|
||||||
|
@ -88,7 +88,7 @@ public class SearchQueryIndexService {
|
|||||||
DomainIndexingState.ACTIVE,
|
DomainIndexingState.ACTIVE,
|
||||||
detail.rankingScore, // termScore
|
detail.rankingScore, // termScore
|
||||||
detail.resultsFromDomain(),
|
detail.resultsFromDomain(),
|
||||||
getPositionsString(detail.rawIndexResult),
|
getPositionsString(detail),
|
||||||
detail.rawIndexResult,
|
detail.rawIndexResult,
|
||||||
detail.rawIndexResult.keywordScores
|
detail.rawIndexResult.keywordScores
|
||||||
));
|
));
|
||||||
@ -97,27 +97,8 @@ public class SearchQueryIndexService {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
private String getPositionsString(SearchResultItem resultItem) {
|
private String getPositionsString(DecoratedSearchResultItem resultItem) {
|
||||||
Int2LongArrayMap positionsPerSet = new Int2LongArrayMap(8);
|
return BrailleBlockPunchCards.printBits(resultItem.bestPositions, 56);
|
||||||
|
|
||||||
for (var score : resultItem.keywordScores) {
|
|
||||||
if (!score.isKeywordRegular()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
positionsPerSet.merge(score.subquery(), score.positions(), this::and);
|
|
||||||
}
|
|
||||||
|
|
||||||
long bits = positionsPerSet.values().longStream().reduce(this::or).orElse(0);
|
|
||||||
|
|
||||||
return BrailleBlockPunchCards.printBits(bits, 56);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private long and(long a, long b) {
|
|
||||||
return a & b;
|
|
||||||
}
|
|
||||||
private long or(long a, long b) {
|
|
||||||
return a | b;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user