(index) Add index-side deduplication in selectBestResults

This commit is contained in:
Viktor Lofgren 2024-08-10 10:51:59 +02:00
parent 4ece5f847b
commit e6c8a6febe

View File

@ -6,6 +6,7 @@ import gnu.trove.list.TLongList;
import gnu.trove.list.array.TLongArrayList;
import gnu.trove.map.hash.TObjectLongHashMap;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import nu.marginalia.api.searchquery.*;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
@ -174,6 +175,7 @@ public class IndexResultRankingService {
}
List<RpcDecoratedResultItem> resultItems = new ArrayList<>(resultsList.size());
LongOpenHashSet seenDocumentHashes = new LongOpenHashSet(resultsList.size());
// Decorate the results with the document details
for (var result : resultsList) {
@ -185,6 +187,11 @@ public class IndexResultRankingService {
continue;
}
// Filter out duplicates by content
if (!seenDocumentHashes.add(docData.dataHash())) {
continue;
}
var rawItem = RpcRawResultItem.newBuilder();
rawItem.setCombinedId(result.combinedId);