(search) Sort and deduplicate search results for better relevance.

Added a custom sorting mechanism to prioritize HTTPS over HTTP and domain-based URLs over raw IPs during deduplication. Ensures "bad duplicates" are discarded while maintaining the original presentation order for user-facing results.
This commit is contained in:
Viktor Lofgren 2024-12-13 15:22:20 +01:00
parent 2a173e2861
commit e4769f541d

View File

@ -161,7 +161,9 @@ public class SearchOperator {
searchVisitorCount.registerQuery();
List<UrlDetails> details = queryResponse.results().stream()
.sorted(this::retentionSortOrder) // Sort in an order that makes us more likely to discard the "bad" duplicates
.filter(deduplicator::shouldRetain)
.sorted() // Return to the presentation sort order before limiting so we don't throw out good results over schema and "ip-ness"
.limit(limits.resultsTotal())
.map(SearchOperator::createDetails)
.toList();
@ -177,6 +179,29 @@ public class SearchOperator {
return new SimpleSearchResults(details, pages);
}
/** A sorting order that makes us more likely to discard the "bad apple", when deduplicating.
* Sometimes the search engine has found the same content via different access routes to the same server,
* this may be raw IP access, or http access. Try to weed these out by sorting in a way that prefers
* https over http, and domains that don't look like IPs to those that do
*/
private int retentionSortOrder(DecoratedSearchResultItem a, DecoratedSearchResultItem b) {
// Note we reverse the order of a and b below, to prefer items with https over not
int schemaDiff = Boolean.compare("https".equalsIgnoreCase(b.url.proto), "https".equalsIgnoreCase(a.url.proto));
if (schemaDiff != 0)
return schemaDiff;
// Prefer documents accessed via a domain name over those from a raw IP;
// this is a somewhat rough heuristic to only look at the first digit, but
// we don't want to spend a lot of CPU on this so it's good enough for 99.9% of cases
int isLikelyIPDiff = Boolean.compare(Character.isDigit(a.url.domain.topDomain.charAt(0)), Character.isDigit(b.url.domain.topDomain.charAt(0)));
if (isLikelyIPDiff != 0)
return isLikelyIPDiff;
return Double.compare(a.rankingScore, b.rankingScore);
}
private static UrlDetails createDetails(DecoratedSearchResultItem item) {
return new UrlDetails(
item.documentId(),