(search) Sort and deduplicate search results for better relevance.

Added a custom sorting mechanism to prioritize HTTPS over HTTP and domain-based URLs over raw IPs during deduplication. Ensures "bad duplicates" are discarded while maintaining the original presentation order for user-facing results.
2025-02-23 13:09:00 +00:00 · 2024-12-13 15:22:20 +01:00 · 2024-12-13 15:22:20 +01:00 · e4769f541d
commit e4769f541d
parent 2a173e2861
1 changed files with 25 additions and 0 deletions
--- a/code/services-application/search-service/java/nu/marginalia/search/SearchOperator.java
+++ b/code/services-application/search-service/java/nu/marginalia/search/SearchOperator.java
@ -161,7 +161,9 @@ public class SearchOperator {
        searchVisitorCount.registerQuery();

        List<UrlDetails> details = queryResponse.results().stream()
+                .sorted(this::retentionSortOrder) // Sort in an order that makes us more likely to discard the "bad" duplicates
                .filter(deduplicator::shouldRetain)
+                .sorted() // Return to the presentation sort order before limiting so we don't throw out good results over schema and "ip-ness"
                .limit(limits.resultsTotal())
                .map(SearchOperator::createDetails)
                .toList();
@ -177,6 +179,29 @@ public class SearchOperator {
        return new SimpleSearchResults(details, pages);
    }

+    /** A sorting order that makes us more likely to discard the "bad apple", when deduplicating.
+     *  Sometimes the search engine has found the same content via different access routes to the same server,
+     *  this may be raw IP access, or http access.  Try to weed these out by sorting in a way that prefers
+     *  https over http, and domains that don't look like IPs to those that do
+     */
+    private int retentionSortOrder(DecoratedSearchResultItem a, DecoratedSearchResultItem b) {
+
+        // Note we reverse the order of a and b below, to prefer items with https over not
+        int schemaDiff = Boolean.compare("https".equalsIgnoreCase(b.url.proto), "https".equalsIgnoreCase(a.url.proto));
+        if (schemaDiff != 0)
+            return schemaDiff;
+
+        // Prefer documents accessed via a domain name over those from a raw IP;
+        // this is a somewhat rough heuristic to only look at the first digit, but
+        // we don't want to spend a lot of CPU on this so it's good enough for 99.9% of cases
+
+        int isLikelyIPDiff = Boolean.compare(Character.isDigit(a.url.domain.topDomain.charAt(0)), Character.isDigit(b.url.domain.topDomain.charAt(0)));
+        if (isLikelyIPDiff != 0)
+            return isLikelyIPDiff;
+
+        return Double.compare(a.rankingScore, b.rankingScore);
+    }
+
    private static UrlDetails createDetails(DecoratedSearchResultItem item) {
        return new UrlDetails(
                item.documentId(),