diff --git a/code/services-application/search-service/java/nu/marginalia/search/SearchOperator.java b/code/services-application/search-service/java/nu/marginalia/search/SearchOperator.java index 3badddae..8acc297e 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/SearchOperator.java +++ b/code/services-application/search-service/java/nu/marginalia/search/SearchOperator.java @@ -161,7 +161,9 @@ public class SearchOperator { searchVisitorCount.registerQuery(); List details = queryResponse.results().stream() + .sorted(this::retentionSortOrder) // Sort in an order that makes us more likely to discard the "bad" duplicates .filter(deduplicator::shouldRetain) + .sorted() // Return to the presentation sort order before limiting so we don't throw out good results over schema and "ip-ness" .limit(limits.resultsTotal()) .map(SearchOperator::createDetails) .toList(); @@ -177,6 +179,29 @@ public class SearchOperator { return new SimpleSearchResults(details, pages); } + /** A sorting order that makes us more likely to discard the "bad apple", when deduplicating. + * Sometimes the search engine has found the same content via different access routes to the same server, + * this may be raw IP access, or http access. Try to weed these out by sorting in a way that prefers + * https over http, and domains that don't look like IPs to those that do + */ + private int retentionSortOrder(DecoratedSearchResultItem a, DecoratedSearchResultItem b) { + + // Note we reverse the order of a and b below, to prefer items with https over not + int schemaDiff = Boolean.compare("https".equalsIgnoreCase(b.url.proto), "https".equalsIgnoreCase(a.url.proto)); + if (schemaDiff != 0) + return schemaDiff; + + // Prefer documents accessed via a domain name over those from a raw IP; + // this is a somewhat rough heuristic to only look at the first digit, but + // we don't want to spend a lot of CPU on this so it's good enough for 99.9% of cases + + int isLikelyIPDiff = Boolean.compare(Character.isDigit(a.url.domain.topDomain.charAt(0)), Character.isDigit(b.url.domain.topDomain.charAt(0))); + if (isLikelyIPDiff != 0) + return isLikelyIPDiff; + + return Double.compare(a.rankingScore, b.rankingScore); + } + private static UrlDetails createDetails(DecoratedSearchResultItem item) { return new UrlDetails( item.documentId(),