mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(search) Sort and deduplicate search results for better relevance.
Added a custom sorting mechanism to prioritize HTTPS over HTTP and domain-based URLs over raw IPs during deduplication. Ensures "bad duplicates" are discarded while maintaining the original presentation order for user-facing results.
This commit is contained in:
parent
2a173e2861
commit
e4769f541d
@ -161,7 +161,9 @@ public class SearchOperator {
|
||||
searchVisitorCount.registerQuery();
|
||||
|
||||
List<UrlDetails> details = queryResponse.results().stream()
|
||||
.sorted(this::retentionSortOrder) // Sort in an order that makes us more likely to discard the "bad" duplicates
|
||||
.filter(deduplicator::shouldRetain)
|
||||
.sorted() // Return to the presentation sort order before limiting so we don't throw out good results over schema and "ip-ness"
|
||||
.limit(limits.resultsTotal())
|
||||
.map(SearchOperator::createDetails)
|
||||
.toList();
|
||||
@ -177,6 +179,29 @@ public class SearchOperator {
|
||||
return new SimpleSearchResults(details, pages);
|
||||
}
|
||||
|
||||
/** A sorting order that makes us more likely to discard the "bad apple", when deduplicating.
|
||||
* Sometimes the search engine has found the same content via different access routes to the same server,
|
||||
* this may be raw IP access, or http access. Try to weed these out by sorting in a way that prefers
|
||||
* https over http, and domains that don't look like IPs to those that do
|
||||
*/
|
||||
private int retentionSortOrder(DecoratedSearchResultItem a, DecoratedSearchResultItem b) {
|
||||
|
||||
// Note we reverse the order of a and b below, to prefer items with https over not
|
||||
int schemaDiff = Boolean.compare("https".equalsIgnoreCase(b.url.proto), "https".equalsIgnoreCase(a.url.proto));
|
||||
if (schemaDiff != 0)
|
||||
return schemaDiff;
|
||||
|
||||
// Prefer documents accessed via a domain name over those from a raw IP;
|
||||
// this is a somewhat rough heuristic to only look at the first digit, but
|
||||
// we don't want to spend a lot of CPU on this so it's good enough for 99.9% of cases
|
||||
|
||||
int isLikelyIPDiff = Boolean.compare(Character.isDigit(a.url.domain.topDomain.charAt(0)), Character.isDigit(b.url.domain.topDomain.charAt(0)));
|
||||
if (isLikelyIPDiff != 0)
|
||||
return isLikelyIPDiff;
|
||||
|
||||
return Double.compare(a.rankingScore, b.rankingScore);
|
||||
}
|
||||
|
||||
private static UrlDetails createDetails(DecoratedSearchResultItem item) {
|
||||
return new UrlDetails(
|
||||
item.documentId(),
|
||||
|
Loading…
Reference in New Issue
Block a user