mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(search) Sort and deduplicate search results for better relevance.
Added a custom sorting mechanism to prioritize HTTPS over HTTP and domain-based URLs over raw IPs during deduplication. Ensures "bad duplicates" are discarded while maintaining the original presentation order for user-facing results.
This commit is contained in:
parent
2a173e2861
commit
e4769f541d
@ -161,7 +161,9 @@ public class SearchOperator {
|
|||||||
searchVisitorCount.registerQuery();
|
searchVisitorCount.registerQuery();
|
||||||
|
|
||||||
List<UrlDetails> details = queryResponse.results().stream()
|
List<UrlDetails> details = queryResponse.results().stream()
|
||||||
|
.sorted(this::retentionSortOrder) // Sort in an order that makes us more likely to discard the "bad" duplicates
|
||||||
.filter(deduplicator::shouldRetain)
|
.filter(deduplicator::shouldRetain)
|
||||||
|
.sorted() // Return to the presentation sort order before limiting so we don't throw out good results over schema and "ip-ness"
|
||||||
.limit(limits.resultsTotal())
|
.limit(limits.resultsTotal())
|
||||||
.map(SearchOperator::createDetails)
|
.map(SearchOperator::createDetails)
|
||||||
.toList();
|
.toList();
|
||||||
@ -177,6 +179,29 @@ public class SearchOperator {
|
|||||||
return new SimpleSearchResults(details, pages);
|
return new SimpleSearchResults(details, pages);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** A sorting order that makes us more likely to discard the "bad apple", when deduplicating.
|
||||||
|
* Sometimes the search engine has found the same content via different access routes to the same server,
|
||||||
|
* this may be raw IP access, or http access. Try to weed these out by sorting in a way that prefers
|
||||||
|
* https over http, and domains that don't look like IPs to those that do
|
||||||
|
*/
|
||||||
|
private int retentionSortOrder(DecoratedSearchResultItem a, DecoratedSearchResultItem b) {
|
||||||
|
|
||||||
|
// Note we reverse the order of a and b below, to prefer items with https over not
|
||||||
|
int schemaDiff = Boolean.compare("https".equalsIgnoreCase(b.url.proto), "https".equalsIgnoreCase(a.url.proto));
|
||||||
|
if (schemaDiff != 0)
|
||||||
|
return schemaDiff;
|
||||||
|
|
||||||
|
// Prefer documents accessed via a domain name over those from a raw IP;
|
||||||
|
// this is a somewhat rough heuristic to only look at the first digit, but
|
||||||
|
// we don't want to spend a lot of CPU on this so it's good enough for 99.9% of cases
|
||||||
|
|
||||||
|
int isLikelyIPDiff = Boolean.compare(Character.isDigit(a.url.domain.topDomain.charAt(0)), Character.isDigit(b.url.domain.topDomain.charAt(0)));
|
||||||
|
if (isLikelyIPDiff != 0)
|
||||||
|
return isLikelyIPDiff;
|
||||||
|
|
||||||
|
return Double.compare(a.rankingScore, b.rankingScore);
|
||||||
|
}
|
||||||
|
|
||||||
private static UrlDetails createDetails(DecoratedSearchResultItem item) {
|
private static UrlDetails createDetails(DecoratedSearchResultItem item) {
|
||||||
return new UrlDetails(
|
return new UrlDetails(
|
||||||
item.documentId(),
|
item.documentId(),
|
||||||
|
Loading…
Reference in New Issue
Block a user