diff --git a/code/services-application/search-service/java/nu/marginalia/search/model/UrlDetails.java b/code/services-application/search-service/java/nu/marginalia/search/model/UrlDetails.java index 41e152e6..2999b66d 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/model/UrlDetails.java +++ b/code/services-application/search-service/java/nu/marginalia/search/model/UrlDetails.java @@ -9,11 +9,10 @@ import nu.marginalia.model.crawl.HtmlFeature; import java.util.ArrayList; import java.util.List; -import java.util.StringJoiner; /** A class to hold details about a single search result. */ @AllArgsConstructor @NoArgsConstructor @With @Getter @ToString -public class UrlDetails implements Comparable { +public class UrlDetails { public long id; public int domainId; @@ -63,12 +62,6 @@ public class UrlDetails implements Comparable { return Long.hashCode(id); } - @Override - public int compareTo(UrlDetails other) { - int result = Double.compare(getTermScore(), other.getTermScore()); - if (result == 0) result = Long.compare(getId(), other.getId()); - return result; - } public boolean equals(Object other) { if (other == null) { diff --git a/code/services-application/search-service/java/nu/marginalia/search/results/UrlDeduplicator.java b/code/services-application/search-service/java/nu/marginalia/search/results/UrlDeduplicator.java index ccddb8d9..046b779e 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/results/UrlDeduplicator.java +++ b/code/services-application/search-service/java/nu/marginalia/search/results/UrlDeduplicator.java @@ -24,7 +24,7 @@ public class UrlDeduplicator { this.resultsPerKey = resultsPerKey; } - public synchronized boolean shouldRemove(DecoratedSearchResultItem details) { + public boolean shouldRemove(DecoratedSearchResultItem details) { if (!deduplicateOnSuperficialHash(details)) return true; if (!deduplicateOnLSH(details)) @@ -35,6 +35,10 @@ public class UrlDeduplicator { return false; } + public boolean shouldRetain(DecoratedSearchResultItem details) { + return !shouldRemove(details); + } + private boolean deduplicateOnSuperficialHash(DecoratedSearchResultItem details) { return seenSuperficialhashes.add(Objects.hash(details.url.path, details.title)); } diff --git a/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java b/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java index d5813549..39619fdf 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java +++ b/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java @@ -2,11 +2,10 @@ package nu.marginalia.search.svc; import com.google.inject.Inject; import com.google.inject.Singleton; -import lombok.SneakyThrows; import nu.marginalia.api.searchquery.model.query.QueryResponse; -import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; import nu.marginalia.bbpc.BrailleBlockPunchCards; +import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.search.model.UrlDetails; import nu.marginalia.search.results.UrlDeduplicator; @@ -15,8 +14,6 @@ import org.slf4j.LoggerFactory; import org.slf4j.Marker; import org.slf4j.MarkerFactory; -import java.util.ArrayList; -import java.util.Comparator; import java.util.List; @Singleton @@ -31,75 +28,35 @@ public class SearchQueryIndexService { } public List getResultsFromQuery(QueryResponse queryResponse) { - // Remove duplicates and other chaff - final var results = limitAndDeduplicateResults(queryResponse.specs(), queryResponse.results()); + final QueryLimits limits = queryResponse.specs().queryLimits; + final UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain()); // Update the query count (this is what you see on the front page) searchVisitorCount.registerQuery(); - // Decorate and sort the results - List urlDetails = getAllUrlDetails(results); - - urlDetails.sort(Comparator.naturalOrder()); - - return urlDetails; + return queryResponse.results().stream() + .filter(deduplicator::shouldRetain) + .limit(limits.resultsTotal()) + .map(SearchQueryIndexService::createDetails) + .toList(); } - private List limitAndDeduplicateResults(SearchSpecification specs, List decoratedResults) { - var limits = specs.queryLimits; - - UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain()); - List retList = new ArrayList<>(limits.resultsTotal()); - - int dedupCount = 0; - for (var item : decoratedResults) { - if (retList.size() >= limits.resultsTotal()) - break; - - if (!deduplicator.shouldRemove(item)) { - retList.add(item); - } - else { - dedupCount ++; - } - } - - if (dedupCount > 0) { - logger.info(queryMarker, "Deduplicator ate {} results", dedupCount); - } - - return retList; - } - - - @SneakyThrows - public List getAllUrlDetails(List resultSet) { - List ret = new ArrayList<>(resultSet.size()); - - for (var detail : resultSet) { - ret.add(new UrlDetails( - detail.documentId(), - detail.domainId(), - detail.url, - detail.title, - detail.description, - detail.format, - detail.features, - DomainIndexingState.ACTIVE, - detail.rankingScore, // termScore - detail.resultsFromDomain, - getPositionsString(detail), - Long.bitCount(detail.bestPositions), - detail.rawIndexResult, - detail.rawIndexResult.keywordScores - )); - } - - return ret; - } - - private String getPositionsString(DecoratedSearchResultItem resultItem) { - return BrailleBlockPunchCards.printBits(resultItem.bestPositions, 64); - + private static UrlDetails createDetails(DecoratedSearchResultItem item) { + return new UrlDetails( + item.documentId(), + item.domainId(), + item.url, + item.title, + item.description, + item.format, + item.features, + DomainIndexingState.ACTIVE, + item.rankingScore, // termScore + item.resultsFromDomain, + BrailleBlockPunchCards.printBits(item.bestPositions, 64), + Long.bitCount(item.bestPositions), + item.rawIndexResult, + item.rawIndexResult.keywordScores + ); } }