(search) Clean up SearchQueryIndexService and surrounding code

This commit is contained in:
Viktor Lofgren 2024-08-22 11:44:57 +02:00
parent 9eb1f120fc
commit 557bdaa694
3 changed files with 31 additions and 77 deletions

View File

@ -9,11 +9,10 @@ import nu.marginalia.model.crawl.HtmlFeature;
import java.util.ArrayList;
import java.util.List;
import java.util.StringJoiner;
/** A class to hold details about a single search result. */
@AllArgsConstructor @NoArgsConstructor @With @Getter @ToString
public class UrlDetails implements Comparable<UrlDetails> {
public class UrlDetails {
public long id;
public int domainId;
@ -63,12 +62,6 @@ public class UrlDetails implements Comparable<UrlDetails> {
return Long.hashCode(id);
}
@Override
public int compareTo(UrlDetails other) {
int result = Double.compare(getTermScore(), other.getTermScore());
if (result == 0) result = Long.compare(getId(), other.getId());
return result;
}
public boolean equals(Object other) {
if (other == null) {

View File

@ -24,7 +24,7 @@ public class UrlDeduplicator {
this.resultsPerKey = resultsPerKey;
}
public synchronized boolean shouldRemove(DecoratedSearchResultItem details) {
public boolean shouldRemove(DecoratedSearchResultItem details) {
if (!deduplicateOnSuperficialHash(details))
return true;
if (!deduplicateOnLSH(details))
@ -35,6 +35,10 @@ public class UrlDeduplicator {
return false;
}
public boolean shouldRetain(DecoratedSearchResultItem details) {
return !shouldRemove(details);
}
private boolean deduplicateOnSuperficialHash(DecoratedSearchResultItem details) {
return seenSuperficialhashes.add(Objects.hash(details.url.path, details.title));
}

View File

@ -2,11 +2,10 @@ package nu.marginalia.search.svc;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import lombok.SneakyThrows;
import nu.marginalia.api.searchquery.model.query.QueryResponse;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.bbpc.BrailleBlockPunchCards;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.search.model.UrlDetails;
import nu.marginalia.search.results.UrlDeduplicator;
@ -15,8 +14,6 @@ import org.slf4j.LoggerFactory;
import org.slf4j.Marker;
import org.slf4j.MarkerFactory;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
@Singleton
@ -31,75 +28,35 @@ public class SearchQueryIndexService {
}
public List<UrlDetails> getResultsFromQuery(QueryResponse queryResponse) {
// Remove duplicates and other chaff
final var results = limitAndDeduplicateResults(queryResponse.specs(), queryResponse.results());
final QueryLimits limits = queryResponse.specs().queryLimits;
final UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain());
// Update the query count (this is what you see on the front page)
searchVisitorCount.registerQuery();
// Decorate and sort the results
List<UrlDetails> urlDetails = getAllUrlDetails(results);
urlDetails.sort(Comparator.naturalOrder());
return urlDetails;
return queryResponse.results().stream()
.filter(deduplicator::shouldRetain)
.limit(limits.resultsTotal())
.map(SearchQueryIndexService::createDetails)
.toList();
}
private List<DecoratedSearchResultItem> limitAndDeduplicateResults(SearchSpecification specs, List<DecoratedSearchResultItem> decoratedResults) {
var limits = specs.queryLimits;
UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain());
List<DecoratedSearchResultItem> retList = new ArrayList<>(limits.resultsTotal());
int dedupCount = 0;
for (var item : decoratedResults) {
if (retList.size() >= limits.resultsTotal())
break;
if (!deduplicator.shouldRemove(item)) {
retList.add(item);
}
else {
dedupCount ++;
}
}
if (dedupCount > 0) {
logger.info(queryMarker, "Deduplicator ate {} results", dedupCount);
}
return retList;
}
@SneakyThrows
public List<UrlDetails> getAllUrlDetails(List<DecoratedSearchResultItem> resultSet) {
List<UrlDetails> ret = new ArrayList<>(resultSet.size());
for (var detail : resultSet) {
ret.add(new UrlDetails(
detail.documentId(),
detail.domainId(),
detail.url,
detail.title,
detail.description,
detail.format,
detail.features,
DomainIndexingState.ACTIVE,
detail.rankingScore, // termScore
detail.resultsFromDomain,
getPositionsString(detail),
Long.bitCount(detail.bestPositions),
detail.rawIndexResult,
detail.rawIndexResult.keywordScores
));
}
return ret;
}
private String getPositionsString(DecoratedSearchResultItem resultItem) {
return BrailleBlockPunchCards.printBits(resultItem.bestPositions, 64);
private static UrlDetails createDetails(DecoratedSearchResultItem item) {
return new UrlDetails(
item.documentId(),
item.domainId(),
item.url,
item.title,
item.description,
item.format,
item.features,
DomainIndexingState.ACTIVE,
item.rankingScore, // termScore
item.resultsFromDomain,
BrailleBlockPunchCards.printBits(item.bestPositions, 64),
Long.bitCount(item.bestPositions),
item.rawIndexResult,
item.rawIndexResult.keywordScores
);
}
}