mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(search) Clean up SearchQueryIndexService and surrounding code
This commit is contained in:
parent
9eb1f120fc
commit
557bdaa694
@ -9,11 +9,10 @@ import nu.marginalia.model.crawl.HtmlFeature;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
/** A class to hold details about a single search result. */
|
||||
@AllArgsConstructor @NoArgsConstructor @With @Getter @ToString
|
||||
public class UrlDetails implements Comparable<UrlDetails> {
|
||||
public class UrlDetails {
|
||||
public long id;
|
||||
public int domainId;
|
||||
|
||||
@ -63,12 +62,6 @@ public class UrlDetails implements Comparable<UrlDetails> {
|
||||
return Long.hashCode(id);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(UrlDetails other) {
|
||||
int result = Double.compare(getTermScore(), other.getTermScore());
|
||||
if (result == 0) result = Long.compare(getId(), other.getId());
|
||||
return result;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == null) {
|
||||
|
@ -24,7 +24,7 @@ public class UrlDeduplicator {
|
||||
this.resultsPerKey = resultsPerKey;
|
||||
}
|
||||
|
||||
public synchronized boolean shouldRemove(DecoratedSearchResultItem details) {
|
||||
public boolean shouldRemove(DecoratedSearchResultItem details) {
|
||||
if (!deduplicateOnSuperficialHash(details))
|
||||
return true;
|
||||
if (!deduplicateOnLSH(details))
|
||||
@ -35,6 +35,10 @@ public class UrlDeduplicator {
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean shouldRetain(DecoratedSearchResultItem details) {
|
||||
return !shouldRemove(details);
|
||||
}
|
||||
|
||||
private boolean deduplicateOnSuperficialHash(DecoratedSearchResultItem details) {
|
||||
return seenSuperficialhashes.add(Objects.hash(details.url.path, details.title));
|
||||
}
|
||||
|
@ -2,11 +2,10 @@ package nu.marginalia.search.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.bbpc.BrailleBlockPunchCards;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.search.model.UrlDetails;
|
||||
import nu.marginalia.search.results.UrlDeduplicator;
|
||||
@ -15,8 +14,6 @@ import org.slf4j.LoggerFactory;
|
||||
import org.slf4j.Marker;
|
||||
import org.slf4j.MarkerFactory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
@Singleton
|
||||
@ -31,75 +28,35 @@ public class SearchQueryIndexService {
|
||||
}
|
||||
|
||||
public List<UrlDetails> getResultsFromQuery(QueryResponse queryResponse) {
|
||||
// Remove duplicates and other chaff
|
||||
final var results = limitAndDeduplicateResults(queryResponse.specs(), queryResponse.results());
|
||||
final QueryLimits limits = queryResponse.specs().queryLimits;
|
||||
final UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain());
|
||||
|
||||
// Update the query count (this is what you see on the front page)
|
||||
searchVisitorCount.registerQuery();
|
||||
|
||||
// Decorate and sort the results
|
||||
List<UrlDetails> urlDetails = getAllUrlDetails(results);
|
||||
|
||||
urlDetails.sort(Comparator.naturalOrder());
|
||||
|
||||
return urlDetails;
|
||||
return queryResponse.results().stream()
|
||||
.filter(deduplicator::shouldRetain)
|
||||
.limit(limits.resultsTotal())
|
||||
.map(SearchQueryIndexService::createDetails)
|
||||
.toList();
|
||||
}
|
||||
|
||||
private List<DecoratedSearchResultItem> limitAndDeduplicateResults(SearchSpecification specs, List<DecoratedSearchResultItem> decoratedResults) {
|
||||
var limits = specs.queryLimits;
|
||||
|
||||
UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain());
|
||||
List<DecoratedSearchResultItem> retList = new ArrayList<>(limits.resultsTotal());
|
||||
|
||||
int dedupCount = 0;
|
||||
for (var item : decoratedResults) {
|
||||
if (retList.size() >= limits.resultsTotal())
|
||||
break;
|
||||
|
||||
if (!deduplicator.shouldRemove(item)) {
|
||||
retList.add(item);
|
||||
}
|
||||
else {
|
||||
dedupCount ++;
|
||||
}
|
||||
}
|
||||
|
||||
if (dedupCount > 0) {
|
||||
logger.info(queryMarker, "Deduplicator ate {} results", dedupCount);
|
||||
}
|
||||
|
||||
return retList;
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public List<UrlDetails> getAllUrlDetails(List<DecoratedSearchResultItem> resultSet) {
|
||||
List<UrlDetails> ret = new ArrayList<>(resultSet.size());
|
||||
|
||||
for (var detail : resultSet) {
|
||||
ret.add(new UrlDetails(
|
||||
detail.documentId(),
|
||||
detail.domainId(),
|
||||
detail.url,
|
||||
detail.title,
|
||||
detail.description,
|
||||
detail.format,
|
||||
detail.features,
|
||||
DomainIndexingState.ACTIVE,
|
||||
detail.rankingScore, // termScore
|
||||
detail.resultsFromDomain,
|
||||
getPositionsString(detail),
|
||||
Long.bitCount(detail.bestPositions),
|
||||
detail.rawIndexResult,
|
||||
detail.rawIndexResult.keywordScores
|
||||
));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
private String getPositionsString(DecoratedSearchResultItem resultItem) {
|
||||
return BrailleBlockPunchCards.printBits(resultItem.bestPositions, 64);
|
||||
|
||||
private static UrlDetails createDetails(DecoratedSearchResultItem item) {
|
||||
return new UrlDetails(
|
||||
item.documentId(),
|
||||
item.domainId(),
|
||||
item.url,
|
||||
item.title,
|
||||
item.description,
|
||||
item.format,
|
||||
item.features,
|
||||
DomainIndexingState.ACTIVE,
|
||||
item.rankingScore, // termScore
|
||||
item.resultsFromDomain,
|
||||
BrailleBlockPunchCards.printBits(item.bestPositions, 64),
|
||||
Long.bitCount(item.bestPositions),
|
||||
item.rawIndexResult,
|
||||
item.rawIndexResult.keywordScores
|
||||
);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user