(search) Absorb SearchQueryIndexService into SearchOperator, and clean up SearchOperator

This commit is contained in:
Viktor Lofgren 2024-08-22 11:49:29 +02:00
parent 557bdaa694
commit 2db0e446cb
2 changed files with 63 additions and 114 deletions

View File

@ -7,14 +7,19 @@ import nu.marginalia.WebsiteUrl;
import nu.marginalia.api.math.MathClient; import nu.marginalia.api.math.MathClient;
import nu.marginalia.api.searchquery.QueryClient; import nu.marginalia.api.searchquery.QueryClient;
import nu.marginalia.api.searchquery.model.query.QueryResponse; import nu.marginalia.api.searchquery.model.query.QueryResponse;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.bbpc.BrailleBlockPunchCards;
import nu.marginalia.db.DbDomainQueries; import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.search.command.SearchParameters; import nu.marginalia.search.command.SearchParameters;
import nu.marginalia.search.model.ClusteredUrlDetails; import nu.marginalia.search.model.ClusteredUrlDetails;
import nu.marginalia.search.model.DecoratedSearchResults; import nu.marginalia.search.model.DecoratedSearchResults;
import nu.marginalia.search.model.SearchFilters; import nu.marginalia.search.model.SearchFilters;
import nu.marginalia.search.model.UrlDetails; import nu.marginalia.search.model.UrlDetails;
import nu.marginalia.search.svc.SearchQueryIndexService; import nu.marginalia.search.results.UrlDeduplicator;
import nu.marginalia.search.svc.SearchQueryCountService;
import nu.marginalia.search.svc.SearchUnitConversionService; import nu.marginalia.search.svc.SearchUnitConversionService;
import org.apache.logging.log4j.util.Strings; import org.apache.logging.log4j.util.Strings;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -23,9 +28,10 @@ import org.slf4j.Marker;
import org.slf4j.MarkerFactory; import org.slf4j.MarkerFactory;
import javax.annotation.Nullable; import javax.annotation.Nullable;
import java.lang.ref.WeakReference;
import java.time.Duration; import java.time.Duration;
import java.util.*; import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.concurrent.Future; import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -41,30 +47,30 @@ public class SearchOperator {
private final MathClient mathClient; private final MathClient mathClient;
private final DbDomainQueries domainQueries; private final DbDomainQueries domainQueries;
private final QueryClient queryClient; private final QueryClient queryClient;
private final SearchQueryIndexService searchQueryService;
private final SearchQueryParamFactory paramFactory; private final SearchQueryParamFactory paramFactory;
private final WebsiteUrl websiteUrl; private final WebsiteUrl websiteUrl;
private final SearchUnitConversionService searchUnitConversionService; private final SearchUnitConversionService searchUnitConversionService;
private final SearchQueryCountService searchVisitorCount;
@Inject @Inject
public SearchOperator(MathClient mathClient, public SearchOperator(MathClient mathClient,
DbDomainQueries domainQueries, DbDomainQueries domainQueries,
QueryClient queryClient, QueryClient queryClient,
SearchQueryIndexService searchQueryService,
SearchQueryParamFactory paramFactory, SearchQueryParamFactory paramFactory,
WebsiteUrl websiteUrl, WebsiteUrl websiteUrl,
SearchUnitConversionService searchUnitConversionService) SearchUnitConversionService searchUnitConversionService,
SearchQueryCountService searchVisitorCount
)
{ {
this.mathClient = mathClient; this.mathClient = mathClient;
this.domainQueries = domainQueries; this.domainQueries = domainQueries;
this.queryClient = queryClient; this.queryClient = queryClient;
this.searchQueryService = searchQueryService;
this.paramFactory = paramFactory; this.paramFactory = paramFactory;
this.websiteUrl = websiteUrl; this.websiteUrl = websiteUrl;
this.searchUnitConversionService = searchUnitConversionService; this.searchUnitConversionService = searchUnitConversionService;
this.searchVisitorCount = searchVisitorCount;
} }
public List<UrlDetails> doSiteSearch(String domain, public List<UrlDetails> doSiteSearch(String domain,
@ -74,7 +80,7 @@ public class SearchOperator {
var queryParams = paramFactory.forSiteSearch(domain, domainId, count); var queryParams = paramFactory.forSiteSearch(domain, domainId, count);
var queryResponse = queryClient.search(queryParams); var queryResponse = queryClient.search(queryParams);
return searchQueryService.getResultsFromQuery(queryResponse); return getResultsFromQuery(queryResponse);
} }
public List<UrlDetails> doBacklinkSearch(String domain) { public List<UrlDetails> doBacklinkSearch(String domain) {
@ -82,63 +88,35 @@ public class SearchOperator {
var queryParams = paramFactory.forBacklinkSearch(domain); var queryParams = paramFactory.forBacklinkSearch(domain);
var queryResponse = queryClient.search(queryParams); var queryResponse = queryClient.search(queryParams);
return searchQueryService.getResultsFromQuery(queryResponse); return getResultsFromQuery(queryResponse);
} }
public List<UrlDetails> doLinkSearch(String source, String dest) { public List<UrlDetails> doLinkSearch(String source, String dest) {
var queryParams = paramFactory.forLinkSearch(source, dest); var queryParams = paramFactory.forLinkSearch(source, dest);
var queryResponse = queryClient.search(queryParams); var queryResponse = queryClient.search(queryParams);
return searchQueryService.getResultsFromQuery(queryResponse); return getResultsFromQuery(queryResponse);
} }
private volatile WeakReference<List<ClusteredUrlDetails>> oldResults = new WeakReference<>(Collections.emptyList());
public DecoratedSearchResults doSearch(SearchParameters userParams) { public DecoratedSearchResults doSearch(SearchParameters userParams) {
Future<String> eval = searchUnitConversionService.tryEval(userParams.query()); Future<String> eval = searchUnitConversionService.tryEval(userParams.query());
List<ClusteredUrlDetails> clusteredResults; var queryParams = paramFactory.forRegularSearch(userParams);
QueryResponse queryResponse; QueryResponse queryResponse = queryClient.search(queryParams);
List<String> problems; var queryResults = getResultsFromQuery(queryResponse);
String evalResult;
String focusDomain;
if (userParams.poisonResults() && Math.random() > 0.1) { logger.info(queryMarker, "Human terms: {}", Strings.join(queryResponse.searchTermsHuman(), ','));
logger.info(queryMarker, "Search Result Count: {}", queryResults.size());
// For botnet users, we return random old query results. This is to make String evalResult = getFutureOrDefault(eval, "");
// it harder for them to figure out if they are being rate limited.
clusteredResults = new ArrayList<>(Objects.requireNonNullElse(oldResults.get(), List.of())); List<ClusteredUrlDetails> clusteredResults = SearchResultClusterer
.selectStrategy(queryResponse)
.clusterResults(queryResults, 25);
// Shuffle the results to make it harder to distinguish String focusDomain = queryResponse.domain();
Collections.shuffle(clusteredResults); List<String> problems = getProblems(evalResult, queryResults, queryResponse);
problems = List.of();
evalResult = "";
focusDomain = "";
} else {
var queryParams = paramFactory.forRegularSearch(userParams);
queryResponse = queryClient.search(queryParams);
var queryResults = searchQueryService.getResultsFromQuery(queryResponse);
logger.info(queryMarker, "Human terms: {}", Strings.join(queryResponse.searchTermsHuman(), ','));
logger.info(queryMarker, "Search Result Count: {}", queryResults.size());
evalResult = getFutureOrDefault(eval, "");
clusteredResults = SearchResultClusterer
.selectStrategy(queryResponse)
.clusterResults(queryResults, 25);
focusDomain = queryResponse.domain();
problems = getProblems(evalResult, queryResults, queryResponse);
if (userParams.poisonResults()) {
// Save the results to feed to the botnet
oldResults = new WeakReference<>(clusteredResults);
}
}
return DecoratedSearchResults.builder() return DecoratedSearchResults.builder()
.params(userParams) .params(userParams)
@ -151,6 +129,41 @@ public class SearchOperator {
.build(); .build();
} }
public List<UrlDetails> getResultsFromQuery(QueryResponse queryResponse) {
final QueryLimits limits = queryResponse.specs().queryLimits;
final UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain());
// Update the query count (this is what you see on the front page)
searchVisitorCount.registerQuery();
return queryResponse.results().stream()
.filter(deduplicator::shouldRetain)
.limit(limits.resultsTotal())
.map(SearchOperator::createDetails)
.toList();
}
private static UrlDetails createDetails(DecoratedSearchResultItem item) {
return new UrlDetails(
item.documentId(),
item.domainId(),
item.url,
item.title,
item.description,
item.format,
item.features,
DomainIndexingState.ACTIVE,
item.rankingScore, // termScore
item.resultsFromDomain,
BrailleBlockPunchCards.printBits(item.bestPositions, 64),
Long.bitCount(item.bestPositions),
item.rawIndexResult,
item.rawIndexResult.keywordScores
);
}
private <T> T getFutureOrDefault(@Nullable Future<T> fut, T defaultValue) { private <T> T getFutureOrDefault(@Nullable Future<T> fut, T defaultValue) {
if (fut == null || fut.isCancelled()) { if (fut == null || fut.isCancelled()) {
return defaultValue; return defaultValue;
@ -214,6 +227,4 @@ public class SearchOperator {
return STR."\"\{term}\" could be spelled \{suggestionsStr}"; return STR."\"\{term}\" could be spelled \{suggestionsStr}";
} }
} }

View File

@ -1,62 +0,0 @@
package nu.marginalia.search.svc;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.api.searchquery.model.query.QueryResponse;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.bbpc.BrailleBlockPunchCards;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.search.model.UrlDetails;
import nu.marginalia.search.results.UrlDeduplicator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Marker;
import org.slf4j.MarkerFactory;
import java.util.List;
@Singleton
public class SearchQueryIndexService {
private final SearchQueryCountService searchVisitorCount;
private final Marker queryMarker = MarkerFactory.getMarker("QUERY");
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public SearchQueryIndexService(SearchQueryCountService searchVisitorCount) {
this.searchVisitorCount = searchVisitorCount;
}
public List<UrlDetails> getResultsFromQuery(QueryResponse queryResponse) {
final QueryLimits limits = queryResponse.specs().queryLimits;
final UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain());
// Update the query count (this is what you see on the front page)
searchVisitorCount.registerQuery();
return queryResponse.results().stream()
.filter(deduplicator::shouldRetain)
.limit(limits.resultsTotal())
.map(SearchQueryIndexService::createDetails)
.toList();
}
private static UrlDetails createDetails(DecoratedSearchResultItem item) {
return new UrlDetails(
item.documentId(),
item.domainId(),
item.url,
item.title,
item.description,
item.format,
item.features,
DomainIndexingState.ACTIVE,
item.rankingScore, // termScore
item.resultsFromDomain,
BrailleBlockPunchCards.printBits(item.bestPositions, 64),
Long.bitCount(item.bestPositions),
item.rawIndexResult,
item.rawIndexResult.keywordScores
);
}
}