mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(search) Refactor SearchQueryIndexService in preparation for feature extraction.
Prefer working on DecoratedSearchResultItem in favor of UrlDetails.
This commit is contained in:
parent
77ccab7d80
commit
cf366c602f
@ -66,7 +66,7 @@ public class SearchOperator {
|
|||||||
|
|
||||||
logger.info(queryMarker, "Human terms (API): {}", Strings.join(processedQuery.searchTermsHuman, ','));
|
logger.info(queryMarker, "Human terms (API): {}", Strings.join(processedQuery.searchTermsHuman, ','));
|
||||||
|
|
||||||
return searchQueryService.executeQuery(ctx, processedQuery);
|
return searchQueryService.executeQuery(ctx, processedQuery.specs);
|
||||||
}
|
}
|
||||||
|
|
||||||
public DecoratedSearchResults doSearch(Context ctx, UserSearchParameters params) {
|
public DecoratedSearchResults doSearch(Context ctx, UserSearchParameters params) {
|
||||||
@ -76,7 +76,7 @@ public class SearchOperator {
|
|||||||
|
|
||||||
logger.info(queryMarker, "Human terms: {}", Strings.join(processedQuery.searchTermsHuman, ','));
|
logger.info(queryMarker, "Human terms: {}", Strings.join(processedQuery.searchTermsHuman, ','));
|
||||||
|
|
||||||
List<UrlDetails> queryResults = searchQueryService.executeQuery(ctx, processedQuery);
|
List<UrlDetails> queryResults = searchQueryService.executeQuery(ctx, processedQuery.specs);
|
||||||
|
|
||||||
logger.info(queryMarker, "Search Result Count: {}", queryResults.size());
|
logger.info(queryMarker, "Search Result Count: {}", queryResults.size());
|
||||||
|
|
||||||
|
@ -64,7 +64,7 @@ public class SiteListCommand implements SearchCommandInterface {
|
|||||||
int domainId = -1;
|
int domainId = -1;
|
||||||
if (null != domain) {
|
if (null != domain) {
|
||||||
var dumbQuery = queryFactory.createQuery(SearchProfile.CORPO, 100, 100, "site:"+domain);
|
var dumbQuery = queryFactory.createQuery(SearchProfile.CORPO, 100, 100, "site:"+domain);
|
||||||
resultSet = searchQueryIndexService.executeQuery(ctx, dumbQuery);
|
resultSet = searchQueryIndexService.executeQuery(ctx, dumbQuery.specs);
|
||||||
var maybeId = domainQueries.tryGetDomainId(domain);
|
var maybeId = domainQueries.tryGetDomainId(domain);
|
||||||
if (maybeId.isPresent()) {
|
if (maybeId.isPresent()) {
|
||||||
domainId = maybeId.getAsInt();
|
domainId = maybeId.getAsInt();
|
||||||
|
@ -9,7 +9,6 @@ import nu.marginalia.model.crawl.HtmlFeature;
|
|||||||
|
|
||||||
import java.util.EnumSet;
|
import java.util.EnumSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.StringJoiner;
|
import java.util.StringJoiner;
|
||||||
|
|
||||||
@AllArgsConstructor @NoArgsConstructor @With @Getter @ToString
|
@AllArgsConstructor @NoArgsConstructor @With @Getter @ToString
|
||||||
@ -98,14 +97,6 @@ public class UrlDetails {
|
|||||||
return getFeatureScore()*Math.sqrt(1+rankingId)/Math.max(1E-10, lengthAdjustment *(0.7+0.3*Math.exp(urlQualityAdjustment.getScore())));
|
return getFeatureScore()*Math.sqrt(1+rankingId)/Math.max(1E-10, lengthAdjustment *(0.7+0.3*Math.exp(urlQualityAdjustment.getScore())));
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getSuperficialHash() {
|
|
||||||
return Objects.hash(url.path, title);
|
|
||||||
}
|
|
||||||
public String getSuperficialHashStr() {
|
|
||||||
return String.format("%8X", getSuperficialHash());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public String getGeminiLink() {
|
public String getGeminiLink() {
|
||||||
return url.proto + "://" + url.domain.toString() + url.path.replace(" ", "%20").replace("\"", "%22");
|
return url.proto + "://" + url.domain.toString() + url.path.replace(" ", "%20").replace("\"", "%22");
|
||||||
}
|
}
|
||||||
@ -173,9 +164,6 @@ public class UrlDetails {
|
|||||||
}
|
}
|
||||||
public boolean isAds() { return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_ADTECH); }
|
public boolean isAds() { return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_ADTECH); }
|
||||||
|
|
||||||
public boolean isSpecialDomain() {
|
|
||||||
return domainState == DomainIndexingState.SPECIAL;
|
|
||||||
}
|
|
||||||
public int getLogRank() { return (int) Math.round(Math.min(Math.log(1+rankingId),10)); }
|
public int getLogRank() { return (int) Math.round(Math.min(Math.log(1+rankingId),10)); }
|
||||||
|
|
||||||
public int getMatchRank() {
|
public int getMatchRank() {
|
||||||
|
@ -3,6 +3,7 @@ package nu.marginalia.search.results;
|
|||||||
import it.unimi.dsi.fastutil.ints.Int2LongArrayMap;
|
import it.unimi.dsi.fastutil.ints.Int2LongArrayMap;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.bbpc.BrailleBlockPunchCards;
|
import nu.marginalia.bbpc.BrailleBlockPunchCards;
|
||||||
|
import nu.marginalia.index.client.model.results.DecoratedSearchResultItem;
|
||||||
import nu.marginalia.index.client.model.results.SearchResultItem;
|
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||||
import nu.marginalia.index.client.model.results.SearchResultSet;
|
import nu.marginalia.index.client.model.results.SearchResultSet;
|
||||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
@ -18,9 +19,9 @@ public class SearchResultDecorator {
|
|||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public List<UrlDetails> getAllUrlDetails(SearchResultSet resultSet) {
|
public List<UrlDetails> getAllUrlDetails(List<DecoratedSearchResultItem> resultSet) {
|
||||||
List<UrlDetails> ret = new ArrayList<>(resultSet.size());
|
List<UrlDetails> ret = new ArrayList<>(resultSet.size());
|
||||||
for (var detail : resultSet.results) {
|
for (var detail : resultSet) {
|
||||||
ret.add(new UrlDetails(
|
ret.add(new UrlDetails(
|
||||||
detail.documentId(),
|
detail.documentId(),
|
||||||
detail.domainId(),
|
detail.domainId(),
|
||||||
|
@ -4,11 +4,14 @@ import gnu.trove.list.TLongList;
|
|||||||
import gnu.trove.list.array.TLongArrayList;
|
import gnu.trove.list.array.TLongArrayList;
|
||||||
import gnu.trove.map.hash.TObjectIntHashMap;
|
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||||
import gnu.trove.set.hash.TIntHashSet;
|
import gnu.trove.set.hash.TIntHashSet;
|
||||||
|
import nu.marginalia.index.client.model.results.DecoratedSearchResultItem;
|
||||||
import nu.marginalia.search.model.UrlDetails;
|
import nu.marginalia.search.model.UrlDetails;
|
||||||
import nu.marginalia.lsh.EasyLSH;
|
import nu.marginalia.lsh.EasyLSH;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
public class UrlDeduplicator {
|
public class UrlDeduplicator {
|
||||||
private final int LSH_SIMILARITY_THRESHOLD = 2;
|
private final int LSH_SIMILARITY_THRESHOLD = 2;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(UrlDeduplicator.class);
|
private static final Logger logger = LoggerFactory.getLogger(UrlDeduplicator.class);
|
||||||
@ -22,7 +25,7 @@ public class UrlDeduplicator {
|
|||||||
this.resultsPerKey = resultsPerKey;
|
this.resultsPerKey = resultsPerKey;
|
||||||
}
|
}
|
||||||
|
|
||||||
public synchronized boolean shouldRemove(UrlDetails details) {
|
public synchronized boolean shouldRemove(DecoratedSearchResultItem details) {
|
||||||
if (!deduplicateOnSuperficialHash(details))
|
if (!deduplicateOnSuperficialHash(details))
|
||||||
return true;
|
return true;
|
||||||
if (!deduplicateOnLSH(details))
|
if (!deduplicateOnLSH(details))
|
||||||
@ -33,11 +36,11 @@ public class UrlDeduplicator {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean deduplicateOnSuperficialHash(UrlDetails details) {
|
private boolean deduplicateOnSuperficialHash(DecoratedSearchResultItem details) {
|
||||||
return seenSuperficialhashes.add(details.getSuperficialHash());
|
return seenSuperficialhashes.add(Objects.hash(details.url.path, details.title));
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean deduplicateOnLSH(UrlDetails details) {
|
private boolean deduplicateOnLSH(DecoratedSearchResultItem details) {
|
||||||
long thisHash = details.dataHash;
|
long thisHash = details.dataHash;
|
||||||
|
|
||||||
if (0 == thisHash)
|
if (0 == thisHash)
|
||||||
@ -53,16 +56,9 @@ public class UrlDeduplicator {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean limitResultsPerDomain(UrlDetails details) {
|
private boolean limitResultsPerDomain(DecoratedSearchResultItem details) {
|
||||||
final var domain = details.getUrl().getDomain();
|
final var domain = details.getUrl().getDomain();
|
||||||
final String key;
|
final String key = domain.getDomainKey();
|
||||||
|
|
||||||
if (!details.isSpecialDomain()) {
|
|
||||||
key = domain.getLongDomainKey();
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
key = domain.getDomainKey();
|
|
||||||
}
|
|
||||||
|
|
||||||
return keyCount.adjustOrPutValue(key, 1, 1) < resultsPerKey;
|
return keyCount.adjustOrPutValue(key, 1, 1) < resultsPerKey;
|
||||||
}
|
}
|
||||||
|
@ -4,6 +4,7 @@ import com.google.inject.Inject;
|
|||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.index.client.IndexClient;
|
import nu.marginalia.index.client.IndexClient;
|
||||||
import nu.marginalia.index.client.model.query.SearchSpecification;
|
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||||
|
import nu.marginalia.index.client.model.results.DecoratedSearchResultItem;
|
||||||
import nu.marginalia.index.client.model.results.SearchResultSet;
|
import nu.marginalia.index.client.model.results.SearchResultSet;
|
||||||
import nu.marginalia.search.model.PageScoreAdjustment;
|
import nu.marginalia.search.model.PageScoreAdjustment;
|
||||||
import nu.marginalia.search.model.UrlDetails;
|
import nu.marginalia.search.model.UrlDetails;
|
||||||
@ -42,9 +43,12 @@ public class SearchQueryIndexService {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<UrlDetails> executeQuery(Context ctx, SearchQuery processedQuery) {
|
public List<UrlDetails> executeQuery(Context ctx, SearchSpecification specs) {
|
||||||
// Send the query
|
// Send the query
|
||||||
final SearchResultSet results = indexClient.query(ctx, processedQuery.specs);
|
final var queryResponse = indexClient.query(ctx, specs);
|
||||||
|
|
||||||
|
// Remove duplicates and other chaff
|
||||||
|
final var results = limitAndDeduplicateResults(specs, queryResponse.results);
|
||||||
|
|
||||||
// Update the query count (this is what you see on the front page)
|
// Update the query count (this is what you see on the front page)
|
||||||
searchVisitorCount.registerQuery();
|
searchVisitorCount.registerQuery();
|
||||||
@ -53,14 +57,14 @@ public class SearchQueryIndexService {
|
|||||||
List<UrlDetails> urlDetails = resultDecorator.getAllUrlDetails(results);
|
List<UrlDetails> urlDetails = resultDecorator.getAllUrlDetails(results);
|
||||||
urlDetails.sort(resultListComparator);
|
urlDetails.sort(resultListComparator);
|
||||||
|
|
||||||
return limitAndDeduplicateResults(processedQuery, urlDetails);
|
return urlDetails;
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<UrlDetails> limitAndDeduplicateResults(SearchQuery processedQuery, List<UrlDetails> decoratedResults) {
|
private List<DecoratedSearchResultItem> limitAndDeduplicateResults(SearchSpecification specs, List<DecoratedSearchResultItem> decoratedResults) {
|
||||||
var limits = processedQuery.specs.queryLimits;
|
var limits = specs.queryLimits;
|
||||||
|
|
||||||
UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain());
|
UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain());
|
||||||
List<UrlDetails> retList = new ArrayList<>(limits.resultsTotal());
|
List<DecoratedSearchResultItem> retList = new ArrayList<>(limits.resultsTotal());
|
||||||
|
|
||||||
int dedupCount = 0;
|
int dedupCount = 0;
|
||||||
for (var item : decoratedResults) {
|
for (var item : decoratedResults) {
|
||||||
|
Loading…
Reference in New Issue
Block a user