mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Clean up search-service and index-api
This commit is contained in:
parent
c2f9980eba
commit
73e412ea5b
@ -7,9 +7,9 @@ import io.reactivex.rxjava3.core.Observable;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.client.AbstractDynamicClient;
|
||||
import nu.marginalia.client.Context;
|
||||
import nu.marginalia.index.client.model.query.EdgeSearchSpecification;
|
||||
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
|
||||
import nu.marginalia.index.client.model.results.EdgeSearchResultSet;
|
||||
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||
import nu.marginalia.index.client.model.results.SearchResultSet;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import nu.marginalia.service.descriptor.ServiceDescriptors;
|
||||
import nu.marginalia.service.id.ServiceId;
|
||||
@ -30,9 +30,9 @@ public class IndexClient extends AbstractDynamicClient {
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
public List<EdgeSearchResultItem> query(Context ctx, EdgeSearchSpecification specs) {
|
||||
public List<SearchResultItem> query(Context ctx, SearchSpecification specs) {
|
||||
return wmsa_search_index_api_time.time(
|
||||
() -> this.postGet(ctx, "/search/", specs, EdgeSearchResultSet.class).blockingFirst().getResults()
|
||||
() -> this.postGet(ctx, "/search/", specs, SearchResultSet.class).blockingFirst().getResults()
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -1,13 +0,0 @@
|
||||
package nu.marginalia.index.client.model.domain;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.id.EdgeIdList;
|
||||
|
||||
@AllArgsConstructor @Getter @ToString
|
||||
public class EdgeDomainSearchResults {
|
||||
public final String keyword;
|
||||
public final EdgeIdList<EdgeUrl> results;
|
||||
}
|
@ -1,14 +0,0 @@
|
||||
package nu.marginalia.index.client.model.domain;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.ToString;
|
||||
|
||||
@ToString @AllArgsConstructor
|
||||
public class EdgeDomainSearchSpecification {
|
||||
|
||||
public final String keyword;
|
||||
|
||||
public final int queryDepth;
|
||||
public final int minHitCount;
|
||||
public final int maxResults;
|
||||
}
|
@ -8,9 +8,12 @@ import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import java.util.List;
|
||||
|
||||
@ToString @Getter @Builder @With @AllArgsConstructor
|
||||
public class EdgeSearchSpecification {
|
||||
public List<EdgeSearchSubquery> subqueries;
|
||||
public class SearchSpecification {
|
||||
public List<SearchSubquery> subqueries;
|
||||
|
||||
/** If present and not empty, limit the search to these domain IDs */
|
||||
public List<Integer> domains;
|
||||
|
||||
public SearchSetIdentifier searchSetIdentifier;
|
||||
|
||||
public final String humanQuery;
|
||||
@ -21,6 +24,7 @@ public class EdgeSearchSpecification {
|
||||
public final SpecificationLimit rank;
|
||||
|
||||
public final QueryLimits queryLimits;
|
||||
|
||||
public final QueryStrategy queryStrategy;
|
||||
|
||||
}
|
@ -2,23 +2,29 @@ package nu.marginalia.index.client.model.query;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
public class EdgeSearchSubquery {
|
||||
public class SearchSubquery {
|
||||
|
||||
/** These terms must be present in the document */
|
||||
public final List<String> searchTermsInclude;
|
||||
|
||||
/** These terms must be absent from the document */
|
||||
public final List<String> searchTermsExclude;
|
||||
|
||||
/** These terms must be present in the document, but are not used in ranking */
|
||||
public final List<String> searchTermsAdvice;
|
||||
|
||||
/** If these optional terms are present in the document, rank it highly */
|
||||
public final List<String> searchTermsPriority;
|
||||
|
||||
private double value = 0;
|
||||
|
||||
public EdgeSearchSubquery(List<String> searchTermsInclude,
|
||||
public SearchSubquery(List<String> searchTermsInclude,
|
||||
List<String> searchTermsExclude,
|
||||
List<String> searchTermsAdvice,
|
||||
List<String> searchTermsPriority
|
||||
@ -29,7 +35,7 @@ public class EdgeSearchSubquery {
|
||||
this.searchTermsPriority = searchTermsPriority;
|
||||
}
|
||||
|
||||
public EdgeSearchSubquery setValue(double value) {
|
||||
public SearchSubquery setValue(double value) {
|
||||
if (Double.isInfinite(value) || Double.isNaN(value)) {
|
||||
this.value = Double.MAX_VALUE;
|
||||
} else {
|
@ -1,84 +0,0 @@
|
||||
package nu.marginalia.index.client.model.results;
|
||||
|
||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
|
||||
import static java.lang.Integer.lowestOneBit;
|
||||
import static java.lang.Integer.numberOfTrailingZeros;
|
||||
|
||||
public record EdgeSearchResultKeywordScore(int set,
|
||||
String keyword,
|
||||
long encodedWordMetadata,
|
||||
long encodedDocMetadata,
|
||||
boolean hasPriorityTerms) {
|
||||
public double documentValue() {
|
||||
long sum = 0;
|
||||
|
||||
sum += DocumentMetadata.decodeQuality(encodedDocMetadata) / 5.;
|
||||
|
||||
sum += DocumentMetadata.decodeTopology(encodedDocMetadata);
|
||||
|
||||
if (DocumentMetadata.hasFlags(encodedDocMetadata, EdgePageDocumentFlags.Simple.asBit())) {
|
||||
sum += 20;
|
||||
}
|
||||
|
||||
int rank = DocumentMetadata.decodeRank(encodedDocMetadata) - 13;
|
||||
if (rank < 0)
|
||||
sum += rank / 2;
|
||||
else
|
||||
sum += rank / 4;
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
private boolean hasTermFlag(EdgePageWordFlags flag) {
|
||||
return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit());
|
||||
}
|
||||
|
||||
public double termValue() {
|
||||
double sum = 0;
|
||||
|
||||
if (hasTermFlag(EdgePageWordFlags.Title)) {
|
||||
sum -= 15;
|
||||
}
|
||||
|
||||
if (hasTermFlag(EdgePageWordFlags.Site)) {
|
||||
sum -= 10;
|
||||
}
|
||||
else if (hasTermFlag(EdgePageWordFlags.SiteAdjacent)) {
|
||||
sum -= 5;
|
||||
}
|
||||
|
||||
if (hasTermFlag(EdgePageWordFlags.Subjects)) {
|
||||
sum -= 10;
|
||||
}
|
||||
if (hasTermFlag(EdgePageWordFlags.NamesWords)) {
|
||||
sum -= 1;
|
||||
}
|
||||
|
||||
if (hasTermFlag(EdgePageWordFlags.UrlDomain)) {
|
||||
sum -= 5;
|
||||
}
|
||||
|
||||
if (hasTermFlag(EdgePageWordFlags.UrlPath)) {
|
||||
sum -= 5;
|
||||
}
|
||||
|
||||
double tfIdf = WordMetadata.decodeTfidf(encodedWordMetadata);
|
||||
int positionBits = WordMetadata.decodePositions(encodedWordMetadata);
|
||||
|
||||
sum -= tfIdf / 10.;
|
||||
sum -= Integer.bitCount(positionBits) / 3.;
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
public int positions() { return WordMetadata.decodePositions(encodedWordMetadata); }
|
||||
public boolean isSpecial() { return keyword.contains(":") || hasTermFlag(EdgePageWordFlags.Synthetic); }
|
||||
public boolean isRegular() {
|
||||
return !keyword.contains(":")
|
||||
&& !hasTermFlag(EdgePageWordFlags.Synthetic);
|
||||
}
|
||||
}
|
@ -1,26 +0,0 @@
|
||||
package nu.marginalia.index.client.model.results;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@AllArgsConstructor @Getter @ToString
|
||||
public class EdgeSearchResults {
|
||||
public final List<EdgeSearchResultItem> results;
|
||||
|
||||
public EdgeSearchResults() {
|
||||
results = new ArrayList<>();
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return results.size();
|
||||
}
|
||||
|
||||
public Stream<EdgeSearchResultItem> stream() {
|
||||
return results.stream();
|
||||
}
|
||||
}
|
@ -8,15 +8,19 @@ import nu.marginalia.model.id.EdgeId;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/** Represents a document matching a search query */
|
||||
@AllArgsConstructor @Getter
|
||||
public class EdgeSearchResultItem {
|
||||
public class SearchResultItem {
|
||||
/** Encoded ID that contains both the URL id and its ranking */
|
||||
public final long combinedId;
|
||||
|
||||
public final List<EdgeSearchResultKeywordScore> scores;
|
||||
/** How did the subqueries match against the document ? */
|
||||
public final List<SearchResultKeywordScore> scores;
|
||||
|
||||
/** How many other potential results existed in the same domain */
|
||||
public int resultsFromDomain;
|
||||
|
||||
public EdgeSearchResultItem(long val) {
|
||||
public SearchResultItem(long val) {
|
||||
this.combinedId = val;
|
||||
this.scores = new ArrayList<>(16);
|
||||
}
|
||||
@ -62,7 +66,7 @@ public class EdgeSearchResultItem {
|
||||
return false;
|
||||
if (other == this)
|
||||
return true;
|
||||
if (other instanceof EdgeSearchResultItem o) {
|
||||
if (other instanceof SearchResultItem o) {
|
||||
return o.getUrlIdInt() == getUrlIdInt();
|
||||
}
|
||||
return false;
|
@ -0,0 +1,145 @@
|
||||
package nu.marginalia.index.client.model.results;
|
||||
|
||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
public final class SearchResultKeywordScore {
|
||||
public final int subquery;
|
||||
public final String keyword;
|
||||
private final long encodedWordMetadata;
|
||||
private final long encodedDocMetadata;
|
||||
private final boolean hasPriorityTerms;
|
||||
|
||||
public SearchResultKeywordScore(int subquery,
|
||||
String keyword,
|
||||
long encodedWordMetadata,
|
||||
long encodedDocMetadata,
|
||||
boolean hasPriorityTerms) {
|
||||
this.subquery = subquery;
|
||||
this.keyword = keyword;
|
||||
this.encodedWordMetadata = encodedWordMetadata;
|
||||
this.encodedDocMetadata = encodedDocMetadata;
|
||||
this.hasPriorityTerms = hasPriorityTerms;
|
||||
}
|
||||
|
||||
private boolean hasTermFlag(EdgePageWordFlags flag) {
|
||||
return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit());
|
||||
}
|
||||
|
||||
public double documentValue() {
|
||||
long sum = 0;
|
||||
|
||||
sum += DocumentMetadata.decodeQuality(encodedDocMetadata) / 5.;
|
||||
|
||||
sum += DocumentMetadata.decodeTopology(encodedDocMetadata);
|
||||
|
||||
if (DocumentMetadata.hasFlags(encodedDocMetadata, EdgePageDocumentFlags.Simple.asBit())) {
|
||||
sum += 20;
|
||||
}
|
||||
|
||||
int rank = DocumentMetadata.decodeRank(encodedDocMetadata) - 13;
|
||||
if (rank < 0)
|
||||
sum += rank / 2;
|
||||
else
|
||||
sum += rank / 4;
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
public double termValue() {
|
||||
double sum = 0;
|
||||
|
||||
if (hasTermFlag(EdgePageWordFlags.Title)) {
|
||||
sum -= 15;
|
||||
}
|
||||
|
||||
if (hasTermFlag(EdgePageWordFlags.Site)) {
|
||||
sum -= 10;
|
||||
} else if (hasTermFlag(EdgePageWordFlags.SiteAdjacent)) {
|
||||
sum -= 5;
|
||||
}
|
||||
|
||||
if (hasTermFlag(EdgePageWordFlags.Subjects)) {
|
||||
sum -= 10;
|
||||
}
|
||||
if (hasTermFlag(EdgePageWordFlags.NamesWords)) {
|
||||
sum -= 1;
|
||||
}
|
||||
|
||||
if (hasTermFlag(EdgePageWordFlags.UrlDomain)) {
|
||||
sum -= 5;
|
||||
}
|
||||
|
||||
if (hasTermFlag(EdgePageWordFlags.UrlPath)) {
|
||||
sum -= 5;
|
||||
}
|
||||
|
||||
double tfIdf = WordMetadata.decodeTfidf(encodedWordMetadata);
|
||||
int positionBits = WordMetadata.decodePositions(encodedWordMetadata);
|
||||
|
||||
sum -= tfIdf / 10.;
|
||||
sum -= Integer.bitCount(positionBits) / 3.;
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
public int subquery() {
|
||||
return subquery;
|
||||
}
|
||||
public int positions() {
|
||||
return WordMetadata.decodePositions(encodedWordMetadata);
|
||||
}
|
||||
|
||||
public boolean isKeywordSpecial() {
|
||||
return keyword.contains(":") || hasTermFlag(EdgePageWordFlags.Synthetic);
|
||||
}
|
||||
|
||||
public boolean isKeywordRegular() {
|
||||
return !keyword.contains(":")
|
||||
&& !hasTermFlag(EdgePageWordFlags.Synthetic);
|
||||
}
|
||||
|
||||
public long encodedWordMetadata() {
|
||||
return encodedWordMetadata;
|
||||
}
|
||||
|
||||
public long encodedDocMetadata() {
|
||||
return encodedDocMetadata;
|
||||
}
|
||||
|
||||
public boolean hasPriorityTerms() {
|
||||
return hasPriorityTerms;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (obj == this) return true;
|
||||
if (obj == null || obj.getClass() != this.getClass()) return false;
|
||||
var that = (SearchResultKeywordScore) obj;
|
||||
return this.subquery == that.subquery &&
|
||||
Objects.equals(this.keyword, that.keyword) &&
|
||||
this.encodedWordMetadata == that.encodedWordMetadata &&
|
||||
this.encodedDocMetadata == that.encodedDocMetadata &&
|
||||
this.hasPriorityTerms == that.hasPriorityTerms;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(subquery, keyword, encodedWordMetadata, encodedDocMetadata, hasPriorityTerms);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "SearchResultKeywordScore[" +
|
||||
"set=" + subquery + ", " +
|
||||
"keyword=" + keyword + ", " +
|
||||
"encodedWordMetadata=" + encodedWordMetadata + ", " +
|
||||
"encodedDocMetadata=" + encodedDocMetadata + ", " +
|
||||
"hasPriorityTerms=" + hasPriorityTerms + ']';
|
||||
}
|
||||
|
||||
}
|
@ -7,8 +7,8 @@ import lombok.ToString;
|
||||
import java.util.List;
|
||||
|
||||
@AllArgsConstructor @Getter @ToString
|
||||
public class EdgeSearchResultSet {
|
||||
public List<EdgeSearchResultItem> results;
|
||||
public class SearchResultSet {
|
||||
public List<SearchResultItem> results;
|
||||
|
||||
public int size() {
|
||||
return results.size();
|
@ -2,7 +2,7 @@ package nu.marginalia.index.results;
|
||||
|
||||
import gnu.trove.map.TLongIntMap;
|
||||
import gnu.trove.map.hash.TLongIntHashMap;
|
||||
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
|
||||
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||
|
||||
public class IndexResultDomainDeduplicator {
|
||||
final TLongIntMap resultsByRankingId = new TLongIntHashMap(2048, 0.5f, -1, 0);
|
||||
@ -21,7 +21,7 @@ public class IndexResultDomainDeduplicator {
|
||||
return resultsByRankingId.adjustOrPutValue(ranking, 1, 1) <= limitByDomain;
|
||||
}
|
||||
|
||||
public boolean test(EdgeSearchResultItem item) {
|
||||
public boolean test(SearchResultItem item) {
|
||||
final long key = item.deduplicationKey();
|
||||
if (key == 0)
|
||||
return true;
|
||||
@ -29,7 +29,7 @@ public class IndexResultDomainDeduplicator {
|
||||
return resultsByRankingId.adjustOrPutValue(key, 1, 1) <= limitByDomain;
|
||||
}
|
||||
|
||||
public int getCount(EdgeSearchResultItem item) {
|
||||
public int getCount(SearchResultItem item) {
|
||||
final long key = item.deduplicationKey();
|
||||
if (key == 0)
|
||||
return 1;
|
||||
|
@ -9,9 +9,9 @@ import nu.marginalia.index.svc.SearchTermsService;
|
||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
|
||||
import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore;
|
||||
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
|
||||
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
|
||||
import nu.marginalia.index.client.model.query.SearchSubquery;
|
||||
import nu.marginalia.index.query.IndexQueryParams;
|
||||
|
||||
import java.util.List;
|
||||
@ -31,7 +31,7 @@ public class IndexResultValuator {
|
||||
public IndexResultValuator(SearchTermsService searchTermsSvc,
|
||||
IndexMetadataService metadataService,
|
||||
TLongList results,
|
||||
List<EdgeSearchSubquery> subqueries,
|
||||
List<SearchSubquery> subqueries,
|
||||
IndexQueryParams queryParams) {
|
||||
this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
|
||||
this.queryParams = queryParams;
|
||||
@ -71,9 +71,9 @@ public class IndexResultValuator {
|
||||
|
||||
}
|
||||
|
||||
public EdgeSearchResultItem evaluateResult(long id) {
|
||||
public SearchResultItem evaluateResult(long id) {
|
||||
|
||||
EdgeSearchResultItem searchResult = new EdgeSearchResultItem(id);
|
||||
SearchResultItem searchResult = new SearchResultItem(id);
|
||||
final long urlIdInt = searchResult.getUrlIdInt();
|
||||
|
||||
searchResult.setDomainId(metadataService.getDomainId(urlIdInt));
|
||||
@ -99,7 +99,7 @@ public class IndexResultValuator {
|
||||
return searchResult;
|
||||
}
|
||||
|
||||
private double evaluateSubquery(EdgeSearchResultItem searchResult,
|
||||
private double evaluateSubquery(SearchResultItem searchResult,
|
||||
long docMetadata,
|
||||
int querySetId,
|
||||
List<String> termList)
|
||||
@ -114,7 +114,7 @@ public class IndexResultValuator {
|
||||
|
||||
long metadata = termMetadata.getTermMetadata(termId, searchResult.getUrlIdInt());
|
||||
|
||||
EdgeSearchResultKeywordScore score = new EdgeSearchResultKeywordScore(
|
||||
SearchResultKeywordScore score = new SearchResultKeywordScore(
|
||||
querySetId,
|
||||
searchTerm,
|
||||
metadata,
|
||||
|
@ -9,10 +9,10 @@ import gnu.trove.set.hash.TLongHashSet;
|
||||
import io.prometheus.client.Counter;
|
||||
import io.prometheus.client.Gauge;
|
||||
import io.prometheus.client.Histogram;
|
||||
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
|
||||
import nu.marginalia.index.client.model.results.EdgeSearchResultSet;
|
||||
import nu.marginalia.index.client.model.query.EdgeSearchSpecification;
|
||||
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
|
||||
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||
import nu.marginalia.index.client.model.results.SearchResultSet;
|
||||
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||
import nu.marginalia.index.client.model.query.SearchSubquery;
|
||||
import nu.marginalia.array.buffer.LongQueryBuffer;
|
||||
import nu.marginalia.index.index.SearchIndex;
|
||||
import nu.marginalia.index.index.SearchIndexSearchTerms;
|
||||
@ -73,13 +73,13 @@ public class IndexQueryService {
|
||||
|
||||
public Object search(Request request, Response response) {
|
||||
String json = request.body();
|
||||
EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class);
|
||||
SearchSpecification specsSet = gson.fromJson(json, SearchSpecification.class);
|
||||
|
||||
try {
|
||||
return wmsa_edge_index_query_time.time(() -> {
|
||||
var params = new SearchParameters(specsSet, getSearchSet(specsSet));
|
||||
|
||||
List<EdgeSearchResultItem> results = executeSearch(params);
|
||||
List<SearchResultItem> results = executeSearch(params);
|
||||
logger.info(queryMarker, "Index Result Count: {}", results.size());
|
||||
|
||||
wmsa_edge_index_query_cost.set(params.getDataCost());
|
||||
@ -87,7 +87,7 @@ public class IndexQueryService {
|
||||
wmsa_edge_index_query_timeouts.inc();
|
||||
}
|
||||
|
||||
return new EdgeSearchResultSet(results);
|
||||
return new SearchResultSet(results);
|
||||
});
|
||||
}
|
||||
catch (HaltException ex) {
|
||||
@ -103,11 +103,11 @@ public class IndexQueryService {
|
||||
}
|
||||
|
||||
// exists for test access
|
||||
EdgeSearchResultSet justQuery(EdgeSearchSpecification specsSet) {
|
||||
return new EdgeSearchResultSet(executeSearch(new SearchParameters(specsSet, getSearchSet(specsSet))));
|
||||
SearchResultSet justQuery(SearchSpecification specsSet) {
|
||||
return new SearchResultSet(executeSearch(new SearchParameters(specsSet, getSearchSet(specsSet))));
|
||||
}
|
||||
|
||||
private SearchSet getSearchSet(EdgeSearchSpecification specsSet) {
|
||||
private SearchSet getSearchSet(SearchSpecification specsSet) {
|
||||
if (specsSet.domains != null && !specsSet.domains.isEmpty()) {
|
||||
return new SmallSearchSet(specsSet.domains);
|
||||
}
|
||||
@ -115,7 +115,7 @@ public class IndexQueryService {
|
||||
return searchSetsService.getSearchSetByName(specsSet.searchSetIdentifier);
|
||||
}
|
||||
|
||||
private List<EdgeSearchResultItem> executeSearch(SearchParameters params) {
|
||||
private List<SearchResultItem> executeSearch(SearchParameters params) {
|
||||
var resultIds = evaluateSubqueries(params);
|
||||
|
||||
var resultItems = calculateResultScores(params, resultIds);
|
||||
@ -176,7 +176,7 @@ public class IndexQueryService {
|
||||
return results;
|
||||
}
|
||||
|
||||
private ArrayList<EdgeSearchResultItem> calculateResultScores(SearchParameters params, TLongList results) {
|
||||
private ArrayList<SearchResultItem> calculateResultScores(SearchParameters params, TLongList results) {
|
||||
|
||||
final var evaluator = new IndexResultValuator(
|
||||
searchTermsSvc,
|
||||
@ -185,7 +185,7 @@ public class IndexQueryService {
|
||||
params.subqueries,
|
||||
params.queryParams);
|
||||
|
||||
ArrayList<EdgeSearchResultItem> items = new ArrayList<>(results.size());
|
||||
ArrayList<SearchResultItem> items = new ArrayList<>(results.size());
|
||||
|
||||
// Sorting the result ids results in better paging characteristics
|
||||
results.sort();
|
||||
@ -206,15 +206,15 @@ public class IndexQueryService {
|
||||
return items;
|
||||
}
|
||||
|
||||
private List<EdgeSearchResultItem> selectBestResults(SearchParameters params, List<EdgeSearchResultItem> results) {
|
||||
private List<SearchResultItem> selectBestResults(SearchParameters params, List<SearchResultItem> results) {
|
||||
|
||||
var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain);
|
||||
|
||||
results.sort(comparingDouble(EdgeSearchResultItem::getScore)
|
||||
.thenComparingInt(EdgeSearchResultItem::getRanking)
|
||||
.thenComparingInt(EdgeSearchResultItem::getUrlIdInt));
|
||||
results.sort(comparingDouble(SearchResultItem::getScore)
|
||||
.thenComparingInt(SearchResultItem::getRanking)
|
||||
.thenComparingInt(SearchResultItem::getUrlIdInt));
|
||||
|
||||
List<EdgeSearchResultItem> resultsList = new ArrayList<>(results.size());
|
||||
List<SearchResultItem> resultsList = new ArrayList<>(results.size());
|
||||
|
||||
for (var item : results) {
|
||||
if (domainCountFilter.test(item)) {
|
||||
@ -245,7 +245,7 @@ class SearchParameters {
|
||||
before evaluating them for the best result. */
|
||||
final int fetchSize;
|
||||
final IndexSearchBudget budget;
|
||||
final List<EdgeSearchSubquery> subqueries;
|
||||
final List<SearchSubquery> subqueries;
|
||||
final IndexQueryParams queryParams;
|
||||
|
||||
final int limitByDomain;
|
||||
@ -261,7 +261,7 @@ class SearchParameters {
|
||||
*/
|
||||
final TLongHashSet consideredUrlIds;
|
||||
|
||||
public SearchParameters(EdgeSearchSpecification specsSet, SearchSet searchSet) {
|
||||
public SearchParameters(SearchSpecification specsSet, SearchSet searchSet) {
|
||||
var limits = specsSet.queryLimits;
|
||||
|
||||
this.fetchSize = limits.fetchSize();
|
||||
|
@ -5,7 +5,7 @@ import com.google.inject.Singleton;
|
||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.dict.OffHeapDictionaryHashMap;
|
||||
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
|
||||
import nu.marginalia.index.client.model.query.SearchSubquery;
|
||||
import nu.marginalia.index.index.SearchIndexSearchTerms;
|
||||
import nu.marginalia.lexicon.KeywordLexiconReadOnlyView;
|
||||
import org.slf4j.Logger;
|
||||
@ -23,7 +23,7 @@ public class SearchTermsService {
|
||||
this.lexicon = lexicon;
|
||||
}
|
||||
|
||||
public SearchIndexSearchTerms getSearchTerms(EdgeSearchSubquery request) {
|
||||
public SearchIndexSearchTerms getSearchTerms(SearchSubquery request) {
|
||||
final IntList excludes = new IntArrayList();
|
||||
final IntList includes = new IntArrayList();
|
||||
final IntList priority = new IntArrayList();
|
||||
|
@ -2,10 +2,10 @@ package nu.marginalia.index.svc;
|
||||
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.index.client.model.query.EdgeSearchSpecification;
|
||||
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
|
||||
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||
import nu.marginalia.index.client.model.query.SearchSubquery;
|
||||
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
||||
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
|
||||
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||
import nu.marginalia.index.index.SearchIndex;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||
@ -79,7 +79,7 @@ public class IndexQueryServiceIntegrationTest {
|
||||
searchIndex.switchIndex();
|
||||
|
||||
var rsp = queryService.justQuery(
|
||||
EdgeSearchSpecification.builder()
|
||||
SearchSpecification.builder()
|
||||
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
||||
.queryStrategy(QueryStrategy.SENTENCE)
|
||||
.year(SpecificationLimit.none())
|
||||
@ -88,7 +88,7 @@ public class IndexQueryServiceIntegrationTest {
|
||||
.rank(SpecificationLimit.none())
|
||||
.domains(new ArrayList<>())
|
||||
.searchSetIdentifier(SearchSetIdentifier.NONE)
|
||||
.subqueries(List.of(new EdgeSearchSubquery(
|
||||
.subqueries(List.of(new SearchSubquery(
|
||||
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList()
|
||||
))).build());
|
||||
|
||||
@ -96,7 +96,7 @@ public class IndexQueryServiceIntegrationTest {
|
||||
new int[] { 30, 90, 150, 210, 270, 330, 390, 450, 510 },
|
||||
rsp.results
|
||||
.stream()
|
||||
.mapToInt(EdgeSearchResultItem::getUrlIdInt)
|
||||
.mapToInt(SearchResultItem::getUrlIdInt)
|
||||
.toArray());
|
||||
}
|
||||
|
||||
@ -111,7 +111,7 @@ public class IndexQueryServiceIntegrationTest {
|
||||
searchIndex.switchIndex();
|
||||
|
||||
var rsp = queryService.justQuery(
|
||||
EdgeSearchSpecification.builder()
|
||||
SearchSpecification.builder()
|
||||
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
||||
.year(SpecificationLimit.none())
|
||||
.quality(SpecificationLimit.none())
|
||||
@ -119,12 +119,12 @@ public class IndexQueryServiceIntegrationTest {
|
||||
.rank(SpecificationLimit.none())
|
||||
.queryStrategy(QueryStrategy.SENTENCE)
|
||||
.domains(List.of(2))
|
||||
.subqueries(List.of(new EdgeSearchSubquery(
|
||||
.subqueries(List.of(new SearchSubquery(
|
||||
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList()
|
||||
))).build());
|
||||
Assertions.assertArrayEquals(
|
||||
new int[] { 210, 270 },
|
||||
rsp.results.stream().mapToInt(EdgeSearchResultItem::getUrlIdInt).toArray());
|
||||
rsp.results.stream().mapToInt(SearchResultItem::getUrlIdInt).toArray());
|
||||
}
|
||||
|
||||
@Test
|
||||
@ -136,7 +136,7 @@ public class IndexQueryServiceIntegrationTest {
|
||||
searchIndex.switchIndex();
|
||||
|
||||
var rsp = queryService.justQuery(
|
||||
EdgeSearchSpecification.builder()
|
||||
SearchSpecification.builder()
|
||||
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
||||
.quality(SpecificationLimit.none())
|
||||
.year(SpecificationLimit.equals(1998))
|
||||
@ -144,14 +144,14 @@ public class IndexQueryServiceIntegrationTest {
|
||||
.rank(SpecificationLimit.none())
|
||||
.queryStrategy(QueryStrategy.SENTENCE)
|
||||
.searchSetIdentifier(SearchSetIdentifier.NONE)
|
||||
.subqueries(List.of(new EdgeSearchSubquery(
|
||||
.subqueries(List.of(new SearchSubquery(
|
||||
List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList()
|
||||
))
|
||||
).build());
|
||||
|
||||
Assertions.assertArrayEquals(
|
||||
new int[] { 12, 72, 132, 192, 252, 312, 372, 432, 492, 32 },
|
||||
rsp.results.stream().mapToInt(EdgeSearchResultItem::getUrlIdInt).toArray());
|
||||
rsp.results.stream().mapToInt(SearchResultItem::getUrlIdInt).toArray());
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.search.command;
|
||||
|
||||
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
|
||||
import nu.marginalia.index.client.model.query.SearchSubquery;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.Arrays;
|
||||
@ -25,7 +25,7 @@ public enum SearchJsParameter {
|
||||
return DEFAULT;
|
||||
}
|
||||
|
||||
public void addTacitTerms(EdgeSearchSubquery subquery) {
|
||||
public void addTacitTerms(SearchSubquery subquery) {
|
||||
subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms));
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.search.query;
|
||||
package nu.marginalia.search.db;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
@ -9,12 +9,12 @@ import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
public class NearQueryProcessor {
|
||||
public class DbNearDomainsQuery {
|
||||
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
@Inject
|
||||
public NearQueryProcessor(HikariDataSource dataSource) {
|
||||
public DbNearDomainsQuery(HikariDataSource dataSource) {
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
@ -2,7 +2,7 @@ package nu.marginalia.search.model;
|
||||
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
|
||||
import nu.marginalia.index.client.model.query.SearchSubquery;
|
||||
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
||||
|
||||
import java.util.Objects;
|
||||
@ -45,7 +45,7 @@ public enum SearchProfile {
|
||||
return YOLO;
|
||||
}
|
||||
|
||||
public void addTacitTerms(EdgeSearchSubquery subquery) {
|
||||
public void addTacitTerms(SearchSubquery subquery) {
|
||||
if (this == ACADEMIA) {
|
||||
subquery.searchTermsPriority.add("tld:edu");
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.search.model;
|
||||
|
||||
import lombok.*;
|
||||
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
|
||||
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
@ -36,7 +36,7 @@ public class UrlDetails {
|
||||
public int resultsFromSameDomain;
|
||||
|
||||
public String positions;
|
||||
public EdgeSearchResultItem resultItem;
|
||||
public SearchResultItem resultItem;
|
||||
|
||||
public boolean hasMoreResults() {
|
||||
return resultsFromSameDomain > 1;
|
||||
|
@ -3,8 +3,8 @@ package nu.marginalia.search.query;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.index.client.model.query.EdgeSearchSpecification;
|
||||
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
|
||||
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||
import nu.marginalia.index.client.model.query.SearchSubquery;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
@ -16,6 +16,7 @@ import nu.marginalia.query_parser.QueryPermutation;
|
||||
import nu.marginalia.query_parser.QueryVariants;
|
||||
import nu.marginalia.query_parser.token.Token;
|
||||
import nu.marginalia.query_parser.token.TokenType;
|
||||
import nu.marginalia.search.db.DbNearDomainsQuery;
|
||||
import nu.marginalia.search.model.SearchProfile;
|
||||
import nu.marginalia.search.query.model.SearchQuery;
|
||||
import nu.marginalia.search.query.model.UserSearchParameters;
|
||||
@ -34,7 +35,7 @@ public class QueryFactory {
|
||||
private final EnglishDictionary englishDictionary;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final SearchResultValuator searchResultValuator;
|
||||
private final NearQueryProcessor nearQueryProcessor;
|
||||
private final DbNearDomainsQuery dbNearDomainsQuery;
|
||||
|
||||
private static final int RETAIN_QUERY_VARIANT_COUNT = 5;
|
||||
private final ThreadLocal<QueryVariants> queryVariants;
|
||||
@ -48,11 +49,11 @@ public class QueryFactory {
|
||||
EnglishDictionary englishDictionary,
|
||||
NGramBloomFilter nGramBloomFilter,
|
||||
SearchResultValuator searchResultValuator,
|
||||
NearQueryProcessor nearQueryProcessor) {
|
||||
DbNearDomainsQuery dbNearDomainsQuery) {
|
||||
|
||||
this.englishDictionary = englishDictionary;
|
||||
this.searchResultValuator = searchResultValuator;
|
||||
this.nearQueryProcessor = nearQueryProcessor;
|
||||
this.dbNearDomainsQuery = dbNearDomainsQuery;
|
||||
|
||||
this.queryVariants = ThreadLocal.withInitial(() -> new QueryVariants(lm ,dict, nGramBloomFilter, englishDictionary));
|
||||
}
|
||||
@ -67,13 +68,13 @@ public class QueryFactory {
|
||||
|
||||
public SearchQuery createQuery(UserSearchParameters params) {
|
||||
final var processedQuery = createQuery(getQueryPermutation(), params);
|
||||
final List<EdgeSearchSubquery> subqueries = processedQuery.specs.subqueries;
|
||||
final List<SearchSubquery> subqueries = processedQuery.specs.subqueries;
|
||||
|
||||
for (var sq : subqueries) {
|
||||
sq.setValue(searchResultValuator.preEvaluate(sq));
|
||||
}
|
||||
|
||||
subqueries.sort(Comparator.comparing(EdgeSearchSubquery::getValue));
|
||||
subqueries.sort(Comparator.comparing(SearchSubquery::getValue));
|
||||
trimArray(subqueries, RETAIN_QUERY_VARIANT_COUNT);
|
||||
|
||||
return processedQuery;
|
||||
@ -84,16 +85,16 @@ public class QueryFactory {
|
||||
int limitTotal,
|
||||
String... termsInclude)
|
||||
{
|
||||
List<EdgeSearchSubquery> sqs = new ArrayList<>();
|
||||
List<SearchSubquery> sqs = new ArrayList<>();
|
||||
|
||||
sqs.add(new EdgeSearchSubquery(
|
||||
sqs.add(new SearchSubquery(
|
||||
Arrays.asList(termsInclude),
|
||||
Collections.emptyList(),
|
||||
Collections.emptyList(),
|
||||
Collections.emptyList()
|
||||
));
|
||||
|
||||
var specs = EdgeSearchSpecification.builder()
|
||||
var specs = SearchSpecification.builder()
|
||||
.subqueries(sqs)
|
||||
.domains(Collections.emptyList())
|
||||
.searchSetIdentifier(profile.searchSetIdentifier)
|
||||
@ -170,7 +171,7 @@ public class QueryFactory {
|
||||
}
|
||||
|
||||
var queryPermutations = queryPermutation.permuteQueriesNew(basicQuery);
|
||||
List<EdgeSearchSubquery> subqueries = new ArrayList<>();
|
||||
List<SearchSubquery> subqueries = new ArrayList<>();
|
||||
|
||||
String near = profile.getNearDomain();
|
||||
|
||||
@ -219,7 +220,7 @@ public class QueryFactory {
|
||||
searchTermsAdvice.clear();
|
||||
}
|
||||
|
||||
EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority);
|
||||
SearchSubquery subquery = new SearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority);
|
||||
|
||||
params.profile().addTacitTerms(subquery);
|
||||
params.jsSetting().addTacitTerms(subquery);
|
||||
@ -231,7 +232,7 @@ public class QueryFactory {
|
||||
|
||||
if (near != null) {
|
||||
if (domain == null) {
|
||||
domains = nearQueryProcessor.getRelatedDomains(near, problems::add);
|
||||
domains = dbNearDomainsQuery.getRelatedDomains(near, problems::add);
|
||||
}
|
||||
}
|
||||
|
||||
@ -242,7 +243,7 @@ public class QueryFactory {
|
||||
domainLimit = 2;
|
||||
}
|
||||
|
||||
EdgeSearchSpecification.EdgeSearchSpecificationBuilder specsBuilder = EdgeSearchSpecification.builder()
|
||||
var specsBuilder = SearchSpecification.builder()
|
||||
.subqueries(subqueries)
|
||||
.queryLimits(new QueryLimits(domainLimit, 100, 250, 4096))
|
||||
.humanQuery(query)
|
||||
@ -254,7 +255,7 @@ public class QueryFactory {
|
||||
.queryStrategy(queryStrategy)
|
||||
.searchSetIdentifier(profile.searchSetIdentifier);
|
||||
|
||||
EdgeSearchSpecification specs = specsBuilder.build();
|
||||
SearchSpecification specs = specsBuilder.build();
|
||||
|
||||
return new SearchQuery(specs, searchTermsHuman, domain);
|
||||
}
|
||||
|
@ -1,19 +1,19 @@
|
||||
package nu.marginalia.search.query.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import nu.marginalia.index.client.model.query.EdgeSearchSpecification;
|
||||
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
@AllArgsConstructor
|
||||
public class SearchQuery {
|
||||
public final EdgeSearchSpecification specs;
|
||||
public final SearchSpecification specs;
|
||||
|
||||
public final Set<String> problems = new TreeSet<>();
|
||||
public final List<String> searchTermsHuman;
|
||||
public String domain;
|
||||
|
||||
public SearchQuery(EdgeSearchSpecification justSpecs) {
|
||||
public SearchQuery(SearchSpecification justSpecs) {
|
||||
searchTermsHuman = new ArrayList<>();
|
||||
specs = justSpecs;
|
||||
}
|
||||
|
@ -8,7 +8,7 @@ import nu.marginalia.search.db.DbUrlDetailsQuery;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
||||
import nu.marginalia.model.id.EdgeIdList;
|
||||
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
|
||||
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||
import nu.marginalia.search.model.UrlDetails;
|
||||
import nu.marginalia.search.valuation.SearchResultValuator;
|
||||
import nu.marginalia.util.BrailleBlockPunchCards;
|
||||
@ -29,11 +29,11 @@ public class SearchResultDecorator {
|
||||
this.valuator = valuator;
|
||||
}
|
||||
|
||||
public List<UrlDetails> getAllUrlDetails(List<EdgeSearchResultItem> resultItems) {
|
||||
public List<UrlDetails> getAllUrlDetails(List<SearchResultItem> resultItems) {
|
||||
TIntObjectHashMap<UrlDetails> detailsById = new TIntObjectHashMap<>(resultItems.size());
|
||||
|
||||
EdgeIdList<EdgeUrl> idList = resultItems.stream()
|
||||
.mapToInt(EdgeSearchResultItem::getUrlIdInt)
|
||||
.mapToInt(SearchResultItem::getUrlIdInt)
|
||||
.collect(EdgeIdList::new, EdgeIdList::add, EdgeIdList::addAll);
|
||||
|
||||
List<UrlDetails> ret = dbUrlDetailsQuery.getUrlDetailsMulti(idList);
|
||||
@ -72,14 +72,14 @@ public class SearchResultDecorator {
|
||||
return retList;
|
||||
}
|
||||
|
||||
private String getPositionsString(EdgeSearchResultItem resultItem) {
|
||||
private String getPositionsString(SearchResultItem resultItem) {
|
||||
Int2IntArrayMap positionsPerSet = new Int2IntArrayMap(8);
|
||||
|
||||
for (var score : resultItem.scores) {
|
||||
if (!score.isRegular()) {
|
||||
if (!score.isKeywordRegular()) {
|
||||
continue;
|
||||
}
|
||||
positionsPerSet.merge(score.set(), score.positions(), this::and);
|
||||
positionsPerSet.merge(score.subquery(), score.positions(), this::and);
|
||||
}
|
||||
|
||||
int bits = positionsPerSet.values().intStream().reduce(this::or).orElse(0);
|
||||
@ -95,7 +95,7 @@ public class SearchResultDecorator {
|
||||
return a | b;
|
||||
}
|
||||
|
||||
private double calculateTermScore(EdgeSearchResultItem resultItem, UrlDetails details) {
|
||||
private double calculateTermScore(SearchResultItem resultItem, UrlDetails details) {
|
||||
|
||||
final double statePenalty = (details.domainState == EdgeDomainIndexingState.SPECIAL) ? 1.25 : 0;
|
||||
final double value = valuator.evaluateTerms(resultItem.scores, details.words, details.title.length());
|
||||
|
@ -3,7 +3,7 @@ package nu.marginalia.search.svc;
|
||||
import com.google.common.base.Strings;
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore;
|
||||
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
|
||||
import nu.marginalia.search.client.model.ApiSearchResultQueryDetails;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.search.SearchOperator;
|
||||
@ -62,7 +62,7 @@ public class SearchApiQueryService {
|
||||
ApiSearchResult convert(UrlDetails url) {
|
||||
List<List<ApiSearchResultQueryDetails>> details = new ArrayList<>();
|
||||
if (url.resultItem != null) {
|
||||
var bySet = url.resultItem.scores.stream().collect(Collectors.groupingBy(EdgeSearchResultKeywordScore::set));
|
||||
var bySet = url.resultItem.scores.stream().collect(Collectors.groupingBy(SearchResultKeywordScore::subquery));
|
||||
|
||||
outer:
|
||||
for (var entries : bySet.values()) {
|
||||
@ -73,7 +73,7 @@ public class SearchApiQueryService {
|
||||
continue outer;
|
||||
|
||||
Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet());
|
||||
lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(), Integer.bitCount(metadata.positions()), flags));
|
||||
lst.add(new ApiSearchResultQueryDetails(entry.keyword, metadata.tfIdf(), Integer.bitCount(metadata.positions()), flags));
|
||||
}
|
||||
details.add(lst);
|
||||
}
|
||||
|
@ -3,8 +3,8 @@ package nu.marginalia.search.svc;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.index.client.IndexClient;
|
||||
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
|
||||
import nu.marginalia.index.client.model.query.EdgeSearchSpecification;
|
||||
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||
import nu.marginalia.search.model.PageScoreAdjustment;
|
||||
import nu.marginalia.search.model.UrlDetails;
|
||||
import nu.marginalia.search.results.SearchResultDecorator;
|
||||
@ -37,7 +37,7 @@ public class SearchQueryIndexService {
|
||||
}
|
||||
|
||||
public List<UrlDetails> executeQuery(Context ctx, SearchQuery processedQuery) {
|
||||
final List<EdgeSearchResultItem> results = indexClient.query(ctx, processedQuery.specs);
|
||||
final List<SearchResultItem> results = indexClient.query(ctx, processedQuery.specs);
|
||||
|
||||
List<UrlDetails> urlDetails = resultDecorator.getAllUrlDetails(results);
|
||||
|
||||
@ -70,7 +70,7 @@ public class SearchQueryIndexService {
|
||||
|
||||
private final Pattern titleSplitPattern = Pattern.compile("[:!|./]|(\\s-|-\\s)|\\s{2,}");
|
||||
|
||||
private PageScoreAdjustment adjustScoreBasedOnQuery(UrlDetails p, EdgeSearchSpecification specs) {
|
||||
private PageScoreAdjustment adjustScoreBasedOnQuery(UrlDetails p, SearchSpecification specs) {
|
||||
String titleLC = p.title == null ? "" : p.title.toLowerCase();
|
||||
String descLC = p.description == null ? "" : p.description.toLowerCase();
|
||||
String urlLC = p.url == null ? "" : p.url.path.toLowerCase();
|
||||
|
@ -5,8 +5,8 @@ import com.google.inject.Singleton;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore;
|
||||
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
|
||||
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
|
||||
import nu.marginalia.index.client.model.query.SearchSubquery;
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
@ -35,7 +35,7 @@ public class SearchResultValuator {
|
||||
}
|
||||
|
||||
|
||||
public double preEvaluate(EdgeSearchSubquery sq) {
|
||||
public double preEvaluate(SearchSubquery sq) {
|
||||
final String[] terms = sq.searchTermsInclude.stream().filter(f -> !f.contains(":")).toArray(String[]::new);
|
||||
|
||||
double termSum = 0.;
|
||||
@ -56,8 +56,8 @@ public class SearchResultValuator {
|
||||
return termSum / factorSum;
|
||||
}
|
||||
|
||||
public double evaluateTerms(List<EdgeSearchResultKeywordScore> rawScores, int length, int titleLength) {
|
||||
int sets = 1 + rawScores.stream().mapToInt(EdgeSearchResultKeywordScore::set).max().orElse(0);
|
||||
public double evaluateTerms(List<SearchResultKeywordScore> rawScores, int length, int titleLength) {
|
||||
int sets = 1 + rawScores.stream().mapToInt(SearchResultKeywordScore::subquery).max().orElse(0);
|
||||
|
||||
double bestScore = 10;
|
||||
double bestAllTermsFactor = 1.;
|
||||
@ -88,10 +88,10 @@ public class SearchResultValuator {
|
||||
return bestScore * (0.1 + 0.9 * bestAllTermsFactor) * priorityTermBonus;
|
||||
}
|
||||
|
||||
private boolean hasPriorityTerm(List<EdgeSearchResultKeywordScore> rawScores) {
|
||||
private boolean hasPriorityTerm(List<SearchResultKeywordScore> rawScores) {
|
||||
return rawScores.stream()
|
||||
.findAny()
|
||||
.map(EdgeSearchResultKeywordScore::hasPriorityTerms)
|
||||
.map(SearchResultKeywordScore::hasPriorityTerms)
|
||||
.orElse(false);
|
||||
}
|
||||
|
||||
@ -260,11 +260,11 @@ public class SearchResultValuator {
|
||||
return f;
|
||||
}
|
||||
|
||||
private double[] getTermWeights(EdgeSearchResultKeywordScore[] scores) {
|
||||
private double[] getTermWeights(SearchResultKeywordScore[] scores) {
|
||||
double[] weights = new double[scores.length];
|
||||
|
||||
for (int i = 0; i < scores.length; i++) {
|
||||
String[] parts = separator.split(scores[i].keyword());
|
||||
String[] parts = separator.split(scores[i].keyword);
|
||||
double sumScore = 0.;
|
||||
|
||||
int count = 0;
|
||||
@ -305,8 +305,8 @@ public class SearchResultValuator {
|
||||
return weights;
|
||||
}
|
||||
|
||||
private SearchResultsKeywordSet createKeywordSet(List<EdgeSearchResultKeywordScore> rawScores, int thisSet) {
|
||||
EdgeSearchResultKeywordScore[] scores = rawScores.stream().filter(w -> w.set() == thisSet && !w.keyword().contains(":")).toArray(EdgeSearchResultKeywordScore[]::new);
|
||||
private SearchResultsKeywordSet createKeywordSet(List<SearchResultKeywordScore> rawScores, int thisSet) {
|
||||
SearchResultKeywordScore[] scores = rawScores.stream().filter(w -> w.subquery() == thisSet && !w.keyword.contains(":")).toArray(SearchResultKeywordScore[]::new);
|
||||
if (scores.length == 0) {
|
||||
return null;
|
||||
}
|
||||
@ -322,8 +322,8 @@ public class SearchResultValuator {
|
||||
}
|
||||
|
||||
|
||||
private record SearchResultsKeyword(EdgeSearchResultKeywordScore score, WordMetadata wordMetadata, double weight) {
|
||||
public SearchResultsKeyword(EdgeSearchResultKeywordScore score, double weight) {
|
||||
private record SearchResultsKeyword(SearchResultKeywordScore score, WordMetadata wordMetadata, double weight) {
|
||||
public SearchResultsKeyword(SearchResultKeywordScore score, double weight) {
|
||||
this(score, new WordMetadata(score.encodedWordMetadata()), weight);
|
||||
}
|
||||
|
||||
|
@ -3,7 +3,7 @@ package nu.marginalia.search.query;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||
import nu.marginalia.language.statistics.EnglishDictionary;
|
||||
import nu.marginalia.index.client.model.query.EdgeSearchSpecification;
|
||||
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||
import nu.marginalia.language.statistics.NGramBloomFilter;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import nu.marginalia.search.command.SearchJsParameter;
|
||||
@ -37,7 +37,7 @@ public class QueryFactoryTest {
|
||||
);
|
||||
}
|
||||
|
||||
public EdgeSearchSpecification parseAndGetSpecs(String query) {
|
||||
public SearchSpecification parseAndGetSpecs(String query) {
|
||||
return queryFactory.createQuery(
|
||||
new UserSearchParameters(query, SearchProfile.CORPO, SearchJsParameter.DEFAULT)
|
||||
).specs;
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.search.valuation;
|
||||
|
||||
import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore;
|
||||
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
|
||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||
@ -31,29 +31,29 @@ class SearchResultValuatorTest {
|
||||
valuator = new SearchResultValuator(dict);
|
||||
|
||||
}
|
||||
List<EdgeSearchResultKeywordScore> titleOnlyLowCountSet = List.of(
|
||||
new EdgeSearchResultKeywordScore(0, "bob",
|
||||
List<SearchResultKeywordScore> titleOnlyLowCountSet = List.of(
|
||||
new SearchResultKeywordScore(0, "bob",
|
||||
wordMetadata(32, Set.of(1), EnumSet.of(EdgePageWordFlags.Title)),
|
||||
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
|
||||
false)
|
||||
);
|
||||
List<EdgeSearchResultKeywordScore> highCountNoTitleSet = List.of(
|
||||
new EdgeSearchResultKeywordScore(0, "bob",
|
||||
List<SearchResultKeywordScore> highCountNoTitleSet = List.of(
|
||||
new SearchResultKeywordScore(0, "bob",
|
||||
wordMetadata(129, Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(EdgePageWordFlags.TfIdfHigh)),
|
||||
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
|
||||
false)
|
||||
);
|
||||
|
||||
List<EdgeSearchResultKeywordScore> highCountSubjectSet = List.of(
|
||||
new EdgeSearchResultKeywordScore(0, "bob",
|
||||
List<SearchResultKeywordScore> highCountSubjectSet = List.of(
|
||||
new SearchResultKeywordScore(0, "bob",
|
||||
wordMetadata(129, Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(EdgePageWordFlags.TfIdfHigh, EdgePageWordFlags.Subjects)),
|
||||
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
|
||||
false)
|
||||
);
|
||||
|
||||
|
||||
List<EdgeSearchResultKeywordScore> first = List.of(
|
||||
new EdgeSearchResultKeywordScore(0, "bob",
|
||||
List<SearchResultKeywordScore> first = List.of(
|
||||
new SearchResultKeywordScore(0, "bob",
|
||||
wordMetadata(202, Set.of(1,3,4,6,7,9,10,11), EnumSet.of(EdgePageWordFlags.TfIdfHigh)),
|
||||
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
|
||||
false)
|
||||
|
Loading…
Reference in New Issue
Block a user