Clean up search-service and index-api

This commit is contained in:
Viktor Lofgren 2023-03-11 12:26:12 +01:00
parent c2f9980eba
commit 73e412ea5b
27 changed files with 293 additions and 270 deletions

View File

@ -7,9 +7,9 @@ import io.reactivex.rxjava3.core.Observable;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.client.AbstractDynamicClient; import nu.marginalia.client.AbstractDynamicClient;
import nu.marginalia.client.Context; import nu.marginalia.client.Context;
import nu.marginalia.index.client.model.query.EdgeSearchSpecification; import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.index.client.model.results.EdgeSearchResultItem; import nu.marginalia.index.client.model.results.SearchResultItem;
import nu.marginalia.index.client.model.results.EdgeSearchResultSet; import nu.marginalia.index.client.model.results.SearchResultSet;
import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.service.descriptor.ServiceDescriptors; import nu.marginalia.service.descriptor.ServiceDescriptors;
import nu.marginalia.service.id.ServiceId; import nu.marginalia.service.id.ServiceId;
@ -30,9 +30,9 @@ public class IndexClient extends AbstractDynamicClient {
} }
@CheckReturnValue @CheckReturnValue
public List<EdgeSearchResultItem> query(Context ctx, EdgeSearchSpecification specs) { public List<SearchResultItem> query(Context ctx, SearchSpecification specs) {
return wmsa_search_index_api_time.time( return wmsa_search_index_api_time.time(
() -> this.postGet(ctx, "/search/", specs, EdgeSearchResultSet.class).blockingFirst().getResults() () -> this.postGet(ctx, "/search/", specs, SearchResultSet.class).blockingFirst().getResults()
); );
} }

View File

@ -1,13 +0,0 @@
package nu.marginalia.index.client.model.domain;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.EdgeIdList;
@AllArgsConstructor @Getter @ToString
public class EdgeDomainSearchResults {
public final String keyword;
public final EdgeIdList<EdgeUrl> results;
}

View File

@ -1,14 +0,0 @@
package nu.marginalia.index.client.model.domain;
import lombok.AllArgsConstructor;
import lombok.ToString;
@ToString @AllArgsConstructor
public class EdgeDomainSearchSpecification {
public final String keyword;
public final int queryDepth;
public final int minHitCount;
public final int maxResults;
}

View File

@ -8,9 +8,12 @@ import nu.marginalia.index.query.limit.SpecificationLimit;
import java.util.List; import java.util.List;
@ToString @Getter @Builder @With @AllArgsConstructor @ToString @Getter @Builder @With @AllArgsConstructor
public class EdgeSearchSpecification { public class SearchSpecification {
public List<EdgeSearchSubquery> subqueries; public List<SearchSubquery> subqueries;
/** If present and not empty, limit the search to these domain IDs */
public List<Integer> domains; public List<Integer> domains;
public SearchSetIdentifier searchSetIdentifier; public SearchSetIdentifier searchSetIdentifier;
public final String humanQuery; public final String humanQuery;
@ -21,6 +24,7 @@ public class EdgeSearchSpecification {
public final SpecificationLimit rank; public final SpecificationLimit rank;
public final QueryLimits queryLimits; public final QueryLimits queryLimits;
public final QueryStrategy queryStrategy; public final QueryStrategy queryStrategy;
} }

View File

@ -2,26 +2,32 @@ package nu.marginalia.index.client.model.query;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Getter; import lombok.Getter;
import lombok.ToString;
import java.util.List; import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@Getter @Getter
@AllArgsConstructor @AllArgsConstructor
public class EdgeSearchSubquery { public class SearchSubquery {
/** These terms must be present in the document */
public final List<String> searchTermsInclude; public final List<String> searchTermsInclude;
/** These terms must be absent from the document */
public final List<String> searchTermsExclude; public final List<String> searchTermsExclude;
/** These terms must be present in the document, but are not used in ranking */
public final List<String> searchTermsAdvice; public final List<String> searchTermsAdvice;
/** If these optional terms are present in the document, rank it highly */
public final List<String> searchTermsPriority; public final List<String> searchTermsPriority;
private double value = 0; private double value = 0;
public EdgeSearchSubquery(List<String> searchTermsInclude, public SearchSubquery(List<String> searchTermsInclude,
List<String> searchTermsExclude, List<String> searchTermsExclude,
List<String> searchTermsAdvice, List<String> searchTermsAdvice,
List<String> searchTermsPriority List<String> searchTermsPriority
) { ) {
this.searchTermsInclude = searchTermsInclude; this.searchTermsInclude = searchTermsInclude;
this.searchTermsExclude = searchTermsExclude; this.searchTermsExclude = searchTermsExclude;
@ -29,7 +35,7 @@ public class EdgeSearchSubquery {
this.searchTermsPriority = searchTermsPriority; this.searchTermsPriority = searchTermsPriority;
} }
public EdgeSearchSubquery setValue(double value) { public SearchSubquery setValue(double value) {
if (Double.isInfinite(value) || Double.isNaN(value)) { if (Double.isInfinite(value) || Double.isNaN(value)) {
this.value = Double.MAX_VALUE; this.value = Double.MAX_VALUE;
} else { } else {

View File

@ -1,84 +0,0 @@
package nu.marginalia.index.client.model.results;
import nu.marginalia.model.crawl.EdgePageWordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import static java.lang.Integer.lowestOneBit;
import static java.lang.Integer.numberOfTrailingZeros;
public record EdgeSearchResultKeywordScore(int set,
String keyword,
long encodedWordMetadata,
long encodedDocMetadata,
boolean hasPriorityTerms) {
public double documentValue() {
long sum = 0;
sum += DocumentMetadata.decodeQuality(encodedDocMetadata) / 5.;
sum += DocumentMetadata.decodeTopology(encodedDocMetadata);
if (DocumentMetadata.hasFlags(encodedDocMetadata, EdgePageDocumentFlags.Simple.asBit())) {
sum += 20;
}
int rank = DocumentMetadata.decodeRank(encodedDocMetadata) - 13;
if (rank < 0)
sum += rank / 2;
else
sum += rank / 4;
return sum;
}
private boolean hasTermFlag(EdgePageWordFlags flag) {
return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit());
}
public double termValue() {
double sum = 0;
if (hasTermFlag(EdgePageWordFlags.Title)) {
sum -= 15;
}
if (hasTermFlag(EdgePageWordFlags.Site)) {
sum -= 10;
}
else if (hasTermFlag(EdgePageWordFlags.SiteAdjacent)) {
sum -= 5;
}
if (hasTermFlag(EdgePageWordFlags.Subjects)) {
sum -= 10;
}
if (hasTermFlag(EdgePageWordFlags.NamesWords)) {
sum -= 1;
}
if (hasTermFlag(EdgePageWordFlags.UrlDomain)) {
sum -= 5;
}
if (hasTermFlag(EdgePageWordFlags.UrlPath)) {
sum -= 5;
}
double tfIdf = WordMetadata.decodeTfidf(encodedWordMetadata);
int positionBits = WordMetadata.decodePositions(encodedWordMetadata);
sum -= tfIdf / 10.;
sum -= Integer.bitCount(positionBits) / 3.;
return sum;
}
public int positions() { return WordMetadata.decodePositions(encodedWordMetadata); }
public boolean isSpecial() { return keyword.contains(":") || hasTermFlag(EdgePageWordFlags.Synthetic); }
public boolean isRegular() {
return !keyword.contains(":")
&& !hasTermFlag(EdgePageWordFlags.Synthetic);
}
}

View File

@ -1,26 +0,0 @@
package nu.marginalia.index.client.model.results;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Stream;
@AllArgsConstructor @Getter @ToString
public class EdgeSearchResults {
public final List<EdgeSearchResultItem> results;
public EdgeSearchResults() {
results = new ArrayList<>();
}
public int size() {
return results.size();
}
public Stream<EdgeSearchResultItem> stream() {
return results.stream();
}
}

View File

@ -8,15 +8,19 @@ import nu.marginalia.model.id.EdgeId;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
/** Represents a document matching a search query */
@AllArgsConstructor @Getter @AllArgsConstructor @Getter
public class EdgeSearchResultItem { public class SearchResultItem {
/** Encoded ID that contains both the URL id and its ranking */
public final long combinedId; public final long combinedId;
public final List<EdgeSearchResultKeywordScore> scores; /** How did the subqueries match against the document ? */
public final List<SearchResultKeywordScore> scores;
/** How many other potential results existed in the same domain */
public int resultsFromDomain; public int resultsFromDomain;
public EdgeSearchResultItem(long val) { public SearchResultItem(long val) {
this.combinedId = val; this.combinedId = val;
this.scores = new ArrayList<>(16); this.scores = new ArrayList<>(16);
} }
@ -62,7 +66,7 @@ public class EdgeSearchResultItem {
return false; return false;
if (other == this) if (other == this)
return true; return true;
if (other instanceof EdgeSearchResultItem o) { if (other instanceof SearchResultItem o) {
return o.getUrlIdInt() == getUrlIdInt(); return o.getUrlIdInt() == getUrlIdInt();
} }
return false; return false;

View File

@ -0,0 +1,145 @@
package nu.marginalia.index.client.model.results;
import nu.marginalia.model.crawl.EdgePageWordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import java.util.Objects;
public final class SearchResultKeywordScore {
public final int subquery;
public final String keyword;
private final long encodedWordMetadata;
private final long encodedDocMetadata;
private final boolean hasPriorityTerms;
public SearchResultKeywordScore(int subquery,
String keyword,
long encodedWordMetadata,
long encodedDocMetadata,
boolean hasPriorityTerms) {
this.subquery = subquery;
this.keyword = keyword;
this.encodedWordMetadata = encodedWordMetadata;
this.encodedDocMetadata = encodedDocMetadata;
this.hasPriorityTerms = hasPriorityTerms;
}
private boolean hasTermFlag(EdgePageWordFlags flag) {
return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit());
}
public double documentValue() {
long sum = 0;
sum += DocumentMetadata.decodeQuality(encodedDocMetadata) / 5.;
sum += DocumentMetadata.decodeTopology(encodedDocMetadata);
if (DocumentMetadata.hasFlags(encodedDocMetadata, EdgePageDocumentFlags.Simple.asBit())) {
sum += 20;
}
int rank = DocumentMetadata.decodeRank(encodedDocMetadata) - 13;
if (rank < 0)
sum += rank / 2;
else
sum += rank / 4;
return sum;
}
public double termValue() {
double sum = 0;
if (hasTermFlag(EdgePageWordFlags.Title)) {
sum -= 15;
}
if (hasTermFlag(EdgePageWordFlags.Site)) {
sum -= 10;
} else if (hasTermFlag(EdgePageWordFlags.SiteAdjacent)) {
sum -= 5;
}
if (hasTermFlag(EdgePageWordFlags.Subjects)) {
sum -= 10;
}
if (hasTermFlag(EdgePageWordFlags.NamesWords)) {
sum -= 1;
}
if (hasTermFlag(EdgePageWordFlags.UrlDomain)) {
sum -= 5;
}
if (hasTermFlag(EdgePageWordFlags.UrlPath)) {
sum -= 5;
}
double tfIdf = WordMetadata.decodeTfidf(encodedWordMetadata);
int positionBits = WordMetadata.decodePositions(encodedWordMetadata);
sum -= tfIdf / 10.;
sum -= Integer.bitCount(positionBits) / 3.;
return sum;
}
public int subquery() {
return subquery;
}
public int positions() {
return WordMetadata.decodePositions(encodedWordMetadata);
}
public boolean isKeywordSpecial() {
return keyword.contains(":") || hasTermFlag(EdgePageWordFlags.Synthetic);
}
public boolean isKeywordRegular() {
return !keyword.contains(":")
&& !hasTermFlag(EdgePageWordFlags.Synthetic);
}
public long encodedWordMetadata() {
return encodedWordMetadata;
}
public long encodedDocMetadata() {
return encodedDocMetadata;
}
public boolean hasPriorityTerms() {
return hasPriorityTerms;
}
@Override
public boolean equals(Object obj) {
if (obj == this) return true;
if (obj == null || obj.getClass() != this.getClass()) return false;
var that = (SearchResultKeywordScore) obj;
return this.subquery == that.subquery &&
Objects.equals(this.keyword, that.keyword) &&
this.encodedWordMetadata == that.encodedWordMetadata &&
this.encodedDocMetadata == that.encodedDocMetadata &&
this.hasPriorityTerms == that.hasPriorityTerms;
}
@Override
public int hashCode() {
return Objects.hash(subquery, keyword, encodedWordMetadata, encodedDocMetadata, hasPriorityTerms);
}
@Override
public String toString() {
return "SearchResultKeywordScore[" +
"set=" + subquery + ", " +
"keyword=" + keyword + ", " +
"encodedWordMetadata=" + encodedWordMetadata + ", " +
"encodedDocMetadata=" + encodedDocMetadata + ", " +
"hasPriorityTerms=" + hasPriorityTerms + ']';
}
}

View File

@ -7,8 +7,8 @@ import lombok.ToString;
import java.util.List; import java.util.List;
@AllArgsConstructor @Getter @ToString @AllArgsConstructor @Getter @ToString
public class EdgeSearchResultSet { public class SearchResultSet {
public List<EdgeSearchResultItem> results; public List<SearchResultItem> results;
public int size() { public int size() {
return results.size(); return results.size();

View File

@ -2,7 +2,7 @@ package nu.marginalia.index.results;
import gnu.trove.map.TLongIntMap; import gnu.trove.map.TLongIntMap;
import gnu.trove.map.hash.TLongIntHashMap; import gnu.trove.map.hash.TLongIntHashMap;
import nu.marginalia.index.client.model.results.EdgeSearchResultItem; import nu.marginalia.index.client.model.results.SearchResultItem;
public class IndexResultDomainDeduplicator { public class IndexResultDomainDeduplicator {
final TLongIntMap resultsByRankingId = new TLongIntHashMap(2048, 0.5f, -1, 0); final TLongIntMap resultsByRankingId = new TLongIntHashMap(2048, 0.5f, -1, 0);
@ -21,7 +21,7 @@ public class IndexResultDomainDeduplicator {
return resultsByRankingId.adjustOrPutValue(ranking, 1, 1) <= limitByDomain; return resultsByRankingId.adjustOrPutValue(ranking, 1, 1) <= limitByDomain;
} }
public boolean test(EdgeSearchResultItem item) { public boolean test(SearchResultItem item) {
final long key = item.deduplicationKey(); final long key = item.deduplicationKey();
if (key == 0) if (key == 0)
return true; return true;
@ -29,7 +29,7 @@ public class IndexResultDomainDeduplicator {
return resultsByRankingId.adjustOrPutValue(key, 1, 1) <= limitByDomain; return resultsByRankingId.adjustOrPutValue(key, 1, 1) <= limitByDomain;
} }
public int getCount(EdgeSearchResultItem item) { public int getCount(SearchResultItem item) {
final long key = item.deduplicationKey(); final long key = item.deduplicationKey();
if (key == 0) if (key == 0)
return 1; return 1;

View File

@ -9,9 +9,9 @@ import nu.marginalia.index.svc.SearchTermsService;
import nu.marginalia.model.crawl.EdgePageWordFlags; import nu.marginalia.model.crawl.EdgePageWordFlags;
import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.client.model.results.EdgeSearchResultItem; import nu.marginalia.index.client.model.results.SearchResultItem;
import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore; import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
import nu.marginalia.index.client.model.query.EdgeSearchSubquery; import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.index.query.IndexQueryParams; import nu.marginalia.index.query.IndexQueryParams;
import java.util.List; import java.util.List;
@ -31,7 +31,7 @@ public class IndexResultValuator {
public IndexResultValuator(SearchTermsService searchTermsSvc, public IndexResultValuator(SearchTermsService searchTermsSvc,
IndexMetadataService metadataService, IndexMetadataService metadataService,
TLongList results, TLongList results,
List<EdgeSearchSubquery> subqueries, List<SearchSubquery> subqueries,
IndexQueryParams queryParams) { IndexQueryParams queryParams) {
this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList(); this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
this.queryParams = queryParams; this.queryParams = queryParams;
@ -71,9 +71,9 @@ public class IndexResultValuator {
} }
public EdgeSearchResultItem evaluateResult(long id) { public SearchResultItem evaluateResult(long id) {
EdgeSearchResultItem searchResult = new EdgeSearchResultItem(id); SearchResultItem searchResult = new SearchResultItem(id);
final long urlIdInt = searchResult.getUrlIdInt(); final long urlIdInt = searchResult.getUrlIdInt();
searchResult.setDomainId(metadataService.getDomainId(urlIdInt)); searchResult.setDomainId(metadataService.getDomainId(urlIdInt));
@ -99,7 +99,7 @@ public class IndexResultValuator {
return searchResult; return searchResult;
} }
private double evaluateSubquery(EdgeSearchResultItem searchResult, private double evaluateSubquery(SearchResultItem searchResult,
long docMetadata, long docMetadata,
int querySetId, int querySetId,
List<String> termList) List<String> termList)
@ -114,7 +114,7 @@ public class IndexResultValuator {
long metadata = termMetadata.getTermMetadata(termId, searchResult.getUrlIdInt()); long metadata = termMetadata.getTermMetadata(termId, searchResult.getUrlIdInt());
EdgeSearchResultKeywordScore score = new EdgeSearchResultKeywordScore( SearchResultKeywordScore score = new SearchResultKeywordScore(
querySetId, querySetId,
searchTerm, searchTerm,
metadata, metadata,

View File

@ -9,10 +9,10 @@ import gnu.trove.set.hash.TLongHashSet;
import io.prometheus.client.Counter; import io.prometheus.client.Counter;
import io.prometheus.client.Gauge; import io.prometheus.client.Gauge;
import io.prometheus.client.Histogram; import io.prometheus.client.Histogram;
import nu.marginalia.index.client.model.results.EdgeSearchResultItem; import nu.marginalia.index.client.model.results.SearchResultItem;
import nu.marginalia.index.client.model.results.EdgeSearchResultSet; import nu.marginalia.index.client.model.results.SearchResultSet;
import nu.marginalia.index.client.model.query.EdgeSearchSpecification; import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.index.client.model.query.EdgeSearchSubquery; import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.array.buffer.LongQueryBuffer; import nu.marginalia.array.buffer.LongQueryBuffer;
import nu.marginalia.index.index.SearchIndex; import nu.marginalia.index.index.SearchIndex;
import nu.marginalia.index.index.SearchIndexSearchTerms; import nu.marginalia.index.index.SearchIndexSearchTerms;
@ -73,13 +73,13 @@ public class IndexQueryService {
public Object search(Request request, Response response) { public Object search(Request request, Response response) {
String json = request.body(); String json = request.body();
EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class); SearchSpecification specsSet = gson.fromJson(json, SearchSpecification.class);
try { try {
return wmsa_edge_index_query_time.time(() -> { return wmsa_edge_index_query_time.time(() -> {
var params = new SearchParameters(specsSet, getSearchSet(specsSet)); var params = new SearchParameters(specsSet, getSearchSet(specsSet));
List<EdgeSearchResultItem> results = executeSearch(params); List<SearchResultItem> results = executeSearch(params);
logger.info(queryMarker, "Index Result Count: {}", results.size()); logger.info(queryMarker, "Index Result Count: {}", results.size());
wmsa_edge_index_query_cost.set(params.getDataCost()); wmsa_edge_index_query_cost.set(params.getDataCost());
@ -87,7 +87,7 @@ public class IndexQueryService {
wmsa_edge_index_query_timeouts.inc(); wmsa_edge_index_query_timeouts.inc();
} }
return new EdgeSearchResultSet(results); return new SearchResultSet(results);
}); });
} }
catch (HaltException ex) { catch (HaltException ex) {
@ -103,11 +103,11 @@ public class IndexQueryService {
} }
// exists for test access // exists for test access
EdgeSearchResultSet justQuery(EdgeSearchSpecification specsSet) { SearchResultSet justQuery(SearchSpecification specsSet) {
return new EdgeSearchResultSet(executeSearch(new SearchParameters(specsSet, getSearchSet(specsSet)))); return new SearchResultSet(executeSearch(new SearchParameters(specsSet, getSearchSet(specsSet))));
} }
private SearchSet getSearchSet(EdgeSearchSpecification specsSet) { private SearchSet getSearchSet(SearchSpecification specsSet) {
if (specsSet.domains != null && !specsSet.domains.isEmpty()) { if (specsSet.domains != null && !specsSet.domains.isEmpty()) {
return new SmallSearchSet(specsSet.domains); return new SmallSearchSet(specsSet.domains);
} }
@ -115,7 +115,7 @@ public class IndexQueryService {
return searchSetsService.getSearchSetByName(specsSet.searchSetIdentifier); return searchSetsService.getSearchSetByName(specsSet.searchSetIdentifier);
} }
private List<EdgeSearchResultItem> executeSearch(SearchParameters params) { private List<SearchResultItem> executeSearch(SearchParameters params) {
var resultIds = evaluateSubqueries(params); var resultIds = evaluateSubqueries(params);
var resultItems = calculateResultScores(params, resultIds); var resultItems = calculateResultScores(params, resultIds);
@ -176,7 +176,7 @@ public class IndexQueryService {
return results; return results;
} }
private ArrayList<EdgeSearchResultItem> calculateResultScores(SearchParameters params, TLongList results) { private ArrayList<SearchResultItem> calculateResultScores(SearchParameters params, TLongList results) {
final var evaluator = new IndexResultValuator( final var evaluator = new IndexResultValuator(
searchTermsSvc, searchTermsSvc,
@ -185,7 +185,7 @@ public class IndexQueryService {
params.subqueries, params.subqueries,
params.queryParams); params.queryParams);
ArrayList<EdgeSearchResultItem> items = new ArrayList<>(results.size()); ArrayList<SearchResultItem> items = new ArrayList<>(results.size());
// Sorting the result ids results in better paging characteristics // Sorting the result ids results in better paging characteristics
results.sort(); results.sort();
@ -206,15 +206,15 @@ public class IndexQueryService {
return items; return items;
} }
private List<EdgeSearchResultItem> selectBestResults(SearchParameters params, List<EdgeSearchResultItem> results) { private List<SearchResultItem> selectBestResults(SearchParameters params, List<SearchResultItem> results) {
var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain); var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain);
results.sort(comparingDouble(EdgeSearchResultItem::getScore) results.sort(comparingDouble(SearchResultItem::getScore)
.thenComparingInt(EdgeSearchResultItem::getRanking) .thenComparingInt(SearchResultItem::getRanking)
.thenComparingInt(EdgeSearchResultItem::getUrlIdInt)); .thenComparingInt(SearchResultItem::getUrlIdInt));
List<EdgeSearchResultItem> resultsList = new ArrayList<>(results.size()); List<SearchResultItem> resultsList = new ArrayList<>(results.size());
for (var item : results) { for (var item : results) {
if (domainCountFilter.test(item)) { if (domainCountFilter.test(item)) {
@ -245,7 +245,7 @@ class SearchParameters {
before evaluating them for the best result. */ before evaluating them for the best result. */
final int fetchSize; final int fetchSize;
final IndexSearchBudget budget; final IndexSearchBudget budget;
final List<EdgeSearchSubquery> subqueries; final List<SearchSubquery> subqueries;
final IndexQueryParams queryParams; final IndexQueryParams queryParams;
final int limitByDomain; final int limitByDomain;
@ -261,7 +261,7 @@ class SearchParameters {
*/ */
final TLongHashSet consideredUrlIds; final TLongHashSet consideredUrlIds;
public SearchParameters(EdgeSearchSpecification specsSet, SearchSet searchSet) { public SearchParameters(SearchSpecification specsSet, SearchSet searchSet) {
var limits = specsSet.queryLimits; var limits = specsSet.queryLimits;
this.fetchSize = limits.fetchSize(); this.fetchSize = limits.fetchSize();

View File

@ -5,7 +5,7 @@ import com.google.inject.Singleton;
import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntList; import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.dict.OffHeapDictionaryHashMap; import nu.marginalia.dict.OffHeapDictionaryHashMap;
import nu.marginalia.index.client.model.query.EdgeSearchSubquery; import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.index.index.SearchIndexSearchTerms; import nu.marginalia.index.index.SearchIndexSearchTerms;
import nu.marginalia.lexicon.KeywordLexiconReadOnlyView; import nu.marginalia.lexicon.KeywordLexiconReadOnlyView;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -23,7 +23,7 @@ public class SearchTermsService {
this.lexicon = lexicon; this.lexicon = lexicon;
} }
public SearchIndexSearchTerms getSearchTerms(EdgeSearchSubquery request) { public SearchIndexSearchTerms getSearchTerms(SearchSubquery request) {
final IntList excludes = new IntArrayList(); final IntList excludes = new IntArrayList();
final IntList includes = new IntArrayList(); final IntList includes = new IntArrayList();
final IntList priority = new IntArrayList(); final IntList priority = new IntArrayList();

View File

@ -2,10 +2,10 @@ package nu.marginalia.index.svc;
import com.google.inject.Guice; import com.google.inject.Guice;
import com.google.inject.Inject; import com.google.inject.Inject;
import nu.marginalia.index.client.model.query.EdgeSearchSpecification; import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.index.client.model.query.EdgeSearchSubquery; import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.index.client.model.query.SearchSetIdentifier; import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import nu.marginalia.index.client.model.results.EdgeSearchResultItem; import nu.marginalia.index.client.model.results.SearchResultItem;
import nu.marginalia.index.index.SearchIndex; import nu.marginalia.index.index.SearchIndex;
import nu.marginalia.index.journal.model.IndexJournalEntryData; import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader; import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
@ -79,7 +79,7 @@ public class IndexQueryServiceIntegrationTest {
searchIndex.switchIndex(); searchIndex.switchIndex();
var rsp = queryService.justQuery( var rsp = queryService.justQuery(
EdgeSearchSpecification.builder() SearchSpecification.builder()
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000)) .queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
.queryStrategy(QueryStrategy.SENTENCE) .queryStrategy(QueryStrategy.SENTENCE)
.year(SpecificationLimit.none()) .year(SpecificationLimit.none())
@ -88,7 +88,7 @@ public class IndexQueryServiceIntegrationTest {
.rank(SpecificationLimit.none()) .rank(SpecificationLimit.none())
.domains(new ArrayList<>()) .domains(new ArrayList<>())
.searchSetIdentifier(SearchSetIdentifier.NONE) .searchSetIdentifier(SearchSetIdentifier.NONE)
.subqueries(List.of(new EdgeSearchSubquery( .subqueries(List.of(new SearchSubquery(
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList() List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList()
))).build()); ))).build());
@ -96,7 +96,7 @@ public class IndexQueryServiceIntegrationTest {
new int[] { 30, 90, 150, 210, 270, 330, 390, 450, 510 }, new int[] { 30, 90, 150, 210, 270, 330, 390, 450, 510 },
rsp.results rsp.results
.stream() .stream()
.mapToInt(EdgeSearchResultItem::getUrlIdInt) .mapToInt(SearchResultItem::getUrlIdInt)
.toArray()); .toArray());
} }
@ -111,7 +111,7 @@ public class IndexQueryServiceIntegrationTest {
searchIndex.switchIndex(); searchIndex.switchIndex();
var rsp = queryService.justQuery( var rsp = queryService.justQuery(
EdgeSearchSpecification.builder() SearchSpecification.builder()
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000)) .queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
.year(SpecificationLimit.none()) .year(SpecificationLimit.none())
.quality(SpecificationLimit.none()) .quality(SpecificationLimit.none())
@ -119,12 +119,12 @@ public class IndexQueryServiceIntegrationTest {
.rank(SpecificationLimit.none()) .rank(SpecificationLimit.none())
.queryStrategy(QueryStrategy.SENTENCE) .queryStrategy(QueryStrategy.SENTENCE)
.domains(List.of(2)) .domains(List.of(2))
.subqueries(List.of(new EdgeSearchSubquery( .subqueries(List.of(new SearchSubquery(
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList() List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList()
))).build()); ))).build());
Assertions.assertArrayEquals( Assertions.assertArrayEquals(
new int[] { 210, 270 }, new int[] { 210, 270 },
rsp.results.stream().mapToInt(EdgeSearchResultItem::getUrlIdInt).toArray()); rsp.results.stream().mapToInt(SearchResultItem::getUrlIdInt).toArray());
} }
@Test @Test
@ -136,7 +136,7 @@ public class IndexQueryServiceIntegrationTest {
searchIndex.switchIndex(); searchIndex.switchIndex();
var rsp = queryService.justQuery( var rsp = queryService.justQuery(
EdgeSearchSpecification.builder() SearchSpecification.builder()
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000)) .queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
.quality(SpecificationLimit.none()) .quality(SpecificationLimit.none())
.year(SpecificationLimit.equals(1998)) .year(SpecificationLimit.equals(1998))
@ -144,14 +144,14 @@ public class IndexQueryServiceIntegrationTest {
.rank(SpecificationLimit.none()) .rank(SpecificationLimit.none())
.queryStrategy(QueryStrategy.SENTENCE) .queryStrategy(QueryStrategy.SENTENCE)
.searchSetIdentifier(SearchSetIdentifier.NONE) .searchSetIdentifier(SearchSetIdentifier.NONE)
.subqueries(List.of(new EdgeSearchSubquery( .subqueries(List.of(new SearchSubquery(
List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList() List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList()
)) ))
).build()); ).build());
Assertions.assertArrayEquals( Assertions.assertArrayEquals(
new int[] { 12, 72, 132, 192, 252, 312, 372, 432, 492, 32 }, new int[] { 12, 72, 132, 192, 252, 312, 372, 432, 492, 32 },
rsp.results.stream().mapToInt(EdgeSearchResultItem::getUrlIdInt).toArray()); rsp.results.stream().mapToInt(SearchResultItem::getUrlIdInt).toArray());
} }

View File

@ -1,6 +1,6 @@
package nu.marginalia.search.command; package nu.marginalia.search.command;
import nu.marginalia.index.client.model.query.EdgeSearchSubquery; import nu.marginalia.index.client.model.query.SearchSubquery;
import javax.annotation.Nullable; import javax.annotation.Nullable;
import java.util.Arrays; import java.util.Arrays;
@ -25,7 +25,7 @@ public enum SearchJsParameter {
return DEFAULT; return DEFAULT;
} }
public void addTacitTerms(EdgeSearchSubquery subquery) { public void addTacitTerms(SearchSubquery subquery) {
subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms)); subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms));
} }
} }

View File

@ -1,4 +1,4 @@
package nu.marginalia.search.query; package nu.marginalia.search.db;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
@ -9,12 +9,12 @@ import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.function.Consumer; import java.util.function.Consumer;
public class NearQueryProcessor { public class DbNearDomainsQuery {
private final HikariDataSource dataSource; private final HikariDataSource dataSource;
@Inject @Inject
public NearQueryProcessor(HikariDataSource dataSource) { public DbNearDomainsQuery(HikariDataSource dataSource) {
this.dataSource = dataSource; this.dataSource = dataSource;
} }

View File

@ -2,7 +2,7 @@ package nu.marginalia.search.model;
import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.index.client.model.query.EdgeSearchSubquery; import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.index.client.model.query.SearchSetIdentifier; import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import java.util.Objects; import java.util.Objects;
@ -45,7 +45,7 @@ public enum SearchProfile {
return YOLO; return YOLO;
} }
public void addTacitTerms(EdgeSearchSubquery subquery) { public void addTacitTerms(SearchSubquery subquery) {
if (this == ACADEMIA) { if (this == ACADEMIA) {
subquery.searchTermsPriority.add("tld:edu"); subquery.searchTermsPriority.add("tld:edu");
} }

View File

@ -1,7 +1,7 @@
package nu.marginalia.search.model; package nu.marginalia.search.model;
import lombok.*; import lombok.*;
import nu.marginalia.index.client.model.results.EdgeSearchResultItem; import nu.marginalia.index.client.model.results.SearchResultItem;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.EdgeDomainIndexingState; import nu.marginalia.model.crawl.EdgeDomainIndexingState;
import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.HtmlFeature;
@ -36,7 +36,7 @@ public class UrlDetails {
public int resultsFromSameDomain; public int resultsFromSameDomain;
public String positions; public String positions;
public EdgeSearchResultItem resultItem; public SearchResultItem resultItem;
public boolean hasMoreResults() { public boolean hasMoreResults() {
return resultsFromSameDomain > 1; return resultsFromSameDomain > 1;

View File

@ -3,8 +3,8 @@ package nu.marginalia.search.query;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import nu.marginalia.LanguageModels; import nu.marginalia.LanguageModels;
import nu.marginalia.index.client.model.query.EdgeSearchSpecification; import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.index.client.model.query.EdgeSearchSubquery; import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.index.query.limit.SpecificationLimit;
@ -16,6 +16,7 @@ import nu.marginalia.query_parser.QueryPermutation;
import nu.marginalia.query_parser.QueryVariants; import nu.marginalia.query_parser.QueryVariants;
import nu.marginalia.query_parser.token.Token; import nu.marginalia.query_parser.token.Token;
import nu.marginalia.query_parser.token.TokenType; import nu.marginalia.query_parser.token.TokenType;
import nu.marginalia.search.db.DbNearDomainsQuery;
import nu.marginalia.search.model.SearchProfile; import nu.marginalia.search.model.SearchProfile;
import nu.marginalia.search.query.model.SearchQuery; import nu.marginalia.search.query.model.SearchQuery;
import nu.marginalia.search.query.model.UserSearchParameters; import nu.marginalia.search.query.model.UserSearchParameters;
@ -34,7 +35,7 @@ public class QueryFactory {
private final EnglishDictionary englishDictionary; private final EnglishDictionary englishDictionary;
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
private final SearchResultValuator searchResultValuator; private final SearchResultValuator searchResultValuator;
private final NearQueryProcessor nearQueryProcessor; private final DbNearDomainsQuery dbNearDomainsQuery;
private static final int RETAIN_QUERY_VARIANT_COUNT = 5; private static final int RETAIN_QUERY_VARIANT_COUNT = 5;
private final ThreadLocal<QueryVariants> queryVariants; private final ThreadLocal<QueryVariants> queryVariants;
@ -48,11 +49,11 @@ public class QueryFactory {
EnglishDictionary englishDictionary, EnglishDictionary englishDictionary,
NGramBloomFilter nGramBloomFilter, NGramBloomFilter nGramBloomFilter,
SearchResultValuator searchResultValuator, SearchResultValuator searchResultValuator,
NearQueryProcessor nearQueryProcessor) { DbNearDomainsQuery dbNearDomainsQuery) {
this.englishDictionary = englishDictionary; this.englishDictionary = englishDictionary;
this.searchResultValuator = searchResultValuator; this.searchResultValuator = searchResultValuator;
this.nearQueryProcessor = nearQueryProcessor; this.dbNearDomainsQuery = dbNearDomainsQuery;
this.queryVariants = ThreadLocal.withInitial(() -> new QueryVariants(lm ,dict, nGramBloomFilter, englishDictionary)); this.queryVariants = ThreadLocal.withInitial(() -> new QueryVariants(lm ,dict, nGramBloomFilter, englishDictionary));
} }
@ -67,13 +68,13 @@ public class QueryFactory {
public SearchQuery createQuery(UserSearchParameters params) { public SearchQuery createQuery(UserSearchParameters params) {
final var processedQuery = createQuery(getQueryPermutation(), params); final var processedQuery = createQuery(getQueryPermutation(), params);
final List<EdgeSearchSubquery> subqueries = processedQuery.specs.subqueries; final List<SearchSubquery> subqueries = processedQuery.specs.subqueries;
for (var sq : subqueries) { for (var sq : subqueries) {
sq.setValue(searchResultValuator.preEvaluate(sq)); sq.setValue(searchResultValuator.preEvaluate(sq));
} }
subqueries.sort(Comparator.comparing(EdgeSearchSubquery::getValue)); subqueries.sort(Comparator.comparing(SearchSubquery::getValue));
trimArray(subqueries, RETAIN_QUERY_VARIANT_COUNT); trimArray(subqueries, RETAIN_QUERY_VARIANT_COUNT);
return processedQuery; return processedQuery;
@ -84,16 +85,16 @@ public class QueryFactory {
int limitTotal, int limitTotal,
String... termsInclude) String... termsInclude)
{ {
List<EdgeSearchSubquery> sqs = new ArrayList<>(); List<SearchSubquery> sqs = new ArrayList<>();
sqs.add(new EdgeSearchSubquery( sqs.add(new SearchSubquery(
Arrays.asList(termsInclude), Arrays.asList(termsInclude),
Collections.emptyList(), Collections.emptyList(),
Collections.emptyList(), Collections.emptyList(),
Collections.emptyList() Collections.emptyList()
)); ));
var specs = EdgeSearchSpecification.builder() var specs = SearchSpecification.builder()
.subqueries(sqs) .subqueries(sqs)
.domains(Collections.emptyList()) .domains(Collections.emptyList())
.searchSetIdentifier(profile.searchSetIdentifier) .searchSetIdentifier(profile.searchSetIdentifier)
@ -170,7 +171,7 @@ public class QueryFactory {
} }
var queryPermutations = queryPermutation.permuteQueriesNew(basicQuery); var queryPermutations = queryPermutation.permuteQueriesNew(basicQuery);
List<EdgeSearchSubquery> subqueries = new ArrayList<>(); List<SearchSubquery> subqueries = new ArrayList<>();
String near = profile.getNearDomain(); String near = profile.getNearDomain();
@ -219,7 +220,7 @@ public class QueryFactory {
searchTermsAdvice.clear(); searchTermsAdvice.clear();
} }
EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority); SearchSubquery subquery = new SearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority);
params.profile().addTacitTerms(subquery); params.profile().addTacitTerms(subquery);
params.jsSetting().addTacitTerms(subquery); params.jsSetting().addTacitTerms(subquery);
@ -231,7 +232,7 @@ public class QueryFactory {
if (near != null) { if (near != null) {
if (domain == null) { if (domain == null) {
domains = nearQueryProcessor.getRelatedDomains(near, problems::add); domains = dbNearDomainsQuery.getRelatedDomains(near, problems::add);
} }
} }
@ -242,7 +243,7 @@ public class QueryFactory {
domainLimit = 2; domainLimit = 2;
} }
EdgeSearchSpecification.EdgeSearchSpecificationBuilder specsBuilder = EdgeSearchSpecification.builder() var specsBuilder = SearchSpecification.builder()
.subqueries(subqueries) .subqueries(subqueries)
.queryLimits(new QueryLimits(domainLimit, 100, 250, 4096)) .queryLimits(new QueryLimits(domainLimit, 100, 250, 4096))
.humanQuery(query) .humanQuery(query)
@ -254,7 +255,7 @@ public class QueryFactory {
.queryStrategy(queryStrategy) .queryStrategy(queryStrategy)
.searchSetIdentifier(profile.searchSetIdentifier); .searchSetIdentifier(profile.searchSetIdentifier);
EdgeSearchSpecification specs = specsBuilder.build(); SearchSpecification specs = specsBuilder.build();
return new SearchQuery(specs, searchTermsHuman, domain); return new SearchQuery(specs, searchTermsHuman, domain);
} }

View File

@ -1,19 +1,19 @@
package nu.marginalia.search.query.model; package nu.marginalia.search.query.model;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import nu.marginalia.index.client.model.query.EdgeSearchSpecification; import nu.marginalia.index.client.model.query.SearchSpecification;
import java.util.*; import java.util.*;
@AllArgsConstructor @AllArgsConstructor
public class SearchQuery { public class SearchQuery {
public final EdgeSearchSpecification specs; public final SearchSpecification specs;
public final Set<String> problems = new TreeSet<>(); public final Set<String> problems = new TreeSet<>();
public final List<String> searchTermsHuman; public final List<String> searchTermsHuman;
public String domain; public String domain;
public SearchQuery(EdgeSearchSpecification justSpecs) { public SearchQuery(SearchSpecification justSpecs) {
searchTermsHuman = new ArrayList<>(); searchTermsHuman = new ArrayList<>();
specs = justSpecs; specs = justSpecs;
} }

View File

@ -8,7 +8,7 @@ import nu.marginalia.search.db.DbUrlDetailsQuery;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.EdgeDomainIndexingState; import nu.marginalia.model.crawl.EdgeDomainIndexingState;
import nu.marginalia.model.id.EdgeIdList; import nu.marginalia.model.id.EdgeIdList;
import nu.marginalia.index.client.model.results.EdgeSearchResultItem; import nu.marginalia.index.client.model.results.SearchResultItem;
import nu.marginalia.search.model.UrlDetails; import nu.marginalia.search.model.UrlDetails;
import nu.marginalia.search.valuation.SearchResultValuator; import nu.marginalia.search.valuation.SearchResultValuator;
import nu.marginalia.util.BrailleBlockPunchCards; import nu.marginalia.util.BrailleBlockPunchCards;
@ -29,11 +29,11 @@ public class SearchResultDecorator {
this.valuator = valuator; this.valuator = valuator;
} }
public List<UrlDetails> getAllUrlDetails(List<EdgeSearchResultItem> resultItems) { public List<UrlDetails> getAllUrlDetails(List<SearchResultItem> resultItems) {
TIntObjectHashMap<UrlDetails> detailsById = new TIntObjectHashMap<>(resultItems.size()); TIntObjectHashMap<UrlDetails> detailsById = new TIntObjectHashMap<>(resultItems.size());
EdgeIdList<EdgeUrl> idList = resultItems.stream() EdgeIdList<EdgeUrl> idList = resultItems.stream()
.mapToInt(EdgeSearchResultItem::getUrlIdInt) .mapToInt(SearchResultItem::getUrlIdInt)
.collect(EdgeIdList::new, EdgeIdList::add, EdgeIdList::addAll); .collect(EdgeIdList::new, EdgeIdList::add, EdgeIdList::addAll);
List<UrlDetails> ret = dbUrlDetailsQuery.getUrlDetailsMulti(idList); List<UrlDetails> ret = dbUrlDetailsQuery.getUrlDetailsMulti(idList);
@ -72,14 +72,14 @@ public class SearchResultDecorator {
return retList; return retList;
} }
private String getPositionsString(EdgeSearchResultItem resultItem) { private String getPositionsString(SearchResultItem resultItem) {
Int2IntArrayMap positionsPerSet = new Int2IntArrayMap(8); Int2IntArrayMap positionsPerSet = new Int2IntArrayMap(8);
for (var score : resultItem.scores) { for (var score : resultItem.scores) {
if (!score.isRegular()) { if (!score.isKeywordRegular()) {
continue; continue;
} }
positionsPerSet.merge(score.set(), score.positions(), this::and); positionsPerSet.merge(score.subquery(), score.positions(), this::and);
} }
int bits = positionsPerSet.values().intStream().reduce(this::or).orElse(0); int bits = positionsPerSet.values().intStream().reduce(this::or).orElse(0);
@ -95,7 +95,7 @@ public class SearchResultDecorator {
return a | b; return a | b;
} }
private double calculateTermScore(EdgeSearchResultItem resultItem, UrlDetails details) { private double calculateTermScore(SearchResultItem resultItem, UrlDetails details) {
final double statePenalty = (details.domainState == EdgeDomainIndexingState.SPECIAL) ? 1.25 : 0; final double statePenalty = (details.domainState == EdgeDomainIndexingState.SPECIAL) ? 1.25 : 0;
final double value = valuator.evaluateTerms(resultItem.scores, details.words, details.title.length()); final double value = valuator.evaluateTerms(resultItem.scores, details.words, details.title.length());

View File

@ -3,7 +3,7 @@ package nu.marginalia.search.svc;
import com.google.common.base.Strings; import com.google.common.base.Strings;
import com.google.inject.Inject; import com.google.inject.Inject;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore; import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
import nu.marginalia.search.client.model.ApiSearchResultQueryDetails; import nu.marginalia.search.client.model.ApiSearchResultQueryDetails;
import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.search.SearchOperator; import nu.marginalia.search.SearchOperator;
@ -62,7 +62,7 @@ public class SearchApiQueryService {
ApiSearchResult convert(UrlDetails url) { ApiSearchResult convert(UrlDetails url) {
List<List<ApiSearchResultQueryDetails>> details = new ArrayList<>(); List<List<ApiSearchResultQueryDetails>> details = new ArrayList<>();
if (url.resultItem != null) { if (url.resultItem != null) {
var bySet = url.resultItem.scores.stream().collect(Collectors.groupingBy(EdgeSearchResultKeywordScore::set)); var bySet = url.resultItem.scores.stream().collect(Collectors.groupingBy(SearchResultKeywordScore::subquery));
outer: outer:
for (var entries : bySet.values()) { for (var entries : bySet.values()) {
@ -73,7 +73,7 @@ public class SearchApiQueryService {
continue outer; continue outer;
Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet()); Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet());
lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(), Integer.bitCount(metadata.positions()), flags)); lst.add(new ApiSearchResultQueryDetails(entry.keyword, metadata.tfIdf(), Integer.bitCount(metadata.positions()), flags));
} }
details.add(lst); details.add(lst);
} }

View File

@ -3,8 +3,8 @@ package nu.marginalia.search.svc;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import nu.marginalia.index.client.IndexClient; import nu.marginalia.index.client.IndexClient;
import nu.marginalia.index.client.model.results.EdgeSearchResultItem; import nu.marginalia.index.client.model.results.SearchResultItem;
import nu.marginalia.index.client.model.query.EdgeSearchSpecification; import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.search.model.PageScoreAdjustment; import nu.marginalia.search.model.PageScoreAdjustment;
import nu.marginalia.search.model.UrlDetails; import nu.marginalia.search.model.UrlDetails;
import nu.marginalia.search.results.SearchResultDecorator; import nu.marginalia.search.results.SearchResultDecorator;
@ -37,7 +37,7 @@ public class SearchQueryIndexService {
} }
public List<UrlDetails> executeQuery(Context ctx, SearchQuery processedQuery) { public List<UrlDetails> executeQuery(Context ctx, SearchQuery processedQuery) {
final List<EdgeSearchResultItem> results = indexClient.query(ctx, processedQuery.specs); final List<SearchResultItem> results = indexClient.query(ctx, processedQuery.specs);
List<UrlDetails> urlDetails = resultDecorator.getAllUrlDetails(results); List<UrlDetails> urlDetails = resultDecorator.getAllUrlDetails(results);
@ -70,7 +70,7 @@ public class SearchQueryIndexService {
private final Pattern titleSplitPattern = Pattern.compile("[:!|./]|(\\s-|-\\s)|\\s{2,}"); private final Pattern titleSplitPattern = Pattern.compile("[:!|./]|(\\s-|-\\s)|\\s{2,}");
private PageScoreAdjustment adjustScoreBasedOnQuery(UrlDetails p, EdgeSearchSpecification specs) { private PageScoreAdjustment adjustScoreBasedOnQuery(UrlDetails p, SearchSpecification specs) {
String titleLC = p.title == null ? "" : p.title.toLowerCase(); String titleLC = p.title == null ? "" : p.title.toLowerCase();
String descLC = p.description == null ? "" : p.description.toLowerCase(); String descLC = p.description == null ? "" : p.description.toLowerCase();
String urlLC = p.url == null ? "" : p.url.path.toLowerCase(); String urlLC = p.url == null ? "" : p.url.path.toLowerCase();

View File

@ -5,8 +5,8 @@ import com.google.inject.Singleton;
import nu.marginalia.language.statistics.TermFrequencyDict; import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.model.crawl.EdgePageWordFlags; import nu.marginalia.model.crawl.EdgePageWordFlags;
import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore; import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
import nu.marginalia.index.client.model.query.EdgeSearchSubquery; import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.language.WordPatterns; import nu.marginalia.language.WordPatterns;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
@ -35,7 +35,7 @@ public class SearchResultValuator {
} }
public double preEvaluate(EdgeSearchSubquery sq) { public double preEvaluate(SearchSubquery sq) {
final String[] terms = sq.searchTermsInclude.stream().filter(f -> !f.contains(":")).toArray(String[]::new); final String[] terms = sq.searchTermsInclude.stream().filter(f -> !f.contains(":")).toArray(String[]::new);
double termSum = 0.; double termSum = 0.;
@ -56,8 +56,8 @@ public class SearchResultValuator {
return termSum / factorSum; return termSum / factorSum;
} }
public double evaluateTerms(List<EdgeSearchResultKeywordScore> rawScores, int length, int titleLength) { public double evaluateTerms(List<SearchResultKeywordScore> rawScores, int length, int titleLength) {
int sets = 1 + rawScores.stream().mapToInt(EdgeSearchResultKeywordScore::set).max().orElse(0); int sets = 1 + rawScores.stream().mapToInt(SearchResultKeywordScore::subquery).max().orElse(0);
double bestScore = 10; double bestScore = 10;
double bestAllTermsFactor = 1.; double bestAllTermsFactor = 1.;
@ -88,10 +88,10 @@ public class SearchResultValuator {
return bestScore * (0.1 + 0.9 * bestAllTermsFactor) * priorityTermBonus; return bestScore * (0.1 + 0.9 * bestAllTermsFactor) * priorityTermBonus;
} }
private boolean hasPriorityTerm(List<EdgeSearchResultKeywordScore> rawScores) { private boolean hasPriorityTerm(List<SearchResultKeywordScore> rawScores) {
return rawScores.stream() return rawScores.stream()
.findAny() .findAny()
.map(EdgeSearchResultKeywordScore::hasPriorityTerms) .map(SearchResultKeywordScore::hasPriorityTerms)
.orElse(false); .orElse(false);
} }
@ -260,11 +260,11 @@ public class SearchResultValuator {
return f; return f;
} }
private double[] getTermWeights(EdgeSearchResultKeywordScore[] scores) { private double[] getTermWeights(SearchResultKeywordScore[] scores) {
double[] weights = new double[scores.length]; double[] weights = new double[scores.length];
for (int i = 0; i < scores.length; i++) { for (int i = 0; i < scores.length; i++) {
String[] parts = separator.split(scores[i].keyword()); String[] parts = separator.split(scores[i].keyword);
double sumScore = 0.; double sumScore = 0.;
int count = 0; int count = 0;
@ -305,8 +305,8 @@ public class SearchResultValuator {
return weights; return weights;
} }
private SearchResultsKeywordSet createKeywordSet(List<EdgeSearchResultKeywordScore> rawScores, int thisSet) { private SearchResultsKeywordSet createKeywordSet(List<SearchResultKeywordScore> rawScores, int thisSet) {
EdgeSearchResultKeywordScore[] scores = rawScores.stream().filter(w -> w.set() == thisSet && !w.keyword().contains(":")).toArray(EdgeSearchResultKeywordScore[]::new); SearchResultKeywordScore[] scores = rawScores.stream().filter(w -> w.subquery() == thisSet && !w.keyword.contains(":")).toArray(SearchResultKeywordScore[]::new);
if (scores.length == 0) { if (scores.length == 0) {
return null; return null;
} }
@ -322,8 +322,8 @@ public class SearchResultValuator {
} }
private record SearchResultsKeyword(EdgeSearchResultKeywordScore score, WordMetadata wordMetadata, double weight) { private record SearchResultsKeyword(SearchResultKeywordScore score, WordMetadata wordMetadata, double weight) {
public SearchResultsKeyword(EdgeSearchResultKeywordScore score, double weight) { public SearchResultsKeyword(SearchResultKeywordScore score, double weight) {
this(score, new WordMetadata(score.encodedWordMetadata()), weight); this(score, new WordMetadata(score.encodedWordMetadata()), weight);
} }

View File

@ -3,7 +3,7 @@ package nu.marginalia.search.query;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.index.query.limit.SpecificationLimitType; import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.language.statistics.EnglishDictionary; import nu.marginalia.language.statistics.EnglishDictionary;
import nu.marginalia.index.client.model.query.EdgeSearchSpecification; import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.language.statistics.NGramBloomFilter; import nu.marginalia.language.statistics.NGramBloomFilter;
import nu.marginalia.language.statistics.TermFrequencyDict; import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.search.command.SearchJsParameter; import nu.marginalia.search.command.SearchJsParameter;
@ -37,7 +37,7 @@ public class QueryFactoryTest {
); );
} }
public EdgeSearchSpecification parseAndGetSpecs(String query) { public SearchSpecification parseAndGetSpecs(String query) {
return queryFactory.createQuery( return queryFactory.createQuery(
new UserSearchParameters(query, SearchProfile.CORPO, SearchJsParameter.DEFAULT) new UserSearchParameters(query, SearchProfile.CORPO, SearchJsParameter.DEFAULT)
).specs; ).specs;

View File

@ -1,6 +1,6 @@
package nu.marginalia.search.valuation; package nu.marginalia.search.valuation;
import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore; import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
import nu.marginalia.language.statistics.TermFrequencyDict; import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.model.crawl.EdgePageDocumentFlags; import nu.marginalia.model.crawl.EdgePageDocumentFlags;
import nu.marginalia.model.crawl.EdgePageWordFlags; import nu.marginalia.model.crawl.EdgePageWordFlags;
@ -31,29 +31,29 @@ class SearchResultValuatorTest {
valuator = new SearchResultValuator(dict); valuator = new SearchResultValuator(dict);
} }
List<EdgeSearchResultKeywordScore> titleOnlyLowCountSet = List.of( List<SearchResultKeywordScore> titleOnlyLowCountSet = List.of(
new EdgeSearchResultKeywordScore(0, "bob", new SearchResultKeywordScore(0, "bob",
wordMetadata(32, Set.of(1), EnumSet.of(EdgePageWordFlags.Title)), wordMetadata(32, Set.of(1), EnumSet.of(EdgePageWordFlags.Title)),
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)), docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
false) false)
); );
List<EdgeSearchResultKeywordScore> highCountNoTitleSet = List.of( List<SearchResultKeywordScore> highCountNoTitleSet = List.of(
new EdgeSearchResultKeywordScore(0, "bob", new SearchResultKeywordScore(0, "bob",
wordMetadata(129, Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(EdgePageWordFlags.TfIdfHigh)), wordMetadata(129, Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(EdgePageWordFlags.TfIdfHigh)),
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)), docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
false) false)
); );
List<EdgeSearchResultKeywordScore> highCountSubjectSet = List.of( List<SearchResultKeywordScore> highCountSubjectSet = List.of(
new EdgeSearchResultKeywordScore(0, "bob", new SearchResultKeywordScore(0, "bob",
wordMetadata(129, Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(EdgePageWordFlags.TfIdfHigh, EdgePageWordFlags.Subjects)), wordMetadata(129, Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(EdgePageWordFlags.TfIdfHigh, EdgePageWordFlags.Subjects)),
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)), docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
false) false)
); );
List<EdgeSearchResultKeywordScore> first = List.of( List<SearchResultKeywordScore> first = List.of(
new EdgeSearchResultKeywordScore(0, "bob", new SearchResultKeywordScore(0, "bob",
wordMetadata(202, Set.of(1,3,4,6,7,9,10,11), EnumSet.of(EdgePageWordFlags.TfIdfHigh)), wordMetadata(202, Set.of(1,3,4,6,7,9,10,11), EnumSet.of(EdgePageWordFlags.TfIdfHigh)),
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)), docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
false) false)