Clean up search-service and index-api

This commit is contained in:
Viktor Lofgren 2023-03-11 12:26:12 +01:00
parent c2f9980eba
commit 73e412ea5b
27 changed files with 293 additions and 270 deletions

View File

@ -7,9 +7,9 @@ import io.reactivex.rxjava3.core.Observable;
import nu.marginalia.WmsaHome;
import nu.marginalia.client.AbstractDynamicClient;
import nu.marginalia.client.Context;
import nu.marginalia.index.client.model.query.EdgeSearchSpecification;
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
import nu.marginalia.index.client.model.results.EdgeSearchResultSet;
import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.index.client.model.results.SearchResultItem;
import nu.marginalia.index.client.model.results.SearchResultSet;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.service.descriptor.ServiceDescriptors;
import nu.marginalia.service.id.ServiceId;
@ -30,9 +30,9 @@ public class IndexClient extends AbstractDynamicClient {
}
@CheckReturnValue
public List<EdgeSearchResultItem> query(Context ctx, EdgeSearchSpecification specs) {
public List<SearchResultItem> query(Context ctx, SearchSpecification specs) {
return wmsa_search_index_api_time.time(
() -> this.postGet(ctx, "/search/", specs, EdgeSearchResultSet.class).blockingFirst().getResults()
() -> this.postGet(ctx, "/search/", specs, SearchResultSet.class).blockingFirst().getResults()
);
}

View File

@ -1,13 +0,0 @@
package nu.marginalia.index.client.model.domain;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.EdgeIdList;
@AllArgsConstructor @Getter @ToString
public class EdgeDomainSearchResults {
public final String keyword;
public final EdgeIdList<EdgeUrl> results;
}

View File

@ -1,14 +0,0 @@
package nu.marginalia.index.client.model.domain;
import lombok.AllArgsConstructor;
import lombok.ToString;
@ToString @AllArgsConstructor
public class EdgeDomainSearchSpecification {
public final String keyword;
public final int queryDepth;
public final int minHitCount;
public final int maxResults;
}

View File

@ -8,9 +8,12 @@ import nu.marginalia.index.query.limit.SpecificationLimit;
import java.util.List;
@ToString @Getter @Builder @With @AllArgsConstructor
public class EdgeSearchSpecification {
public List<EdgeSearchSubquery> subqueries;
public class SearchSpecification {
public List<SearchSubquery> subqueries;
/** If present and not empty, limit the search to these domain IDs */
public List<Integer> domains;
public SearchSetIdentifier searchSetIdentifier;
public final String humanQuery;
@ -21,6 +24,7 @@ public class EdgeSearchSpecification {
public final SpecificationLimit rank;
public final QueryLimits queryLimits;
public final QueryStrategy queryStrategy;
}

View File

@ -2,26 +2,32 @@ package nu.marginalia.index.client.model.query;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import java.util.List;
import java.util.stream.Collectors;
@Getter
@AllArgsConstructor
public class EdgeSearchSubquery {
public class SearchSubquery {
/** These terms must be present in the document */
public final List<String> searchTermsInclude;
/** These terms must be absent from the document */
public final List<String> searchTermsExclude;
/** These terms must be present in the document, but are not used in ranking */
public final List<String> searchTermsAdvice;
/** If these optional terms are present in the document, rank it highly */
public final List<String> searchTermsPriority;
private double value = 0;
public EdgeSearchSubquery(List<String> searchTermsInclude,
List<String> searchTermsExclude,
List<String> searchTermsAdvice,
List<String> searchTermsPriority
public SearchSubquery(List<String> searchTermsInclude,
List<String> searchTermsExclude,
List<String> searchTermsAdvice,
List<String> searchTermsPriority
) {
this.searchTermsInclude = searchTermsInclude;
this.searchTermsExclude = searchTermsExclude;
@ -29,7 +35,7 @@ public class EdgeSearchSubquery {
this.searchTermsPriority = searchTermsPriority;
}
public EdgeSearchSubquery setValue(double value) {
public SearchSubquery setValue(double value) {
if (Double.isInfinite(value) || Double.isNaN(value)) {
this.value = Double.MAX_VALUE;
} else {

View File

@ -1,84 +0,0 @@
package nu.marginalia.index.client.model.results;
import nu.marginalia.model.crawl.EdgePageWordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import static java.lang.Integer.lowestOneBit;
import static java.lang.Integer.numberOfTrailingZeros;
public record EdgeSearchResultKeywordScore(int set,
String keyword,
long encodedWordMetadata,
long encodedDocMetadata,
boolean hasPriorityTerms) {
public double documentValue() {
long sum = 0;
sum += DocumentMetadata.decodeQuality(encodedDocMetadata) / 5.;
sum += DocumentMetadata.decodeTopology(encodedDocMetadata);
if (DocumentMetadata.hasFlags(encodedDocMetadata, EdgePageDocumentFlags.Simple.asBit())) {
sum += 20;
}
int rank = DocumentMetadata.decodeRank(encodedDocMetadata) - 13;
if (rank < 0)
sum += rank / 2;
else
sum += rank / 4;
return sum;
}
private boolean hasTermFlag(EdgePageWordFlags flag) {
return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit());
}
public double termValue() {
double sum = 0;
if (hasTermFlag(EdgePageWordFlags.Title)) {
sum -= 15;
}
if (hasTermFlag(EdgePageWordFlags.Site)) {
sum -= 10;
}
else if (hasTermFlag(EdgePageWordFlags.SiteAdjacent)) {
sum -= 5;
}
if (hasTermFlag(EdgePageWordFlags.Subjects)) {
sum -= 10;
}
if (hasTermFlag(EdgePageWordFlags.NamesWords)) {
sum -= 1;
}
if (hasTermFlag(EdgePageWordFlags.UrlDomain)) {
sum -= 5;
}
if (hasTermFlag(EdgePageWordFlags.UrlPath)) {
sum -= 5;
}
double tfIdf = WordMetadata.decodeTfidf(encodedWordMetadata);
int positionBits = WordMetadata.decodePositions(encodedWordMetadata);
sum -= tfIdf / 10.;
sum -= Integer.bitCount(positionBits) / 3.;
return sum;
}
public int positions() { return WordMetadata.decodePositions(encodedWordMetadata); }
public boolean isSpecial() { return keyword.contains(":") || hasTermFlag(EdgePageWordFlags.Synthetic); }
public boolean isRegular() {
return !keyword.contains(":")
&& !hasTermFlag(EdgePageWordFlags.Synthetic);
}
}

View File

@ -1,26 +0,0 @@
package nu.marginalia.index.client.model.results;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Stream;
@AllArgsConstructor @Getter @ToString
public class EdgeSearchResults {
public final List<EdgeSearchResultItem> results;
public EdgeSearchResults() {
results = new ArrayList<>();
}
public int size() {
return results.size();
}
public Stream<EdgeSearchResultItem> stream() {
return results.stream();
}
}

View File

@ -8,15 +8,19 @@ import nu.marginalia.model.id.EdgeId;
import java.util.ArrayList;
import java.util.List;
/** Represents a document matching a search query */
@AllArgsConstructor @Getter
public class EdgeSearchResultItem {
public class SearchResultItem {
/** Encoded ID that contains both the URL id and its ranking */
public final long combinedId;
public final List<EdgeSearchResultKeywordScore> scores;
/** How did the subqueries match against the document ? */
public final List<SearchResultKeywordScore> scores;
/** How many other potential results existed in the same domain */
public int resultsFromDomain;
public EdgeSearchResultItem(long val) {
public SearchResultItem(long val) {
this.combinedId = val;
this.scores = new ArrayList<>(16);
}
@ -62,7 +66,7 @@ public class EdgeSearchResultItem {
return false;
if (other == this)
return true;
if (other instanceof EdgeSearchResultItem o) {
if (other instanceof SearchResultItem o) {
return o.getUrlIdInt() == getUrlIdInt();
}
return false;

View File

@ -0,0 +1,145 @@
package nu.marginalia.index.client.model.results;
import nu.marginalia.model.crawl.EdgePageWordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import java.util.Objects;
public final class SearchResultKeywordScore {
public final int subquery;
public final String keyword;
private final long encodedWordMetadata;
private final long encodedDocMetadata;
private final boolean hasPriorityTerms;
public SearchResultKeywordScore(int subquery,
String keyword,
long encodedWordMetadata,
long encodedDocMetadata,
boolean hasPriorityTerms) {
this.subquery = subquery;
this.keyword = keyword;
this.encodedWordMetadata = encodedWordMetadata;
this.encodedDocMetadata = encodedDocMetadata;
this.hasPriorityTerms = hasPriorityTerms;
}
private boolean hasTermFlag(EdgePageWordFlags flag) {
return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit());
}
public double documentValue() {
long sum = 0;
sum += DocumentMetadata.decodeQuality(encodedDocMetadata) / 5.;
sum += DocumentMetadata.decodeTopology(encodedDocMetadata);
if (DocumentMetadata.hasFlags(encodedDocMetadata, EdgePageDocumentFlags.Simple.asBit())) {
sum += 20;
}
int rank = DocumentMetadata.decodeRank(encodedDocMetadata) - 13;
if (rank < 0)
sum += rank / 2;
else
sum += rank / 4;
return sum;
}
public double termValue() {
double sum = 0;
if (hasTermFlag(EdgePageWordFlags.Title)) {
sum -= 15;
}
if (hasTermFlag(EdgePageWordFlags.Site)) {
sum -= 10;
} else if (hasTermFlag(EdgePageWordFlags.SiteAdjacent)) {
sum -= 5;
}
if (hasTermFlag(EdgePageWordFlags.Subjects)) {
sum -= 10;
}
if (hasTermFlag(EdgePageWordFlags.NamesWords)) {
sum -= 1;
}
if (hasTermFlag(EdgePageWordFlags.UrlDomain)) {
sum -= 5;
}
if (hasTermFlag(EdgePageWordFlags.UrlPath)) {
sum -= 5;
}
double tfIdf = WordMetadata.decodeTfidf(encodedWordMetadata);
int positionBits = WordMetadata.decodePositions(encodedWordMetadata);
sum -= tfIdf / 10.;
sum -= Integer.bitCount(positionBits) / 3.;
return sum;
}
public int subquery() {
return subquery;
}
public int positions() {
return WordMetadata.decodePositions(encodedWordMetadata);
}
public boolean isKeywordSpecial() {
return keyword.contains(":") || hasTermFlag(EdgePageWordFlags.Synthetic);
}
public boolean isKeywordRegular() {
return !keyword.contains(":")
&& !hasTermFlag(EdgePageWordFlags.Synthetic);
}
public long encodedWordMetadata() {
return encodedWordMetadata;
}
public long encodedDocMetadata() {
return encodedDocMetadata;
}
public boolean hasPriorityTerms() {
return hasPriorityTerms;
}
@Override
public boolean equals(Object obj) {
if (obj == this) return true;
if (obj == null || obj.getClass() != this.getClass()) return false;
var that = (SearchResultKeywordScore) obj;
return this.subquery == that.subquery &&
Objects.equals(this.keyword, that.keyword) &&
this.encodedWordMetadata == that.encodedWordMetadata &&
this.encodedDocMetadata == that.encodedDocMetadata &&
this.hasPriorityTerms == that.hasPriorityTerms;
}
@Override
public int hashCode() {
return Objects.hash(subquery, keyword, encodedWordMetadata, encodedDocMetadata, hasPriorityTerms);
}
@Override
public String toString() {
return "SearchResultKeywordScore[" +
"set=" + subquery + ", " +
"keyword=" + keyword + ", " +
"encodedWordMetadata=" + encodedWordMetadata + ", " +
"encodedDocMetadata=" + encodedDocMetadata + ", " +
"hasPriorityTerms=" + hasPriorityTerms + ']';
}
}

View File

@ -7,8 +7,8 @@ import lombok.ToString;
import java.util.List;
@AllArgsConstructor @Getter @ToString
public class EdgeSearchResultSet {
public List<EdgeSearchResultItem> results;
public class SearchResultSet {
public List<SearchResultItem> results;
public int size() {
return results.size();

View File

@ -2,7 +2,7 @@ package nu.marginalia.index.results;
import gnu.trove.map.TLongIntMap;
import gnu.trove.map.hash.TLongIntHashMap;
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
import nu.marginalia.index.client.model.results.SearchResultItem;
public class IndexResultDomainDeduplicator {
final TLongIntMap resultsByRankingId = new TLongIntHashMap(2048, 0.5f, -1, 0);
@ -21,7 +21,7 @@ public class IndexResultDomainDeduplicator {
return resultsByRankingId.adjustOrPutValue(ranking, 1, 1) <= limitByDomain;
}
public boolean test(EdgeSearchResultItem item) {
public boolean test(SearchResultItem item) {
final long key = item.deduplicationKey();
if (key == 0)
return true;
@ -29,7 +29,7 @@ public class IndexResultDomainDeduplicator {
return resultsByRankingId.adjustOrPutValue(key, 1, 1) <= limitByDomain;
}
public int getCount(EdgeSearchResultItem item) {
public int getCount(SearchResultItem item) {
final long key = item.deduplicationKey();
if (key == 0)
return 1;

View File

@ -9,9 +9,9 @@ import nu.marginalia.index.svc.SearchTermsService;
import nu.marginalia.model.crawl.EdgePageWordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore;
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
import nu.marginalia.index.client.model.results.SearchResultItem;
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.index.query.IndexQueryParams;
import java.util.List;
@ -31,7 +31,7 @@ public class IndexResultValuator {
public IndexResultValuator(SearchTermsService searchTermsSvc,
IndexMetadataService metadataService,
TLongList results,
List<EdgeSearchSubquery> subqueries,
List<SearchSubquery> subqueries,
IndexQueryParams queryParams) {
this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
this.queryParams = queryParams;
@ -71,9 +71,9 @@ public class IndexResultValuator {
}
public EdgeSearchResultItem evaluateResult(long id) {
public SearchResultItem evaluateResult(long id) {
EdgeSearchResultItem searchResult = new EdgeSearchResultItem(id);
SearchResultItem searchResult = new SearchResultItem(id);
final long urlIdInt = searchResult.getUrlIdInt();
searchResult.setDomainId(metadataService.getDomainId(urlIdInt));
@ -99,7 +99,7 @@ public class IndexResultValuator {
return searchResult;
}
private double evaluateSubquery(EdgeSearchResultItem searchResult,
private double evaluateSubquery(SearchResultItem searchResult,
long docMetadata,
int querySetId,
List<String> termList)
@ -114,7 +114,7 @@ public class IndexResultValuator {
long metadata = termMetadata.getTermMetadata(termId, searchResult.getUrlIdInt());
EdgeSearchResultKeywordScore score = new EdgeSearchResultKeywordScore(
SearchResultKeywordScore score = new SearchResultKeywordScore(
querySetId,
searchTerm,
metadata,

View File

@ -9,10 +9,10 @@ import gnu.trove.set.hash.TLongHashSet;
import io.prometheus.client.Counter;
import io.prometheus.client.Gauge;
import io.prometheus.client.Histogram;
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
import nu.marginalia.index.client.model.results.EdgeSearchResultSet;
import nu.marginalia.index.client.model.query.EdgeSearchSpecification;
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
import nu.marginalia.index.client.model.results.SearchResultItem;
import nu.marginalia.index.client.model.results.SearchResultSet;
import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.array.buffer.LongQueryBuffer;
import nu.marginalia.index.index.SearchIndex;
import nu.marginalia.index.index.SearchIndexSearchTerms;
@ -73,13 +73,13 @@ public class IndexQueryService {
public Object search(Request request, Response response) {
String json = request.body();
EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class);
SearchSpecification specsSet = gson.fromJson(json, SearchSpecification.class);
try {
return wmsa_edge_index_query_time.time(() -> {
var params = new SearchParameters(specsSet, getSearchSet(specsSet));
List<EdgeSearchResultItem> results = executeSearch(params);
List<SearchResultItem> results = executeSearch(params);
logger.info(queryMarker, "Index Result Count: {}", results.size());
wmsa_edge_index_query_cost.set(params.getDataCost());
@ -87,7 +87,7 @@ public class IndexQueryService {
wmsa_edge_index_query_timeouts.inc();
}
return new EdgeSearchResultSet(results);
return new SearchResultSet(results);
});
}
catch (HaltException ex) {
@ -103,11 +103,11 @@ public class IndexQueryService {
}
// exists for test access
EdgeSearchResultSet justQuery(EdgeSearchSpecification specsSet) {
return new EdgeSearchResultSet(executeSearch(new SearchParameters(specsSet, getSearchSet(specsSet))));
SearchResultSet justQuery(SearchSpecification specsSet) {
return new SearchResultSet(executeSearch(new SearchParameters(specsSet, getSearchSet(specsSet))));
}
private SearchSet getSearchSet(EdgeSearchSpecification specsSet) {
private SearchSet getSearchSet(SearchSpecification specsSet) {
if (specsSet.domains != null && !specsSet.domains.isEmpty()) {
return new SmallSearchSet(specsSet.domains);
}
@ -115,7 +115,7 @@ public class IndexQueryService {
return searchSetsService.getSearchSetByName(specsSet.searchSetIdentifier);
}
private List<EdgeSearchResultItem> executeSearch(SearchParameters params) {
private List<SearchResultItem> executeSearch(SearchParameters params) {
var resultIds = evaluateSubqueries(params);
var resultItems = calculateResultScores(params, resultIds);
@ -176,7 +176,7 @@ public class IndexQueryService {
return results;
}
private ArrayList<EdgeSearchResultItem> calculateResultScores(SearchParameters params, TLongList results) {
private ArrayList<SearchResultItem> calculateResultScores(SearchParameters params, TLongList results) {
final var evaluator = new IndexResultValuator(
searchTermsSvc,
@ -185,7 +185,7 @@ public class IndexQueryService {
params.subqueries,
params.queryParams);
ArrayList<EdgeSearchResultItem> items = new ArrayList<>(results.size());
ArrayList<SearchResultItem> items = new ArrayList<>(results.size());
// Sorting the result ids results in better paging characteristics
results.sort();
@ -206,15 +206,15 @@ public class IndexQueryService {
return items;
}
private List<EdgeSearchResultItem> selectBestResults(SearchParameters params, List<EdgeSearchResultItem> results) {
private List<SearchResultItem> selectBestResults(SearchParameters params, List<SearchResultItem> results) {
var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain);
results.sort(comparingDouble(EdgeSearchResultItem::getScore)
.thenComparingInt(EdgeSearchResultItem::getRanking)
.thenComparingInt(EdgeSearchResultItem::getUrlIdInt));
results.sort(comparingDouble(SearchResultItem::getScore)
.thenComparingInt(SearchResultItem::getRanking)
.thenComparingInt(SearchResultItem::getUrlIdInt));
List<EdgeSearchResultItem> resultsList = new ArrayList<>(results.size());
List<SearchResultItem> resultsList = new ArrayList<>(results.size());
for (var item : results) {
if (domainCountFilter.test(item)) {
@ -245,7 +245,7 @@ class SearchParameters {
before evaluating them for the best result. */
final int fetchSize;
final IndexSearchBudget budget;
final List<EdgeSearchSubquery> subqueries;
final List<SearchSubquery> subqueries;
final IndexQueryParams queryParams;
final int limitByDomain;
@ -261,7 +261,7 @@ class SearchParameters {
*/
final TLongHashSet consideredUrlIds;
public SearchParameters(EdgeSearchSpecification specsSet, SearchSet searchSet) {
public SearchParameters(SearchSpecification specsSet, SearchSet searchSet) {
var limits = specsSet.queryLimits;
this.fetchSize = limits.fetchSize();

View File

@ -5,7 +5,7 @@ import com.google.inject.Singleton;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.dict.OffHeapDictionaryHashMap;
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.index.index.SearchIndexSearchTerms;
import nu.marginalia.lexicon.KeywordLexiconReadOnlyView;
import org.slf4j.Logger;
@ -23,7 +23,7 @@ public class SearchTermsService {
this.lexicon = lexicon;
}
public SearchIndexSearchTerms getSearchTerms(EdgeSearchSubquery request) {
public SearchIndexSearchTerms getSearchTerms(SearchSubquery request) {
final IntList excludes = new IntArrayList();
final IntList includes = new IntArrayList();
final IntList priority = new IntArrayList();

View File

@ -2,10 +2,10 @@ package nu.marginalia.index.svc;
import com.google.inject.Guice;
import com.google.inject.Inject;
import nu.marginalia.index.client.model.query.EdgeSearchSpecification;
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
import nu.marginalia.index.client.model.results.SearchResultItem;
import nu.marginalia.index.index.SearchIndex;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
@ -79,7 +79,7 @@ public class IndexQueryServiceIntegrationTest {
searchIndex.switchIndex();
var rsp = queryService.justQuery(
EdgeSearchSpecification.builder()
SearchSpecification.builder()
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
.queryStrategy(QueryStrategy.SENTENCE)
.year(SpecificationLimit.none())
@ -88,7 +88,7 @@ public class IndexQueryServiceIntegrationTest {
.rank(SpecificationLimit.none())
.domains(new ArrayList<>())
.searchSetIdentifier(SearchSetIdentifier.NONE)
.subqueries(List.of(new EdgeSearchSubquery(
.subqueries(List.of(new SearchSubquery(
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList()
))).build());
@ -96,7 +96,7 @@ public class IndexQueryServiceIntegrationTest {
new int[] { 30, 90, 150, 210, 270, 330, 390, 450, 510 },
rsp.results
.stream()
.mapToInt(EdgeSearchResultItem::getUrlIdInt)
.mapToInt(SearchResultItem::getUrlIdInt)
.toArray());
}
@ -111,7 +111,7 @@ public class IndexQueryServiceIntegrationTest {
searchIndex.switchIndex();
var rsp = queryService.justQuery(
EdgeSearchSpecification.builder()
SearchSpecification.builder()
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
.year(SpecificationLimit.none())
.quality(SpecificationLimit.none())
@ -119,12 +119,12 @@ public class IndexQueryServiceIntegrationTest {
.rank(SpecificationLimit.none())
.queryStrategy(QueryStrategy.SENTENCE)
.domains(List.of(2))
.subqueries(List.of(new EdgeSearchSubquery(
.subqueries(List.of(new SearchSubquery(
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList()
))).build());
Assertions.assertArrayEquals(
new int[] { 210, 270 },
rsp.results.stream().mapToInt(EdgeSearchResultItem::getUrlIdInt).toArray());
rsp.results.stream().mapToInt(SearchResultItem::getUrlIdInt).toArray());
}
@Test
@ -136,7 +136,7 @@ public class IndexQueryServiceIntegrationTest {
searchIndex.switchIndex();
var rsp = queryService.justQuery(
EdgeSearchSpecification.builder()
SearchSpecification.builder()
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
.quality(SpecificationLimit.none())
.year(SpecificationLimit.equals(1998))
@ -144,14 +144,14 @@ public class IndexQueryServiceIntegrationTest {
.rank(SpecificationLimit.none())
.queryStrategy(QueryStrategy.SENTENCE)
.searchSetIdentifier(SearchSetIdentifier.NONE)
.subqueries(List.of(new EdgeSearchSubquery(
.subqueries(List.of(new SearchSubquery(
List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList()
))
).build());
Assertions.assertArrayEquals(
new int[] { 12, 72, 132, 192, 252, 312, 372, 432, 492, 32 },
rsp.results.stream().mapToInt(EdgeSearchResultItem::getUrlIdInt).toArray());
rsp.results.stream().mapToInt(SearchResultItem::getUrlIdInt).toArray());
}

View File

@ -1,6 +1,6 @@
package nu.marginalia.search.command;
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
import nu.marginalia.index.client.model.query.SearchSubquery;
import javax.annotation.Nullable;
import java.util.Arrays;
@ -25,7 +25,7 @@ public enum SearchJsParameter {
return DEFAULT;
}
public void addTacitTerms(EdgeSearchSubquery subquery) {
public void addTacitTerms(SearchSubquery subquery) {
subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms));
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.search.query;
package nu.marginalia.search.db;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
@ -9,12 +9,12 @@ import java.util.ArrayList;
import java.util.List;
import java.util.function.Consumer;
public class NearQueryProcessor {
public class DbNearDomainsQuery {
private final HikariDataSource dataSource;
@Inject
public NearQueryProcessor(HikariDataSource dataSource) {
public DbNearDomainsQuery(HikariDataSource dataSource) {
this.dataSource = dataSource;
}

View File

@ -2,7 +2,7 @@ package nu.marginalia.search.model;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import java.util.Objects;
@ -45,7 +45,7 @@ public enum SearchProfile {
return YOLO;
}
public void addTacitTerms(EdgeSearchSubquery subquery) {
public void addTacitTerms(SearchSubquery subquery) {
if (this == ACADEMIA) {
subquery.searchTermsPriority.add("tld:edu");
}

View File

@ -1,7 +1,7 @@
package nu.marginalia.search.model;
import lombok.*;
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
import nu.marginalia.index.client.model.results.SearchResultItem;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
import nu.marginalia.model.crawl.HtmlFeature;
@ -36,7 +36,7 @@ public class UrlDetails {
public int resultsFromSameDomain;
public String positions;
public EdgeSearchResultItem resultItem;
public SearchResultItem resultItem;
public boolean hasMoreResults() {
return resultsFromSameDomain > 1;

View File

@ -3,8 +3,8 @@ package nu.marginalia.search.query;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.LanguageModels;
import nu.marginalia.index.client.model.query.EdgeSearchSpecification;
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
@ -16,6 +16,7 @@ import nu.marginalia.query_parser.QueryPermutation;
import nu.marginalia.query_parser.QueryVariants;
import nu.marginalia.query_parser.token.Token;
import nu.marginalia.query_parser.token.TokenType;
import nu.marginalia.search.db.DbNearDomainsQuery;
import nu.marginalia.search.model.SearchProfile;
import nu.marginalia.search.query.model.SearchQuery;
import nu.marginalia.search.query.model.UserSearchParameters;
@ -34,7 +35,7 @@ public class QueryFactory {
private final EnglishDictionary englishDictionary;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final SearchResultValuator searchResultValuator;
private final NearQueryProcessor nearQueryProcessor;
private final DbNearDomainsQuery dbNearDomainsQuery;
private static final int RETAIN_QUERY_VARIANT_COUNT = 5;
private final ThreadLocal<QueryVariants> queryVariants;
@ -48,11 +49,11 @@ public class QueryFactory {
EnglishDictionary englishDictionary,
NGramBloomFilter nGramBloomFilter,
SearchResultValuator searchResultValuator,
NearQueryProcessor nearQueryProcessor) {
DbNearDomainsQuery dbNearDomainsQuery) {
this.englishDictionary = englishDictionary;
this.searchResultValuator = searchResultValuator;
this.nearQueryProcessor = nearQueryProcessor;
this.dbNearDomainsQuery = dbNearDomainsQuery;
this.queryVariants = ThreadLocal.withInitial(() -> new QueryVariants(lm ,dict, nGramBloomFilter, englishDictionary));
}
@ -67,13 +68,13 @@ public class QueryFactory {
public SearchQuery createQuery(UserSearchParameters params) {
final var processedQuery = createQuery(getQueryPermutation(), params);
final List<EdgeSearchSubquery> subqueries = processedQuery.specs.subqueries;
final List<SearchSubquery> subqueries = processedQuery.specs.subqueries;
for (var sq : subqueries) {
sq.setValue(searchResultValuator.preEvaluate(sq));
}
subqueries.sort(Comparator.comparing(EdgeSearchSubquery::getValue));
subqueries.sort(Comparator.comparing(SearchSubquery::getValue));
trimArray(subqueries, RETAIN_QUERY_VARIANT_COUNT);
return processedQuery;
@ -84,16 +85,16 @@ public class QueryFactory {
int limitTotal,
String... termsInclude)
{
List<EdgeSearchSubquery> sqs = new ArrayList<>();
List<SearchSubquery> sqs = new ArrayList<>();
sqs.add(new EdgeSearchSubquery(
sqs.add(new SearchSubquery(
Arrays.asList(termsInclude),
Collections.emptyList(),
Collections.emptyList(),
Collections.emptyList()
));
var specs = EdgeSearchSpecification.builder()
var specs = SearchSpecification.builder()
.subqueries(sqs)
.domains(Collections.emptyList())
.searchSetIdentifier(profile.searchSetIdentifier)
@ -170,7 +171,7 @@ public class QueryFactory {
}
var queryPermutations = queryPermutation.permuteQueriesNew(basicQuery);
List<EdgeSearchSubquery> subqueries = new ArrayList<>();
List<SearchSubquery> subqueries = new ArrayList<>();
String near = profile.getNearDomain();
@ -219,7 +220,7 @@ public class QueryFactory {
searchTermsAdvice.clear();
}
EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority);
SearchSubquery subquery = new SearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority);
params.profile().addTacitTerms(subquery);
params.jsSetting().addTacitTerms(subquery);
@ -231,7 +232,7 @@ public class QueryFactory {
if (near != null) {
if (domain == null) {
domains = nearQueryProcessor.getRelatedDomains(near, problems::add);
domains = dbNearDomainsQuery.getRelatedDomains(near, problems::add);
}
}
@ -242,7 +243,7 @@ public class QueryFactory {
domainLimit = 2;
}
EdgeSearchSpecification.EdgeSearchSpecificationBuilder specsBuilder = EdgeSearchSpecification.builder()
var specsBuilder = SearchSpecification.builder()
.subqueries(subqueries)
.queryLimits(new QueryLimits(domainLimit, 100, 250, 4096))
.humanQuery(query)
@ -254,7 +255,7 @@ public class QueryFactory {
.queryStrategy(queryStrategy)
.searchSetIdentifier(profile.searchSetIdentifier);
EdgeSearchSpecification specs = specsBuilder.build();
SearchSpecification specs = specsBuilder.build();
return new SearchQuery(specs, searchTermsHuman, domain);
}

View File

@ -1,19 +1,19 @@
package nu.marginalia.search.query.model;
import lombok.AllArgsConstructor;
import nu.marginalia.index.client.model.query.EdgeSearchSpecification;
import nu.marginalia.index.client.model.query.SearchSpecification;
import java.util.*;
@AllArgsConstructor
public class SearchQuery {
public final EdgeSearchSpecification specs;
public final SearchSpecification specs;
public final Set<String> problems = new TreeSet<>();
public final List<String> searchTermsHuman;
public String domain;
public SearchQuery(EdgeSearchSpecification justSpecs) {
public SearchQuery(SearchSpecification justSpecs) {
searchTermsHuman = new ArrayList<>();
specs = justSpecs;
}

View File

@ -8,7 +8,7 @@ import nu.marginalia.search.db.DbUrlDetailsQuery;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
import nu.marginalia.model.id.EdgeIdList;
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
import nu.marginalia.index.client.model.results.SearchResultItem;
import nu.marginalia.search.model.UrlDetails;
import nu.marginalia.search.valuation.SearchResultValuator;
import nu.marginalia.util.BrailleBlockPunchCards;
@ -29,11 +29,11 @@ public class SearchResultDecorator {
this.valuator = valuator;
}
public List<UrlDetails> getAllUrlDetails(List<EdgeSearchResultItem> resultItems) {
public List<UrlDetails> getAllUrlDetails(List<SearchResultItem> resultItems) {
TIntObjectHashMap<UrlDetails> detailsById = new TIntObjectHashMap<>(resultItems.size());
EdgeIdList<EdgeUrl> idList = resultItems.stream()
.mapToInt(EdgeSearchResultItem::getUrlIdInt)
.mapToInt(SearchResultItem::getUrlIdInt)
.collect(EdgeIdList::new, EdgeIdList::add, EdgeIdList::addAll);
List<UrlDetails> ret = dbUrlDetailsQuery.getUrlDetailsMulti(idList);
@ -72,14 +72,14 @@ public class SearchResultDecorator {
return retList;
}
private String getPositionsString(EdgeSearchResultItem resultItem) {
private String getPositionsString(SearchResultItem resultItem) {
Int2IntArrayMap positionsPerSet = new Int2IntArrayMap(8);
for (var score : resultItem.scores) {
if (!score.isRegular()) {
if (!score.isKeywordRegular()) {
continue;
}
positionsPerSet.merge(score.set(), score.positions(), this::and);
positionsPerSet.merge(score.subquery(), score.positions(), this::and);
}
int bits = positionsPerSet.values().intStream().reduce(this::or).orElse(0);
@ -95,7 +95,7 @@ public class SearchResultDecorator {
return a | b;
}
private double calculateTermScore(EdgeSearchResultItem resultItem, UrlDetails details) {
private double calculateTermScore(SearchResultItem resultItem, UrlDetails details) {
final double statePenalty = (details.domainState == EdgeDomainIndexingState.SPECIAL) ? 1.25 : 0;
final double value = valuator.evaluateTerms(resultItem.scores, details.words, details.title.length());

View File

@ -3,7 +3,7 @@ package nu.marginalia.search.svc;
import com.google.common.base.Strings;
import com.google.inject.Inject;
import lombok.SneakyThrows;
import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore;
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
import nu.marginalia.search.client.model.ApiSearchResultQueryDetails;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.search.SearchOperator;
@ -62,7 +62,7 @@ public class SearchApiQueryService {
ApiSearchResult convert(UrlDetails url) {
List<List<ApiSearchResultQueryDetails>> details = new ArrayList<>();
if (url.resultItem != null) {
var bySet = url.resultItem.scores.stream().collect(Collectors.groupingBy(EdgeSearchResultKeywordScore::set));
var bySet = url.resultItem.scores.stream().collect(Collectors.groupingBy(SearchResultKeywordScore::subquery));
outer:
for (var entries : bySet.values()) {
@ -73,7 +73,7 @@ public class SearchApiQueryService {
continue outer;
Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet());
lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(), Integer.bitCount(metadata.positions()), flags));
lst.add(new ApiSearchResultQueryDetails(entry.keyword, metadata.tfIdf(), Integer.bitCount(metadata.positions()), flags));
}
details.add(lst);
}

View File

@ -3,8 +3,8 @@ package nu.marginalia.search.svc;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.index.client.IndexClient;
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
import nu.marginalia.index.client.model.query.EdgeSearchSpecification;
import nu.marginalia.index.client.model.results.SearchResultItem;
import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.search.model.PageScoreAdjustment;
import nu.marginalia.search.model.UrlDetails;
import nu.marginalia.search.results.SearchResultDecorator;
@ -37,7 +37,7 @@ public class SearchQueryIndexService {
}
public List<UrlDetails> executeQuery(Context ctx, SearchQuery processedQuery) {
final List<EdgeSearchResultItem> results = indexClient.query(ctx, processedQuery.specs);
final List<SearchResultItem> results = indexClient.query(ctx, processedQuery.specs);
List<UrlDetails> urlDetails = resultDecorator.getAllUrlDetails(results);
@ -70,7 +70,7 @@ public class SearchQueryIndexService {
private final Pattern titleSplitPattern = Pattern.compile("[:!|./]|(\\s-|-\\s)|\\s{2,}");
private PageScoreAdjustment adjustScoreBasedOnQuery(UrlDetails p, EdgeSearchSpecification specs) {
private PageScoreAdjustment adjustScoreBasedOnQuery(UrlDetails p, SearchSpecification specs) {
String titleLC = p.title == null ? "" : p.title.toLowerCase();
String descLC = p.description == null ? "" : p.description.toLowerCase();
String urlLC = p.url == null ? "" : p.url.path.toLowerCase();

View File

@ -5,8 +5,8 @@ import com.google.inject.Singleton;
import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.model.crawl.EdgePageWordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore;
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.language.WordPatterns;
import org.jetbrains.annotations.NotNull;
@ -35,7 +35,7 @@ public class SearchResultValuator {
}
public double preEvaluate(EdgeSearchSubquery sq) {
public double preEvaluate(SearchSubquery sq) {
final String[] terms = sq.searchTermsInclude.stream().filter(f -> !f.contains(":")).toArray(String[]::new);
double termSum = 0.;
@ -56,8 +56,8 @@ public class SearchResultValuator {
return termSum / factorSum;
}
public double evaluateTerms(List<EdgeSearchResultKeywordScore> rawScores, int length, int titleLength) {
int sets = 1 + rawScores.stream().mapToInt(EdgeSearchResultKeywordScore::set).max().orElse(0);
public double evaluateTerms(List<SearchResultKeywordScore> rawScores, int length, int titleLength) {
int sets = 1 + rawScores.stream().mapToInt(SearchResultKeywordScore::subquery).max().orElse(0);
double bestScore = 10;
double bestAllTermsFactor = 1.;
@ -88,10 +88,10 @@ public class SearchResultValuator {
return bestScore * (0.1 + 0.9 * bestAllTermsFactor) * priorityTermBonus;
}
private boolean hasPriorityTerm(List<EdgeSearchResultKeywordScore> rawScores) {
private boolean hasPriorityTerm(List<SearchResultKeywordScore> rawScores) {
return rawScores.stream()
.findAny()
.map(EdgeSearchResultKeywordScore::hasPriorityTerms)
.map(SearchResultKeywordScore::hasPriorityTerms)
.orElse(false);
}
@ -260,11 +260,11 @@ public class SearchResultValuator {
return f;
}
private double[] getTermWeights(EdgeSearchResultKeywordScore[] scores) {
private double[] getTermWeights(SearchResultKeywordScore[] scores) {
double[] weights = new double[scores.length];
for (int i = 0; i < scores.length; i++) {
String[] parts = separator.split(scores[i].keyword());
String[] parts = separator.split(scores[i].keyword);
double sumScore = 0.;
int count = 0;
@ -305,8 +305,8 @@ public class SearchResultValuator {
return weights;
}
private SearchResultsKeywordSet createKeywordSet(List<EdgeSearchResultKeywordScore> rawScores, int thisSet) {
EdgeSearchResultKeywordScore[] scores = rawScores.stream().filter(w -> w.set() == thisSet && !w.keyword().contains(":")).toArray(EdgeSearchResultKeywordScore[]::new);
private SearchResultsKeywordSet createKeywordSet(List<SearchResultKeywordScore> rawScores, int thisSet) {
SearchResultKeywordScore[] scores = rawScores.stream().filter(w -> w.subquery() == thisSet && !w.keyword.contains(":")).toArray(SearchResultKeywordScore[]::new);
if (scores.length == 0) {
return null;
}
@ -322,8 +322,8 @@ public class SearchResultValuator {
}
private record SearchResultsKeyword(EdgeSearchResultKeywordScore score, WordMetadata wordMetadata, double weight) {
public SearchResultsKeyword(EdgeSearchResultKeywordScore score, double weight) {
private record SearchResultsKeyword(SearchResultKeywordScore score, WordMetadata wordMetadata, double weight) {
public SearchResultsKeyword(SearchResultKeywordScore score, double weight) {
this(score, new WordMetadata(score.encodedWordMetadata()), weight);
}

View File

@ -3,7 +3,7 @@ package nu.marginalia.search.query;
import nu.marginalia.WmsaHome;
import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.language.statistics.EnglishDictionary;
import nu.marginalia.index.client.model.query.EdgeSearchSpecification;
import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.language.statistics.NGramBloomFilter;
import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.search.command.SearchJsParameter;
@ -37,7 +37,7 @@ public class QueryFactoryTest {
);
}
public EdgeSearchSpecification parseAndGetSpecs(String query) {
public SearchSpecification parseAndGetSpecs(String query) {
return queryFactory.createQuery(
new UserSearchParameters(query, SearchProfile.CORPO, SearchJsParameter.DEFAULT)
).specs;

View File

@ -1,6 +1,6 @@
package nu.marginalia.search.valuation;
import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore;
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
import nu.marginalia.language.statistics.TermFrequencyDict;
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
import nu.marginalia.model.crawl.EdgePageWordFlags;
@ -31,29 +31,29 @@ class SearchResultValuatorTest {
valuator = new SearchResultValuator(dict);
}
List<EdgeSearchResultKeywordScore> titleOnlyLowCountSet = List.of(
new EdgeSearchResultKeywordScore(0, "bob",
List<SearchResultKeywordScore> titleOnlyLowCountSet = List.of(
new SearchResultKeywordScore(0, "bob",
wordMetadata(32, Set.of(1), EnumSet.of(EdgePageWordFlags.Title)),
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
false)
);
List<EdgeSearchResultKeywordScore> highCountNoTitleSet = List.of(
new EdgeSearchResultKeywordScore(0, "bob",
List<SearchResultKeywordScore> highCountNoTitleSet = List.of(
new SearchResultKeywordScore(0, "bob",
wordMetadata(129, Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(EdgePageWordFlags.TfIdfHigh)),
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
false)
);
List<EdgeSearchResultKeywordScore> highCountSubjectSet = List.of(
new EdgeSearchResultKeywordScore(0, "bob",
List<SearchResultKeywordScore> highCountSubjectSet = List.of(
new SearchResultKeywordScore(0, "bob",
wordMetadata(129, Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(EdgePageWordFlags.TfIdfHigh, EdgePageWordFlags.Subjects)),
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
false)
);
List<EdgeSearchResultKeywordScore> first = List.of(
new EdgeSearchResultKeywordScore(0, "bob",
List<SearchResultKeywordScore> first = List.of(
new SearchResultKeywordScore(0, "bob",
wordMetadata(202, Set.of(1,3,4,6,7,9,10,11), EnumSet.of(EdgePageWordFlags.TfIdfHigh)),
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
false)