mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Clean up search-service and index-api
This commit is contained in:
parent
c2f9980eba
commit
73e412ea5b
@ -7,9 +7,9 @@ import io.reactivex.rxjava3.core.Observable;
|
|||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.client.AbstractDynamicClient;
|
import nu.marginalia.client.AbstractDynamicClient;
|
||||||
import nu.marginalia.client.Context;
|
import nu.marginalia.client.Context;
|
||||||
import nu.marginalia.index.client.model.query.EdgeSearchSpecification;
|
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||||
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
|
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||||
import nu.marginalia.index.client.model.results.EdgeSearchResultSet;
|
import nu.marginalia.index.client.model.results.SearchResultSet;
|
||||||
import nu.marginalia.model.gson.GsonFactory;
|
import nu.marginalia.model.gson.GsonFactory;
|
||||||
import nu.marginalia.service.descriptor.ServiceDescriptors;
|
import nu.marginalia.service.descriptor.ServiceDescriptors;
|
||||||
import nu.marginalia.service.id.ServiceId;
|
import nu.marginalia.service.id.ServiceId;
|
||||||
@ -30,9 +30,9 @@ public class IndexClient extends AbstractDynamicClient {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@CheckReturnValue
|
@CheckReturnValue
|
||||||
public List<EdgeSearchResultItem> query(Context ctx, EdgeSearchSpecification specs) {
|
public List<SearchResultItem> query(Context ctx, SearchSpecification specs) {
|
||||||
return wmsa_search_index_api_time.time(
|
return wmsa_search_index_api_time.time(
|
||||||
() -> this.postGet(ctx, "/search/", specs, EdgeSearchResultSet.class).blockingFirst().getResults()
|
() -> this.postGet(ctx, "/search/", specs, SearchResultSet.class).blockingFirst().getResults()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,13 +0,0 @@
|
|||||||
package nu.marginalia.index.client.model.domain;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Getter;
|
|
||||||
import lombok.ToString;
|
|
||||||
import nu.marginalia.model.EdgeUrl;
|
|
||||||
import nu.marginalia.model.id.EdgeIdList;
|
|
||||||
|
|
||||||
@AllArgsConstructor @Getter @ToString
|
|
||||||
public class EdgeDomainSearchResults {
|
|
||||||
public final String keyword;
|
|
||||||
public final EdgeIdList<EdgeUrl> results;
|
|
||||||
}
|
|
@ -1,14 +0,0 @@
|
|||||||
package nu.marginalia.index.client.model.domain;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.ToString;
|
|
||||||
|
|
||||||
@ToString @AllArgsConstructor
|
|
||||||
public class EdgeDomainSearchSpecification {
|
|
||||||
|
|
||||||
public final String keyword;
|
|
||||||
|
|
||||||
public final int queryDepth;
|
|
||||||
public final int minHitCount;
|
|
||||||
public final int maxResults;
|
|
||||||
}
|
|
@ -8,9 +8,12 @@ import nu.marginalia.index.query.limit.SpecificationLimit;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@ToString @Getter @Builder @With @AllArgsConstructor
|
@ToString @Getter @Builder @With @AllArgsConstructor
|
||||||
public class EdgeSearchSpecification {
|
public class SearchSpecification {
|
||||||
public List<EdgeSearchSubquery> subqueries;
|
public List<SearchSubquery> subqueries;
|
||||||
|
|
||||||
|
/** If present and not empty, limit the search to these domain IDs */
|
||||||
public List<Integer> domains;
|
public List<Integer> domains;
|
||||||
|
|
||||||
public SearchSetIdentifier searchSetIdentifier;
|
public SearchSetIdentifier searchSetIdentifier;
|
||||||
|
|
||||||
public final String humanQuery;
|
public final String humanQuery;
|
||||||
@ -21,6 +24,7 @@ public class EdgeSearchSpecification {
|
|||||||
public final SpecificationLimit rank;
|
public final SpecificationLimit rank;
|
||||||
|
|
||||||
public final QueryLimits queryLimits;
|
public final QueryLimits queryLimits;
|
||||||
|
|
||||||
public final QueryStrategy queryStrategy;
|
public final QueryStrategy queryStrategy;
|
||||||
|
|
||||||
}
|
}
|
@ -2,26 +2,32 @@ package nu.marginalia.index.client.model.query;
|
|||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import lombok.ToString;
|
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
@Getter
|
@Getter
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
public class EdgeSearchSubquery {
|
public class SearchSubquery {
|
||||||
|
|
||||||
|
/** These terms must be present in the document */
|
||||||
public final List<String> searchTermsInclude;
|
public final List<String> searchTermsInclude;
|
||||||
|
|
||||||
|
/** These terms must be absent from the document */
|
||||||
public final List<String> searchTermsExclude;
|
public final List<String> searchTermsExclude;
|
||||||
|
|
||||||
|
/** These terms must be present in the document, but are not used in ranking */
|
||||||
public final List<String> searchTermsAdvice;
|
public final List<String> searchTermsAdvice;
|
||||||
|
|
||||||
|
/** If these optional terms are present in the document, rank it highly */
|
||||||
public final List<String> searchTermsPriority;
|
public final List<String> searchTermsPriority;
|
||||||
|
|
||||||
private double value = 0;
|
private double value = 0;
|
||||||
|
|
||||||
public EdgeSearchSubquery(List<String> searchTermsInclude,
|
public SearchSubquery(List<String> searchTermsInclude,
|
||||||
List<String> searchTermsExclude,
|
List<String> searchTermsExclude,
|
||||||
List<String> searchTermsAdvice,
|
List<String> searchTermsAdvice,
|
||||||
List<String> searchTermsPriority
|
List<String> searchTermsPriority
|
||||||
) {
|
) {
|
||||||
this.searchTermsInclude = searchTermsInclude;
|
this.searchTermsInclude = searchTermsInclude;
|
||||||
this.searchTermsExclude = searchTermsExclude;
|
this.searchTermsExclude = searchTermsExclude;
|
||||||
@ -29,7 +35,7 @@ public class EdgeSearchSubquery {
|
|||||||
this.searchTermsPriority = searchTermsPriority;
|
this.searchTermsPriority = searchTermsPriority;
|
||||||
}
|
}
|
||||||
|
|
||||||
public EdgeSearchSubquery setValue(double value) {
|
public SearchSubquery setValue(double value) {
|
||||||
if (Double.isInfinite(value) || Double.isNaN(value)) {
|
if (Double.isInfinite(value) || Double.isNaN(value)) {
|
||||||
this.value = Double.MAX_VALUE;
|
this.value = Double.MAX_VALUE;
|
||||||
} else {
|
} else {
|
@ -1,84 +0,0 @@
|
|||||||
package nu.marginalia.index.client.model.results;
|
|
||||||
|
|
||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
|
||||||
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
|
||||||
|
|
||||||
import static java.lang.Integer.lowestOneBit;
|
|
||||||
import static java.lang.Integer.numberOfTrailingZeros;
|
|
||||||
|
|
||||||
public record EdgeSearchResultKeywordScore(int set,
|
|
||||||
String keyword,
|
|
||||||
long encodedWordMetadata,
|
|
||||||
long encodedDocMetadata,
|
|
||||||
boolean hasPriorityTerms) {
|
|
||||||
public double documentValue() {
|
|
||||||
long sum = 0;
|
|
||||||
|
|
||||||
sum += DocumentMetadata.decodeQuality(encodedDocMetadata) / 5.;
|
|
||||||
|
|
||||||
sum += DocumentMetadata.decodeTopology(encodedDocMetadata);
|
|
||||||
|
|
||||||
if (DocumentMetadata.hasFlags(encodedDocMetadata, EdgePageDocumentFlags.Simple.asBit())) {
|
|
||||||
sum += 20;
|
|
||||||
}
|
|
||||||
|
|
||||||
int rank = DocumentMetadata.decodeRank(encodedDocMetadata) - 13;
|
|
||||||
if (rank < 0)
|
|
||||||
sum += rank / 2;
|
|
||||||
else
|
|
||||||
sum += rank / 4;
|
|
||||||
|
|
||||||
return sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean hasTermFlag(EdgePageWordFlags flag) {
|
|
||||||
return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit());
|
|
||||||
}
|
|
||||||
|
|
||||||
public double termValue() {
|
|
||||||
double sum = 0;
|
|
||||||
|
|
||||||
if (hasTermFlag(EdgePageWordFlags.Title)) {
|
|
||||||
sum -= 15;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (hasTermFlag(EdgePageWordFlags.Site)) {
|
|
||||||
sum -= 10;
|
|
||||||
}
|
|
||||||
else if (hasTermFlag(EdgePageWordFlags.SiteAdjacent)) {
|
|
||||||
sum -= 5;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (hasTermFlag(EdgePageWordFlags.Subjects)) {
|
|
||||||
sum -= 10;
|
|
||||||
}
|
|
||||||
if (hasTermFlag(EdgePageWordFlags.NamesWords)) {
|
|
||||||
sum -= 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (hasTermFlag(EdgePageWordFlags.UrlDomain)) {
|
|
||||||
sum -= 5;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (hasTermFlag(EdgePageWordFlags.UrlPath)) {
|
|
||||||
sum -= 5;
|
|
||||||
}
|
|
||||||
|
|
||||||
double tfIdf = WordMetadata.decodeTfidf(encodedWordMetadata);
|
|
||||||
int positionBits = WordMetadata.decodePositions(encodedWordMetadata);
|
|
||||||
|
|
||||||
sum -= tfIdf / 10.;
|
|
||||||
sum -= Integer.bitCount(positionBits) / 3.;
|
|
||||||
|
|
||||||
return sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int positions() { return WordMetadata.decodePositions(encodedWordMetadata); }
|
|
||||||
public boolean isSpecial() { return keyword.contains(":") || hasTermFlag(EdgePageWordFlags.Synthetic); }
|
|
||||||
public boolean isRegular() {
|
|
||||||
return !keyword.contains(":")
|
|
||||||
&& !hasTermFlag(EdgePageWordFlags.Synthetic);
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,26 +0,0 @@
|
|||||||
package nu.marginalia.index.client.model.results;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Getter;
|
|
||||||
import lombok.ToString;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
@AllArgsConstructor @Getter @ToString
|
|
||||||
public class EdgeSearchResults {
|
|
||||||
public final List<EdgeSearchResultItem> results;
|
|
||||||
|
|
||||||
public EdgeSearchResults() {
|
|
||||||
results = new ArrayList<>();
|
|
||||||
}
|
|
||||||
|
|
||||||
public int size() {
|
|
||||||
return results.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
public Stream<EdgeSearchResultItem> stream() {
|
|
||||||
return results.stream();
|
|
||||||
}
|
|
||||||
}
|
|
@ -8,15 +8,19 @@ import nu.marginalia.model.id.EdgeId;
|
|||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
/** Represents a document matching a search query */
|
||||||
@AllArgsConstructor @Getter
|
@AllArgsConstructor @Getter
|
||||||
public class EdgeSearchResultItem {
|
public class SearchResultItem {
|
||||||
|
/** Encoded ID that contains both the URL id and its ranking */
|
||||||
public final long combinedId;
|
public final long combinedId;
|
||||||
|
|
||||||
public final List<EdgeSearchResultKeywordScore> scores;
|
/** How did the subqueries match against the document ? */
|
||||||
|
public final List<SearchResultKeywordScore> scores;
|
||||||
|
|
||||||
|
/** How many other potential results existed in the same domain */
|
||||||
public int resultsFromDomain;
|
public int resultsFromDomain;
|
||||||
|
|
||||||
public EdgeSearchResultItem(long val) {
|
public SearchResultItem(long val) {
|
||||||
this.combinedId = val;
|
this.combinedId = val;
|
||||||
this.scores = new ArrayList<>(16);
|
this.scores = new ArrayList<>(16);
|
||||||
}
|
}
|
||||||
@ -62,7 +66,7 @@ public class EdgeSearchResultItem {
|
|||||||
return false;
|
return false;
|
||||||
if (other == this)
|
if (other == this)
|
||||||
return true;
|
return true;
|
||||||
if (other instanceof EdgeSearchResultItem o) {
|
if (other instanceof SearchResultItem o) {
|
||||||
return o.getUrlIdInt() == getUrlIdInt();
|
return o.getUrlIdInt() == getUrlIdInt();
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
@ -0,0 +1,145 @@
|
|||||||
|
package nu.marginalia.index.client.model.results;
|
||||||
|
|
||||||
|
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||||
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
|
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
||||||
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
|
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
public final class SearchResultKeywordScore {
|
||||||
|
public final int subquery;
|
||||||
|
public final String keyword;
|
||||||
|
private final long encodedWordMetadata;
|
||||||
|
private final long encodedDocMetadata;
|
||||||
|
private final boolean hasPriorityTerms;
|
||||||
|
|
||||||
|
public SearchResultKeywordScore(int subquery,
|
||||||
|
String keyword,
|
||||||
|
long encodedWordMetadata,
|
||||||
|
long encodedDocMetadata,
|
||||||
|
boolean hasPriorityTerms) {
|
||||||
|
this.subquery = subquery;
|
||||||
|
this.keyword = keyword;
|
||||||
|
this.encodedWordMetadata = encodedWordMetadata;
|
||||||
|
this.encodedDocMetadata = encodedDocMetadata;
|
||||||
|
this.hasPriorityTerms = hasPriorityTerms;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean hasTermFlag(EdgePageWordFlags flag) {
|
||||||
|
return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit());
|
||||||
|
}
|
||||||
|
|
||||||
|
public double documentValue() {
|
||||||
|
long sum = 0;
|
||||||
|
|
||||||
|
sum += DocumentMetadata.decodeQuality(encodedDocMetadata) / 5.;
|
||||||
|
|
||||||
|
sum += DocumentMetadata.decodeTopology(encodedDocMetadata);
|
||||||
|
|
||||||
|
if (DocumentMetadata.hasFlags(encodedDocMetadata, EdgePageDocumentFlags.Simple.asBit())) {
|
||||||
|
sum += 20;
|
||||||
|
}
|
||||||
|
|
||||||
|
int rank = DocumentMetadata.decodeRank(encodedDocMetadata) - 13;
|
||||||
|
if (rank < 0)
|
||||||
|
sum += rank / 2;
|
||||||
|
else
|
||||||
|
sum += rank / 4;
|
||||||
|
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double termValue() {
|
||||||
|
double sum = 0;
|
||||||
|
|
||||||
|
if (hasTermFlag(EdgePageWordFlags.Title)) {
|
||||||
|
sum -= 15;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hasTermFlag(EdgePageWordFlags.Site)) {
|
||||||
|
sum -= 10;
|
||||||
|
} else if (hasTermFlag(EdgePageWordFlags.SiteAdjacent)) {
|
||||||
|
sum -= 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hasTermFlag(EdgePageWordFlags.Subjects)) {
|
||||||
|
sum -= 10;
|
||||||
|
}
|
||||||
|
if (hasTermFlag(EdgePageWordFlags.NamesWords)) {
|
||||||
|
sum -= 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hasTermFlag(EdgePageWordFlags.UrlDomain)) {
|
||||||
|
sum -= 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hasTermFlag(EdgePageWordFlags.UrlPath)) {
|
||||||
|
sum -= 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
double tfIdf = WordMetadata.decodeTfidf(encodedWordMetadata);
|
||||||
|
int positionBits = WordMetadata.decodePositions(encodedWordMetadata);
|
||||||
|
|
||||||
|
sum -= tfIdf / 10.;
|
||||||
|
sum -= Integer.bitCount(positionBits) / 3.;
|
||||||
|
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int subquery() {
|
||||||
|
return subquery;
|
||||||
|
}
|
||||||
|
public int positions() {
|
||||||
|
return WordMetadata.decodePositions(encodedWordMetadata);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isKeywordSpecial() {
|
||||||
|
return keyword.contains(":") || hasTermFlag(EdgePageWordFlags.Synthetic);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isKeywordRegular() {
|
||||||
|
return !keyword.contains(":")
|
||||||
|
&& !hasTermFlag(EdgePageWordFlags.Synthetic);
|
||||||
|
}
|
||||||
|
|
||||||
|
public long encodedWordMetadata() {
|
||||||
|
return encodedWordMetadata;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long encodedDocMetadata() {
|
||||||
|
return encodedDocMetadata;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasPriorityTerms() {
|
||||||
|
return hasPriorityTerms;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (obj == this) return true;
|
||||||
|
if (obj == null || obj.getClass() != this.getClass()) return false;
|
||||||
|
var that = (SearchResultKeywordScore) obj;
|
||||||
|
return this.subquery == that.subquery &&
|
||||||
|
Objects.equals(this.keyword, that.keyword) &&
|
||||||
|
this.encodedWordMetadata == that.encodedWordMetadata &&
|
||||||
|
this.encodedDocMetadata == that.encodedDocMetadata &&
|
||||||
|
this.hasPriorityTerms == that.hasPriorityTerms;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return Objects.hash(subquery, keyword, encodedWordMetadata, encodedDocMetadata, hasPriorityTerms);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "SearchResultKeywordScore[" +
|
||||||
|
"set=" + subquery + ", " +
|
||||||
|
"keyword=" + keyword + ", " +
|
||||||
|
"encodedWordMetadata=" + encodedWordMetadata + ", " +
|
||||||
|
"encodedDocMetadata=" + encodedDocMetadata + ", " +
|
||||||
|
"hasPriorityTerms=" + hasPriorityTerms + ']';
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -7,8 +7,8 @@ import lombok.ToString;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@AllArgsConstructor @Getter @ToString
|
@AllArgsConstructor @Getter @ToString
|
||||||
public class EdgeSearchResultSet {
|
public class SearchResultSet {
|
||||||
public List<EdgeSearchResultItem> results;
|
public List<SearchResultItem> results;
|
||||||
|
|
||||||
public int size() {
|
public int size() {
|
||||||
return results.size();
|
return results.size();
|
@ -2,7 +2,7 @@ package nu.marginalia.index.results;
|
|||||||
|
|
||||||
import gnu.trove.map.TLongIntMap;
|
import gnu.trove.map.TLongIntMap;
|
||||||
import gnu.trove.map.hash.TLongIntHashMap;
|
import gnu.trove.map.hash.TLongIntHashMap;
|
||||||
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
|
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||||
|
|
||||||
public class IndexResultDomainDeduplicator {
|
public class IndexResultDomainDeduplicator {
|
||||||
final TLongIntMap resultsByRankingId = new TLongIntHashMap(2048, 0.5f, -1, 0);
|
final TLongIntMap resultsByRankingId = new TLongIntHashMap(2048, 0.5f, -1, 0);
|
||||||
@ -21,7 +21,7 @@ public class IndexResultDomainDeduplicator {
|
|||||||
return resultsByRankingId.adjustOrPutValue(ranking, 1, 1) <= limitByDomain;
|
return resultsByRankingId.adjustOrPutValue(ranking, 1, 1) <= limitByDomain;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean test(EdgeSearchResultItem item) {
|
public boolean test(SearchResultItem item) {
|
||||||
final long key = item.deduplicationKey();
|
final long key = item.deduplicationKey();
|
||||||
if (key == 0)
|
if (key == 0)
|
||||||
return true;
|
return true;
|
||||||
@ -29,7 +29,7 @@ public class IndexResultDomainDeduplicator {
|
|||||||
return resultsByRankingId.adjustOrPutValue(key, 1, 1) <= limitByDomain;
|
return resultsByRankingId.adjustOrPutValue(key, 1, 1) <= limitByDomain;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getCount(EdgeSearchResultItem item) {
|
public int getCount(SearchResultItem item) {
|
||||||
final long key = item.deduplicationKey();
|
final long key = item.deduplicationKey();
|
||||||
if (key == 0)
|
if (key == 0)
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -9,9 +9,9 @@ import nu.marginalia.index.svc.SearchTermsService;
|
|||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
|
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||||
import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore;
|
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
|
||||||
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
|
import nu.marginalia.index.client.model.query.SearchSubquery;
|
||||||
import nu.marginalia.index.query.IndexQueryParams;
|
import nu.marginalia.index.query.IndexQueryParams;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -31,7 +31,7 @@ public class IndexResultValuator {
|
|||||||
public IndexResultValuator(SearchTermsService searchTermsSvc,
|
public IndexResultValuator(SearchTermsService searchTermsSvc,
|
||||||
IndexMetadataService metadataService,
|
IndexMetadataService metadataService,
|
||||||
TLongList results,
|
TLongList results,
|
||||||
List<EdgeSearchSubquery> subqueries,
|
List<SearchSubquery> subqueries,
|
||||||
IndexQueryParams queryParams) {
|
IndexQueryParams queryParams) {
|
||||||
this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
|
this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
|
||||||
this.queryParams = queryParams;
|
this.queryParams = queryParams;
|
||||||
@ -71,9 +71,9 @@ public class IndexResultValuator {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public EdgeSearchResultItem evaluateResult(long id) {
|
public SearchResultItem evaluateResult(long id) {
|
||||||
|
|
||||||
EdgeSearchResultItem searchResult = new EdgeSearchResultItem(id);
|
SearchResultItem searchResult = new SearchResultItem(id);
|
||||||
final long urlIdInt = searchResult.getUrlIdInt();
|
final long urlIdInt = searchResult.getUrlIdInt();
|
||||||
|
|
||||||
searchResult.setDomainId(metadataService.getDomainId(urlIdInt));
|
searchResult.setDomainId(metadataService.getDomainId(urlIdInt));
|
||||||
@ -99,7 +99,7 @@ public class IndexResultValuator {
|
|||||||
return searchResult;
|
return searchResult;
|
||||||
}
|
}
|
||||||
|
|
||||||
private double evaluateSubquery(EdgeSearchResultItem searchResult,
|
private double evaluateSubquery(SearchResultItem searchResult,
|
||||||
long docMetadata,
|
long docMetadata,
|
||||||
int querySetId,
|
int querySetId,
|
||||||
List<String> termList)
|
List<String> termList)
|
||||||
@ -114,7 +114,7 @@ public class IndexResultValuator {
|
|||||||
|
|
||||||
long metadata = termMetadata.getTermMetadata(termId, searchResult.getUrlIdInt());
|
long metadata = termMetadata.getTermMetadata(termId, searchResult.getUrlIdInt());
|
||||||
|
|
||||||
EdgeSearchResultKeywordScore score = new EdgeSearchResultKeywordScore(
|
SearchResultKeywordScore score = new SearchResultKeywordScore(
|
||||||
querySetId,
|
querySetId,
|
||||||
searchTerm,
|
searchTerm,
|
||||||
metadata,
|
metadata,
|
||||||
|
@ -9,10 +9,10 @@ import gnu.trove.set.hash.TLongHashSet;
|
|||||||
import io.prometheus.client.Counter;
|
import io.prometheus.client.Counter;
|
||||||
import io.prometheus.client.Gauge;
|
import io.prometheus.client.Gauge;
|
||||||
import io.prometheus.client.Histogram;
|
import io.prometheus.client.Histogram;
|
||||||
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
|
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||||
import nu.marginalia.index.client.model.results.EdgeSearchResultSet;
|
import nu.marginalia.index.client.model.results.SearchResultSet;
|
||||||
import nu.marginalia.index.client.model.query.EdgeSearchSpecification;
|
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||||
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
|
import nu.marginalia.index.client.model.query.SearchSubquery;
|
||||||
import nu.marginalia.array.buffer.LongQueryBuffer;
|
import nu.marginalia.array.buffer.LongQueryBuffer;
|
||||||
import nu.marginalia.index.index.SearchIndex;
|
import nu.marginalia.index.index.SearchIndex;
|
||||||
import nu.marginalia.index.index.SearchIndexSearchTerms;
|
import nu.marginalia.index.index.SearchIndexSearchTerms;
|
||||||
@ -73,13 +73,13 @@ public class IndexQueryService {
|
|||||||
|
|
||||||
public Object search(Request request, Response response) {
|
public Object search(Request request, Response response) {
|
||||||
String json = request.body();
|
String json = request.body();
|
||||||
EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class);
|
SearchSpecification specsSet = gson.fromJson(json, SearchSpecification.class);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
return wmsa_edge_index_query_time.time(() -> {
|
return wmsa_edge_index_query_time.time(() -> {
|
||||||
var params = new SearchParameters(specsSet, getSearchSet(specsSet));
|
var params = new SearchParameters(specsSet, getSearchSet(specsSet));
|
||||||
|
|
||||||
List<EdgeSearchResultItem> results = executeSearch(params);
|
List<SearchResultItem> results = executeSearch(params);
|
||||||
logger.info(queryMarker, "Index Result Count: {}", results.size());
|
logger.info(queryMarker, "Index Result Count: {}", results.size());
|
||||||
|
|
||||||
wmsa_edge_index_query_cost.set(params.getDataCost());
|
wmsa_edge_index_query_cost.set(params.getDataCost());
|
||||||
@ -87,7 +87,7 @@ public class IndexQueryService {
|
|||||||
wmsa_edge_index_query_timeouts.inc();
|
wmsa_edge_index_query_timeouts.inc();
|
||||||
}
|
}
|
||||||
|
|
||||||
return new EdgeSearchResultSet(results);
|
return new SearchResultSet(results);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
catch (HaltException ex) {
|
catch (HaltException ex) {
|
||||||
@ -103,11 +103,11 @@ public class IndexQueryService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// exists for test access
|
// exists for test access
|
||||||
EdgeSearchResultSet justQuery(EdgeSearchSpecification specsSet) {
|
SearchResultSet justQuery(SearchSpecification specsSet) {
|
||||||
return new EdgeSearchResultSet(executeSearch(new SearchParameters(specsSet, getSearchSet(specsSet))));
|
return new SearchResultSet(executeSearch(new SearchParameters(specsSet, getSearchSet(specsSet))));
|
||||||
}
|
}
|
||||||
|
|
||||||
private SearchSet getSearchSet(EdgeSearchSpecification specsSet) {
|
private SearchSet getSearchSet(SearchSpecification specsSet) {
|
||||||
if (specsSet.domains != null && !specsSet.domains.isEmpty()) {
|
if (specsSet.domains != null && !specsSet.domains.isEmpty()) {
|
||||||
return new SmallSearchSet(specsSet.domains);
|
return new SmallSearchSet(specsSet.domains);
|
||||||
}
|
}
|
||||||
@ -115,7 +115,7 @@ public class IndexQueryService {
|
|||||||
return searchSetsService.getSearchSetByName(specsSet.searchSetIdentifier);
|
return searchSetsService.getSearchSetByName(specsSet.searchSetIdentifier);
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<EdgeSearchResultItem> executeSearch(SearchParameters params) {
|
private List<SearchResultItem> executeSearch(SearchParameters params) {
|
||||||
var resultIds = evaluateSubqueries(params);
|
var resultIds = evaluateSubqueries(params);
|
||||||
|
|
||||||
var resultItems = calculateResultScores(params, resultIds);
|
var resultItems = calculateResultScores(params, resultIds);
|
||||||
@ -176,7 +176,7 @@ public class IndexQueryService {
|
|||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
private ArrayList<EdgeSearchResultItem> calculateResultScores(SearchParameters params, TLongList results) {
|
private ArrayList<SearchResultItem> calculateResultScores(SearchParameters params, TLongList results) {
|
||||||
|
|
||||||
final var evaluator = new IndexResultValuator(
|
final var evaluator = new IndexResultValuator(
|
||||||
searchTermsSvc,
|
searchTermsSvc,
|
||||||
@ -185,7 +185,7 @@ public class IndexQueryService {
|
|||||||
params.subqueries,
|
params.subqueries,
|
||||||
params.queryParams);
|
params.queryParams);
|
||||||
|
|
||||||
ArrayList<EdgeSearchResultItem> items = new ArrayList<>(results.size());
|
ArrayList<SearchResultItem> items = new ArrayList<>(results.size());
|
||||||
|
|
||||||
// Sorting the result ids results in better paging characteristics
|
// Sorting the result ids results in better paging characteristics
|
||||||
results.sort();
|
results.sort();
|
||||||
@ -206,15 +206,15 @@ public class IndexQueryService {
|
|||||||
return items;
|
return items;
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<EdgeSearchResultItem> selectBestResults(SearchParameters params, List<EdgeSearchResultItem> results) {
|
private List<SearchResultItem> selectBestResults(SearchParameters params, List<SearchResultItem> results) {
|
||||||
|
|
||||||
var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain);
|
var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain);
|
||||||
|
|
||||||
results.sort(comparingDouble(EdgeSearchResultItem::getScore)
|
results.sort(comparingDouble(SearchResultItem::getScore)
|
||||||
.thenComparingInt(EdgeSearchResultItem::getRanking)
|
.thenComparingInt(SearchResultItem::getRanking)
|
||||||
.thenComparingInt(EdgeSearchResultItem::getUrlIdInt));
|
.thenComparingInt(SearchResultItem::getUrlIdInt));
|
||||||
|
|
||||||
List<EdgeSearchResultItem> resultsList = new ArrayList<>(results.size());
|
List<SearchResultItem> resultsList = new ArrayList<>(results.size());
|
||||||
|
|
||||||
for (var item : results) {
|
for (var item : results) {
|
||||||
if (domainCountFilter.test(item)) {
|
if (domainCountFilter.test(item)) {
|
||||||
@ -245,7 +245,7 @@ class SearchParameters {
|
|||||||
before evaluating them for the best result. */
|
before evaluating them for the best result. */
|
||||||
final int fetchSize;
|
final int fetchSize;
|
||||||
final IndexSearchBudget budget;
|
final IndexSearchBudget budget;
|
||||||
final List<EdgeSearchSubquery> subqueries;
|
final List<SearchSubquery> subqueries;
|
||||||
final IndexQueryParams queryParams;
|
final IndexQueryParams queryParams;
|
||||||
|
|
||||||
final int limitByDomain;
|
final int limitByDomain;
|
||||||
@ -261,7 +261,7 @@ class SearchParameters {
|
|||||||
*/
|
*/
|
||||||
final TLongHashSet consideredUrlIds;
|
final TLongHashSet consideredUrlIds;
|
||||||
|
|
||||||
public SearchParameters(EdgeSearchSpecification specsSet, SearchSet searchSet) {
|
public SearchParameters(SearchSpecification specsSet, SearchSet searchSet) {
|
||||||
var limits = specsSet.queryLimits;
|
var limits = specsSet.queryLimits;
|
||||||
|
|
||||||
this.fetchSize = limits.fetchSize();
|
this.fetchSize = limits.fetchSize();
|
||||||
|
@ -5,7 +5,7 @@ import com.google.inject.Singleton;
|
|||||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||||
import it.unimi.dsi.fastutil.ints.IntList;
|
import it.unimi.dsi.fastutil.ints.IntList;
|
||||||
import nu.marginalia.dict.OffHeapDictionaryHashMap;
|
import nu.marginalia.dict.OffHeapDictionaryHashMap;
|
||||||
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
|
import nu.marginalia.index.client.model.query.SearchSubquery;
|
||||||
import nu.marginalia.index.index.SearchIndexSearchTerms;
|
import nu.marginalia.index.index.SearchIndexSearchTerms;
|
||||||
import nu.marginalia.lexicon.KeywordLexiconReadOnlyView;
|
import nu.marginalia.lexicon.KeywordLexiconReadOnlyView;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -23,7 +23,7 @@ public class SearchTermsService {
|
|||||||
this.lexicon = lexicon;
|
this.lexicon = lexicon;
|
||||||
}
|
}
|
||||||
|
|
||||||
public SearchIndexSearchTerms getSearchTerms(EdgeSearchSubquery request) {
|
public SearchIndexSearchTerms getSearchTerms(SearchSubquery request) {
|
||||||
final IntList excludes = new IntArrayList();
|
final IntList excludes = new IntArrayList();
|
||||||
final IntList includes = new IntArrayList();
|
final IntList includes = new IntArrayList();
|
||||||
final IntList priority = new IntArrayList();
|
final IntList priority = new IntArrayList();
|
||||||
|
@ -2,10 +2,10 @@ package nu.marginalia.index.svc;
|
|||||||
|
|
||||||
import com.google.inject.Guice;
|
import com.google.inject.Guice;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.index.client.model.query.EdgeSearchSpecification;
|
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||||
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
|
import nu.marginalia.index.client.model.query.SearchSubquery;
|
||||||
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
||||||
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
|
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||||
import nu.marginalia.index.index.SearchIndex;
|
import nu.marginalia.index.index.SearchIndex;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||||
@ -79,7 +79,7 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
searchIndex.switchIndex();
|
searchIndex.switchIndex();
|
||||||
|
|
||||||
var rsp = queryService.justQuery(
|
var rsp = queryService.justQuery(
|
||||||
EdgeSearchSpecification.builder()
|
SearchSpecification.builder()
|
||||||
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
||||||
.queryStrategy(QueryStrategy.SENTENCE)
|
.queryStrategy(QueryStrategy.SENTENCE)
|
||||||
.year(SpecificationLimit.none())
|
.year(SpecificationLimit.none())
|
||||||
@ -88,7 +88,7 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
.rank(SpecificationLimit.none())
|
.rank(SpecificationLimit.none())
|
||||||
.domains(new ArrayList<>())
|
.domains(new ArrayList<>())
|
||||||
.searchSetIdentifier(SearchSetIdentifier.NONE)
|
.searchSetIdentifier(SearchSetIdentifier.NONE)
|
||||||
.subqueries(List.of(new EdgeSearchSubquery(
|
.subqueries(List.of(new SearchSubquery(
|
||||||
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList()
|
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList()
|
||||||
))).build());
|
))).build());
|
||||||
|
|
||||||
@ -96,7 +96,7 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
new int[] { 30, 90, 150, 210, 270, 330, 390, 450, 510 },
|
new int[] { 30, 90, 150, 210, 270, 330, 390, 450, 510 },
|
||||||
rsp.results
|
rsp.results
|
||||||
.stream()
|
.stream()
|
||||||
.mapToInt(EdgeSearchResultItem::getUrlIdInt)
|
.mapToInt(SearchResultItem::getUrlIdInt)
|
||||||
.toArray());
|
.toArray());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -111,7 +111,7 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
searchIndex.switchIndex();
|
searchIndex.switchIndex();
|
||||||
|
|
||||||
var rsp = queryService.justQuery(
|
var rsp = queryService.justQuery(
|
||||||
EdgeSearchSpecification.builder()
|
SearchSpecification.builder()
|
||||||
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
||||||
.year(SpecificationLimit.none())
|
.year(SpecificationLimit.none())
|
||||||
.quality(SpecificationLimit.none())
|
.quality(SpecificationLimit.none())
|
||||||
@ -119,12 +119,12 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
.rank(SpecificationLimit.none())
|
.rank(SpecificationLimit.none())
|
||||||
.queryStrategy(QueryStrategy.SENTENCE)
|
.queryStrategy(QueryStrategy.SENTENCE)
|
||||||
.domains(List.of(2))
|
.domains(List.of(2))
|
||||||
.subqueries(List.of(new EdgeSearchSubquery(
|
.subqueries(List.of(new SearchSubquery(
|
||||||
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList()
|
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList()
|
||||||
))).build());
|
))).build());
|
||||||
Assertions.assertArrayEquals(
|
Assertions.assertArrayEquals(
|
||||||
new int[] { 210, 270 },
|
new int[] { 210, 270 },
|
||||||
rsp.results.stream().mapToInt(EdgeSearchResultItem::getUrlIdInt).toArray());
|
rsp.results.stream().mapToInt(SearchResultItem::getUrlIdInt).toArray());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@ -136,7 +136,7 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
searchIndex.switchIndex();
|
searchIndex.switchIndex();
|
||||||
|
|
||||||
var rsp = queryService.justQuery(
|
var rsp = queryService.justQuery(
|
||||||
EdgeSearchSpecification.builder()
|
SearchSpecification.builder()
|
||||||
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
||||||
.quality(SpecificationLimit.none())
|
.quality(SpecificationLimit.none())
|
||||||
.year(SpecificationLimit.equals(1998))
|
.year(SpecificationLimit.equals(1998))
|
||||||
@ -144,14 +144,14 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
.rank(SpecificationLimit.none())
|
.rank(SpecificationLimit.none())
|
||||||
.queryStrategy(QueryStrategy.SENTENCE)
|
.queryStrategy(QueryStrategy.SENTENCE)
|
||||||
.searchSetIdentifier(SearchSetIdentifier.NONE)
|
.searchSetIdentifier(SearchSetIdentifier.NONE)
|
||||||
.subqueries(List.of(new EdgeSearchSubquery(
|
.subqueries(List.of(new SearchSubquery(
|
||||||
List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList()
|
List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList()
|
||||||
))
|
))
|
||||||
).build());
|
).build());
|
||||||
|
|
||||||
Assertions.assertArrayEquals(
|
Assertions.assertArrayEquals(
|
||||||
new int[] { 12, 72, 132, 192, 252, 312, 372, 432, 492, 32 },
|
new int[] { 12, 72, 132, 192, 252, 312, 372, 432, 492, 32 },
|
||||||
rsp.results.stream().mapToInt(EdgeSearchResultItem::getUrlIdInt).toArray());
|
rsp.results.stream().mapToInt(SearchResultItem::getUrlIdInt).toArray());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.search.command;
|
package nu.marginalia.search.command;
|
||||||
|
|
||||||
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
|
import nu.marginalia.index.client.model.query.SearchSubquery;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
@ -25,7 +25,7 @@ public enum SearchJsParameter {
|
|||||||
return DEFAULT;
|
return DEFAULT;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addTacitTerms(EdgeSearchSubquery subquery) {
|
public void addTacitTerms(SearchSubquery subquery) {
|
||||||
subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms));
|
subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.search.query;
|
package nu.marginalia.search.db;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
@ -9,12 +9,12 @@ import java.util.ArrayList;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.function.Consumer;
|
import java.util.function.Consumer;
|
||||||
|
|
||||||
public class NearQueryProcessor {
|
public class DbNearDomainsQuery {
|
||||||
|
|
||||||
private final HikariDataSource dataSource;
|
private final HikariDataSource dataSource;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public NearQueryProcessor(HikariDataSource dataSource) {
|
public DbNearDomainsQuery(HikariDataSource dataSource) {
|
||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
}
|
}
|
||||||
|
|
@ -2,7 +2,7 @@ package nu.marginalia.search.model;
|
|||||||
|
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
|
import nu.marginalia.index.client.model.query.SearchSubquery;
|
||||||
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
||||||
|
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
@ -45,7 +45,7 @@ public enum SearchProfile {
|
|||||||
return YOLO;
|
return YOLO;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addTacitTerms(EdgeSearchSubquery subquery) {
|
public void addTacitTerms(SearchSubquery subquery) {
|
||||||
if (this == ACADEMIA) {
|
if (this == ACADEMIA) {
|
||||||
subquery.searchTermsPriority.add("tld:edu");
|
subquery.searchTermsPriority.add("tld:edu");
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.search.model;
|
package nu.marginalia.search.model;
|
||||||
|
|
||||||
import lombok.*;
|
import lombok.*;
|
||||||
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
|
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
||||||
import nu.marginalia.model.crawl.HtmlFeature;
|
import nu.marginalia.model.crawl.HtmlFeature;
|
||||||
@ -36,7 +36,7 @@ public class UrlDetails {
|
|||||||
public int resultsFromSameDomain;
|
public int resultsFromSameDomain;
|
||||||
|
|
||||||
public String positions;
|
public String positions;
|
||||||
public EdgeSearchResultItem resultItem;
|
public SearchResultItem resultItem;
|
||||||
|
|
||||||
public boolean hasMoreResults() {
|
public boolean hasMoreResults() {
|
||||||
return resultsFromSameDomain > 1;
|
return resultsFromSameDomain > 1;
|
||||||
|
@ -3,8 +3,8 @@ package nu.marginalia.search.query;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.LanguageModels;
|
import nu.marginalia.LanguageModels;
|
||||||
import nu.marginalia.index.client.model.query.EdgeSearchSpecification;
|
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||||
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
|
import nu.marginalia.index.client.model.query.SearchSubquery;
|
||||||
import nu.marginalia.index.query.limit.QueryLimits;
|
import nu.marginalia.index.query.limit.QueryLimits;
|
||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
@ -16,6 +16,7 @@ import nu.marginalia.query_parser.QueryPermutation;
|
|||||||
import nu.marginalia.query_parser.QueryVariants;
|
import nu.marginalia.query_parser.QueryVariants;
|
||||||
import nu.marginalia.query_parser.token.Token;
|
import nu.marginalia.query_parser.token.Token;
|
||||||
import nu.marginalia.query_parser.token.TokenType;
|
import nu.marginalia.query_parser.token.TokenType;
|
||||||
|
import nu.marginalia.search.db.DbNearDomainsQuery;
|
||||||
import nu.marginalia.search.model.SearchProfile;
|
import nu.marginalia.search.model.SearchProfile;
|
||||||
import nu.marginalia.search.query.model.SearchQuery;
|
import nu.marginalia.search.query.model.SearchQuery;
|
||||||
import nu.marginalia.search.query.model.UserSearchParameters;
|
import nu.marginalia.search.query.model.UserSearchParameters;
|
||||||
@ -34,7 +35,7 @@ public class QueryFactory {
|
|||||||
private final EnglishDictionary englishDictionary;
|
private final EnglishDictionary englishDictionary;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final SearchResultValuator searchResultValuator;
|
private final SearchResultValuator searchResultValuator;
|
||||||
private final NearQueryProcessor nearQueryProcessor;
|
private final DbNearDomainsQuery dbNearDomainsQuery;
|
||||||
|
|
||||||
private static final int RETAIN_QUERY_VARIANT_COUNT = 5;
|
private static final int RETAIN_QUERY_VARIANT_COUNT = 5;
|
||||||
private final ThreadLocal<QueryVariants> queryVariants;
|
private final ThreadLocal<QueryVariants> queryVariants;
|
||||||
@ -48,11 +49,11 @@ public class QueryFactory {
|
|||||||
EnglishDictionary englishDictionary,
|
EnglishDictionary englishDictionary,
|
||||||
NGramBloomFilter nGramBloomFilter,
|
NGramBloomFilter nGramBloomFilter,
|
||||||
SearchResultValuator searchResultValuator,
|
SearchResultValuator searchResultValuator,
|
||||||
NearQueryProcessor nearQueryProcessor) {
|
DbNearDomainsQuery dbNearDomainsQuery) {
|
||||||
|
|
||||||
this.englishDictionary = englishDictionary;
|
this.englishDictionary = englishDictionary;
|
||||||
this.searchResultValuator = searchResultValuator;
|
this.searchResultValuator = searchResultValuator;
|
||||||
this.nearQueryProcessor = nearQueryProcessor;
|
this.dbNearDomainsQuery = dbNearDomainsQuery;
|
||||||
|
|
||||||
this.queryVariants = ThreadLocal.withInitial(() -> new QueryVariants(lm ,dict, nGramBloomFilter, englishDictionary));
|
this.queryVariants = ThreadLocal.withInitial(() -> new QueryVariants(lm ,dict, nGramBloomFilter, englishDictionary));
|
||||||
}
|
}
|
||||||
@ -67,13 +68,13 @@ public class QueryFactory {
|
|||||||
|
|
||||||
public SearchQuery createQuery(UserSearchParameters params) {
|
public SearchQuery createQuery(UserSearchParameters params) {
|
||||||
final var processedQuery = createQuery(getQueryPermutation(), params);
|
final var processedQuery = createQuery(getQueryPermutation(), params);
|
||||||
final List<EdgeSearchSubquery> subqueries = processedQuery.specs.subqueries;
|
final List<SearchSubquery> subqueries = processedQuery.specs.subqueries;
|
||||||
|
|
||||||
for (var sq : subqueries) {
|
for (var sq : subqueries) {
|
||||||
sq.setValue(searchResultValuator.preEvaluate(sq));
|
sq.setValue(searchResultValuator.preEvaluate(sq));
|
||||||
}
|
}
|
||||||
|
|
||||||
subqueries.sort(Comparator.comparing(EdgeSearchSubquery::getValue));
|
subqueries.sort(Comparator.comparing(SearchSubquery::getValue));
|
||||||
trimArray(subqueries, RETAIN_QUERY_VARIANT_COUNT);
|
trimArray(subqueries, RETAIN_QUERY_VARIANT_COUNT);
|
||||||
|
|
||||||
return processedQuery;
|
return processedQuery;
|
||||||
@ -84,16 +85,16 @@ public class QueryFactory {
|
|||||||
int limitTotal,
|
int limitTotal,
|
||||||
String... termsInclude)
|
String... termsInclude)
|
||||||
{
|
{
|
||||||
List<EdgeSearchSubquery> sqs = new ArrayList<>();
|
List<SearchSubquery> sqs = new ArrayList<>();
|
||||||
|
|
||||||
sqs.add(new EdgeSearchSubquery(
|
sqs.add(new SearchSubquery(
|
||||||
Arrays.asList(termsInclude),
|
Arrays.asList(termsInclude),
|
||||||
Collections.emptyList(),
|
Collections.emptyList(),
|
||||||
Collections.emptyList(),
|
Collections.emptyList(),
|
||||||
Collections.emptyList()
|
Collections.emptyList()
|
||||||
));
|
));
|
||||||
|
|
||||||
var specs = EdgeSearchSpecification.builder()
|
var specs = SearchSpecification.builder()
|
||||||
.subqueries(sqs)
|
.subqueries(sqs)
|
||||||
.domains(Collections.emptyList())
|
.domains(Collections.emptyList())
|
||||||
.searchSetIdentifier(profile.searchSetIdentifier)
|
.searchSetIdentifier(profile.searchSetIdentifier)
|
||||||
@ -170,7 +171,7 @@ public class QueryFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var queryPermutations = queryPermutation.permuteQueriesNew(basicQuery);
|
var queryPermutations = queryPermutation.permuteQueriesNew(basicQuery);
|
||||||
List<EdgeSearchSubquery> subqueries = new ArrayList<>();
|
List<SearchSubquery> subqueries = new ArrayList<>();
|
||||||
|
|
||||||
String near = profile.getNearDomain();
|
String near = profile.getNearDomain();
|
||||||
|
|
||||||
@ -219,7 +220,7 @@ public class QueryFactory {
|
|||||||
searchTermsAdvice.clear();
|
searchTermsAdvice.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
EdgeSearchSubquery subquery = new EdgeSearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority);
|
SearchSubquery subquery = new SearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority);
|
||||||
|
|
||||||
params.profile().addTacitTerms(subquery);
|
params.profile().addTacitTerms(subquery);
|
||||||
params.jsSetting().addTacitTerms(subquery);
|
params.jsSetting().addTacitTerms(subquery);
|
||||||
@ -231,7 +232,7 @@ public class QueryFactory {
|
|||||||
|
|
||||||
if (near != null) {
|
if (near != null) {
|
||||||
if (domain == null) {
|
if (domain == null) {
|
||||||
domains = nearQueryProcessor.getRelatedDomains(near, problems::add);
|
domains = dbNearDomainsQuery.getRelatedDomains(near, problems::add);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -242,7 +243,7 @@ public class QueryFactory {
|
|||||||
domainLimit = 2;
|
domainLimit = 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
EdgeSearchSpecification.EdgeSearchSpecificationBuilder specsBuilder = EdgeSearchSpecification.builder()
|
var specsBuilder = SearchSpecification.builder()
|
||||||
.subqueries(subqueries)
|
.subqueries(subqueries)
|
||||||
.queryLimits(new QueryLimits(domainLimit, 100, 250, 4096))
|
.queryLimits(new QueryLimits(domainLimit, 100, 250, 4096))
|
||||||
.humanQuery(query)
|
.humanQuery(query)
|
||||||
@ -254,7 +255,7 @@ public class QueryFactory {
|
|||||||
.queryStrategy(queryStrategy)
|
.queryStrategy(queryStrategy)
|
||||||
.searchSetIdentifier(profile.searchSetIdentifier);
|
.searchSetIdentifier(profile.searchSetIdentifier);
|
||||||
|
|
||||||
EdgeSearchSpecification specs = specsBuilder.build();
|
SearchSpecification specs = specsBuilder.build();
|
||||||
|
|
||||||
return new SearchQuery(specs, searchTermsHuman, domain);
|
return new SearchQuery(specs, searchTermsHuman, domain);
|
||||||
}
|
}
|
||||||
|
@ -1,19 +1,19 @@
|
|||||||
package nu.marginalia.search.query.model;
|
package nu.marginalia.search.query.model;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import nu.marginalia.index.client.model.query.EdgeSearchSpecification;
|
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
public class SearchQuery {
|
public class SearchQuery {
|
||||||
public final EdgeSearchSpecification specs;
|
public final SearchSpecification specs;
|
||||||
|
|
||||||
public final Set<String> problems = new TreeSet<>();
|
public final Set<String> problems = new TreeSet<>();
|
||||||
public final List<String> searchTermsHuman;
|
public final List<String> searchTermsHuman;
|
||||||
public String domain;
|
public String domain;
|
||||||
|
|
||||||
public SearchQuery(EdgeSearchSpecification justSpecs) {
|
public SearchQuery(SearchSpecification justSpecs) {
|
||||||
searchTermsHuman = new ArrayList<>();
|
searchTermsHuman = new ArrayList<>();
|
||||||
specs = justSpecs;
|
specs = justSpecs;
|
||||||
}
|
}
|
||||||
|
@ -8,7 +8,7 @@ import nu.marginalia.search.db.DbUrlDetailsQuery;
|
|||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.model.crawl.EdgeDomainIndexingState;
|
||||||
import nu.marginalia.model.id.EdgeIdList;
|
import nu.marginalia.model.id.EdgeIdList;
|
||||||
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
|
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||||
import nu.marginalia.search.model.UrlDetails;
|
import nu.marginalia.search.model.UrlDetails;
|
||||||
import nu.marginalia.search.valuation.SearchResultValuator;
|
import nu.marginalia.search.valuation.SearchResultValuator;
|
||||||
import nu.marginalia.util.BrailleBlockPunchCards;
|
import nu.marginalia.util.BrailleBlockPunchCards;
|
||||||
@ -29,11 +29,11 @@ public class SearchResultDecorator {
|
|||||||
this.valuator = valuator;
|
this.valuator = valuator;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<UrlDetails> getAllUrlDetails(List<EdgeSearchResultItem> resultItems) {
|
public List<UrlDetails> getAllUrlDetails(List<SearchResultItem> resultItems) {
|
||||||
TIntObjectHashMap<UrlDetails> detailsById = new TIntObjectHashMap<>(resultItems.size());
|
TIntObjectHashMap<UrlDetails> detailsById = new TIntObjectHashMap<>(resultItems.size());
|
||||||
|
|
||||||
EdgeIdList<EdgeUrl> idList = resultItems.stream()
|
EdgeIdList<EdgeUrl> idList = resultItems.stream()
|
||||||
.mapToInt(EdgeSearchResultItem::getUrlIdInt)
|
.mapToInt(SearchResultItem::getUrlIdInt)
|
||||||
.collect(EdgeIdList::new, EdgeIdList::add, EdgeIdList::addAll);
|
.collect(EdgeIdList::new, EdgeIdList::add, EdgeIdList::addAll);
|
||||||
|
|
||||||
List<UrlDetails> ret = dbUrlDetailsQuery.getUrlDetailsMulti(idList);
|
List<UrlDetails> ret = dbUrlDetailsQuery.getUrlDetailsMulti(idList);
|
||||||
@ -72,14 +72,14 @@ public class SearchResultDecorator {
|
|||||||
return retList;
|
return retList;
|
||||||
}
|
}
|
||||||
|
|
||||||
private String getPositionsString(EdgeSearchResultItem resultItem) {
|
private String getPositionsString(SearchResultItem resultItem) {
|
||||||
Int2IntArrayMap positionsPerSet = new Int2IntArrayMap(8);
|
Int2IntArrayMap positionsPerSet = new Int2IntArrayMap(8);
|
||||||
|
|
||||||
for (var score : resultItem.scores) {
|
for (var score : resultItem.scores) {
|
||||||
if (!score.isRegular()) {
|
if (!score.isKeywordRegular()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
positionsPerSet.merge(score.set(), score.positions(), this::and);
|
positionsPerSet.merge(score.subquery(), score.positions(), this::and);
|
||||||
}
|
}
|
||||||
|
|
||||||
int bits = positionsPerSet.values().intStream().reduce(this::or).orElse(0);
|
int bits = positionsPerSet.values().intStream().reduce(this::or).orElse(0);
|
||||||
@ -95,7 +95,7 @@ public class SearchResultDecorator {
|
|||||||
return a | b;
|
return a | b;
|
||||||
}
|
}
|
||||||
|
|
||||||
private double calculateTermScore(EdgeSearchResultItem resultItem, UrlDetails details) {
|
private double calculateTermScore(SearchResultItem resultItem, UrlDetails details) {
|
||||||
|
|
||||||
final double statePenalty = (details.domainState == EdgeDomainIndexingState.SPECIAL) ? 1.25 : 0;
|
final double statePenalty = (details.domainState == EdgeDomainIndexingState.SPECIAL) ? 1.25 : 0;
|
||||||
final double value = valuator.evaluateTerms(resultItem.scores, details.words, details.title.length());
|
final double value = valuator.evaluateTerms(resultItem.scores, details.words, details.title.length());
|
||||||
|
@ -3,7 +3,7 @@ package nu.marginalia.search.svc;
|
|||||||
import com.google.common.base.Strings;
|
import com.google.common.base.Strings;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore;
|
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
|
||||||
import nu.marginalia.search.client.model.ApiSearchResultQueryDetails;
|
import nu.marginalia.search.client.model.ApiSearchResultQueryDetails;
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
import nu.marginalia.search.SearchOperator;
|
import nu.marginalia.search.SearchOperator;
|
||||||
@ -62,7 +62,7 @@ public class SearchApiQueryService {
|
|||||||
ApiSearchResult convert(UrlDetails url) {
|
ApiSearchResult convert(UrlDetails url) {
|
||||||
List<List<ApiSearchResultQueryDetails>> details = new ArrayList<>();
|
List<List<ApiSearchResultQueryDetails>> details = new ArrayList<>();
|
||||||
if (url.resultItem != null) {
|
if (url.resultItem != null) {
|
||||||
var bySet = url.resultItem.scores.stream().collect(Collectors.groupingBy(EdgeSearchResultKeywordScore::set));
|
var bySet = url.resultItem.scores.stream().collect(Collectors.groupingBy(SearchResultKeywordScore::subquery));
|
||||||
|
|
||||||
outer:
|
outer:
|
||||||
for (var entries : bySet.values()) {
|
for (var entries : bySet.values()) {
|
||||||
@ -73,7 +73,7 @@ public class SearchApiQueryService {
|
|||||||
continue outer;
|
continue outer;
|
||||||
|
|
||||||
Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet());
|
Set<String> flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet());
|
||||||
lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(), Integer.bitCount(metadata.positions()), flags));
|
lst.add(new ApiSearchResultQueryDetails(entry.keyword, metadata.tfIdf(), Integer.bitCount(metadata.positions()), flags));
|
||||||
}
|
}
|
||||||
details.add(lst);
|
details.add(lst);
|
||||||
}
|
}
|
||||||
|
@ -3,8 +3,8 @@ package nu.marginalia.search.svc;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.index.client.IndexClient;
|
import nu.marginalia.index.client.IndexClient;
|
||||||
import nu.marginalia.index.client.model.results.EdgeSearchResultItem;
|
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||||
import nu.marginalia.index.client.model.query.EdgeSearchSpecification;
|
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||||
import nu.marginalia.search.model.PageScoreAdjustment;
|
import nu.marginalia.search.model.PageScoreAdjustment;
|
||||||
import nu.marginalia.search.model.UrlDetails;
|
import nu.marginalia.search.model.UrlDetails;
|
||||||
import nu.marginalia.search.results.SearchResultDecorator;
|
import nu.marginalia.search.results.SearchResultDecorator;
|
||||||
@ -37,7 +37,7 @@ public class SearchQueryIndexService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public List<UrlDetails> executeQuery(Context ctx, SearchQuery processedQuery) {
|
public List<UrlDetails> executeQuery(Context ctx, SearchQuery processedQuery) {
|
||||||
final List<EdgeSearchResultItem> results = indexClient.query(ctx, processedQuery.specs);
|
final List<SearchResultItem> results = indexClient.query(ctx, processedQuery.specs);
|
||||||
|
|
||||||
List<UrlDetails> urlDetails = resultDecorator.getAllUrlDetails(results);
|
List<UrlDetails> urlDetails = resultDecorator.getAllUrlDetails(results);
|
||||||
|
|
||||||
@ -70,7 +70,7 @@ public class SearchQueryIndexService {
|
|||||||
|
|
||||||
private final Pattern titleSplitPattern = Pattern.compile("[:!|./]|(\\s-|-\\s)|\\s{2,}");
|
private final Pattern titleSplitPattern = Pattern.compile("[:!|./]|(\\s-|-\\s)|\\s{2,}");
|
||||||
|
|
||||||
private PageScoreAdjustment adjustScoreBasedOnQuery(UrlDetails p, EdgeSearchSpecification specs) {
|
private PageScoreAdjustment adjustScoreBasedOnQuery(UrlDetails p, SearchSpecification specs) {
|
||||||
String titleLC = p.title == null ? "" : p.title.toLowerCase();
|
String titleLC = p.title == null ? "" : p.title.toLowerCase();
|
||||||
String descLC = p.description == null ? "" : p.description.toLowerCase();
|
String descLC = p.description == null ? "" : p.description.toLowerCase();
|
||||||
String urlLC = p.url == null ? "" : p.url.path.toLowerCase();
|
String urlLC = p.url == null ? "" : p.url.path.toLowerCase();
|
||||||
|
@ -5,8 +5,8 @@ import com.google.inject.Singleton;
|
|||||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore;
|
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
|
||||||
import nu.marginalia.index.client.model.query.EdgeSearchSubquery;
|
import nu.marginalia.index.client.model.query.SearchSubquery;
|
||||||
import nu.marginalia.language.WordPatterns;
|
import nu.marginalia.language.WordPatterns;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
@ -35,7 +35,7 @@ public class SearchResultValuator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public double preEvaluate(EdgeSearchSubquery sq) {
|
public double preEvaluate(SearchSubquery sq) {
|
||||||
final String[] terms = sq.searchTermsInclude.stream().filter(f -> !f.contains(":")).toArray(String[]::new);
|
final String[] terms = sq.searchTermsInclude.stream().filter(f -> !f.contains(":")).toArray(String[]::new);
|
||||||
|
|
||||||
double termSum = 0.;
|
double termSum = 0.;
|
||||||
@ -56,8 +56,8 @@ public class SearchResultValuator {
|
|||||||
return termSum / factorSum;
|
return termSum / factorSum;
|
||||||
}
|
}
|
||||||
|
|
||||||
public double evaluateTerms(List<EdgeSearchResultKeywordScore> rawScores, int length, int titleLength) {
|
public double evaluateTerms(List<SearchResultKeywordScore> rawScores, int length, int titleLength) {
|
||||||
int sets = 1 + rawScores.stream().mapToInt(EdgeSearchResultKeywordScore::set).max().orElse(0);
|
int sets = 1 + rawScores.stream().mapToInt(SearchResultKeywordScore::subquery).max().orElse(0);
|
||||||
|
|
||||||
double bestScore = 10;
|
double bestScore = 10;
|
||||||
double bestAllTermsFactor = 1.;
|
double bestAllTermsFactor = 1.;
|
||||||
@ -88,10 +88,10 @@ public class SearchResultValuator {
|
|||||||
return bestScore * (0.1 + 0.9 * bestAllTermsFactor) * priorityTermBonus;
|
return bestScore * (0.1 + 0.9 * bestAllTermsFactor) * priorityTermBonus;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean hasPriorityTerm(List<EdgeSearchResultKeywordScore> rawScores) {
|
private boolean hasPriorityTerm(List<SearchResultKeywordScore> rawScores) {
|
||||||
return rawScores.stream()
|
return rawScores.stream()
|
||||||
.findAny()
|
.findAny()
|
||||||
.map(EdgeSearchResultKeywordScore::hasPriorityTerms)
|
.map(SearchResultKeywordScore::hasPriorityTerms)
|
||||||
.orElse(false);
|
.orElse(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -260,11 +260,11 @@ public class SearchResultValuator {
|
|||||||
return f;
|
return f;
|
||||||
}
|
}
|
||||||
|
|
||||||
private double[] getTermWeights(EdgeSearchResultKeywordScore[] scores) {
|
private double[] getTermWeights(SearchResultKeywordScore[] scores) {
|
||||||
double[] weights = new double[scores.length];
|
double[] weights = new double[scores.length];
|
||||||
|
|
||||||
for (int i = 0; i < scores.length; i++) {
|
for (int i = 0; i < scores.length; i++) {
|
||||||
String[] parts = separator.split(scores[i].keyword());
|
String[] parts = separator.split(scores[i].keyword);
|
||||||
double sumScore = 0.;
|
double sumScore = 0.;
|
||||||
|
|
||||||
int count = 0;
|
int count = 0;
|
||||||
@ -305,8 +305,8 @@ public class SearchResultValuator {
|
|||||||
return weights;
|
return weights;
|
||||||
}
|
}
|
||||||
|
|
||||||
private SearchResultsKeywordSet createKeywordSet(List<EdgeSearchResultKeywordScore> rawScores, int thisSet) {
|
private SearchResultsKeywordSet createKeywordSet(List<SearchResultKeywordScore> rawScores, int thisSet) {
|
||||||
EdgeSearchResultKeywordScore[] scores = rawScores.stream().filter(w -> w.set() == thisSet && !w.keyword().contains(":")).toArray(EdgeSearchResultKeywordScore[]::new);
|
SearchResultKeywordScore[] scores = rawScores.stream().filter(w -> w.subquery() == thisSet && !w.keyword.contains(":")).toArray(SearchResultKeywordScore[]::new);
|
||||||
if (scores.length == 0) {
|
if (scores.length == 0) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
@ -322,8 +322,8 @@ public class SearchResultValuator {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private record SearchResultsKeyword(EdgeSearchResultKeywordScore score, WordMetadata wordMetadata, double weight) {
|
private record SearchResultsKeyword(SearchResultKeywordScore score, WordMetadata wordMetadata, double weight) {
|
||||||
public SearchResultsKeyword(EdgeSearchResultKeywordScore score, double weight) {
|
public SearchResultsKeyword(SearchResultKeywordScore score, double weight) {
|
||||||
this(score, new WordMetadata(score.encodedWordMetadata()), weight);
|
this(score, new WordMetadata(score.encodedWordMetadata()), weight);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3,7 +3,7 @@ package nu.marginalia.search.query;
|
|||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||||
import nu.marginalia.language.statistics.EnglishDictionary;
|
import nu.marginalia.language.statistics.EnglishDictionary;
|
||||||
import nu.marginalia.index.client.model.query.EdgeSearchSpecification;
|
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||||
import nu.marginalia.language.statistics.NGramBloomFilter;
|
import nu.marginalia.language.statistics.NGramBloomFilter;
|
||||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||||
import nu.marginalia.search.command.SearchJsParameter;
|
import nu.marginalia.search.command.SearchJsParameter;
|
||||||
@ -37,7 +37,7 @@ public class QueryFactoryTest {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
public EdgeSearchSpecification parseAndGetSpecs(String query) {
|
public SearchSpecification parseAndGetSpecs(String query) {
|
||||||
return queryFactory.createQuery(
|
return queryFactory.createQuery(
|
||||||
new UserSearchParameters(query, SearchProfile.CORPO, SearchJsParameter.DEFAULT)
|
new UserSearchParameters(query, SearchProfile.CORPO, SearchJsParameter.DEFAULT)
|
||||||
).specs;
|
).specs;
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.search.valuation;
|
package nu.marginalia.search.valuation;
|
||||||
|
|
||||||
import nu.marginalia.index.client.model.results.EdgeSearchResultKeywordScore;
|
import nu.marginalia.index.client.model.results.SearchResultKeywordScore;
|
||||||
import nu.marginalia.language.statistics.TermFrequencyDict;
|
import nu.marginalia.language.statistics.TermFrequencyDict;
|
||||||
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
import nu.marginalia.model.crawl.EdgePageDocumentFlags;
|
||||||
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
import nu.marginalia.model.crawl.EdgePageWordFlags;
|
||||||
@ -31,29 +31,29 @@ class SearchResultValuatorTest {
|
|||||||
valuator = new SearchResultValuator(dict);
|
valuator = new SearchResultValuator(dict);
|
||||||
|
|
||||||
}
|
}
|
||||||
List<EdgeSearchResultKeywordScore> titleOnlyLowCountSet = List.of(
|
List<SearchResultKeywordScore> titleOnlyLowCountSet = List.of(
|
||||||
new EdgeSearchResultKeywordScore(0, "bob",
|
new SearchResultKeywordScore(0, "bob",
|
||||||
wordMetadata(32, Set.of(1), EnumSet.of(EdgePageWordFlags.Title)),
|
wordMetadata(32, Set.of(1), EnumSet.of(EdgePageWordFlags.Title)),
|
||||||
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
|
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
|
||||||
false)
|
false)
|
||||||
);
|
);
|
||||||
List<EdgeSearchResultKeywordScore> highCountNoTitleSet = List.of(
|
List<SearchResultKeywordScore> highCountNoTitleSet = List.of(
|
||||||
new EdgeSearchResultKeywordScore(0, "bob",
|
new SearchResultKeywordScore(0, "bob",
|
||||||
wordMetadata(129, Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(EdgePageWordFlags.TfIdfHigh)),
|
wordMetadata(129, Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(EdgePageWordFlags.TfIdfHigh)),
|
||||||
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
|
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
|
||||||
false)
|
false)
|
||||||
);
|
);
|
||||||
|
|
||||||
List<EdgeSearchResultKeywordScore> highCountSubjectSet = List.of(
|
List<SearchResultKeywordScore> highCountSubjectSet = List.of(
|
||||||
new EdgeSearchResultKeywordScore(0, "bob",
|
new SearchResultKeywordScore(0, "bob",
|
||||||
wordMetadata(129, Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(EdgePageWordFlags.TfIdfHigh, EdgePageWordFlags.Subjects)),
|
wordMetadata(129, Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(EdgePageWordFlags.TfIdfHigh, EdgePageWordFlags.Subjects)),
|
||||||
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
|
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
|
||||||
false)
|
false)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
List<EdgeSearchResultKeywordScore> first = List.of(
|
List<SearchResultKeywordScore> first = List.of(
|
||||||
new EdgeSearchResultKeywordScore(0, "bob",
|
new SearchResultKeywordScore(0, "bob",
|
||||||
wordMetadata(202, Set.of(1,3,4,6,7,9,10,11), EnumSet.of(EdgePageWordFlags.TfIdfHigh)),
|
wordMetadata(202, Set.of(1,3,4,6,7,9,10,11), EnumSet.of(EdgePageWordFlags.TfIdfHigh)),
|
||||||
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
|
docMetadata(0, 2010, 0, 5, EnumSet.noneOf(EdgePageDocumentFlags.class)),
|
||||||
false)
|
false)
|
||||||
|
Loading…
Reference in New Issue
Block a user