mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(*) Refactor termCoherences and rename them to phrase constraints.
This commit is contained in:
parent
b2a3cac351
commit
03d5dec24c
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.api.searchquery;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
@ -46,18 +46,21 @@ public class IndexProtobufCodec {
|
||||
}
|
||||
|
||||
public static SearchQuery convertRpcQuery(RpcQuery query) {
|
||||
List<SearchCoherenceConstraint> coherences = new ArrayList<>();
|
||||
List<SearchPhraseConstraint> phraeConstraints = new ArrayList<>();
|
||||
|
||||
for (int j = 0; j < query.getCoherencesCount(); j++) {
|
||||
var coh = query.getCoherences(j);
|
||||
if (coh.getType() == RpcCoherences.TYPE.OPTIONAL) {
|
||||
coherences.add(new SearchCoherenceConstraint(false, List.copyOf(coh.getCoherencesList())));
|
||||
for (int j = 0; j < query.getPhrasesCount(); j++) {
|
||||
var coh = query.getPhrases(j);
|
||||
if (coh.getType() == RpcPhrases.TYPE.OPTIONAL) {
|
||||
phraeConstraints.add(new SearchPhraseConstraint.Optional(List.copyOf(coh.getTermsList())));
|
||||
}
|
||||
else if (coh.getType() == RpcCoherences.TYPE.MANDATORY) {
|
||||
coherences.add(new SearchCoherenceConstraint(true, List.copyOf(coh.getCoherencesList())));
|
||||
else if (coh.getType() == RpcPhrases.TYPE.MANDATORY) {
|
||||
phraeConstraints.add(new SearchPhraseConstraint.Mandatory(List.copyOf(coh.getTermsList())));
|
||||
}
|
||||
else if (coh.getType() == RpcPhrases.TYPE.FULL) {
|
||||
phraeConstraints.add(new SearchPhraseConstraint.Full(List.copyOf(coh.getTermsList())));
|
||||
}
|
||||
else {
|
||||
throw new IllegalArgumentException("Unknown coherence type: " + coh.getType());
|
||||
throw new IllegalArgumentException("Unknown phrase constraint type: " + coh.getType());
|
||||
}
|
||||
}
|
||||
|
||||
@ -67,7 +70,7 @@ public class IndexProtobufCodec {
|
||||
query.getExcludeList(),
|
||||
query.getAdviceList(),
|
||||
query.getPriorityList(),
|
||||
coherences
|
||||
phraeConstraints
|
||||
);
|
||||
}
|
||||
|
||||
@ -80,11 +83,21 @@ public class IndexProtobufCodec {
|
||||
.addAllExclude(searchQuery.getSearchTermsExclude())
|
||||
.addAllPriority(searchQuery.getSearchTermsPriority());
|
||||
|
||||
for (var coherences : searchQuery.searchTermCoherences) {
|
||||
subqueryBuilder.addCoherencesBuilder()
|
||||
.addAllCoherences(coherences.terms())
|
||||
.setType(coherences.mandatory() ? RpcCoherences.TYPE.MANDATORY : RpcCoherences.TYPE.OPTIONAL)
|
||||
.build();
|
||||
for (var constraint : searchQuery.phraseConstraints) {
|
||||
switch (constraint) {
|
||||
case SearchPhraseConstraint.Optional(List<String> terms) ->
|
||||
subqueryBuilder.addPhrasesBuilder()
|
||||
.addAllTerms(terms)
|
||||
.setType(RpcPhrases.TYPE.OPTIONAL);
|
||||
case SearchPhraseConstraint.Mandatory(List<String> terms) ->
|
||||
subqueryBuilder.addPhrasesBuilder()
|
||||
.addAllTerms(terms)
|
||||
.setType(RpcPhrases.TYPE.MANDATORY);
|
||||
case SearchPhraseConstraint.Full(List<String> terms) ->
|
||||
subqueryBuilder.addPhrasesBuilder()
|
||||
.addAllTerms(terms)
|
||||
.setType(RpcPhrases.TYPE.FULL);
|
||||
}
|
||||
}
|
||||
|
||||
return subqueryBuilder.build();
|
||||
|
@ -1,71 +0,0 @@
|
||||
package nu.marginalia.api.searchquery.model.query;
|
||||
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public record SearchCoherenceConstraint(boolean mandatory, List<String> terms) {
|
||||
|
||||
public int size() {
|
||||
return terms.size();
|
||||
}
|
||||
|
||||
/** Create a new SearchCoherenceConstraint with the given terms, and the given mandatory flag.
|
||||
* Stop words are replaced with empty strings.
|
||||
*/
|
||||
public static SearchCoherenceConstraint mandatory(String... terms) {
|
||||
return new SearchCoherenceConstraint(true, trimStopWords(terms));
|
||||
}
|
||||
/** Create a new SearchCoherenceConstraint with the given terms, and the given mandatory flag.
|
||||
* Stop words are replaced with empty strings.
|
||||
*/
|
||||
public static SearchCoherenceConstraint mandatory(List<String> terms) {
|
||||
return new SearchCoherenceConstraint(true, trimStopWords(terms));
|
||||
}
|
||||
/** Create a new SearchCoherenceConstraint with the given terms, without the mandatory flag.
|
||||
* Stop words are replaced with empty strings.
|
||||
*/
|
||||
public static SearchCoherenceConstraint optional(String... terms) {
|
||||
return new SearchCoherenceConstraint(false, trimStopWords(terms));
|
||||
}
|
||||
/** Create a new SearchCoherenceConstraint with the given terms, without the mandatory flag.
|
||||
* Stop words are replaced with empty strings.
|
||||
*/
|
||||
public static SearchCoherenceConstraint optional(List<String> terms) {
|
||||
return new SearchCoherenceConstraint(false, trimStopWords(terms));
|
||||
}
|
||||
|
||||
private static List<String> trimStopWords(List<String> terms) {
|
||||
List<String> ret = new ArrayList<>(terms.size());
|
||||
for (var term : terms) {
|
||||
if (WordPatterns.isStopWord(term)) {
|
||||
ret.add("");
|
||||
} else {
|
||||
ret.add(term);
|
||||
}
|
||||
}
|
||||
return List.copyOf(ret);
|
||||
}
|
||||
|
||||
private static List<String> trimStopWords(String... terms) {
|
||||
List<String> ret = new ArrayList<>(terms.length);
|
||||
for (var term : terms) {
|
||||
if (WordPatterns.isStopWord(term)) {
|
||||
ret.add("");
|
||||
} else {
|
||||
ret.add(term);
|
||||
}
|
||||
}
|
||||
|
||||
while (!ret.isEmpty() && "".equals(ret.getFirst())) {
|
||||
ret.removeFirst();
|
||||
}
|
||||
while (!ret.isEmpty() && "".equals(ret.getLast())) {
|
||||
ret.removeLast();
|
||||
}
|
||||
|
||||
return List.copyOf(ret);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,85 @@
|
||||
package nu.marginalia.api.searchquery.model.query;
|
||||
|
||||
import nu.marginalia.language.WordPatterns;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public sealed interface SearchPhraseConstraint {
|
||||
|
||||
record Mandatory(List<String> terms) implements SearchPhraseConstraint {
|
||||
public Mandatory(String... terms) {
|
||||
this(List.of(terms));
|
||||
}
|
||||
}
|
||||
|
||||
record Optional(List<String> terms) implements SearchPhraseConstraint {
|
||||
public Optional(String... terms) {
|
||||
this(List.of(terms));
|
||||
}
|
||||
}
|
||||
|
||||
record Full(List<String> terms) implements SearchPhraseConstraint {
|
||||
public Full(String... terms) {
|
||||
this(List.of(terms));
|
||||
}
|
||||
}
|
||||
|
||||
List<String> terms();
|
||||
default int size() {
|
||||
return terms().size();
|
||||
}
|
||||
|
||||
static SearchPhraseConstraint mandatory(String... terms) {
|
||||
return new Mandatory(trimStopWords(terms));
|
||||
}
|
||||
static SearchPhraseConstraint mandatory(List<String> terms) {
|
||||
return new Mandatory(trimStopWords(terms));
|
||||
}
|
||||
static SearchPhraseConstraint optional(String... terms) {
|
||||
return new Optional(trimStopWords(terms));
|
||||
}
|
||||
static SearchPhraseConstraint optional(List<String> terms) {
|
||||
return new Optional(trimStopWords(terms));
|
||||
}
|
||||
static SearchPhraseConstraint full(String... terms) {
|
||||
return new Full(trimStopWords(terms));
|
||||
}
|
||||
static SearchPhraseConstraint full(List<String> terms) {
|
||||
return new Full(trimStopWords(terms));
|
||||
}
|
||||
|
||||
|
||||
private static List<String> trimStopWords(List<String> terms) {
|
||||
List<String> ret = new ArrayList<>(terms.size());
|
||||
for (var term : terms) {
|
||||
if (WordPatterns.isStopWord(term)) {
|
||||
ret.add("");
|
||||
} else {
|
||||
ret.add(term);
|
||||
}
|
||||
}
|
||||
return List.copyOf(ret);
|
||||
}
|
||||
|
||||
private static List<String> trimStopWords(String... terms) {
|
||||
List<String> ret = new ArrayList<>(terms.length);
|
||||
for (var term : terms) {
|
||||
if (WordPatterns.isStopWord(term)) {
|
||||
ret.add("");
|
||||
} else {
|
||||
ret.add(term);
|
||||
}
|
||||
}
|
||||
|
||||
while (!ret.isEmpty() && "".equals(ret.getFirst())) {
|
||||
ret.removeFirst();
|
||||
}
|
||||
while (!ret.isEmpty() && "".equals(ret.getLast())) {
|
||||
ret.removeLast();
|
||||
}
|
||||
|
||||
return List.copyOf(ret);
|
||||
}
|
||||
|
||||
}
|
@ -31,7 +31,7 @@ public class SearchQuery {
|
||||
public final List<String> searchTermsPriority;
|
||||
|
||||
/** Terms that we require to be in the same sentence */
|
||||
public final List<SearchCoherenceConstraint> searchTermCoherences;
|
||||
public final List<SearchPhraseConstraint> phraseConstraints;
|
||||
|
||||
@Deprecated // why does this exist?
|
||||
private double value = 0;
|
||||
@ -46,7 +46,7 @@ public class SearchQuery {
|
||||
this.searchTermsExclude = new ArrayList<>();
|
||||
this.searchTermsAdvice = new ArrayList<>();
|
||||
this.searchTermsPriority = new ArrayList<>();
|
||||
this.searchTermCoherences = new ArrayList<>();
|
||||
this.phraseConstraints = new ArrayList<>();
|
||||
}
|
||||
|
||||
public SearchQuery(String compiledQuery,
|
||||
@ -54,13 +54,13 @@ public class SearchQuery {
|
||||
List<String> searchTermsExclude,
|
||||
List<String> searchTermsAdvice,
|
||||
List<String> searchTermsPriority,
|
||||
List<SearchCoherenceConstraint> searchTermCoherences) {
|
||||
List<SearchPhraseConstraint> phraseConstraints) {
|
||||
this.compiledQuery = compiledQuery;
|
||||
this.searchTermsInclude = searchTermsInclude;
|
||||
this.searchTermsExclude = searchTermsExclude;
|
||||
this.searchTermsAdvice = searchTermsAdvice;
|
||||
this.searchTermsPriority = searchTermsPriority;
|
||||
this.searchTermCoherences = searchTermCoherences;
|
||||
this.phraseConstraints = phraseConstraints;
|
||||
}
|
||||
|
||||
@Deprecated // why does this exist?
|
||||
@ -80,7 +80,7 @@ public class SearchQuery {
|
||||
if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
if (!searchTermCoherences.isEmpty()) sb.append("coherences=").append(searchTermCoherences.stream().map(coh->coh.terms().stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", ")));
|
||||
if (!phraseConstraints.isEmpty()) sb.append("phraseConstraints=").append(phraseConstraints.stream().map(coh->coh.terms().stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", ")));
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
@ -91,7 +91,7 @@ public class SearchQuery {
|
||||
public final List<String> searchTermsExclude = new ArrayList<>();
|
||||
public final List<String> searchTermsAdvice = new ArrayList<>();
|
||||
public final List<String> searchTermsPriority = new ArrayList<>();
|
||||
public final List<SearchCoherenceConstraint> searchTermCoherences = new ArrayList<>();
|
||||
public final List<SearchPhraseConstraint> searchPhraseConstraints = new ArrayList<>();
|
||||
|
||||
private SearchQueryBuilder() {
|
||||
}
|
||||
@ -121,13 +121,13 @@ public class SearchQuery {
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchQueryBuilder coherenceConstraint(SearchCoherenceConstraint constraint) {
|
||||
searchTermCoherences.add(constraint);
|
||||
public SearchQueryBuilder phraseConstraint(SearchPhraseConstraint constraint) {
|
||||
searchPhraseConstraints.add(constraint);
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchQuery build() {
|
||||
return new SearchQuery(compiledQuery, searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchTermCoherences);
|
||||
return new SearchQuery(compiledQuery, searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchPhraseConstraints);
|
||||
}
|
||||
|
||||
/** If there are no ranking terms, promote the advice terms to ranking terms */
|
||||
|
@ -176,17 +176,18 @@ message RpcQuery {
|
||||
repeated string exclude = 2; // These terms must be absent
|
||||
repeated string advice = 3; // These terms must be present, but do not affect ranking
|
||||
repeated string priority = 4; // These terms are not mandatory, but affect ranking positively if they are present
|
||||
repeated RpcCoherences coherences = 5; // Groups of terms that must exist in proximity of each other
|
||||
repeated RpcPhrases phrases = 5; // Groups of terms that must exist in proximity of each other
|
||||
string compiledQuery = 6; // Compiled query in infix notation
|
||||
}
|
||||
|
||||
/* Defines a group of search terms that must exist in close proximity within the document */
|
||||
message RpcCoherences {
|
||||
repeated string coherences = 1;
|
||||
/* Defines a group of search terms that must exist in the the specified order within the document */
|
||||
message RpcPhrases {
|
||||
repeated string terms = 1;
|
||||
TYPE type = 2;
|
||||
|
||||
enum TYPE {
|
||||
OPTIONAL = 0;
|
||||
MANDATORY = 1;
|
||||
FULL = 2;
|
||||
};
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.index.client;
|
||||
|
||||
import nu.marginalia.api.searchquery.IndexProtobufCodec;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
@ -11,7 +11,7 @@ import org.junit.jupiter.api.Test;
|
||||
import java.util.List;
|
||||
import java.util.function.Function;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class IndexProtobufCodecTest {
|
||||
@Test
|
||||
@ -43,8 +43,8 @@ class IndexProtobufCodecTest {
|
||||
List.of("e", "f"),
|
||||
List.of("g", "h"),
|
||||
List.of(
|
||||
new SearchCoherenceConstraint(true, List.of("i", "j")),
|
||||
new SearchCoherenceConstraint(false, List.of("k")))
|
||||
SearchPhraseConstraint.mandatory(List.of("i", "j")),
|
||||
SearchPhraseConstraint.optional(List.of("k")))
|
||||
),
|
||||
s -> IndexProtobufCodec.convertRpcQuery(IndexProtobufCodec.convertRpcQuery(s))
|
||||
);
|
||||
|
@ -73,7 +73,7 @@ public class QueryFactory {
|
||||
|
||||
if (parts.length > 1) {
|
||||
// Require that the terms appear in sequence
|
||||
queryBuilder.coherenceConstraint(SearchCoherenceConstraint.mandatory(parts));
|
||||
queryBuilder.phraseConstraint(SearchPhraseConstraint.mandatory(parts));
|
||||
|
||||
// Construct a regular query from the parts in the quoted string
|
||||
queryBuilder.include(parts);
|
||||
@ -126,12 +126,15 @@ public class QueryFactory {
|
||||
|
||||
var expansion = queryExpansion.expandQuery(queryBuilder.searchTermsInclude);
|
||||
|
||||
// Query expansion may produce suggestions for coherence constraints,
|
||||
// Query expansion may produce suggestions for phrase constraints,
|
||||
// add these to the query
|
||||
for (var coh : expansion.extraCoherences()) {
|
||||
queryBuilder.coherenceConstraint(SearchCoherenceConstraint.optional(coh));
|
||||
for (var coh : expansion.optionalPharseConstraints()) {
|
||||
queryBuilder.phraseConstraint(SearchPhraseConstraint.optional(coh));
|
||||
}
|
||||
|
||||
// add a pseudo-constraint for the full query
|
||||
queryBuilder.phraseConstraint(SearchPhraseConstraint.full(expansion.fullPhraseConstraint()));
|
||||
|
||||
queryBuilder.compiledQuery(expansion.compiledQuery());
|
||||
|
||||
var specsBuilder = SearchSpecification.builder()
|
||||
|
@ -44,11 +44,17 @@ public class QueryExpansion {
|
||||
strategy.expand(graph);
|
||||
}
|
||||
|
||||
List<List<String>> coherences = createSegments(graph);
|
||||
List<List<String>> optionalPhraseConstraints = createSegments(graph);
|
||||
|
||||
// also create a segmentation that is just the entire query
|
||||
List<String> fullPhraseConstraint = new ArrayList<> ();
|
||||
for (var qw : graph) {
|
||||
fullPhraseConstraint.add(qw.word());
|
||||
}
|
||||
|
||||
var compiled = QWordPathsRenderer.render(graph);
|
||||
|
||||
return new Expansion(compiled, coherences);
|
||||
return new Expansion(compiled, optionalPhraseConstraints, fullPhraseConstraint);
|
||||
}
|
||||
|
||||
private static final Pattern dashPattern = Pattern.compile("-");
|
||||
@ -144,36 +150,28 @@ public class QueryExpansion {
|
||||
}
|
||||
allSegments.sort(Comparator.comparing(NgramLexicon.SentenceSegment::start));
|
||||
|
||||
List<List<String>> coherences = new ArrayList<>();
|
||||
Set<List<String>> constraints = new HashSet<>();
|
||||
|
||||
if (!allSegments.isEmpty()) {
|
||||
Set<NgramLexicon.SentenceSegment> bestSegmentation =
|
||||
findBestSegmentation(allSegments);
|
||||
|
||||
Set<NgramLexicon.SentenceSegment> bestSegmentation =
|
||||
findBestSegmentation(allSegments);
|
||||
for (var segment : bestSegmentation) {
|
||||
|
||||
for (var segment : bestSegmentation) {
|
||||
int start = segment.start();
|
||||
int end = segment.start() + segment.length();
|
||||
|
||||
int start = segment.start();
|
||||
int end = segment.start() + segment.length();
|
||||
|
||||
List<String> components = new ArrayList<>(end - start);
|
||||
for (int i = start; i < end; i++) {
|
||||
components.add(nodes.get(i).word());
|
||||
}
|
||||
coherences.add(components);
|
||||
|
||||
// Create an n-gram search term for the segment
|
||||
String word = String.join("_", components);
|
||||
graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word);
|
||||
List<String> components = new ArrayList<>(end - start);
|
||||
for (int i = start; i < end; i++) {
|
||||
components.add(nodes.get(i).word());
|
||||
}
|
||||
constraints.add(components);
|
||||
|
||||
// Create an n-gram search term for the segment
|
||||
String word = String.join("_", components);
|
||||
graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word);
|
||||
}
|
||||
|
||||
// also create a segmentation that is just the entire query
|
||||
coherences.add(nodes.stream()
|
||||
.map(QWord::word)
|
||||
.collect(Collectors.toList()));
|
||||
|
||||
return coherences;
|
||||
return new ArrayList<>(constraints);
|
||||
}
|
||||
|
||||
private Set<NgramLexicon.SentenceSegment> findBestSegmentation(List<NgramLexicon.SentenceSegment> allSegments) {
|
||||
@ -216,5 +214,5 @@ public class QueryExpansion {
|
||||
void expand(QWordGraph graph);
|
||||
}
|
||||
|
||||
public record Expansion(String compiledQuery, List<List<String>> extraCoherences) {}
|
||||
public record Expansion(String compiledQuery, List<List<String>> optionalPharseConstraints, List<String> fullPhraseConstraint) {}
|
||||
}
|
||||
|
@ -1,17 +1,17 @@
|
||||
package nu.marginalia.query.svc;
|
||||
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
|
||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||
import nu.marginalia.segmentation.NgramLexicon;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@ -143,7 +143,7 @@ public class QueryFactoryTest {
|
||||
var specs = parseAndGetSpecs("\"tde shining\"");
|
||||
assertEquals("( shining tde | tde_shining )", specs.query.compiledQuery);
|
||||
assertEquals(List.of("tde_shining"), specs.query.searchTermsPriority);
|
||||
assertEquals(List.of(new SearchCoherenceConstraint(true, List.of("tde", "shining"))), specs.query.searchTermCoherences);
|
||||
assertEquals(List.of(new SearchPhraseConstraint.Mandatory(List.of("tde", "shining"))), specs.query.phraseConstraints);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -7,9 +7,6 @@ import it.unimi.dsi.fastutil.longs.LongList;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.index.model.SearchTermsUtil.getWordId;
|
||||
|
||||
public final class SearchTerms {
|
||||
@ -17,9 +14,6 @@ public final class SearchTerms {
|
||||
private final LongList excludes;
|
||||
private final LongList priority;
|
||||
|
||||
private final List<LongList> coherencesMandatory;
|
||||
private final List<LongList> coherencesOptional;
|
||||
|
||||
public static final LongArraySet stopWords = new LongArraySet(
|
||||
new long[] {
|
||||
getWordId("a"),
|
||||
@ -36,9 +30,6 @@ public final class SearchTerms {
|
||||
this.excludes = new LongArrayList();
|
||||
this.priority = new LongArrayList();
|
||||
|
||||
this.coherencesMandatory = new ArrayList<>();
|
||||
this.coherencesOptional = new ArrayList<>();
|
||||
|
||||
this.advice = new LongArrayList();
|
||||
this.compiledQueryIds = compiledQueryIds;
|
||||
|
||||
@ -46,21 +37,6 @@ public final class SearchTerms {
|
||||
advice.add(getWordId(word));
|
||||
}
|
||||
|
||||
for (var coherence : query.searchTermCoherences) {
|
||||
LongList parts = new LongArrayList(coherence.size());
|
||||
|
||||
for (var word : coherence.terms()) {
|
||||
parts.add(getWordId(word));
|
||||
}
|
||||
|
||||
if (coherence.mandatory()) {
|
||||
coherencesMandatory.add(parts);
|
||||
}
|
||||
else {
|
||||
coherencesOptional.add(parts);
|
||||
}
|
||||
}
|
||||
|
||||
for (var word : query.searchTermsExclude) {
|
||||
excludes.add(getWordId(word));
|
||||
}
|
||||
@ -91,12 +67,6 @@ public final class SearchTerms {
|
||||
return priority;
|
||||
}
|
||||
|
||||
public List<LongList> coherencesMandatory() {
|
||||
return coherencesMandatory;
|
||||
}
|
||||
public List<LongList> coherencesOptional() {
|
||||
return coherencesOptional;
|
||||
}
|
||||
public CompiledQueryLong compiledQuery() { return compiledQueryIds; }
|
||||
|
||||
}
|
||||
|
@ -10,6 +10,7 @@ import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||
import nu.marginalia.api.searchquery.*;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
@ -18,8 +19,8 @@ import nu.marginalia.index.index.CombinedIndexReader;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.model.SearchParameters;
|
||||
import nu.marginalia.index.model.SearchTermsUtil;
|
||||
import nu.marginalia.index.results.model.PhraseConstraintGroupList;
|
||||
import nu.marginalia.index.results.model.QuerySearchTerms;
|
||||
import nu.marginalia.index.results.model.TermCoherenceGroupList;
|
||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||
import nu.marginalia.index.results.model.ids.TermIdList;
|
||||
import nu.marginalia.index.results.model.ids.TermMetadataList;
|
||||
@ -97,7 +98,7 @@ public class IndexResultRankingService {
|
||||
}
|
||||
|
||||
// Ignore documents that don't match the mandatory constraints
|
||||
if (!searchTerms.coherences.testMandatory(positions)) {
|
||||
if (!searchTerms.phraseConstraints.testMandatory(positions)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -295,14 +296,26 @@ public class IndexResultRankingService {
|
||||
|
||||
var idsAll = new TermIdList(termIdsList);
|
||||
|
||||
var constraints = new ArrayList<TermCoherenceGroupList.TermCoherenceGroup>();
|
||||
for (var coherence : searchQuery.searchTermCoherences) {
|
||||
constraints.add(new TermCoherenceGroupList.TermCoherenceGroup(coherence, idsAll));
|
||||
var constraintsMandatory = new ArrayList<PhraseConstraintGroupList.PhraseConstraintGroup>();
|
||||
var constraintsFull = new ArrayList<PhraseConstraintGroupList.PhraseConstraintGroup>();
|
||||
var constraintsOptional = new ArrayList<PhraseConstraintGroupList.PhraseConstraintGroup>();
|
||||
|
||||
for (var constraint : searchQuery.phraseConstraints) {
|
||||
switch (constraint) {
|
||||
case SearchPhraseConstraint.Mandatory(List<String> terms) ->
|
||||
constraintsMandatory.add(new PhraseConstraintGroupList.PhraseConstraintGroup(terms, idsAll));
|
||||
case SearchPhraseConstraint.Optional(List<String> terms) ->
|
||||
constraintsOptional.add(new PhraseConstraintGroupList.PhraseConstraintGroup(terms, idsAll));
|
||||
case SearchPhraseConstraint.Full(List<String> terms) ->
|
||||
constraintsFull.add(new PhraseConstraintGroupList.PhraseConstraintGroup(terms, idsAll));
|
||||
}
|
||||
}
|
||||
|
||||
assert constraintsFull.size() == 1 : "Exactly one full constraint group is required";
|
||||
|
||||
return new QuerySearchTerms(termToId,
|
||||
idsAll,
|
||||
new TermCoherenceGroupList(constraints)
|
||||
new PhraseConstraintGroupList(constraintsFull.getFirst(), constraintsMandatory, constraintsOptional)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -13,8 +13,8 @@ import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.model.QueryParams;
|
||||
import nu.marginalia.index.model.SearchParameters;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.results.model.PhraseConstraintGroupList;
|
||||
import nu.marginalia.index.results.model.QuerySearchTerms;
|
||||
import nu.marginalia.index.results.model.TermCoherenceGroupList;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
@ -103,7 +103,7 @@ public class IndexResultScoreCalculator {
|
||||
docSize,
|
||||
spans,
|
||||
positions,
|
||||
searchTerms.coherences,
|
||||
searchTerms.phraseConstraints,
|
||||
rankingContext);
|
||||
|
||||
return new SearchResultItem(combinedId, docMetadata, htmlFeatures, score);
|
||||
@ -155,7 +155,7 @@ public class IndexResultScoreCalculator {
|
||||
int length,
|
||||
DocumentSpans spans,
|
||||
CodedSequence[] positions,
|
||||
TermCoherenceGroupList coherences,
|
||||
PhraseConstraintGroupList constraintGroups,
|
||||
ResultRankingContext ctx)
|
||||
{
|
||||
if (length < 0) {
|
||||
@ -192,7 +192,7 @@ public class IndexResultScoreCalculator {
|
||||
|
||||
VerbatimMatches verbatimMatches = new VerbatimMatches();
|
||||
|
||||
float verbatimMatchScore = findVerbatimMatches(verbatimMatches, coherences, positions, spans);
|
||||
float verbatimMatchScore = findVerbatimMatches(verbatimMatches, constraintGroups, positions, spans);
|
||||
|
||||
float[] weightedCounts = new float[compiledQuery.size()];
|
||||
float keywordMinDistFac = 0;
|
||||
@ -373,19 +373,19 @@ public class IndexResultScoreCalculator {
|
||||
}
|
||||
|
||||
private float findVerbatimMatches(VerbatimMatches verbatimMatches,
|
||||
TermCoherenceGroupList coherences,
|
||||
PhraseConstraintGroupList constraints,
|
||||
CodedSequence[] positions,
|
||||
DocumentSpans spans) {
|
||||
|
||||
// Calculate a bonus for keyword coherences when large ones exist
|
||||
int largestOptional = coherences.largestOptional();
|
||||
int largestOptional = constraints.largestOptional();
|
||||
if (largestOptional < 2) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
float verbatimMatchScore = 0.f;
|
||||
|
||||
for (var optionalGroup : coherences.getOptionalGroups()) {
|
||||
for (var optionalGroup : constraints.getOptionalGroups()) {
|
||||
int groupSize = optionalGroup.size;
|
||||
float sizeScalingFactor = groupSize / (float) largestOptional;
|
||||
|
||||
@ -400,8 +400,8 @@ public class IndexResultScoreCalculator {
|
||||
}
|
||||
}
|
||||
|
||||
if (coherences.numOptional() > 0) {
|
||||
verbatimMatchScore += (float) Math.pow(coherences.countOptional(positions) / (double) coherences.numOptional(), 2);
|
||||
if (constraints.numOptional() > 0) {
|
||||
verbatimMatchScore += (float) Math.pow(constraints.countOptional(positions) / (double) constraints.numOptional(), 2);
|
||||
}
|
||||
|
||||
return verbatimMatchScore;
|
||||
|
@ -1,7 +1,6 @@
|
||||
package nu.marginalia.index.results.model;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
|
||||
import nu.marginalia.index.forward.spans.DocumentSpan;
|
||||
import nu.marginalia.index.model.SearchTermsUtil;
|
||||
import nu.marginalia.index.results.model.ids.TermIdList;
|
||||
@ -16,28 +15,32 @@ import java.util.List;
|
||||
/**
|
||||
* wordIds that we require to be in the same sentence
|
||||
*/
|
||||
public class TermCoherenceGroupList {
|
||||
List<TermCoherenceGroup> mandatoryGroups = new ArrayList<>();
|
||||
List<TermCoherenceGroup> optionalGroups = new ArrayList<>();
|
||||
public class PhraseConstraintGroupList {
|
||||
List<PhraseConstraintGroup> mandatoryGroups = new ArrayList<>();
|
||||
List<PhraseConstraintGroup> optionalGroups = new ArrayList<>();
|
||||
PhraseConstraintGroup fullGroup;
|
||||
|
||||
public TermCoherenceGroupList(List<TermCoherenceGroup> groups) {
|
||||
for (var group : groups) {
|
||||
if (group.mandatory) {
|
||||
mandatoryGroups.add(group);
|
||||
} else {
|
||||
optionalGroups.add(group);
|
||||
}
|
||||
}
|
||||
public PhraseConstraintGroupList(
|
||||
PhraseConstraintGroup fullGroup,
|
||||
List<PhraseConstraintGroup> mandatoryGroups,
|
||||
List<PhraseConstraintGroup> optionalGroups) {
|
||||
this.mandatoryGroups.addAll(mandatoryGroups);
|
||||
this.optionalGroups.addAll(optionalGroups);
|
||||
this.fullGroup = fullGroup;
|
||||
}
|
||||
|
||||
public List<TermCoherenceGroup> getOptionalGroups() {
|
||||
public List<PhraseConstraintGroup> getOptionalGroups() {
|
||||
return Collections.unmodifiableList(optionalGroups);
|
||||
}
|
||||
|
||||
public PhraseConstraintGroup getFullGroup() {
|
||||
return fullGroup;
|
||||
}
|
||||
|
||||
public boolean testMandatory(CodedSequence[] positions) {
|
||||
|
||||
for (var coherenceSet : mandatoryGroups) {
|
||||
if (!coherenceSet.test(positions)) {
|
||||
for (var constraint : mandatoryGroups) {
|
||||
if (!constraint.test(positions)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -48,9 +51,9 @@ public class TermCoherenceGroupList {
|
||||
public int testOptional(CodedSequence[] positions) {
|
||||
|
||||
int best = 0;
|
||||
for (var coherenceSet : optionalGroups) {
|
||||
if (coherenceSet.test(positions)) {
|
||||
best = Math.max(coherenceSet.size, best);
|
||||
for (var constraint : optionalGroups) {
|
||||
if (constraint.test(positions)) {
|
||||
best = Math.max(constraint.size, best);
|
||||
}
|
||||
}
|
||||
return best;
|
||||
@ -59,8 +62,8 @@ public class TermCoherenceGroupList {
|
||||
public int countOptional(CodedSequence[] positions) {
|
||||
|
||||
int ct = 0;
|
||||
for (var coherenceSet : optionalGroups) {
|
||||
if (coherenceSet.test(positions)) {
|
||||
for (var constraint : optionalGroups) {
|
||||
if (constraint.test(positions)) {
|
||||
ct++;
|
||||
}
|
||||
}
|
||||
@ -70,17 +73,17 @@ public class TermCoherenceGroupList {
|
||||
public int testOptional(CodedSequence[] positions, DocumentSpan span) {
|
||||
|
||||
int best = 0;
|
||||
for (var coherenceSet : optionalGroups) {
|
||||
if (coherenceSet.test(span, positions)) {
|
||||
best = Math.max(coherenceSet.size, best);
|
||||
for (var constraint : optionalGroups) {
|
||||
if (constraint.test(span, positions)) {
|
||||
best = Math.max(constraint.size, best);
|
||||
}
|
||||
}
|
||||
return best;
|
||||
}
|
||||
|
||||
public boolean allOptionalInSpan(CodedSequence[] positions, DocumentSpan span) {
|
||||
for (var coherenceSet : optionalGroups) {
|
||||
if (!coherenceSet.test(span, positions)) {
|
||||
for (var constraint : optionalGroups) {
|
||||
if (!constraint.test(span, positions)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -91,36 +94,48 @@ public class TermCoherenceGroupList {
|
||||
return optionalGroups.size();
|
||||
}
|
||||
public int largestOptional() {
|
||||
int best = 0;
|
||||
for (var coherenceSet : optionalGroups) {
|
||||
best = Math.max(coherenceSet.size, best);
|
||||
}
|
||||
return best;
|
||||
return fullGroup.size;
|
||||
}
|
||||
|
||||
|
||||
public static final class TermCoherenceGroup {
|
||||
public static final class PhraseConstraintGroup {
|
||||
private final int[] offsets;
|
||||
private final BitSet present;
|
||||
private final BitSet termIdsMask;
|
||||
|
||||
public final int size;
|
||||
public final boolean mandatory;
|
||||
public TermCoherenceGroup(SearchCoherenceConstraint cons, TermIdList termIdsAll) {
|
||||
offsets = new int[cons.size()];
|
||||
present = new BitSet(cons.size());
|
||||
mandatory = cons.mandatory();
|
||||
size = cons.size();
|
||||
public PhraseConstraintGroup(List<String> terms, TermIdList termIdsAll) {
|
||||
offsets = new int[terms.size()];
|
||||
present = new BitSet(terms.size());
|
||||
size = terms.size();
|
||||
|
||||
termIdsMask = new BitSet(termIdsAll.size());
|
||||
|
||||
int i = 0;
|
||||
for (String term : cons.terms()) {
|
||||
if (!term.isEmpty()) {
|
||||
present.set(i);
|
||||
long termId = SearchTermsUtil.getWordId(term);
|
||||
offsets[i++] = termIdsAll.indexOf(termId);
|
||||
for (String term : terms) {
|
||||
if (term.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
present.set(i);
|
||||
long termId = SearchTermsUtil.getWordId(term);
|
||||
|
||||
int idx = termIdsAll.indexOf(termId);
|
||||
if (idx < 0) {
|
||||
offsets[i++] = -1;
|
||||
}
|
||||
else {
|
||||
offsets[i++] = idx;
|
||||
termIdsMask.set(idx);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns true if the term with index termIdx in the query is in the group */
|
||||
public boolean containsTerm(int termIdx) {
|
||||
return termIdsMask.get(termIdx);
|
||||
}
|
||||
|
||||
public boolean test(CodedSequence[] positions) {
|
||||
IntIterator[] sequences = new IntIterator[present.cardinality()];
|
||||
|
@ -7,14 +7,14 @@ public class QuerySearchTerms {
|
||||
private final TObjectLongHashMap<String> termToId;
|
||||
public final TermIdList termIdsAll;
|
||||
|
||||
public final TermCoherenceGroupList coherences;
|
||||
public final PhraseConstraintGroupList phraseConstraints;
|
||||
|
||||
public QuerySearchTerms(TObjectLongHashMap<String> termToId,
|
||||
TermIdList termIdsAll,
|
||||
TermCoherenceGroupList coherences) {
|
||||
PhraseConstraintGroupList phraseConstraints) {
|
||||
this.termToId = termToId;
|
||||
this.termIdsAll = termIdsAll;
|
||||
this.coherences = coherences;
|
||||
this.phraseConstraints = phraseConstraints;
|
||||
}
|
||||
|
||||
public long getIdForTerm(String searchTerm) {
|
||||
|
@ -4,7 +4,7 @@ import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
@ -175,7 +175,7 @@ public class IndexQueryServiceIntegrationTest {
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(SearchCoherenceConstraint.mandatory(List.of("missing", "hello")))
|
||||
List.of(SearchPhraseConstraint.mandatory(List.of("missing", "hello")))
|
||||
)));
|
||||
|
||||
executeSearch(queryMissingCoherence)
|
||||
@ -443,7 +443,7 @@ public class IndexQueryServiceIntegrationTest {
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(SearchCoherenceConstraint.mandatory(List.of(includes)))
|
||||
List.of(SearchPhraseConstraint.mandatory(List.of(includes)))
|
||||
);
|
||||
}
|
||||
private MockDataDocument d(int domainId, int ordinal) {
|
||||
|
@ -79,9 +79,9 @@
|
||||
<tr> <th title="terms that must be present">Search Terms Exclude</th><td>{{#each specs.query.searchTermsExclude}} {{.}} {{/each}}</td> </tr>
|
||||
<tr> <th title="mandatory terms, no effect on ranking">Search Terms Advice</th><td>{{#each specs.query.searchTermsAdvice}} {{.}} {{/each}}</td> </tr>
|
||||
<tr> <th title="not mandatory, effects ranking">Search Terms Priority</th><td>{{#each specs.query.searchTermsPriority}} {{.}} {{/each}}</td> </tr>
|
||||
{{#each specs.query.searchTermCoherences}}
|
||||
{{#each specs.query.phraseConstraints}}
|
||||
<tr>
|
||||
<th title="terms must appear close by">Coherence Requirement</th>
|
||||
<th title="terms must appear close by">Phrase Constraints</th>
|
||||
<td>
|
||||
{{#each .}}
|
||||
{{.}}
|
||||
|
Loading…
Reference in New Issue
Block a user