(*) Refactor termCoherences and rename them to phrase constraints.

This commit is contained in:
Viktor Lofgren 2024-08-15 11:02:19 +02:00
parent b2a3cac351
commit 03d5dec24c
16 changed files with 259 additions and 232 deletions

View File

@ -1,6 +1,6 @@
package nu.marginalia.api.searchquery; package nu.marginalia.api.searchquery;
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint; import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.Bm25Parameters; import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
@ -46,18 +46,21 @@ public class IndexProtobufCodec {
} }
public static SearchQuery convertRpcQuery(RpcQuery query) { public static SearchQuery convertRpcQuery(RpcQuery query) {
List<SearchCoherenceConstraint> coherences = new ArrayList<>(); List<SearchPhraseConstraint> phraeConstraints = new ArrayList<>();
for (int j = 0; j < query.getCoherencesCount(); j++) { for (int j = 0; j < query.getPhrasesCount(); j++) {
var coh = query.getCoherences(j); var coh = query.getPhrases(j);
if (coh.getType() == RpcCoherences.TYPE.OPTIONAL) { if (coh.getType() == RpcPhrases.TYPE.OPTIONAL) {
coherences.add(new SearchCoherenceConstraint(false, List.copyOf(coh.getCoherencesList()))); phraeConstraints.add(new SearchPhraseConstraint.Optional(List.copyOf(coh.getTermsList())));
} }
else if (coh.getType() == RpcCoherences.TYPE.MANDATORY) { else if (coh.getType() == RpcPhrases.TYPE.MANDATORY) {
coherences.add(new SearchCoherenceConstraint(true, List.copyOf(coh.getCoherencesList()))); phraeConstraints.add(new SearchPhraseConstraint.Mandatory(List.copyOf(coh.getTermsList())));
}
else if (coh.getType() == RpcPhrases.TYPE.FULL) {
phraeConstraints.add(new SearchPhraseConstraint.Full(List.copyOf(coh.getTermsList())));
} }
else { else {
throw new IllegalArgumentException("Unknown coherence type: " + coh.getType()); throw new IllegalArgumentException("Unknown phrase constraint type: " + coh.getType());
} }
} }
@ -67,7 +70,7 @@ public class IndexProtobufCodec {
query.getExcludeList(), query.getExcludeList(),
query.getAdviceList(), query.getAdviceList(),
query.getPriorityList(), query.getPriorityList(),
coherences phraeConstraints
); );
} }
@ -80,11 +83,21 @@ public class IndexProtobufCodec {
.addAllExclude(searchQuery.getSearchTermsExclude()) .addAllExclude(searchQuery.getSearchTermsExclude())
.addAllPriority(searchQuery.getSearchTermsPriority()); .addAllPriority(searchQuery.getSearchTermsPriority());
for (var coherences : searchQuery.searchTermCoherences) { for (var constraint : searchQuery.phraseConstraints) {
subqueryBuilder.addCoherencesBuilder() switch (constraint) {
.addAllCoherences(coherences.terms()) case SearchPhraseConstraint.Optional(List<String> terms) ->
.setType(coherences.mandatory() ? RpcCoherences.TYPE.MANDATORY : RpcCoherences.TYPE.OPTIONAL) subqueryBuilder.addPhrasesBuilder()
.build(); .addAllTerms(terms)
.setType(RpcPhrases.TYPE.OPTIONAL);
case SearchPhraseConstraint.Mandatory(List<String> terms) ->
subqueryBuilder.addPhrasesBuilder()
.addAllTerms(terms)
.setType(RpcPhrases.TYPE.MANDATORY);
case SearchPhraseConstraint.Full(List<String> terms) ->
subqueryBuilder.addPhrasesBuilder()
.addAllTerms(terms)
.setType(RpcPhrases.TYPE.FULL);
}
} }
return subqueryBuilder.build(); return subqueryBuilder.build();

View File

@ -1,71 +0,0 @@
package nu.marginalia.api.searchquery.model.query;
import nu.marginalia.language.WordPatterns;
import java.util.ArrayList;
import java.util.List;
public record SearchCoherenceConstraint(boolean mandatory, List<String> terms) {
public int size() {
return terms.size();
}
/** Create a new SearchCoherenceConstraint with the given terms, and the given mandatory flag.
* Stop words are replaced with empty strings.
*/
public static SearchCoherenceConstraint mandatory(String... terms) {
return new SearchCoherenceConstraint(true, trimStopWords(terms));
}
/** Create a new SearchCoherenceConstraint with the given terms, and the given mandatory flag.
* Stop words are replaced with empty strings.
*/
public static SearchCoherenceConstraint mandatory(List<String> terms) {
return new SearchCoherenceConstraint(true, trimStopWords(terms));
}
/** Create a new SearchCoherenceConstraint with the given terms, without the mandatory flag.
* Stop words are replaced with empty strings.
*/
public static SearchCoherenceConstraint optional(String... terms) {
return new SearchCoherenceConstraint(false, trimStopWords(terms));
}
/** Create a new SearchCoherenceConstraint with the given terms, without the mandatory flag.
* Stop words are replaced with empty strings.
*/
public static SearchCoherenceConstraint optional(List<String> terms) {
return new SearchCoherenceConstraint(false, trimStopWords(terms));
}
private static List<String> trimStopWords(List<String> terms) {
List<String> ret = new ArrayList<>(terms.size());
for (var term : terms) {
if (WordPatterns.isStopWord(term)) {
ret.add("");
} else {
ret.add(term);
}
}
return List.copyOf(ret);
}
private static List<String> trimStopWords(String... terms) {
List<String> ret = new ArrayList<>(terms.length);
for (var term : terms) {
if (WordPatterns.isStopWord(term)) {
ret.add("");
} else {
ret.add(term);
}
}
while (!ret.isEmpty() && "".equals(ret.getFirst())) {
ret.removeFirst();
}
while (!ret.isEmpty() && "".equals(ret.getLast())) {
ret.removeLast();
}
return List.copyOf(ret);
}
}

View File

@ -0,0 +1,85 @@
package nu.marginalia.api.searchquery.model.query;
import nu.marginalia.language.WordPatterns;
import java.util.ArrayList;
import java.util.List;
public sealed interface SearchPhraseConstraint {
record Mandatory(List<String> terms) implements SearchPhraseConstraint {
public Mandatory(String... terms) {
this(List.of(terms));
}
}
record Optional(List<String> terms) implements SearchPhraseConstraint {
public Optional(String... terms) {
this(List.of(terms));
}
}
record Full(List<String> terms) implements SearchPhraseConstraint {
public Full(String... terms) {
this(List.of(terms));
}
}
List<String> terms();
default int size() {
return terms().size();
}
static SearchPhraseConstraint mandatory(String... terms) {
return new Mandatory(trimStopWords(terms));
}
static SearchPhraseConstraint mandatory(List<String> terms) {
return new Mandatory(trimStopWords(terms));
}
static SearchPhraseConstraint optional(String... terms) {
return new Optional(trimStopWords(terms));
}
static SearchPhraseConstraint optional(List<String> terms) {
return new Optional(trimStopWords(terms));
}
static SearchPhraseConstraint full(String... terms) {
return new Full(trimStopWords(terms));
}
static SearchPhraseConstraint full(List<String> terms) {
return new Full(trimStopWords(terms));
}
private static List<String> trimStopWords(List<String> terms) {
List<String> ret = new ArrayList<>(terms.size());
for (var term : terms) {
if (WordPatterns.isStopWord(term)) {
ret.add("");
} else {
ret.add(term);
}
}
return List.copyOf(ret);
}
private static List<String> trimStopWords(String... terms) {
List<String> ret = new ArrayList<>(terms.length);
for (var term : terms) {
if (WordPatterns.isStopWord(term)) {
ret.add("");
} else {
ret.add(term);
}
}
while (!ret.isEmpty() && "".equals(ret.getFirst())) {
ret.removeFirst();
}
while (!ret.isEmpty() && "".equals(ret.getLast())) {
ret.removeLast();
}
return List.copyOf(ret);
}
}

View File

@ -31,7 +31,7 @@ public class SearchQuery {
public final List<String> searchTermsPriority; public final List<String> searchTermsPriority;
/** Terms that we require to be in the same sentence */ /** Terms that we require to be in the same sentence */
public final List<SearchCoherenceConstraint> searchTermCoherences; public final List<SearchPhraseConstraint> phraseConstraints;
@Deprecated // why does this exist? @Deprecated // why does this exist?
private double value = 0; private double value = 0;
@ -46,7 +46,7 @@ public class SearchQuery {
this.searchTermsExclude = new ArrayList<>(); this.searchTermsExclude = new ArrayList<>();
this.searchTermsAdvice = new ArrayList<>(); this.searchTermsAdvice = new ArrayList<>();
this.searchTermsPriority = new ArrayList<>(); this.searchTermsPriority = new ArrayList<>();
this.searchTermCoherences = new ArrayList<>(); this.phraseConstraints = new ArrayList<>();
} }
public SearchQuery(String compiledQuery, public SearchQuery(String compiledQuery,
@ -54,13 +54,13 @@ public class SearchQuery {
List<String> searchTermsExclude, List<String> searchTermsExclude,
List<String> searchTermsAdvice, List<String> searchTermsAdvice,
List<String> searchTermsPriority, List<String> searchTermsPriority,
List<SearchCoherenceConstraint> searchTermCoherences) { List<SearchPhraseConstraint> phraseConstraints) {
this.compiledQuery = compiledQuery; this.compiledQuery = compiledQuery;
this.searchTermsInclude = searchTermsInclude; this.searchTermsInclude = searchTermsInclude;
this.searchTermsExclude = searchTermsExclude; this.searchTermsExclude = searchTermsExclude;
this.searchTermsAdvice = searchTermsAdvice; this.searchTermsAdvice = searchTermsAdvice;
this.searchTermsPriority = searchTermsPriority; this.searchTermsPriority = searchTermsPriority;
this.searchTermCoherences = searchTermCoherences; this.phraseConstraints = phraseConstraints;
} }
@Deprecated // why does this exist? @Deprecated // why does this exist?
@ -80,7 +80,7 @@ public class SearchQuery {
if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermCoherences.isEmpty()) sb.append("coherences=").append(searchTermCoherences.stream().map(coh->coh.terms().stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", "))); if (!phraseConstraints.isEmpty()) sb.append("phraseConstraints=").append(phraseConstraints.stream().map(coh->coh.terms().stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", ")));
return sb.toString(); return sb.toString();
} }
@ -91,7 +91,7 @@ public class SearchQuery {
public final List<String> searchTermsExclude = new ArrayList<>(); public final List<String> searchTermsExclude = new ArrayList<>();
public final List<String> searchTermsAdvice = new ArrayList<>(); public final List<String> searchTermsAdvice = new ArrayList<>();
public final List<String> searchTermsPriority = new ArrayList<>(); public final List<String> searchTermsPriority = new ArrayList<>();
public final List<SearchCoherenceConstraint> searchTermCoherences = new ArrayList<>(); public final List<SearchPhraseConstraint> searchPhraseConstraints = new ArrayList<>();
private SearchQueryBuilder() { private SearchQueryBuilder() {
} }
@ -121,13 +121,13 @@ public class SearchQuery {
return this; return this;
} }
public SearchQueryBuilder coherenceConstraint(SearchCoherenceConstraint constraint) { public SearchQueryBuilder phraseConstraint(SearchPhraseConstraint constraint) {
searchTermCoherences.add(constraint); searchPhraseConstraints.add(constraint);
return this; return this;
} }
public SearchQuery build() { public SearchQuery build() {
return new SearchQuery(compiledQuery, searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchTermCoherences); return new SearchQuery(compiledQuery, searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchPhraseConstraints);
} }
/** If there are no ranking terms, promote the advice terms to ranking terms */ /** If there are no ranking terms, promote the advice terms to ranking terms */

View File

@ -176,17 +176,18 @@ message RpcQuery {
repeated string exclude = 2; // These terms must be absent repeated string exclude = 2; // These terms must be absent
repeated string advice = 3; // These terms must be present, but do not affect ranking repeated string advice = 3; // These terms must be present, but do not affect ranking
repeated string priority = 4; // These terms are not mandatory, but affect ranking positively if they are present repeated string priority = 4; // These terms are not mandatory, but affect ranking positively if they are present
repeated RpcCoherences coherences = 5; // Groups of terms that must exist in proximity of each other repeated RpcPhrases phrases = 5; // Groups of terms that must exist in proximity of each other
string compiledQuery = 6; // Compiled query in infix notation string compiledQuery = 6; // Compiled query in infix notation
} }
/* Defines a group of search terms that must exist in close proximity within the document */ /* Defines a group of search terms that must exist in the the specified order within the document */
message RpcCoherences { message RpcPhrases {
repeated string coherences = 1; repeated string terms = 1;
TYPE type = 2; TYPE type = 2;
enum TYPE { enum TYPE {
OPTIONAL = 0; OPTIONAL = 0;
MANDATORY = 1; MANDATORY = 1;
FULL = 2;
}; };
} }

View File

@ -1,7 +1,7 @@
package nu.marginalia.index.client; package nu.marginalia.index.client;
import nu.marginalia.api.searchquery.IndexProtobufCodec; import nu.marginalia.api.searchquery.IndexProtobufCodec;
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint; import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryLimits;
@ -11,7 +11,7 @@ import org.junit.jupiter.api.Test;
import java.util.List; import java.util.List;
import java.util.function.Function; import java.util.function.Function;
import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.assertEquals;
class IndexProtobufCodecTest { class IndexProtobufCodecTest {
@Test @Test
@ -43,8 +43,8 @@ class IndexProtobufCodecTest {
List.of("e", "f"), List.of("e", "f"),
List.of("g", "h"), List.of("g", "h"),
List.of( List.of(
new SearchCoherenceConstraint(true, List.of("i", "j")), SearchPhraseConstraint.mandatory(List.of("i", "j")),
new SearchCoherenceConstraint(false, List.of("k"))) SearchPhraseConstraint.optional(List.of("k")))
), ),
s -> IndexProtobufCodec.convertRpcQuery(IndexProtobufCodec.convertRpcQuery(s)) s -> IndexProtobufCodec.convertRpcQuery(IndexProtobufCodec.convertRpcQuery(s))
); );

View File

@ -73,7 +73,7 @@ public class QueryFactory {
if (parts.length > 1) { if (parts.length > 1) {
// Require that the terms appear in sequence // Require that the terms appear in sequence
queryBuilder.coherenceConstraint(SearchCoherenceConstraint.mandatory(parts)); queryBuilder.phraseConstraint(SearchPhraseConstraint.mandatory(parts));
// Construct a regular query from the parts in the quoted string // Construct a regular query from the parts in the quoted string
queryBuilder.include(parts); queryBuilder.include(parts);
@ -126,12 +126,15 @@ public class QueryFactory {
var expansion = queryExpansion.expandQuery(queryBuilder.searchTermsInclude); var expansion = queryExpansion.expandQuery(queryBuilder.searchTermsInclude);
// Query expansion may produce suggestions for coherence constraints, // Query expansion may produce suggestions for phrase constraints,
// add these to the query // add these to the query
for (var coh : expansion.extraCoherences()) { for (var coh : expansion.optionalPharseConstraints()) {
queryBuilder.coherenceConstraint(SearchCoherenceConstraint.optional(coh)); queryBuilder.phraseConstraint(SearchPhraseConstraint.optional(coh));
} }
// add a pseudo-constraint for the full query
queryBuilder.phraseConstraint(SearchPhraseConstraint.full(expansion.fullPhraseConstraint()));
queryBuilder.compiledQuery(expansion.compiledQuery()); queryBuilder.compiledQuery(expansion.compiledQuery());
var specsBuilder = SearchSpecification.builder() var specsBuilder = SearchSpecification.builder()

View File

@ -44,11 +44,17 @@ public class QueryExpansion {
strategy.expand(graph); strategy.expand(graph);
} }
List<List<String>> coherences = createSegments(graph); List<List<String>> optionalPhraseConstraints = createSegments(graph);
// also create a segmentation that is just the entire query
List<String> fullPhraseConstraint = new ArrayList<> ();
for (var qw : graph) {
fullPhraseConstraint.add(qw.word());
}
var compiled = QWordPathsRenderer.render(graph); var compiled = QWordPathsRenderer.render(graph);
return new Expansion(compiled, coherences); return new Expansion(compiled, optionalPhraseConstraints, fullPhraseConstraint);
} }
private static final Pattern dashPattern = Pattern.compile("-"); private static final Pattern dashPattern = Pattern.compile("-");
@ -144,36 +150,28 @@ public class QueryExpansion {
} }
allSegments.sort(Comparator.comparing(NgramLexicon.SentenceSegment::start)); allSegments.sort(Comparator.comparing(NgramLexicon.SentenceSegment::start));
List<List<String>> coherences = new ArrayList<>(); Set<List<String>> constraints = new HashSet<>();
if (!allSegments.isEmpty()) { Set<NgramLexicon.SentenceSegment> bestSegmentation =
findBestSegmentation(allSegments);
Set<NgramLexicon.SentenceSegment> bestSegmentation = for (var segment : bestSegmentation) {
findBestSegmentation(allSegments);
for (var segment : bestSegmentation) { int start = segment.start();
int end = segment.start() + segment.length();
int start = segment.start(); List<String> components = new ArrayList<>(end - start);
int end = segment.start() + segment.length(); for (int i = start; i < end; i++) {
components.add(nodes.get(i).word());
List<String> components = new ArrayList<>(end - start);
for (int i = start; i < end; i++) {
components.add(nodes.get(i).word());
}
coherences.add(components);
// Create an n-gram search term for the segment
String word = String.join("_", components);
graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word);
} }
constraints.add(components);
// Create an n-gram search term for the segment
String word = String.join("_", components);
graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word);
} }
// also create a segmentation that is just the entire query return new ArrayList<>(constraints);
coherences.add(nodes.stream()
.map(QWord::word)
.collect(Collectors.toList()));
return coherences;
} }
private Set<NgramLexicon.SentenceSegment> findBestSegmentation(List<NgramLexicon.SentenceSegment> allSegments) { private Set<NgramLexicon.SentenceSegment> findBestSegmentation(List<NgramLexicon.SentenceSegment> allSegments) {
@ -216,5 +214,5 @@ public class QueryExpansion {
void expand(QWordGraph graph); void expand(QWordGraph graph);
} }
public record Expansion(String compiledQuery, List<List<String>> extraCoherences) {} public record Expansion(String compiledQuery, List<List<String>> optionalPharseConstraints, List<String> fullPhraseConstraint) {}
} }

View File

@ -1,17 +1,17 @@
package nu.marginalia.query.svc; package nu.marginalia.query.svc;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint; import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
import nu.marginalia.functions.searchquery.QueryFactory; import nu.marginalia.functions.searchquery.QueryFactory;
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.index.query.limit.SpecificationLimitType; import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@ -143,7 +143,7 @@ public class QueryFactoryTest {
var specs = parseAndGetSpecs("\"tde shining\""); var specs = parseAndGetSpecs("\"tde shining\"");
assertEquals("( shining tde | tde_shining )", specs.query.compiledQuery); assertEquals("( shining tde | tde_shining )", specs.query.compiledQuery);
assertEquals(List.of("tde_shining"), specs.query.searchTermsPriority); assertEquals(List.of("tde_shining"), specs.query.searchTermsPriority);
assertEquals(List.of(new SearchCoherenceConstraint(true, List.of("tde", "shining"))), specs.query.searchTermCoherences); assertEquals(List.of(new SearchPhraseConstraint.Mandatory(List.of("tde", "shining"))), specs.query.phraseConstraints);
} }
} }

View File

@ -7,9 +7,6 @@ import it.unimi.dsi.fastutil.longs.LongList;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.query.SearchQuery;
import java.util.ArrayList;
import java.util.List;
import static nu.marginalia.index.model.SearchTermsUtil.getWordId; import static nu.marginalia.index.model.SearchTermsUtil.getWordId;
public final class SearchTerms { public final class SearchTerms {
@ -17,9 +14,6 @@ public final class SearchTerms {
private final LongList excludes; private final LongList excludes;
private final LongList priority; private final LongList priority;
private final List<LongList> coherencesMandatory;
private final List<LongList> coherencesOptional;
public static final LongArraySet stopWords = new LongArraySet( public static final LongArraySet stopWords = new LongArraySet(
new long[] { new long[] {
getWordId("a"), getWordId("a"),
@ -36,9 +30,6 @@ public final class SearchTerms {
this.excludes = new LongArrayList(); this.excludes = new LongArrayList();
this.priority = new LongArrayList(); this.priority = new LongArrayList();
this.coherencesMandatory = new ArrayList<>();
this.coherencesOptional = new ArrayList<>();
this.advice = new LongArrayList(); this.advice = new LongArrayList();
this.compiledQueryIds = compiledQueryIds; this.compiledQueryIds = compiledQueryIds;
@ -46,21 +37,6 @@ public final class SearchTerms {
advice.add(getWordId(word)); advice.add(getWordId(word));
} }
for (var coherence : query.searchTermCoherences) {
LongList parts = new LongArrayList(coherence.size());
for (var word : coherence.terms()) {
parts.add(getWordId(word));
}
if (coherence.mandatory()) {
coherencesMandatory.add(parts);
}
else {
coherencesOptional.add(parts);
}
}
for (var word : query.searchTermsExclude) { for (var word : query.searchTermsExclude) {
excludes.add(getWordId(word)); excludes.add(getWordId(word));
} }
@ -91,12 +67,6 @@ public final class SearchTerms {
return priority; return priority;
} }
public List<LongList> coherencesMandatory() {
return coherencesMandatory;
}
public List<LongList> coherencesOptional() {
return coherencesOptional;
}
public CompiledQueryLong compiledQuery() { return compiledQueryIds; } public CompiledQueryLong compiledQuery() { return compiledQueryIds; }
} }

View File

@ -10,6 +10,7 @@ import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import nu.marginalia.api.searchquery.*; import nu.marginalia.api.searchquery.*;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CqDataLong; import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.SearchResultItem; import nu.marginalia.api.searchquery.model.results.SearchResultItem;
@ -18,8 +19,8 @@ import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.model.SearchTermsUtil; import nu.marginalia.index.model.SearchTermsUtil;
import nu.marginalia.index.results.model.PhraseConstraintGroupList;
import nu.marginalia.index.results.model.QuerySearchTerms; import nu.marginalia.index.results.model.QuerySearchTerms;
import nu.marginalia.index.results.model.TermCoherenceGroupList;
import nu.marginalia.index.results.model.ids.CombinedDocIdList; import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.results.model.ids.TermIdList; import nu.marginalia.index.results.model.ids.TermIdList;
import nu.marginalia.index.results.model.ids.TermMetadataList; import nu.marginalia.index.results.model.ids.TermMetadataList;
@ -97,7 +98,7 @@ public class IndexResultRankingService {
} }
// Ignore documents that don't match the mandatory constraints // Ignore documents that don't match the mandatory constraints
if (!searchTerms.coherences.testMandatory(positions)) { if (!searchTerms.phraseConstraints.testMandatory(positions)) {
continue; continue;
} }
@ -295,14 +296,26 @@ public class IndexResultRankingService {
var idsAll = new TermIdList(termIdsList); var idsAll = new TermIdList(termIdsList);
var constraints = new ArrayList<TermCoherenceGroupList.TermCoherenceGroup>(); var constraintsMandatory = new ArrayList<PhraseConstraintGroupList.PhraseConstraintGroup>();
for (var coherence : searchQuery.searchTermCoherences) { var constraintsFull = new ArrayList<PhraseConstraintGroupList.PhraseConstraintGroup>();
constraints.add(new TermCoherenceGroupList.TermCoherenceGroup(coherence, idsAll)); var constraintsOptional = new ArrayList<PhraseConstraintGroupList.PhraseConstraintGroup>();
for (var constraint : searchQuery.phraseConstraints) {
switch (constraint) {
case SearchPhraseConstraint.Mandatory(List<String> terms) ->
constraintsMandatory.add(new PhraseConstraintGroupList.PhraseConstraintGroup(terms, idsAll));
case SearchPhraseConstraint.Optional(List<String> terms) ->
constraintsOptional.add(new PhraseConstraintGroupList.PhraseConstraintGroup(terms, idsAll));
case SearchPhraseConstraint.Full(List<String> terms) ->
constraintsFull.add(new PhraseConstraintGroupList.PhraseConstraintGroup(terms, idsAll));
}
} }
assert constraintsFull.size() == 1 : "Exactly one full constraint group is required";
return new QuerySearchTerms(termToId, return new QuerySearchTerms(termToId,
idsAll, idsAll,
new TermCoherenceGroupList(constraints) new PhraseConstraintGroupList(constraintsFull.getFirst(), constraintsMandatory, constraintsOptional)
); );
} }
} }

View File

@ -13,8 +13,8 @@ import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.QueryParams; import nu.marginalia.index.model.QueryParams;
import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.results.model.PhraseConstraintGroupList;
import nu.marginalia.index.results.model.QuerySearchTerms; import nu.marginalia.index.results.model.QuerySearchTerms;
import nu.marginalia.index.results.model.TermCoherenceGroupList;
import nu.marginalia.language.sentence.tag.HtmlTag; import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
@ -103,7 +103,7 @@ public class IndexResultScoreCalculator {
docSize, docSize,
spans, spans,
positions, positions,
searchTerms.coherences, searchTerms.phraseConstraints,
rankingContext); rankingContext);
return new SearchResultItem(combinedId, docMetadata, htmlFeatures, score); return new SearchResultItem(combinedId, docMetadata, htmlFeatures, score);
@ -155,7 +155,7 @@ public class IndexResultScoreCalculator {
int length, int length,
DocumentSpans spans, DocumentSpans spans,
CodedSequence[] positions, CodedSequence[] positions,
TermCoherenceGroupList coherences, PhraseConstraintGroupList constraintGroups,
ResultRankingContext ctx) ResultRankingContext ctx)
{ {
if (length < 0) { if (length < 0) {
@ -192,7 +192,7 @@ public class IndexResultScoreCalculator {
VerbatimMatches verbatimMatches = new VerbatimMatches(); VerbatimMatches verbatimMatches = new VerbatimMatches();
float verbatimMatchScore = findVerbatimMatches(verbatimMatches, coherences, positions, spans); float verbatimMatchScore = findVerbatimMatches(verbatimMatches, constraintGroups, positions, spans);
float[] weightedCounts = new float[compiledQuery.size()]; float[] weightedCounts = new float[compiledQuery.size()];
float keywordMinDistFac = 0; float keywordMinDistFac = 0;
@ -373,19 +373,19 @@ public class IndexResultScoreCalculator {
} }
private float findVerbatimMatches(VerbatimMatches verbatimMatches, private float findVerbatimMatches(VerbatimMatches verbatimMatches,
TermCoherenceGroupList coherences, PhraseConstraintGroupList constraints,
CodedSequence[] positions, CodedSequence[] positions,
DocumentSpans spans) { DocumentSpans spans) {
// Calculate a bonus for keyword coherences when large ones exist // Calculate a bonus for keyword coherences when large ones exist
int largestOptional = coherences.largestOptional(); int largestOptional = constraints.largestOptional();
if (largestOptional < 2) { if (largestOptional < 2) {
return 0; return 0;
} }
float verbatimMatchScore = 0.f; float verbatimMatchScore = 0.f;
for (var optionalGroup : coherences.getOptionalGroups()) { for (var optionalGroup : constraints.getOptionalGroups()) {
int groupSize = optionalGroup.size; int groupSize = optionalGroup.size;
float sizeScalingFactor = groupSize / (float) largestOptional; float sizeScalingFactor = groupSize / (float) largestOptional;
@ -400,8 +400,8 @@ public class IndexResultScoreCalculator {
} }
} }
if (coherences.numOptional() > 0) { if (constraints.numOptional() > 0) {
verbatimMatchScore += (float) Math.pow(coherences.countOptional(positions) / (double) coherences.numOptional(), 2); verbatimMatchScore += (float) Math.pow(constraints.countOptional(positions) / (double) constraints.numOptional(), 2);
} }
return verbatimMatchScore; return verbatimMatchScore;

View File

@ -1,7 +1,6 @@
package nu.marginalia.index.results.model; package nu.marginalia.index.results.model;
import it.unimi.dsi.fastutil.ints.IntIterator; import it.unimi.dsi.fastutil.ints.IntIterator;
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
import nu.marginalia.index.forward.spans.DocumentSpan; import nu.marginalia.index.forward.spans.DocumentSpan;
import nu.marginalia.index.model.SearchTermsUtil; import nu.marginalia.index.model.SearchTermsUtil;
import nu.marginalia.index.results.model.ids.TermIdList; import nu.marginalia.index.results.model.ids.TermIdList;
@ -16,28 +15,32 @@ import java.util.List;
/** /**
* wordIds that we require to be in the same sentence * wordIds that we require to be in the same sentence
*/ */
public class TermCoherenceGroupList { public class PhraseConstraintGroupList {
List<TermCoherenceGroup> mandatoryGroups = new ArrayList<>(); List<PhraseConstraintGroup> mandatoryGroups = new ArrayList<>();
List<TermCoherenceGroup> optionalGroups = new ArrayList<>(); List<PhraseConstraintGroup> optionalGroups = new ArrayList<>();
PhraseConstraintGroup fullGroup;
public TermCoherenceGroupList(List<TermCoherenceGroup> groups) { public PhraseConstraintGroupList(
for (var group : groups) { PhraseConstraintGroup fullGroup,
if (group.mandatory) { List<PhraseConstraintGroup> mandatoryGroups,
mandatoryGroups.add(group); List<PhraseConstraintGroup> optionalGroups) {
} else { this.mandatoryGroups.addAll(mandatoryGroups);
optionalGroups.add(group); this.optionalGroups.addAll(optionalGroups);
} this.fullGroup = fullGroup;
}
} }
public List<TermCoherenceGroup> getOptionalGroups() { public List<PhraseConstraintGroup> getOptionalGroups() {
return Collections.unmodifiableList(optionalGroups); return Collections.unmodifiableList(optionalGroups);
} }
public PhraseConstraintGroup getFullGroup() {
return fullGroup;
}
public boolean testMandatory(CodedSequence[] positions) { public boolean testMandatory(CodedSequence[] positions) {
for (var coherenceSet : mandatoryGroups) { for (var constraint : mandatoryGroups) {
if (!coherenceSet.test(positions)) { if (!constraint.test(positions)) {
return false; return false;
} }
} }
@ -48,9 +51,9 @@ public class TermCoherenceGroupList {
public int testOptional(CodedSequence[] positions) { public int testOptional(CodedSequence[] positions) {
int best = 0; int best = 0;
for (var coherenceSet : optionalGroups) { for (var constraint : optionalGroups) {
if (coherenceSet.test(positions)) { if (constraint.test(positions)) {
best = Math.max(coherenceSet.size, best); best = Math.max(constraint.size, best);
} }
} }
return best; return best;
@ -59,8 +62,8 @@ public class TermCoherenceGroupList {
public int countOptional(CodedSequence[] positions) { public int countOptional(CodedSequence[] positions) {
int ct = 0; int ct = 0;
for (var coherenceSet : optionalGroups) { for (var constraint : optionalGroups) {
if (coherenceSet.test(positions)) { if (constraint.test(positions)) {
ct++; ct++;
} }
} }
@ -70,17 +73,17 @@ public class TermCoherenceGroupList {
public int testOptional(CodedSequence[] positions, DocumentSpan span) { public int testOptional(CodedSequence[] positions, DocumentSpan span) {
int best = 0; int best = 0;
for (var coherenceSet : optionalGroups) { for (var constraint : optionalGroups) {
if (coherenceSet.test(span, positions)) { if (constraint.test(span, positions)) {
best = Math.max(coherenceSet.size, best); best = Math.max(constraint.size, best);
} }
} }
return best; return best;
} }
public boolean allOptionalInSpan(CodedSequence[] positions, DocumentSpan span) { public boolean allOptionalInSpan(CodedSequence[] positions, DocumentSpan span) {
for (var coherenceSet : optionalGroups) { for (var constraint : optionalGroups) {
if (!coherenceSet.test(span, positions)) { if (!constraint.test(span, positions)) {
return false; return false;
} }
} }
@ -91,36 +94,48 @@ public class TermCoherenceGroupList {
return optionalGroups.size(); return optionalGroups.size();
} }
public int largestOptional() { public int largestOptional() {
int best = 0; return fullGroup.size;
for (var coherenceSet : optionalGroups) {
best = Math.max(coherenceSet.size, best);
}
return best;
} }
public static final class TermCoherenceGroup { public static final class PhraseConstraintGroup {
private final int[] offsets; private final int[] offsets;
private final BitSet present; private final BitSet present;
private final BitSet termIdsMask;
public final int size; public final int size;
public final boolean mandatory; public PhraseConstraintGroup(List<String> terms, TermIdList termIdsAll) {
public TermCoherenceGroup(SearchCoherenceConstraint cons, TermIdList termIdsAll) { offsets = new int[terms.size()];
offsets = new int[cons.size()]; present = new BitSet(terms.size());
present = new BitSet(cons.size()); size = terms.size();
mandatory = cons.mandatory();
size = cons.size(); termIdsMask = new BitSet(termIdsAll.size());
int i = 0; int i = 0;
for (String term : cons.terms()) { for (String term : terms) {
if (!term.isEmpty()) { if (term.isEmpty()) {
present.set(i); continue;
long termId = SearchTermsUtil.getWordId(term); }
offsets[i++] = termIdsAll.indexOf(termId);
present.set(i);
long termId = SearchTermsUtil.getWordId(term);
int idx = termIdsAll.indexOf(termId);
if (idx < 0) {
offsets[i++] = -1;
}
else {
offsets[i++] = idx;
termIdsMask.set(idx);
} }
} }
} }
/** Returns true if the term with index termIdx in the query is in the group */
public boolean containsTerm(int termIdx) {
return termIdsMask.get(termIdx);
}
public boolean test(CodedSequence[] positions) { public boolean test(CodedSequence[] positions) {
IntIterator[] sequences = new IntIterator[present.cardinality()]; IntIterator[] sequences = new IntIterator[present.cardinality()];

View File

@ -7,14 +7,14 @@ public class QuerySearchTerms {
private final TObjectLongHashMap<String> termToId; private final TObjectLongHashMap<String> termToId;
public final TermIdList termIdsAll; public final TermIdList termIdsAll;
public final TermCoherenceGroupList coherences; public final PhraseConstraintGroupList phraseConstraints;
public QuerySearchTerms(TObjectLongHashMap<String> termToId, public QuerySearchTerms(TObjectLongHashMap<String> termToId,
TermIdList termIdsAll, TermIdList termIdsAll,
TermCoherenceGroupList coherences) { PhraseConstraintGroupList phraseConstraints) {
this.termToId = termToId; this.termToId = termToId;
this.termIdsAll = termIdsAll; this.termIdsAll = termIdsAll;
this.coherences = coherences; this.phraseConstraints = phraseConstraints;
} }
public long getIdForTerm(String searchTerm) { public long getIdForTerm(String searchTerm) {

View File

@ -4,7 +4,7 @@ import com.google.inject.Guice;
import com.google.inject.Inject; import com.google.inject.Inject;
import it.unimi.dsi.fastutil.ints.IntList; import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.IndexLocations; import nu.marginalia.IndexLocations;
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint; import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
@ -175,7 +175,7 @@ public class IndexQueryServiceIntegrationTest {
List.of(), List.of(),
List.of(), List.of(),
List.of(), List.of(),
List.of(SearchCoherenceConstraint.mandatory(List.of("missing", "hello"))) List.of(SearchPhraseConstraint.mandatory(List.of("missing", "hello")))
))); )));
executeSearch(queryMissingCoherence) executeSearch(queryMissingCoherence)
@ -443,7 +443,7 @@ public class IndexQueryServiceIntegrationTest {
List.of(), List.of(),
List.of(), List.of(),
List.of(), List.of(),
List.of(SearchCoherenceConstraint.mandatory(List.of(includes))) List.of(SearchPhraseConstraint.mandatory(List.of(includes)))
); );
} }
private MockDataDocument d(int domainId, int ordinal) { private MockDataDocument d(int domainId, int ordinal) {

View File

@ -79,9 +79,9 @@
<tr> <th title="terms that must be present">Search Terms Exclude</th><td>{{#each specs.query.searchTermsExclude}} {{.}} {{/each}}</td> </tr> <tr> <th title="terms that must be present">Search Terms Exclude</th><td>{{#each specs.query.searchTermsExclude}} {{.}} {{/each}}</td> </tr>
<tr> <th title="mandatory terms, no effect on ranking">Search Terms Advice</th><td>{{#each specs.query.searchTermsAdvice}} {{.}} {{/each}}</td> </tr> <tr> <th title="mandatory terms, no effect on ranking">Search Terms Advice</th><td>{{#each specs.query.searchTermsAdvice}} {{.}} {{/each}}</td> </tr>
<tr> <th title="not mandatory, effects ranking">Search Terms Priority</th><td>{{#each specs.query.searchTermsPriority}} {{.}} {{/each}}</td> </tr> <tr> <th title="not mandatory, effects ranking">Search Terms Priority</th><td>{{#each specs.query.searchTermsPriority}} {{.}} {{/each}}</td> </tr>
{{#each specs.query.searchTermCoherences}} {{#each specs.query.phraseConstraints}}
<tr> <tr>
<th title="terms must appear close by">Coherence Requirement</th> <th title="terms must appear close by">Phrase Constraints</th>
<td> <td>
{{#each .}} {{#each .}}
{{.}} {{.}}