(*) Refactor termCoherences and rename them to phrase constraints.

This commit is contained in:
Viktor Lofgren 2024-08-15 11:02:19 +02:00
parent b2a3cac351
commit 03d5dec24c
16 changed files with 259 additions and 232 deletions

View File

@ -1,6 +1,6 @@
package nu.marginalia.api.searchquery;
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
@ -46,18 +46,21 @@ public class IndexProtobufCodec {
}
public static SearchQuery convertRpcQuery(RpcQuery query) {
List<SearchCoherenceConstraint> coherences = new ArrayList<>();
List<SearchPhraseConstraint> phraeConstraints = new ArrayList<>();
for (int j = 0; j < query.getCoherencesCount(); j++) {
var coh = query.getCoherences(j);
if (coh.getType() == RpcCoherences.TYPE.OPTIONAL) {
coherences.add(new SearchCoherenceConstraint(false, List.copyOf(coh.getCoherencesList())));
for (int j = 0; j < query.getPhrasesCount(); j++) {
var coh = query.getPhrases(j);
if (coh.getType() == RpcPhrases.TYPE.OPTIONAL) {
phraeConstraints.add(new SearchPhraseConstraint.Optional(List.copyOf(coh.getTermsList())));
}
else if (coh.getType() == RpcCoherences.TYPE.MANDATORY) {
coherences.add(new SearchCoherenceConstraint(true, List.copyOf(coh.getCoherencesList())));
else if (coh.getType() == RpcPhrases.TYPE.MANDATORY) {
phraeConstraints.add(new SearchPhraseConstraint.Mandatory(List.copyOf(coh.getTermsList())));
}
else if (coh.getType() == RpcPhrases.TYPE.FULL) {
phraeConstraints.add(new SearchPhraseConstraint.Full(List.copyOf(coh.getTermsList())));
}
else {
throw new IllegalArgumentException("Unknown coherence type: " + coh.getType());
throw new IllegalArgumentException("Unknown phrase constraint type: " + coh.getType());
}
}
@ -67,7 +70,7 @@ public class IndexProtobufCodec {
query.getExcludeList(),
query.getAdviceList(),
query.getPriorityList(),
coherences
phraeConstraints
);
}
@ -80,11 +83,21 @@ public class IndexProtobufCodec {
.addAllExclude(searchQuery.getSearchTermsExclude())
.addAllPriority(searchQuery.getSearchTermsPriority());
for (var coherences : searchQuery.searchTermCoherences) {
subqueryBuilder.addCoherencesBuilder()
.addAllCoherences(coherences.terms())
.setType(coherences.mandatory() ? RpcCoherences.TYPE.MANDATORY : RpcCoherences.TYPE.OPTIONAL)
.build();
for (var constraint : searchQuery.phraseConstraints) {
switch (constraint) {
case SearchPhraseConstraint.Optional(List<String> terms) ->
subqueryBuilder.addPhrasesBuilder()
.addAllTerms(terms)
.setType(RpcPhrases.TYPE.OPTIONAL);
case SearchPhraseConstraint.Mandatory(List<String> terms) ->
subqueryBuilder.addPhrasesBuilder()
.addAllTerms(terms)
.setType(RpcPhrases.TYPE.MANDATORY);
case SearchPhraseConstraint.Full(List<String> terms) ->
subqueryBuilder.addPhrasesBuilder()
.addAllTerms(terms)
.setType(RpcPhrases.TYPE.FULL);
}
}
return subqueryBuilder.build();

View File

@ -1,71 +0,0 @@
package nu.marginalia.api.searchquery.model.query;
import nu.marginalia.language.WordPatterns;
import java.util.ArrayList;
import java.util.List;
public record SearchCoherenceConstraint(boolean mandatory, List<String> terms) {
public int size() {
return terms.size();
}
/** Create a new SearchCoherenceConstraint with the given terms, and the given mandatory flag.
* Stop words are replaced with empty strings.
*/
public static SearchCoherenceConstraint mandatory(String... terms) {
return new SearchCoherenceConstraint(true, trimStopWords(terms));
}
/** Create a new SearchCoherenceConstraint with the given terms, and the given mandatory flag.
* Stop words are replaced with empty strings.
*/
public static SearchCoherenceConstraint mandatory(List<String> terms) {
return new SearchCoherenceConstraint(true, trimStopWords(terms));
}
/** Create a new SearchCoherenceConstraint with the given terms, without the mandatory flag.
* Stop words are replaced with empty strings.
*/
public static SearchCoherenceConstraint optional(String... terms) {
return new SearchCoherenceConstraint(false, trimStopWords(terms));
}
/** Create a new SearchCoherenceConstraint with the given terms, without the mandatory flag.
* Stop words are replaced with empty strings.
*/
public static SearchCoherenceConstraint optional(List<String> terms) {
return new SearchCoherenceConstraint(false, trimStopWords(terms));
}
private static List<String> trimStopWords(List<String> terms) {
List<String> ret = new ArrayList<>(terms.size());
for (var term : terms) {
if (WordPatterns.isStopWord(term)) {
ret.add("");
} else {
ret.add(term);
}
}
return List.copyOf(ret);
}
private static List<String> trimStopWords(String... terms) {
List<String> ret = new ArrayList<>(terms.length);
for (var term : terms) {
if (WordPatterns.isStopWord(term)) {
ret.add("");
} else {
ret.add(term);
}
}
while (!ret.isEmpty() && "".equals(ret.getFirst())) {
ret.removeFirst();
}
while (!ret.isEmpty() && "".equals(ret.getLast())) {
ret.removeLast();
}
return List.copyOf(ret);
}
}

View File

@ -0,0 +1,85 @@
package nu.marginalia.api.searchquery.model.query;
import nu.marginalia.language.WordPatterns;
import java.util.ArrayList;
import java.util.List;
public sealed interface SearchPhraseConstraint {
record Mandatory(List<String> terms) implements SearchPhraseConstraint {
public Mandatory(String... terms) {
this(List.of(terms));
}
}
record Optional(List<String> terms) implements SearchPhraseConstraint {
public Optional(String... terms) {
this(List.of(terms));
}
}
record Full(List<String> terms) implements SearchPhraseConstraint {
public Full(String... terms) {
this(List.of(terms));
}
}
List<String> terms();
default int size() {
return terms().size();
}
static SearchPhraseConstraint mandatory(String... terms) {
return new Mandatory(trimStopWords(terms));
}
static SearchPhraseConstraint mandatory(List<String> terms) {
return new Mandatory(trimStopWords(terms));
}
static SearchPhraseConstraint optional(String... terms) {
return new Optional(trimStopWords(terms));
}
static SearchPhraseConstraint optional(List<String> terms) {
return new Optional(trimStopWords(terms));
}
static SearchPhraseConstraint full(String... terms) {
return new Full(trimStopWords(terms));
}
static SearchPhraseConstraint full(List<String> terms) {
return new Full(trimStopWords(terms));
}
private static List<String> trimStopWords(List<String> terms) {
List<String> ret = new ArrayList<>(terms.size());
for (var term : terms) {
if (WordPatterns.isStopWord(term)) {
ret.add("");
} else {
ret.add(term);
}
}
return List.copyOf(ret);
}
private static List<String> trimStopWords(String... terms) {
List<String> ret = new ArrayList<>(terms.length);
for (var term : terms) {
if (WordPatterns.isStopWord(term)) {
ret.add("");
} else {
ret.add(term);
}
}
while (!ret.isEmpty() && "".equals(ret.getFirst())) {
ret.removeFirst();
}
while (!ret.isEmpty() && "".equals(ret.getLast())) {
ret.removeLast();
}
return List.copyOf(ret);
}
}

View File

@ -31,7 +31,7 @@ public class SearchQuery {
public final List<String> searchTermsPriority;
/** Terms that we require to be in the same sentence */
public final List<SearchCoherenceConstraint> searchTermCoherences;
public final List<SearchPhraseConstraint> phraseConstraints;
@Deprecated // why does this exist?
private double value = 0;
@ -46,7 +46,7 @@ public class SearchQuery {
this.searchTermsExclude = new ArrayList<>();
this.searchTermsAdvice = new ArrayList<>();
this.searchTermsPriority = new ArrayList<>();
this.searchTermCoherences = new ArrayList<>();
this.phraseConstraints = new ArrayList<>();
}
public SearchQuery(String compiledQuery,
@ -54,13 +54,13 @@ public class SearchQuery {
List<String> searchTermsExclude,
List<String> searchTermsAdvice,
List<String> searchTermsPriority,
List<SearchCoherenceConstraint> searchTermCoherences) {
List<SearchPhraseConstraint> phraseConstraints) {
this.compiledQuery = compiledQuery;
this.searchTermsInclude = searchTermsInclude;
this.searchTermsExclude = searchTermsExclude;
this.searchTermsAdvice = searchTermsAdvice;
this.searchTermsPriority = searchTermsPriority;
this.searchTermCoherences = searchTermCoherences;
this.phraseConstraints = phraseConstraints;
}
@Deprecated // why does this exist?
@ -80,7 +80,7 @@ public class SearchQuery {
if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermCoherences.isEmpty()) sb.append("coherences=").append(searchTermCoherences.stream().map(coh->coh.terms().stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", ")));
if (!phraseConstraints.isEmpty()) sb.append("phraseConstraints=").append(phraseConstraints.stream().map(coh->coh.terms().stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", ")));
return sb.toString();
}
@ -91,7 +91,7 @@ public class SearchQuery {
public final List<String> searchTermsExclude = new ArrayList<>();
public final List<String> searchTermsAdvice = new ArrayList<>();
public final List<String> searchTermsPriority = new ArrayList<>();
public final List<SearchCoherenceConstraint> searchTermCoherences = new ArrayList<>();
public final List<SearchPhraseConstraint> searchPhraseConstraints = new ArrayList<>();
private SearchQueryBuilder() {
}
@ -121,13 +121,13 @@ public class SearchQuery {
return this;
}
public SearchQueryBuilder coherenceConstraint(SearchCoherenceConstraint constraint) {
searchTermCoherences.add(constraint);
public SearchQueryBuilder phraseConstraint(SearchPhraseConstraint constraint) {
searchPhraseConstraints.add(constraint);
return this;
}
public SearchQuery build() {
return new SearchQuery(compiledQuery, searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchTermCoherences);
return new SearchQuery(compiledQuery, searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchPhraseConstraints);
}
/** If there are no ranking terms, promote the advice terms to ranking terms */

View File

@ -176,17 +176,18 @@ message RpcQuery {
repeated string exclude = 2; // These terms must be absent
repeated string advice = 3; // These terms must be present, but do not affect ranking
repeated string priority = 4; // These terms are not mandatory, but affect ranking positively if they are present
repeated RpcCoherences coherences = 5; // Groups of terms that must exist in proximity of each other
repeated RpcPhrases phrases = 5; // Groups of terms that must exist in proximity of each other
string compiledQuery = 6; // Compiled query in infix notation
}
/* Defines a group of search terms that must exist in close proximity within the document */
message RpcCoherences {
repeated string coherences = 1;
/* Defines a group of search terms that must exist in the the specified order within the document */
message RpcPhrases {
repeated string terms = 1;
TYPE type = 2;
enum TYPE {
OPTIONAL = 0;
MANDATORY = 1;
FULL = 2;
};
}

View File

@ -1,7 +1,7 @@
package nu.marginalia.index.client;
import nu.marginalia.api.searchquery.IndexProtobufCodec;
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.index.query.limit.QueryLimits;
@ -11,7 +11,7 @@ import org.junit.jupiter.api.Test;
import java.util.List;
import java.util.function.Function;
import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
class IndexProtobufCodecTest {
@Test
@ -43,8 +43,8 @@ class IndexProtobufCodecTest {
List.of("e", "f"),
List.of("g", "h"),
List.of(
new SearchCoherenceConstraint(true, List.of("i", "j")),
new SearchCoherenceConstraint(false, List.of("k")))
SearchPhraseConstraint.mandatory(List.of("i", "j")),
SearchPhraseConstraint.optional(List.of("k")))
),
s -> IndexProtobufCodec.convertRpcQuery(IndexProtobufCodec.convertRpcQuery(s))
);

View File

@ -73,7 +73,7 @@ public class QueryFactory {
if (parts.length > 1) {
// Require that the terms appear in sequence
queryBuilder.coherenceConstraint(SearchCoherenceConstraint.mandatory(parts));
queryBuilder.phraseConstraint(SearchPhraseConstraint.mandatory(parts));
// Construct a regular query from the parts in the quoted string
queryBuilder.include(parts);
@ -126,12 +126,15 @@ public class QueryFactory {
var expansion = queryExpansion.expandQuery(queryBuilder.searchTermsInclude);
// Query expansion may produce suggestions for coherence constraints,
// Query expansion may produce suggestions for phrase constraints,
// add these to the query
for (var coh : expansion.extraCoherences()) {
queryBuilder.coherenceConstraint(SearchCoherenceConstraint.optional(coh));
for (var coh : expansion.optionalPharseConstraints()) {
queryBuilder.phraseConstraint(SearchPhraseConstraint.optional(coh));
}
// add a pseudo-constraint for the full query
queryBuilder.phraseConstraint(SearchPhraseConstraint.full(expansion.fullPhraseConstraint()));
queryBuilder.compiledQuery(expansion.compiledQuery());
var specsBuilder = SearchSpecification.builder()

View File

@ -44,11 +44,17 @@ public class QueryExpansion {
strategy.expand(graph);
}
List<List<String>> coherences = createSegments(graph);
List<List<String>> optionalPhraseConstraints = createSegments(graph);
// also create a segmentation that is just the entire query
List<String> fullPhraseConstraint = new ArrayList<> ();
for (var qw : graph) {
fullPhraseConstraint.add(qw.word());
}
var compiled = QWordPathsRenderer.render(graph);
return new Expansion(compiled, coherences);
return new Expansion(compiled, optionalPhraseConstraints, fullPhraseConstraint);
}
private static final Pattern dashPattern = Pattern.compile("-");
@ -144,36 +150,28 @@ public class QueryExpansion {
}
allSegments.sort(Comparator.comparing(NgramLexicon.SentenceSegment::start));
List<List<String>> coherences = new ArrayList<>();
Set<List<String>> constraints = new HashSet<>();
if (!allSegments.isEmpty()) {
Set<NgramLexicon.SentenceSegment> bestSegmentation =
findBestSegmentation(allSegments);
Set<NgramLexicon.SentenceSegment> bestSegmentation =
findBestSegmentation(allSegments);
for (var segment : bestSegmentation) {
for (var segment : bestSegmentation) {
int start = segment.start();
int end = segment.start() + segment.length();
int start = segment.start();
int end = segment.start() + segment.length();
List<String> components = new ArrayList<>(end - start);
for (int i = start; i < end; i++) {
components.add(nodes.get(i).word());
}
coherences.add(components);
// Create an n-gram search term for the segment
String word = String.join("_", components);
graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word);
List<String> components = new ArrayList<>(end - start);
for (int i = start; i < end; i++) {
components.add(nodes.get(i).word());
}
constraints.add(components);
// Create an n-gram search term for the segment
String word = String.join("_", components);
graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word);
}
// also create a segmentation that is just the entire query
coherences.add(nodes.stream()
.map(QWord::word)
.collect(Collectors.toList()));
return coherences;
return new ArrayList<>(constraints);
}
private Set<NgramLexicon.SentenceSegment> findBestSegmentation(List<NgramLexicon.SentenceSegment> allSegments) {
@ -216,5 +214,5 @@ public class QueryExpansion {
void expand(QWordGraph graph);
}
public record Expansion(String compiledQuery, List<List<String>> extraCoherences) {}
public record Expansion(String compiledQuery, List<List<String>> optionalPharseConstraints, List<String> fullPhraseConstraint) {}
}

View File

@ -1,17 +1,17 @@
package nu.marginalia.query.svc;
import nu.marginalia.WmsaHome;
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
import nu.marginalia.functions.searchquery.QueryFactory;
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.segmentation.NgramLexicon;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
@ -143,7 +143,7 @@ public class QueryFactoryTest {
var specs = parseAndGetSpecs("\"tde shining\"");
assertEquals("( shining tde | tde_shining )", specs.query.compiledQuery);
assertEquals(List.of("tde_shining"), specs.query.searchTermsPriority);
assertEquals(List.of(new SearchCoherenceConstraint(true, List.of("tde", "shining"))), specs.query.searchTermCoherences);
assertEquals(List.of(new SearchPhraseConstraint.Mandatory(List.of("tde", "shining"))), specs.query.phraseConstraints);
}
}

View File

@ -7,9 +7,6 @@ import it.unimi.dsi.fastutil.longs.LongList;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import java.util.ArrayList;
import java.util.List;
import static nu.marginalia.index.model.SearchTermsUtil.getWordId;
public final class SearchTerms {
@ -17,9 +14,6 @@ public final class SearchTerms {
private final LongList excludes;
private final LongList priority;
private final List<LongList> coherencesMandatory;
private final List<LongList> coherencesOptional;
public static final LongArraySet stopWords = new LongArraySet(
new long[] {
getWordId("a"),
@ -36,9 +30,6 @@ public final class SearchTerms {
this.excludes = new LongArrayList();
this.priority = new LongArrayList();
this.coherencesMandatory = new ArrayList<>();
this.coherencesOptional = new ArrayList<>();
this.advice = new LongArrayList();
this.compiledQueryIds = compiledQueryIds;
@ -46,21 +37,6 @@ public final class SearchTerms {
advice.add(getWordId(word));
}
for (var coherence : query.searchTermCoherences) {
LongList parts = new LongArrayList(coherence.size());
for (var word : coherence.terms()) {
parts.add(getWordId(word));
}
if (coherence.mandatory()) {
coherencesMandatory.add(parts);
}
else {
coherencesOptional.add(parts);
}
}
for (var word : query.searchTermsExclude) {
excludes.add(getWordId(word));
}
@ -91,12 +67,6 @@ public final class SearchTerms {
return priority;
}
public List<LongList> coherencesMandatory() {
return coherencesMandatory;
}
public List<LongList> coherencesOptional() {
return coherencesOptional;
}
public CompiledQueryLong compiledQuery() { return compiledQueryIds; }
}

View File

@ -10,6 +10,7 @@ import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import nu.marginalia.api.searchquery.*;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
@ -18,8 +19,8 @@ import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.model.SearchTermsUtil;
import nu.marginalia.index.results.model.PhraseConstraintGroupList;
import nu.marginalia.index.results.model.QuerySearchTerms;
import nu.marginalia.index.results.model.TermCoherenceGroupList;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.results.model.ids.TermIdList;
import nu.marginalia.index.results.model.ids.TermMetadataList;
@ -97,7 +98,7 @@ public class IndexResultRankingService {
}
// Ignore documents that don't match the mandatory constraints
if (!searchTerms.coherences.testMandatory(positions)) {
if (!searchTerms.phraseConstraints.testMandatory(positions)) {
continue;
}
@ -295,14 +296,26 @@ public class IndexResultRankingService {
var idsAll = new TermIdList(termIdsList);
var constraints = new ArrayList<TermCoherenceGroupList.TermCoherenceGroup>();
for (var coherence : searchQuery.searchTermCoherences) {
constraints.add(new TermCoherenceGroupList.TermCoherenceGroup(coherence, idsAll));
var constraintsMandatory = new ArrayList<PhraseConstraintGroupList.PhraseConstraintGroup>();
var constraintsFull = new ArrayList<PhraseConstraintGroupList.PhraseConstraintGroup>();
var constraintsOptional = new ArrayList<PhraseConstraintGroupList.PhraseConstraintGroup>();
for (var constraint : searchQuery.phraseConstraints) {
switch (constraint) {
case SearchPhraseConstraint.Mandatory(List<String> terms) ->
constraintsMandatory.add(new PhraseConstraintGroupList.PhraseConstraintGroup(terms, idsAll));
case SearchPhraseConstraint.Optional(List<String> terms) ->
constraintsOptional.add(new PhraseConstraintGroupList.PhraseConstraintGroup(terms, idsAll));
case SearchPhraseConstraint.Full(List<String> terms) ->
constraintsFull.add(new PhraseConstraintGroupList.PhraseConstraintGroup(terms, idsAll));
}
}
assert constraintsFull.size() == 1 : "Exactly one full constraint group is required";
return new QuerySearchTerms(termToId,
idsAll,
new TermCoherenceGroupList(constraints)
new PhraseConstraintGroupList(constraintsFull.getFirst(), constraintsMandatory, constraintsOptional)
);
}
}

View File

@ -13,8 +13,8 @@ import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.QueryParams;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.results.model.PhraseConstraintGroupList;
import nu.marginalia.index.results.model.QuerySearchTerms;
import nu.marginalia.index.results.model.TermCoherenceGroupList;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.crawl.PubDate;
@ -103,7 +103,7 @@ public class IndexResultScoreCalculator {
docSize,
spans,
positions,
searchTerms.coherences,
searchTerms.phraseConstraints,
rankingContext);
return new SearchResultItem(combinedId, docMetadata, htmlFeatures, score);
@ -155,7 +155,7 @@ public class IndexResultScoreCalculator {
int length,
DocumentSpans spans,
CodedSequence[] positions,
TermCoherenceGroupList coherences,
PhraseConstraintGroupList constraintGroups,
ResultRankingContext ctx)
{
if (length < 0) {
@ -192,7 +192,7 @@ public class IndexResultScoreCalculator {
VerbatimMatches verbatimMatches = new VerbatimMatches();
float verbatimMatchScore = findVerbatimMatches(verbatimMatches, coherences, positions, spans);
float verbatimMatchScore = findVerbatimMatches(verbatimMatches, constraintGroups, positions, spans);
float[] weightedCounts = new float[compiledQuery.size()];
float keywordMinDistFac = 0;
@ -373,19 +373,19 @@ public class IndexResultScoreCalculator {
}
private float findVerbatimMatches(VerbatimMatches verbatimMatches,
TermCoherenceGroupList coherences,
PhraseConstraintGroupList constraints,
CodedSequence[] positions,
DocumentSpans spans) {
// Calculate a bonus for keyword coherences when large ones exist
int largestOptional = coherences.largestOptional();
int largestOptional = constraints.largestOptional();
if (largestOptional < 2) {
return 0;
}
float verbatimMatchScore = 0.f;
for (var optionalGroup : coherences.getOptionalGroups()) {
for (var optionalGroup : constraints.getOptionalGroups()) {
int groupSize = optionalGroup.size;
float sizeScalingFactor = groupSize / (float) largestOptional;
@ -400,8 +400,8 @@ public class IndexResultScoreCalculator {
}
}
if (coherences.numOptional() > 0) {
verbatimMatchScore += (float) Math.pow(coherences.countOptional(positions) / (double) coherences.numOptional(), 2);
if (constraints.numOptional() > 0) {
verbatimMatchScore += (float) Math.pow(constraints.countOptional(positions) / (double) constraints.numOptional(), 2);
}
return verbatimMatchScore;

View File

@ -1,7 +1,6 @@
package nu.marginalia.index.results.model;
import it.unimi.dsi.fastutil.ints.IntIterator;
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
import nu.marginalia.index.forward.spans.DocumentSpan;
import nu.marginalia.index.model.SearchTermsUtil;
import nu.marginalia.index.results.model.ids.TermIdList;
@ -16,28 +15,32 @@ import java.util.List;
/**
* wordIds that we require to be in the same sentence
*/
public class TermCoherenceGroupList {
List<TermCoherenceGroup> mandatoryGroups = new ArrayList<>();
List<TermCoherenceGroup> optionalGroups = new ArrayList<>();
public class PhraseConstraintGroupList {
List<PhraseConstraintGroup> mandatoryGroups = new ArrayList<>();
List<PhraseConstraintGroup> optionalGroups = new ArrayList<>();
PhraseConstraintGroup fullGroup;
public TermCoherenceGroupList(List<TermCoherenceGroup> groups) {
for (var group : groups) {
if (group.mandatory) {
mandatoryGroups.add(group);
} else {
optionalGroups.add(group);
}
}
public PhraseConstraintGroupList(
PhraseConstraintGroup fullGroup,
List<PhraseConstraintGroup> mandatoryGroups,
List<PhraseConstraintGroup> optionalGroups) {
this.mandatoryGroups.addAll(mandatoryGroups);
this.optionalGroups.addAll(optionalGroups);
this.fullGroup = fullGroup;
}
public List<TermCoherenceGroup> getOptionalGroups() {
public List<PhraseConstraintGroup> getOptionalGroups() {
return Collections.unmodifiableList(optionalGroups);
}
public PhraseConstraintGroup getFullGroup() {
return fullGroup;
}
public boolean testMandatory(CodedSequence[] positions) {
for (var coherenceSet : mandatoryGroups) {
if (!coherenceSet.test(positions)) {
for (var constraint : mandatoryGroups) {
if (!constraint.test(positions)) {
return false;
}
}
@ -48,9 +51,9 @@ public class TermCoherenceGroupList {
public int testOptional(CodedSequence[] positions) {
int best = 0;
for (var coherenceSet : optionalGroups) {
if (coherenceSet.test(positions)) {
best = Math.max(coherenceSet.size, best);
for (var constraint : optionalGroups) {
if (constraint.test(positions)) {
best = Math.max(constraint.size, best);
}
}
return best;
@ -59,8 +62,8 @@ public class TermCoherenceGroupList {
public int countOptional(CodedSequence[] positions) {
int ct = 0;
for (var coherenceSet : optionalGroups) {
if (coherenceSet.test(positions)) {
for (var constraint : optionalGroups) {
if (constraint.test(positions)) {
ct++;
}
}
@ -70,17 +73,17 @@ public class TermCoherenceGroupList {
public int testOptional(CodedSequence[] positions, DocumentSpan span) {
int best = 0;
for (var coherenceSet : optionalGroups) {
if (coherenceSet.test(span, positions)) {
best = Math.max(coherenceSet.size, best);
for (var constraint : optionalGroups) {
if (constraint.test(span, positions)) {
best = Math.max(constraint.size, best);
}
}
return best;
}
public boolean allOptionalInSpan(CodedSequence[] positions, DocumentSpan span) {
for (var coherenceSet : optionalGroups) {
if (!coherenceSet.test(span, positions)) {
for (var constraint : optionalGroups) {
if (!constraint.test(span, positions)) {
return false;
}
}
@ -91,36 +94,48 @@ public class TermCoherenceGroupList {
return optionalGroups.size();
}
public int largestOptional() {
int best = 0;
for (var coherenceSet : optionalGroups) {
best = Math.max(coherenceSet.size, best);
}
return best;
return fullGroup.size;
}
public static final class TermCoherenceGroup {
public static final class PhraseConstraintGroup {
private final int[] offsets;
private final BitSet present;
private final BitSet termIdsMask;
public final int size;
public final boolean mandatory;
public TermCoherenceGroup(SearchCoherenceConstraint cons, TermIdList termIdsAll) {
offsets = new int[cons.size()];
present = new BitSet(cons.size());
mandatory = cons.mandatory();
size = cons.size();
public PhraseConstraintGroup(List<String> terms, TermIdList termIdsAll) {
offsets = new int[terms.size()];
present = new BitSet(terms.size());
size = terms.size();
termIdsMask = new BitSet(termIdsAll.size());
int i = 0;
for (String term : cons.terms()) {
if (!term.isEmpty()) {
present.set(i);
long termId = SearchTermsUtil.getWordId(term);
offsets[i++] = termIdsAll.indexOf(termId);
for (String term : terms) {
if (term.isEmpty()) {
continue;
}
present.set(i);
long termId = SearchTermsUtil.getWordId(term);
int idx = termIdsAll.indexOf(termId);
if (idx < 0) {
offsets[i++] = -1;
}
else {
offsets[i++] = idx;
termIdsMask.set(idx);
}
}
}
/** Returns true if the term with index termIdx in the query is in the group */
public boolean containsTerm(int termIdx) {
return termIdsMask.get(termIdx);
}
public boolean test(CodedSequence[] positions) {
IntIterator[] sequences = new IntIterator[present.cardinality()];

View File

@ -7,14 +7,14 @@ public class QuerySearchTerms {
private final TObjectLongHashMap<String> termToId;
public final TermIdList termIdsAll;
public final TermCoherenceGroupList coherences;
public final PhraseConstraintGroupList phraseConstraints;
public QuerySearchTerms(TObjectLongHashMap<String> termToId,
TermIdList termIdsAll,
TermCoherenceGroupList coherences) {
PhraseConstraintGroupList phraseConstraints) {
this.termToId = termToId;
this.termIdsAll = termIdsAll;
this.coherences = coherences;
this.phraseConstraints = phraseConstraints;
}
public long getIdForTerm(String searchTerm) {

View File

@ -4,7 +4,7 @@ import com.google.inject.Guice;
import com.google.inject.Inject;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.IndexLocations;
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
@ -175,7 +175,7 @@ public class IndexQueryServiceIntegrationTest {
List.of(),
List.of(),
List.of(),
List.of(SearchCoherenceConstraint.mandatory(List.of("missing", "hello")))
List.of(SearchPhraseConstraint.mandatory(List.of("missing", "hello")))
)));
executeSearch(queryMissingCoherence)
@ -443,7 +443,7 @@ public class IndexQueryServiceIntegrationTest {
List.of(),
List.of(),
List.of(),
List.of(SearchCoherenceConstraint.mandatory(List.of(includes)))
List.of(SearchPhraseConstraint.mandatory(List.of(includes)))
);
}
private MockDataDocument d(int domainId, int ordinal) {

View File

@ -79,9 +79,9 @@
<tr> <th title="terms that must be present">Search Terms Exclude</th><td>{{#each specs.query.searchTermsExclude}} {{.}} {{/each}}</td> </tr>
<tr> <th title="mandatory terms, no effect on ranking">Search Terms Advice</th><td>{{#each specs.query.searchTermsAdvice}} {{.}} {{/each}}</td> </tr>
<tr> <th title="not mandatory, effects ranking">Search Terms Priority</th><td>{{#each specs.query.searchTermsPriority}} {{.}} {{/each}}</td> </tr>
{{#each specs.query.searchTermCoherences}}
{{#each specs.query.phraseConstraints}}
<tr>
<th title="terms must appear close by">Coherence Requirement</th>
<th title="terms must appear close by">Phrase Constraints</th>
<td>
{{#each .}}
{{.}}