(index) Partial re-implementation of position constraints

This commit is contained in:
Viktor Lofgren 2024-06-24 15:55:54 +02:00
parent 5461634616
commit 9d00243d7f
16 changed files with 173 additions and 61 deletions

View File

@ -1,5 +1,6 @@
package nu.marginalia.api.searchquery;
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.Bm25Parameters;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
@ -48,11 +49,19 @@ public class IndexProtobufCodec {
}
public static SearchQuery convertRpcQuery(RpcQuery query) {
List<List<String>> coherences = new ArrayList<>();
List<SearchCoherenceConstraint> coherences = new ArrayList<>();
for (int j = 0; j < query.getCoherencesCount(); j++) {
var coh = query.getCoherences(j);
coherences.add(new ArrayList<>(coh.getCoherencesList()));
if (coh.getType() == RpcCoherences.TYPE.OPTIONAL) {
coherences.add(new SearchCoherenceConstraint(false, List.copyOf(coh.getCoherencesList())));
}
else if (coh.getType() == RpcCoherences.TYPE.MANDATORY) {
coherences.add(new SearchCoherenceConstraint(true, List.copyOf(coh.getCoherencesList())));
}
else {
throw new IllegalArgumentException("Unknown coherence type: " + coh.getType());
}
}
return new SearchQuery(
@ -75,7 +84,9 @@ public class IndexProtobufCodec {
.addAllPriority(searchQuery.getSearchTermsPriority());
for (var coherences : searchQuery.searchTermCoherences) {
subqueryBuilder.addCoherencesBuilder().addAllCoherences(coherences);
subqueryBuilder.addCoherencesBuilder()
.addAllCoherences(coherences.terms())
.setType(coherences.mandatory() ? RpcCoherences.TYPE.MANDATORY : RpcCoherences.TYPE.OPTIONAL);
}
return subqueryBuilder.build();

View File

@ -0,0 +1,23 @@
package nu.marginalia.api.searchquery.model.query;
import java.util.List;
public record SearchCoherenceConstraint(boolean mandatory, List<String> terms) {
public static SearchCoherenceConstraint mandatory(String... terms) {
return new SearchCoherenceConstraint(true, List.of(terms));
}
public static SearchCoherenceConstraint mandatory(List<String> terms) {
return new SearchCoherenceConstraint(true, List.copyOf(terms));
}
public static SearchCoherenceConstraint optional(String... terms) {
return new SearchCoherenceConstraint(false, List.of(terms));
}
public static SearchCoherenceConstraint optional(List<String> terms) {
return new SearchCoherenceConstraint(false, List.copyOf(terms));
}
public int size() {
return terms.size();
}
}

View File

@ -31,7 +31,7 @@ public class SearchQuery {
public final List<String> searchTermsPriority;
/** Terms that we require to be in the same sentence */
public final List<List<String>> searchTermCoherences;
public final List<SearchCoherenceConstraint> searchTermCoherences;
@Deprecated // why does this exist?
private double value = 0;
@ -54,7 +54,7 @@ public class SearchQuery {
List<String> searchTermsExclude,
List<String> searchTermsAdvice,
List<String> searchTermsPriority,
List<List<String>> searchTermCoherences) {
List<SearchCoherenceConstraint> searchTermCoherences) {
this.compiledQuery = compiledQuery;
this.searchTermsInclude = searchTermsInclude;
this.searchTermsExclude = searchTermsExclude;
@ -80,7 +80,7 @@ public class SearchQuery {
if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermCoherences.isEmpty()) sb.append("coherences=").append(searchTermCoherences.stream().map(coh->coh.stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", ")));
if (!searchTermCoherences.isEmpty()) sb.append("coherences=").append(searchTermCoherences.stream().map(coh->coh.terms().stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", ")));
return sb.toString();
}
@ -91,7 +91,7 @@ public class SearchQuery {
private List<String> searchTermsExclude = new ArrayList<>();
private List<String> searchTermsAdvice = new ArrayList<>();
private List<String> searchTermsPriority = new ArrayList<>();
private List<List<String>> searchTermCoherences = new ArrayList<>();
private List<SearchCoherenceConstraint> searchTermCoherences = new ArrayList<>();
private SearchQueryBuilder(String compiledQuery) {
this.compiledQuery = compiledQuery;
@ -117,8 +117,8 @@ public class SearchQuery {
return this;
}
public SearchQueryBuilder coherences(String... coherences) {
searchTermCoherences.add(List.of(coherences));
public SearchQueryBuilder coherences(SearchCoherenceConstraint constraint) {
searchTermCoherences.add(constraint);
return this;
}

View File

@ -184,4 +184,10 @@ message RpcQuery {
/* Defines a group of search terms that must exist in close proximity within the document */
message RpcCoherences {
repeated string coherences = 1;
TYPE type = 2;
enum TYPE {
OPTIONAL = 0;
MANDATORY = 1;
};
}

View File

@ -1,6 +1,7 @@
package nu.marginalia.index.client;
import nu.marginalia.api.searchquery.IndexProtobufCodec;
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.index.query.limit.QueryLimits;
@ -41,7 +42,9 @@ class IndexProtobufCodecTest {
List.of("c", "d"),
List.of("e", "f"),
List.of("g", "h"),
List.of(List.of("i", "j"), List.of("k"))
List.of(
new SearchCoherenceConstraint(true, List.of("i", "j")),
new SearchCoherenceConstraint(false, List.of("k")))
),
s -> IndexProtobufCodec.convertRpcQuery(IndexProtobufCodec.convertRpcQuery(s))
);

View File

@ -2,16 +2,13 @@ package nu.marginalia.functions.searchquery.svc;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.query.*;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.functions.searchquery.query_parser.QueryExpansion;
import nu.marginalia.functions.searchquery.query_parser.token.QueryToken;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.language.WordPatterns;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
import nu.marginalia.functions.searchquery.query_parser.QueryParser;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
@ -60,7 +57,7 @@ public class QueryFactory {
List<String> searchTermsInclude = new ArrayList<>();
List<String> searchTermsAdvice = new ArrayList<>();
List<String> searchTermsPriority = new ArrayList<>();
List<List<String>> searchTermCoherences = new ArrayList<>();
List<SearchCoherenceConstraint> searchTermCoherences = new ArrayList<>();
SpecificationLimit qualityLimit = SpecificationLimit.none();
SpecificationLimit year = SpecificationLimit.none();
@ -88,7 +85,7 @@ public class QueryFactory {
searchTermsAdvice.add(str);
// Require that the terms appear in the same sentence
searchTermCoherences.add(Arrays.asList(parts));
searchTermCoherences.add(SearchCoherenceConstraint.mandatory(parts));
// Require that each term exists in the document
// (needed for ranking)
@ -140,7 +137,12 @@ public class QueryFactory {
}
var expansion = queryExpansion.expandQuery(searchTermsInclude);
searchTermCoherences.addAll(expansion.extraCoherences());
// Query expansion may produce suggestions for coherence constraints,
// add these to the query
for (var coh : expansion.extraCoherences()) {
searchTermCoherences.add(SearchCoherenceConstraint.optional(coh));
}
var searchQuery = new SearchQuery(
expansion.compiledQuery(),

View File

@ -23,6 +23,7 @@ public class IndexClient {
private static final Logger logger = LoggerFactory.getLogger(IndexClient.class);
private final GrpcMultiNodeChannelPool<IndexApiGrpc.IndexApiBlockingStub> channelPool;
private static final ExecutorService executor = Executors.newFixedThreadPool(32);
@Inject
public IndexClient(GrpcChannelPoolFactory channelPoolFactory) {
this.channelPool = channelPoolFactory.createMulti(

View File

@ -162,6 +162,8 @@ public class ReverseIndexReader {
var offsets = reader.queryData(docIds, 1);
for (int i = 0; i < docIds.length; i++) {
if (offsets[i] == 0)
continue;
ret[i] = positionsFileReader.getTermData(arena, offsets[i]);
}
return ret;

View File

@ -1,5 +1,8 @@
package nu.marginalia.index.positions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.channels.FileChannel;
@ -8,6 +11,7 @@ import java.nio.file.StandardOpenOption;
public class PositionsFileReader implements AutoCloseable {
private final FileChannel positions;
private static final Logger logger = LoggerFactory.getLogger(PositionsFileReader.class);
public PositionsFileReader(Path positionsFile) throws IOException {
this.positions = FileChannel.open(positionsFile, StandardOpenOption.READ);

View File

@ -71,6 +71,8 @@ public class SearchParameters {
this.budget = new IndexSearchBudget(limits.timeoutMs() / 2);
this.query = IndexProtobufCodec.convertRpcQuery(request.getQuery());
System.out.println(query);
this.limitByDomain = limits.resultsByDomain();
this.limitTotal = limits.resultsTotal();

View File

@ -15,7 +15,9 @@ public final class SearchTerms {
private final LongList advice;
private final LongList excludes;
private final LongList priority;
private final List<LongList> coherences;
private final List<LongList> coherencesMandatory;
private final List<LongList> coherencesOptional;
private final CompiledQueryLong compiledQueryIds;
@ -24,7 +26,10 @@ public final class SearchTerms {
{
this.excludes = new LongArrayList();
this.priority = new LongArrayList();
this.coherences = new ArrayList<>();
this.coherencesMandatory = new ArrayList<>();
this.coherencesOptional = new ArrayList<>();
this.advice = new LongArrayList();
this.compiledQueryIds = compiledQueryIds;
@ -35,11 +40,16 @@ public final class SearchTerms {
for (var coherence : query.searchTermCoherences) {
LongList parts = new LongArrayList(coherence.size());
for (var word : coherence) {
for (var word : coherence.terms()) {
parts.add(getWordId(word));
}
coherences.add(parts);
if (coherence.mandatory()) {
coherencesMandatory.add(parts);
}
else {
coherencesOptional.add(parts);
}
}
for (var word : query.searchTermsExclude) {
@ -72,10 +82,12 @@ public final class SearchTerms {
return priority;
}
public List<LongList> coherences() {
return coherences;
public List<LongList> coherencesMandatory() {
return coherencesMandatory;
}
public List<LongList> coherencesOptional() {
return coherencesOptional;
}
public CompiledQueryLong compiledQuery() { return compiledQueryIds; }
}

View File

@ -15,6 +15,7 @@ import nu.marginalia.index.results.model.ids.TermMetadataList;
import nu.marginalia.index.results.model.ids.TermIdList;
import java.lang.foreign.Arena;
import java.util.ArrayList;
import static nu.marginalia.index.results.model.TermCoherenceGroupList.TermCoherenceGroup;
@ -77,12 +78,15 @@ public class IndexMetadataService {
}
}
var constraints = new ArrayList<TermCoherenceGroup>();
for (var coherence : searchQuery.searchTermCoherences) {
constraints.add(new TermCoherenceGroup(coherence, termIdsList));
}
return new QuerySearchTerms(termToId,
new TermIdList(termIdsList),
new TermIdList(termIdsPrio),
new TermCoherenceGroupList(
searchQuery.searchTermCoherences.stream().map(TermCoherenceGroup::new).toList()
)
new TermCoherenceGroupList(constraints)
);
}

View File

@ -50,11 +50,8 @@ public class IndexResultValuationContext {
long[] wordFlags,
GammaCodedSequence[] positions)
{
// FIXME: Reconsider coherence logic with the new position data
// if (!searchTerms.coherences.test(termMetadataForCombinedDocumentIds, combinedId))
// return null;
if (!searchTerms.coherences.testMandatory(positions))
return null;
CompiledQuery<GammaCodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags);
@ -72,7 +69,6 @@ public class IndexResultValuationContext {
return null;
}
long docId = UrlIdCodec.removeRank(combinedId);
long docMetadata = index.getDocumentMetadata(docId);
int htmlFeatures = index.getHtmlFeatures(docId);

View File

@ -1,23 +1,36 @@
package nu.marginalia.index.results.model;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.longs.LongList;
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
import nu.marginalia.index.model.SearchTermsUtil;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.SequenceOperations;
import java.util.Collections;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.List;
/**
* wordIds that we require to be in the same sentence
*/
public record TermCoherenceGroupList(List<TermCoherenceGroup> words) {
public class TermCoherenceGroupList {
List<TermCoherenceGroup> mandatoryGroups = new ArrayList<>();
List<TermCoherenceGroup> optionalGroups = new ArrayList<>();
public TermCoherenceGroupList(List<TermCoherenceGroup> words) {
this.words = Collections.unmodifiableList(words);
public TermCoherenceGroupList(List<TermCoherenceGroup> groups) {
for (var group : groups) {
if (group.mandatory) {
mandatoryGroups.add(group);
} else {
optionalGroups.add(group);
}
}
}
public boolean test(TermMetadataForCombinedDocumentIds documents, long combinedId) {
for (var coherenceSet : words()) {
if (!coherenceSet.test(documents, combinedId)) {
public boolean testMandatory(GammaCodedSequence[] positions) {
for (var coherenceSet : mandatoryGroups) {
if (!coherenceSet.test(positions)) {
return false;
}
}
@ -25,30 +38,59 @@ public record TermCoherenceGroupList(List<TermCoherenceGroup> words) {
return true;
}
public int testOptional(GammaCodedSequence[] positions) {
int best = 0;
for (var coherenceSet : mandatoryGroups) {
if (coherenceSet.test(positions)) {
best = Math.max(coherenceSet.size, best);
}
}
return best;
}
public static final class TermCoherenceGroup {
private final long[] words;
private final int[] offsets;
private final BitSet present;
public TermCoherenceGroup(long[] words) {
this.words = words;
}
public final int size;
public final boolean mandatory;
public TermCoherenceGroup(SearchCoherenceConstraint cons, LongList termIdsAll) {
offsets = new int[cons.size()];
present = new BitSet(cons.size());
mandatory = cons.mandatory();
size = cons.size();
public TermCoherenceGroup(List<String> coh) {
this(coh.stream().mapToLong(SearchTermsUtil::getWordId).toArray());
}
public boolean test(TermMetadataForCombinedDocumentIds documents, long combinedId) {
long overlap = 0xFF_FFFF_FFFF_FFFFL;
for (var word : words) {
long meta = documents.getTermMetadata(word, combinedId);
// if the word is not present in the document, we omit it from the coherence check
if (meta != 0L) {
overlap &= meta;
int i = 0;
for (String term : cons.terms()) {
if (!term.isEmpty()) {
present.set(i);
long termId = SearchTermsUtil.getWordId(term);
offsets[i++] = termIdsAll.indexOf(termId);
}
}
}
return WordMetadata.decodePositions(overlap) != 0L;
public boolean test(GammaCodedSequence[] positions) {
IntIterator[] sequences = new IntIterator[present.cardinality()];
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
if (!present.get(oi)) {
continue;
}
int offset = offsets[oi];
if (offset < 0)
return false;
// Create iterators that are offset by their relative position in the
// sequence. This is done by subtracting the index from the offset,
// so that when we intersect them, an overlap means that the terms are
// in the correct order. Note the offset is negative!
sequences[si++] = positions[oi].offsetIterator(-oi);
}
return SequenceOperations.intersectSequences(sequences);
}
}
}

View File

@ -3,6 +3,7 @@ package nu.marginalia.index;
import com.google.inject.Guice;
import com.google.inject.Inject;
import nu.marginalia.IndexLocations;
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
@ -174,7 +175,7 @@ public class IndexQueryServiceIntegrationTest {
List.of(),
List.of(),
List.of(),
List.of(List.of("missing", "hello"))
List.of(SearchCoherenceConstraint.mandatory(List.of("missing", "hello")))
)));
executeSearch(queryMissingCoherence)
@ -466,7 +467,7 @@ public class IndexQueryServiceIntegrationTest {
List.of(),
List.of(),
List.of(),
List.of(List.of(includes))
List.of(SearchCoherenceConstraint.mandatory(List.of(includes)))
);
}
private MockDataDocument d(int domainId, int ordinal) {

View File

@ -133,6 +133,9 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
/** Return the number of items in the sequence */
public int valueCount() {
if (startPos == startLimit)
return 0;
// if the first byte is zero, the sequence is empty and we can skip decoding
if (0 == raw.get(startPos))
return 0;