mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 13:19:02 +00:00
(index) Clean up index code
This commit is contained in:
parent
9415539b38
commit
9a045a0588
@ -6,12 +6,13 @@ import io.grpc.stub.StreamObserver;
|
|||||||
import io.prometheus.client.Counter;
|
import io.prometheus.client.Counter;
|
||||||
import io.prometheus.client.Gauge;
|
import io.prometheus.client.Gauge;
|
||||||
import io.prometheus.client.Histogram;
|
import io.prometheus.client.Histogram;
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.api.searchquery.*;
|
import nu.marginalia.api.searchquery.*;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
||||||
import nu.marginalia.api.searchquery.model.results.*;
|
import nu.marginalia.api.searchquery.model.results.*;
|
||||||
import nu.marginalia.index.index.IndexQueryService;
|
import nu.marginalia.array.buffer.LongQueryBuffer;
|
||||||
import nu.marginalia.index.index.StatefulIndex;
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
import nu.marginalia.index.model.SearchParameters;
|
import nu.marginalia.index.model.SearchParameters;
|
||||||
import nu.marginalia.index.model.SearchTerms;
|
import nu.marginalia.index.model.SearchTerms;
|
||||||
@ -79,7 +80,6 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
|||||||
private final StatefulIndex index;
|
private final StatefulIndex index;
|
||||||
private final SearchSetsService searchSetsService;
|
private final SearchSetsService searchSetsService;
|
||||||
|
|
||||||
private final IndexQueryService indexQueryService;
|
|
||||||
private final IndexResultValuatorService resultValuator;
|
private final IndexResultValuatorService resultValuator;
|
||||||
|
|
||||||
private final String nodeName;
|
private final String nodeName;
|
||||||
@ -90,7 +90,6 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
|||||||
public IndexGrpcService(ServiceConfiguration serviceConfiguration,
|
public IndexGrpcService(ServiceConfiguration serviceConfiguration,
|
||||||
StatefulIndex index,
|
StatefulIndex index,
|
||||||
SearchSetsService searchSetsService,
|
SearchSetsService searchSetsService,
|
||||||
IndexQueryService indexQueryService,
|
|
||||||
IndexResultValuatorService resultValuator)
|
IndexResultValuatorService resultValuator)
|
||||||
{
|
{
|
||||||
var nodeId = serviceConfiguration.node();
|
var nodeId = serviceConfiguration.node();
|
||||||
@ -98,7 +97,6 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
|||||||
this.index = index;
|
this.index = index;
|
||||||
this.searchSetsService = searchSetsService;
|
this.searchSetsService = searchSetsService;
|
||||||
this.resultValuator = resultValuator;
|
this.resultValuator = resultValuator;
|
||||||
this.indexQueryService = indexQueryService;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// GRPC endpoint
|
// GRPC endpoint
|
||||||
@ -222,13 +220,14 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** This class is responsible for executing a search query. It uses a thread pool to
|
/** This class is responsible for executing a search query. It uses a thread pool to
|
||||||
* execute the subqueries in parallel, and then uses another thread pool to rank the
|
* execute the subqueries and their valuation in parallel. The results are then combined
|
||||||
* results in parallel. The results are then combined into a bounded priority queue,
|
* into a bounded priority queue, and finally the best results are returned.
|
||||||
* and finally the best results are returned.
|
|
||||||
*/
|
*/
|
||||||
private class QueryExecution {
|
private class QueryExecution {
|
||||||
private static final Executor workerPool = Executors.newWorkStealingPool(indexValuationThreads*4);
|
private static final Executor workerPool = Executors.newWorkStealingPool(indexValuationThreads*4);
|
||||||
|
|
||||||
|
/** The queue where the results from the index lookup threads are placed,
|
||||||
|
* pending ranking by the result ranker threads */
|
||||||
private final ArrayBlockingQueue<CombinedDocIdList> resultCandidateQueue
|
private final ArrayBlockingQueue<CombinedDocIdList> resultCandidateQueue
|
||||||
= new ArrayBlockingQueue<>(8);
|
= new ArrayBlockingQueue<>(8);
|
||||||
|
|
||||||
@ -306,11 +305,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
|||||||
|
|
||||||
public void run() {
|
public void run() {
|
||||||
try {
|
try {
|
||||||
indexQueryService.evaluateSubquery(
|
executeSearch();
|
||||||
query,
|
|
||||||
budget,
|
|
||||||
this::drain
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
finally {
|
finally {
|
||||||
synchronized (remainingIndexTasks) {
|
synchronized (remainingIndexTasks) {
|
||||||
@ -321,7 +316,31 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void drain(CombinedDocIdList resultIds) {
|
private void executeSearch() {
|
||||||
|
final LongArrayList results = new LongArrayList(512);
|
||||||
|
|
||||||
|
// These queries are different indices for one subquery
|
||||||
|
final LongQueryBuffer buffer = new LongQueryBuffer(512);
|
||||||
|
|
||||||
|
while (query.hasMore() && budget.hasTimeLeft())
|
||||||
|
{
|
||||||
|
buffer.reset();
|
||||||
|
query.getMoreResults(buffer);
|
||||||
|
|
||||||
|
results.addElements(0, buffer.data, 0, buffer.end);
|
||||||
|
|
||||||
|
if (results.size() < 512) {
|
||||||
|
enqueueResults(new CombinedDocIdList(results));
|
||||||
|
results.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!results.isEmpty()) {
|
||||||
|
enqueueResults(new CombinedDocIdList(results));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void enqueueResults(CombinedDocIdList resultIds) {
|
||||||
long remainingTime = budget.timeLeft();
|
long remainingTime = budget.timeLeft();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@ -353,30 +372,9 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
|||||||
|
|
||||||
public void run() {
|
public void run() {
|
||||||
try {
|
try {
|
||||||
while (parameters.budget.timeLeft() > 0) {
|
while (parameters.budget.timeLeft() > 0 && execute());
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
|
||||||
|
|
||||||
CombinedDocIdList resultIds = resultCandidateQueue.poll(
|
|
||||||
Math.clamp(parameters.budget.timeLeft(), 1, 5),
|
|
||||||
TimeUnit.MILLISECONDS);
|
|
||||||
|
|
||||||
if (resultIds == null) {
|
|
||||||
if (remainingIndexTasks.get() == 0
|
|
||||||
&& resultCandidateQueue.isEmpty())
|
|
||||||
break;
|
|
||||||
else
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
catch (InterruptedException e) {
|
||||||
stallTime.addAndGet(System.currentTimeMillis() - start);
|
|
||||||
|
|
||||||
var bestResults = resultValuator.rankResults(parameters, rankingContext, resultIds);
|
|
||||||
|
|
||||||
resultHeap.addAll(bestResults);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (Exception e) {
|
|
||||||
logger.warn("Interrupted while waiting to poll resultIds from queue", e);
|
logger.warn("Interrupted while waiting to poll resultIds from queue", e);
|
||||||
}
|
}
|
||||||
finally {
|
finally {
|
||||||
@ -386,6 +384,31 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean execute() throws InterruptedException {
|
||||||
|
long start = System.currentTimeMillis();
|
||||||
|
|
||||||
|
// Do a relatively short poll to ensure we terminate in a timely manner
|
||||||
|
// in the event all work is done
|
||||||
|
final long pollTime = Math.clamp(parameters.budget.timeLeft(), 1, 5);
|
||||||
|
CombinedDocIdList resultIds = resultCandidateQueue.poll(pollTime, TimeUnit.MILLISECONDS);
|
||||||
|
|
||||||
|
if (resultIds == null) {
|
||||||
|
// check if we are done and can terminate
|
||||||
|
if (remainingIndexTasks.get() == 0 && resultCandidateQueue.isEmpty()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
stallTime.addAndGet(System.currentTimeMillis() - start);
|
||||||
|
|
||||||
|
resultHeap.addAll(
|
||||||
|
resultValuator.rankResults(parameters, rankingContext, resultIds)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return true; // keep going
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -6,6 +6,18 @@ import org.jetbrains.annotations.NotNull;
|
|||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
|
/** A priority queue for search results. This class is not thread-safe,
|
||||||
|
* in general, except for concurrent use of the addAll method.
|
||||||
|
* <p></p>
|
||||||
|
* The class implements a subset of the Collection interface, and
|
||||||
|
* is intended to be used as a priority queue for search results,
|
||||||
|
* with a maximum size.
|
||||||
|
* <p></p>
|
||||||
|
* Since the expected use case is to add a large number of items
|
||||||
|
* and then iterate over the items, the class is optimized for
|
||||||
|
* this scenario, and does not implement other mutating methods
|
||||||
|
* than addAll().
|
||||||
|
*/
|
||||||
public class ResultPriorityQueue implements Iterable<SearchResultItem>,
|
public class ResultPriorityQueue implements Iterable<SearchResultItem>,
|
||||||
Collection<SearchResultItem> {
|
Collection<SearchResultItem> {
|
||||||
private final int limit;
|
private final int limit;
|
||||||
@ -34,16 +46,12 @@ public class ResultPriorityQueue implements Iterable<SearchResultItem>,
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean add(SearchResultItem searchResultItem) {
|
public boolean add(SearchResultItem searchResultItem) {
|
||||||
throw new UnsupportedOperationException("Use addAll instead ya dingus");
|
throw new UnsupportedOperationException("Use addAll instead");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean remove(Object o) {
|
public boolean remove(Object o) {
|
||||||
if (o instanceof SearchResultItem sri) {
|
throw new UnsupportedOperationException();
|
||||||
idsInSet.remove(sri.getDocumentId());
|
|
||||||
return idsInSet.remove(sri.getDocumentId());
|
|
||||||
}
|
|
||||||
throw new IllegalArgumentException("Object is not a SearchResultItem");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -77,17 +85,18 @@ public class ResultPriorityQueue implements Iterable<SearchResultItem>,
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean removeAll(@NotNull Collection<?> c) {
|
public boolean removeAll(@NotNull Collection<?> c) {
|
||||||
return backingList.removeAll(c);
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean retainAll(@NotNull Collection<?> c) {
|
public boolean retainAll(@NotNull Collection<?> c) {
|
||||||
return backingList.retainAll(c);
|
throw new UnsupportedOperationException();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void clear() {
|
public void clear() {
|
||||||
|
backingList.clear();
|
||||||
|
idsInSet.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
public int size() {
|
public int size() {
|
||||||
|
@ -5,7 +5,6 @@ import nu.marginalia.index.forward.ForwardIndexReader;
|
|||||||
import nu.marginalia.index.model.QueryParams;
|
import nu.marginalia.index.model.QueryParams;
|
||||||
import nu.marginalia.index.query.IndexQuery;
|
import nu.marginalia.index.query.IndexQuery;
|
||||||
import nu.marginalia.index.query.IndexQueryBuilder;
|
import nu.marginalia.index.query.IndexQueryBuilder;
|
||||||
import nu.marginalia.index.query.IndexQueryPriority;
|
|
||||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||||
@ -16,7 +15,6 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
import java.util.List;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
/** A reader for the combined forward and reverse indexes */
|
/** A reader for the combined forward and reverse indexes */
|
||||||
@ -42,22 +40,15 @@ public class CombinedIndexReader {
|
|||||||
|
|
||||||
|
|
||||||
/** Creates a query builder for terms in the priority index */
|
/** Creates a query builder for terms in the priority index */
|
||||||
public IndexQueryBuilder findPriorityWord(IndexQueryPriority priority,
|
public IndexQueryBuilder findPriorityWord(long wordId) {
|
||||||
long wordId,
|
return newQueryBuilder(new IndexQuery(reverseIndexPriorityReader.documents(wordId)))
|
||||||
int fetchSizeMultiplier) {
|
|
||||||
return newQueryBuilder(new IndexQuery(
|
|
||||||
List.of(reverseIndexPriorityReader.documents(wordId)),
|
|
||||||
priority,
|
|
||||||
fetchSizeMultiplier))
|
|
||||||
.withSourceTerms(wordId);
|
.withSourceTerms(wordId);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Creates a query builder for terms in the full index */
|
/** Creates a query builder for terms in the full index */
|
||||||
public IndexQueryBuilder findFullWord(IndexQueryPriority priority, long wordId, int fetchSizeMultiplier) {
|
public IndexQueryBuilder findFullWord(long wordId) {
|
||||||
return newQueryBuilder(
|
return newQueryBuilder(
|
||||||
new IndexQuery(List.of(reverseIndexFullReader.documents(wordId)),
|
new IndexQuery(reverseIndexFullReader.documents(wordId)))
|
||||||
priority,
|
|
||||||
fetchSizeMultiplier))
|
|
||||||
.withSourceTerms(wordId);
|
.withSourceTerms(wordId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,64 +0,0 @@
|
|||||||
package nu.marginalia.index.index;
|
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
|
||||||
import com.google.inject.Singleton;
|
|
||||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
|
||||||
import nu.marginalia.array.buffer.LongQueryBuffer;
|
|
||||||
import nu.marginalia.index.model.QueryParams;
|
|
||||||
import nu.marginalia.index.model.SearchTerms;
|
|
||||||
import nu.marginalia.index.query.IndexQuery;
|
|
||||||
import nu.marginalia.index.query.IndexSearchBudget;
|
|
||||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
|
||||||
import org.roaringbitmap.longlong.Roaring64Bitmap;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
import org.slf4j.Marker;
|
|
||||||
import org.slf4j.MarkerFactory;
|
|
||||||
|
|
||||||
import java.util.function.Consumer;
|
|
||||||
|
|
||||||
@Singleton
|
|
||||||
public class IndexQueryService {
|
|
||||||
private final Marker queryMarker = MarkerFactory.getMarker("QUERY");
|
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(IndexQueryService.class);
|
|
||||||
private final StatefulIndex index;
|
|
||||||
|
|
||||||
@Inject
|
|
||||||
public IndexQueryService(StatefulIndex index) {
|
|
||||||
this.index = index;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Execute subqueries and return a list of document ids. The index is queried for each subquery,
|
|
||||||
* at different priorty depths until timeout is reached or the results are all visited.
|
|
||||||
* Then the results are combined.
|
|
||||||
* */
|
|
||||||
public void evaluateSubquery(IndexQuery query,
|
|
||||||
IndexSearchBudget timeout,
|
|
||||||
Consumer<CombinedDocIdList> drain)
|
|
||||||
{
|
|
||||||
final LongArrayList results = new LongArrayList(512);
|
|
||||||
|
|
||||||
// These queries are different indices for one subquery
|
|
||||||
final LongQueryBuffer buffer = new LongQueryBuffer(512);
|
|
||||||
|
|
||||||
while (query.hasMore() && timeout.hasTimeLeft())
|
|
||||||
{
|
|
||||||
buffer.reset();
|
|
||||||
query.getMoreResults(buffer);
|
|
||||||
|
|
||||||
results.addElements(0, buffer.data, 0, buffer.end);
|
|
||||||
|
|
||||||
if (results.size() < 512) {
|
|
||||||
drain.accept(new CombinedDocIdList(results));
|
|
||||||
results.clear();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!results.isEmpty()) {
|
|
||||||
drain.accept(new CombinedDocIdList(results));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -8,8 +8,6 @@ import nu.marginalia.index.model.QueryParams;
|
|||||||
import nu.marginalia.index.IndexFactory;
|
import nu.marginalia.index.IndexFactory;
|
||||||
import nu.marginalia.index.model.SearchTerms;
|
import nu.marginalia.index.model.SearchTerms;
|
||||||
import nu.marginalia.index.query.*;
|
import nu.marginalia.index.query.*;
|
||||||
import nu.marginalia.index.query.filter.QueryFilterStepFromPredicate;
|
|
||||||
import nu.marginalia.index.results.model.ids.TermIdList;
|
|
||||||
import nu.marginalia.service.control.ServiceEventLog;
|
import nu.marginalia.service.control.ServiceEventLog;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -22,7 +20,6 @@ import java.util.List;
|
|||||||
import java.util.concurrent.locks.Lock;
|
import java.util.concurrent.locks.Lock;
|
||||||
import java.util.concurrent.locks.ReadWriteLock;
|
import java.util.concurrent.locks.ReadWriteLock;
|
||||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||||
import java.util.function.LongPredicate;
|
|
||||||
|
|
||||||
/** This class delegates SearchIndexReader and deals with the stateful nature of the index,
|
/** This class delegates SearchIndexReader and deals with the stateful nature of the index,
|
||||||
* i.e. it may be possible to reconstruct the index and load a new set of data.
|
* i.e. it may be possible to reconstruct the index and load a new set of data.
|
||||||
@ -122,19 +119,13 @@ public class StatefulIndex {
|
|||||||
List<IndexQueryBuilder> queryHeads = new ArrayList<>(10);
|
List<IndexQueryBuilder> queryHeads = new ArrayList<>(10);
|
||||||
List<IndexQuery> queries = new ArrayList<>(10);
|
List<IndexQuery> queries = new ArrayList<>(10);
|
||||||
|
|
||||||
// Fetch more results than specified for short queries, as the query itself is cheap and the
|
// To ensure that good results are discovered, create separate query heads for the priority index that
|
||||||
// priority index may contain a considerable amount of less interesting results
|
// filter for terms that contain pairs of two search terms
|
||||||
final int fetchSizeMultiplier;
|
|
||||||
if (orderedIncludes.length == 1) fetchSizeMultiplier = 4;
|
|
||||||
else fetchSizeMultiplier = 1;
|
|
||||||
|
|
||||||
// To ensure that good results are processed first, create query heads for the priority index that filter for terms
|
|
||||||
// that contain pairs of two search terms
|
|
||||||
if (orderedIncludesPrio.length > 1) {
|
if (orderedIncludesPrio.length > 1) {
|
||||||
for (int i = 0; i + 1 < orderedIncludesPrio.length; i++) {
|
for (int i = 0; i + 1 < orderedIncludesPrio.length; i++) {
|
||||||
for (int j = i + 1; j < orderedIncludesPrio.length; j++) {
|
for (int j = i + 1; j < orderedIncludesPrio.length; j++) {
|
||||||
var entrySource = combinedIndexReader
|
var entrySource = combinedIndexReader
|
||||||
.findPriorityWord(IndexQueryPriority.BEST, orderedIncludesPrio[i], fetchSizeMultiplier)
|
.findPriorityWord(orderedIncludesPrio[i])
|
||||||
.alsoPrio(orderedIncludesPrio[j]);
|
.alsoPrio(orderedIncludesPrio[j]);
|
||||||
queryHeads.add(entrySource);
|
queryHeads.add(entrySource);
|
||||||
}
|
}
|
||||||
@ -143,18 +134,20 @@ public class StatefulIndex {
|
|||||||
|
|
||||||
// Next consider entries that appear only once in the priority index
|
// Next consider entries that appear only once in the priority index
|
||||||
for (var wordId : orderedIncludesPrio) {
|
for (var wordId : orderedIncludesPrio) {
|
||||||
queryHeads.add(combinedIndexReader.findPriorityWord(IndexQueryPriority.GOOD, wordId, fetchSizeMultiplier));
|
queryHeads.add(combinedIndexReader.findPriorityWord(wordId));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Finally consider terms in the full index, but only do this for sufficiently long queries
|
// Finally consider terms in the full index
|
||||||
// as short queries tend to be too underspecified to produce anything other than CPU warmth
|
queryHeads.add(combinedIndexReader.findFullWord(orderedIncludes[0]));
|
||||||
queryHeads.add(combinedIndexReader.findFullWord(IndexQueryPriority.FALLBACK, orderedIncludes[0], fetchSizeMultiplier));
|
|
||||||
|
|
||||||
for (var query : queryHeads) {
|
for (var query : queryHeads) {
|
||||||
if (query == null) {
|
if (query == null) {
|
||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Note that we can add all includes as filters, even though
|
||||||
|
// they may not be present in the query head, as the query builder
|
||||||
|
// will ignore redundant include filters:
|
||||||
for (long orderedInclude : orderedIncludes) {
|
for (long orderedInclude : orderedIncludes) {
|
||||||
query = query.alsoFull(orderedInclude);
|
query = query.alsoFull(orderedInclude);
|
||||||
}
|
}
|
||||||
@ -163,7 +156,7 @@ public class StatefulIndex {
|
|||||||
query = query.notFull(term);
|
query = query.notFull(term);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Run these last, as they'll worst-case cause as many page faults as there are
|
// Run these filter steps last, as they'll worst-case cause as many page faults as there are
|
||||||
// items in the buffer
|
// items in the buffer
|
||||||
queries.add(query.addInclusionFilter(combinedIndexReader.filterForParams(params)).build());
|
queries.add(query.addInclusionFilter(combinedIndexReader.filterForParams(params)).build());
|
||||||
}
|
}
|
||||||
|
@ -4,8 +4,20 @@ import nu.marginalia.index.searchset.SearchSet;
|
|||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
|
|
||||||
/** IndexQueryParams is a set of parameters for a query.
|
import java.util.Objects;
|
||||||
*
|
|
||||||
|
/**
|
||||||
|
* IndexQueryParams is a set of parameters for a query.
|
||||||
|
*/
|
||||||
|
public final class QueryParams {
|
||||||
|
private final SpecificationLimit qualityLimit;
|
||||||
|
private final SpecificationLimit year;
|
||||||
|
private final SpecificationLimit size;
|
||||||
|
private final SpecificationLimit rank;
|
||||||
|
private final SearchSet searchSet;
|
||||||
|
private final QueryStrategy queryStrategy;
|
||||||
|
|
||||||
|
/**
|
||||||
* @param qualityLimit The quality limit.
|
* @param qualityLimit The quality limit.
|
||||||
* @param year The year limit.
|
* @param year The year limit.
|
||||||
* @param size The size limit. Eliminates results from domains that do not satisfy the size criteria.
|
* @param size The size limit. Eliminates results from domains that do not satisfy the size criteria.
|
||||||
@ -14,13 +26,73 @@ import nu.marginalia.index.query.limit.SpecificationLimit;
|
|||||||
* @param queryStrategy The query strategy. May impose additional constraints on the query, such as requiring
|
* @param queryStrategy The query strategy. May impose additional constraints on the query, such as requiring
|
||||||
* the keywords to appear in the title, or in the domain.
|
* the keywords to appear in the title, or in the domain.
|
||||||
*/
|
*/
|
||||||
public record QueryParams(SpecificationLimit qualityLimit,
|
public QueryParams(SpecificationLimit qualityLimit,
|
||||||
SpecificationLimit year,
|
SpecificationLimit year,
|
||||||
SpecificationLimit size,
|
SpecificationLimit size,
|
||||||
SpecificationLimit rank,
|
SpecificationLimit rank,
|
||||||
SearchSet searchSet,
|
SearchSet searchSet,
|
||||||
QueryStrategy queryStrategy
|
QueryStrategy queryStrategy
|
||||||
)
|
) {
|
||||||
{
|
this.qualityLimit = qualityLimit;
|
||||||
|
this.year = year;
|
||||||
|
this.size = size;
|
||||||
|
this.rank = rank;
|
||||||
|
this.searchSet = searchSet;
|
||||||
|
this.queryStrategy = queryStrategy;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SpecificationLimit qualityLimit() {
|
||||||
|
return qualityLimit;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SpecificationLimit year() {
|
||||||
|
return year;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SpecificationLimit size() {
|
||||||
|
return size;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SpecificationLimit rank() {
|
||||||
|
return rank;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SearchSet searchSet() {
|
||||||
|
return searchSet;
|
||||||
|
}
|
||||||
|
|
||||||
|
public QueryStrategy queryStrategy() {
|
||||||
|
return queryStrategy;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (obj == this) return true;
|
||||||
|
if (obj == null || obj.getClass() != this.getClass()) return false;
|
||||||
|
var that = (QueryParams) obj;
|
||||||
|
return Objects.equals(this.qualityLimit, that.qualityLimit) &&
|
||||||
|
Objects.equals(this.year, that.year) &&
|
||||||
|
Objects.equals(this.size, that.size) &&
|
||||||
|
Objects.equals(this.rank, that.rank) &&
|
||||||
|
Objects.equals(this.searchSet, that.searchSet) &&
|
||||||
|
Objects.equals(this.queryStrategy, that.queryStrategy);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return Objects.hash(qualityLimit, year, size, rank, searchSet, queryStrategy);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "QueryParams[" +
|
||||||
|
"qualityLimit=" + qualityLimit + ", " +
|
||||||
|
"year=" + year + ", " +
|
||||||
|
"size=" + size + ", " +
|
||||||
|
"rank=" + rank + ", " +
|
||||||
|
"searchSet=" + searchSet + ", " +
|
||||||
|
"queryStrategy=" + queryStrategy + ']';
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,13 +1,10 @@
|
|||||||
package nu.marginalia.index.model;
|
package nu.marginalia.index.model;
|
||||||
|
|
||||||
import gnu.trove.set.hash.TLongHashSet;
|
import nu.marginalia.api.searchquery.IndexProtobufCodec;
|
||||||
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||||
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||||
import nu.marginalia.api.searchquery.IndexProtobufCodec;
|
|
||||||
import nu.marginalia.index.index.StatefulIndex;
|
|
||||||
import nu.marginalia.index.query.IndexQuery;
|
|
||||||
import nu.marginalia.index.query.IndexSearchBudget;
|
import nu.marginalia.index.query.IndexSearchBudget;
|
||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.index.searchset.SearchSet;
|
import nu.marginalia.index.searchset.SearchSet;
|
||||||
@ -85,20 +82,8 @@ public class SearchParameters {
|
|||||||
rankingParams = IndexProtobufCodec.convertRankingParameterss(request.getParameters());
|
rankingParams = IndexProtobufCodec.convertRankingParameterss(request.getParameters());
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean hasTimeLeft() {
|
|
||||||
return budget.hasTimeLeft();
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getDataCost() {
|
public long getDataCost() {
|
||||||
return dataCost;
|
return dataCost;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class CachedObjects {
|
|
||||||
private static final ThreadLocal<TLongHashSet> consideredCache = ThreadLocal.withInitial(() -> new TLongHashSet(4096));
|
|
||||||
private static TLongHashSet getConsideredUrlsMap() {
|
|
||||||
var ret = consideredCache.get();
|
|
||||||
ret.clear();
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -8,16 +8,28 @@ import nu.marginalia.api.searchquery.model.query.SearchSubquery;
|
|||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
import static nu.marginalia.index.model.SearchTermsUtil.getWordId;
|
import static nu.marginalia.index.model.SearchTermsUtil.getWordId;
|
||||||
|
|
||||||
public record SearchTerms(
|
public final class SearchTerms {
|
||||||
|
private final LongList includes;
|
||||||
|
private final LongList excludes;
|
||||||
|
private final LongList priority;
|
||||||
|
private final List<LongList> coherences;
|
||||||
|
|
||||||
|
public SearchTerms(
|
||||||
LongList includes,
|
LongList includes,
|
||||||
LongList excludes,
|
LongList excludes,
|
||||||
LongList priority,
|
LongList priority,
|
||||||
List<LongList> coherences
|
List<LongList> coherences
|
||||||
)
|
) {
|
||||||
{
|
this.includes = includes;
|
||||||
|
this.excludes = excludes;
|
||||||
|
this.priority = priority;
|
||||||
|
this.coherences = coherences;
|
||||||
|
}
|
||||||
|
|
||||||
public SearchTerms(SearchSubquery subquery) {
|
public SearchTerms(SearchSubquery subquery) {
|
||||||
this(new LongArrayList(),
|
this(new LongArrayList(),
|
||||||
new LongArrayList(),
|
new LongArrayList(),
|
||||||
@ -67,4 +79,46 @@ public record SearchTerms(
|
|||||||
public int size() {
|
public int size() {
|
||||||
return includes.size() + excludes.size() + priority.size();
|
return includes.size() + excludes.size() + priority.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public LongList includes() {
|
||||||
|
return includes;
|
||||||
|
}
|
||||||
|
|
||||||
|
public LongList excludes() {
|
||||||
|
return excludes;
|
||||||
|
}
|
||||||
|
|
||||||
|
public LongList priority() {
|
||||||
|
return priority;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<LongList> coherences() {
|
||||||
|
return coherences;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (obj == this) return true;
|
||||||
|
if (obj == null || obj.getClass() != this.getClass()) return false;
|
||||||
|
var that = (SearchTerms) obj;
|
||||||
|
return Objects.equals(this.includes, that.includes) &&
|
||||||
|
Objects.equals(this.excludes, that.excludes) &&
|
||||||
|
Objects.equals(this.priority, that.priority) &&
|
||||||
|
Objects.equals(this.coherences, that.coherences);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return Objects.hash(includes, excludes, priority, coherences);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "SearchTerms[" +
|
||||||
|
"includes=" + includes + ", " +
|
||||||
|
"excludes=" + excludes + ", " +
|
||||||
|
"priority=" + priority + ", " +
|
||||||
|
"coherences=" + coherences + ']';
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -22,7 +22,6 @@ import static nu.marginalia.index.results.model.TermMetadataForCombinedDocumentI
|
|||||||
public class IndexMetadataService {
|
public class IndexMetadataService {
|
||||||
private final StatefulIndex index;
|
private final StatefulIndex index;
|
||||||
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public IndexMetadataService(StatefulIndex index) {
|
public IndexMetadataService(StatefulIndex index) {
|
||||||
this.index = index;
|
this.index = index;
|
||||||
|
@ -47,18 +47,12 @@ public class IndexResultValuatorService {
|
|||||||
ResultRankingContext rankingContext,
|
ResultRankingContext rankingContext,
|
||||||
CombinedDocIdList resultIds)
|
CombinedDocIdList resultIds)
|
||||||
{
|
{
|
||||||
final var evaluator = new IndexResultValuationContext(metadataService,
|
final var evaluator = createValuationContext(params, rankingContext, resultIds);
|
||||||
resultValuator,
|
|
||||||
resultIds,
|
|
||||||
statefulIndex,
|
|
||||||
rankingContext,
|
|
||||||
params.subqueries,
|
|
||||||
params.queryParams);
|
|
||||||
|
|
||||||
List<SearchResultItem> results = new ArrayList<>(resultIds.size());
|
List<SearchResultItem> results = new ArrayList<>(resultIds.size());
|
||||||
|
|
||||||
for (long docId : resultIds.array()) {
|
for (long id : resultIds.array()) {
|
||||||
var score = evaluator.calculatePreliminaryScore(docId);
|
var score = evaluator.calculatePreliminaryScore(id);
|
||||||
if (score != null) {
|
if (score != null) {
|
||||||
results.add(score);
|
results.add(score);
|
||||||
}
|
}
|
||||||
@ -67,6 +61,19 @@ public class IndexResultValuatorService {
|
|||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private IndexResultValuationContext createValuationContext(SearchParameters params,
|
||||||
|
ResultRankingContext rankingContext,
|
||||||
|
CombinedDocIdList resultIds)
|
||||||
|
{
|
||||||
|
return new IndexResultValuationContext(metadataService,
|
||||||
|
resultValuator,
|
||||||
|
resultIds,
|
||||||
|
statefulIndex,
|
||||||
|
rankingContext,
|
||||||
|
params.subqueries,
|
||||||
|
params.queryParams);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public List<DecoratedSearchResultItem> selectBestResults(SearchParameters params,
|
public List<DecoratedSearchResultItem> selectBestResults(SearchParameters params,
|
||||||
ResultRankingContext rankingContext,
|
ResultRankingContext rankingContext,
|
||||||
|
@ -1,11 +1,17 @@
|
|||||||
package nu.marginalia.index.results.model.ids;
|
package nu.marginalia.index.results.model.ids;
|
||||||
|
|
||||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongIterators;
|
||||||
import org.roaringbitmap.longlong.Roaring64Bitmap;
|
import org.roaringbitmap.longlong.Roaring64Bitmap;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.stream.LongStream;
|
import java.util.stream.LongStream;
|
||||||
|
|
||||||
|
/** A list of document ids, with their ranking bits still remaining.
|
||||||
|
*
|
||||||
|
* @see nu.marginalia.index.results.model.ids.DocIdList
|
||||||
|
* @see nu.marginalia.model.id.UrlIdCodec
|
||||||
|
* */
|
||||||
public final class CombinedDocIdList {
|
public final class CombinedDocIdList {
|
||||||
private final long[] data;
|
private final long[] data;
|
||||||
|
|
||||||
@ -48,5 +54,6 @@ public final class CombinedDocIdList {
|
|||||||
public void sort() {
|
public void sort() {
|
||||||
Arrays.sort(data);
|
Arrays.sort(data);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6,6 +6,11 @@ import java.util.Arrays;
|
|||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.stream.LongStream;
|
import java.util.stream.LongStream;
|
||||||
|
|
||||||
|
/** A list of document ids, with their ranking bits removed.
|
||||||
|
*
|
||||||
|
* @see nu.marginalia.index.results.model.ids.CombinedDocIdList
|
||||||
|
* @see nu.marginalia.model.id.UrlIdCodec
|
||||||
|
* */
|
||||||
public final class DocIdList {
|
public final class DocIdList {
|
||||||
private final long[] array;
|
private final long[] array;
|
||||||
|
|
||||||
|
@ -19,29 +19,15 @@ public class IndexQuery {
|
|||||||
private final List<EntrySource> sources;
|
private final List<EntrySource> sources;
|
||||||
private final List<QueryFilterStepIf> inclusionFilter = new ArrayList<>(10);
|
private final List<QueryFilterStepIf> inclusionFilter = new ArrayList<>(10);
|
||||||
|
|
||||||
public final IndexQueryPriority queryPriority;
|
public IndexQuery(List<EntrySource> sources)
|
||||||
public final int fetchSizeMultiplier;
|
|
||||||
|
|
||||||
public IndexQuery(EntrySource... sources) {
|
|
||||||
this(List.of(sources), IndexQueryPriority.BEST, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates an IndexQuery object with the given sources, priority, and fetchSizeMultiplier.
|
|
||||||
*
|
|
||||||
* @param sources List of EntrySource objects representing the sources to query from
|
|
||||||
* @param priority IndexQueryPriority of the query, determining how many results to fetch before stopping
|
|
||||||
* @param fetchSizeMultiplier Affects the fetch size of the query, determining how deep the query should go
|
|
||||||
*/
|
|
||||||
public IndexQuery(List<EntrySource> sources,
|
|
||||||
IndexQueryPriority priority,
|
|
||||||
int fetchSizeMultiplier)
|
|
||||||
{
|
{
|
||||||
this.sources = sources;
|
this.sources = sources;
|
||||||
this.queryPriority = priority;
|
|
||||||
this.fetchSizeMultiplier = fetchSizeMultiplier;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public IndexQuery(EntrySource... sources)
|
||||||
|
{
|
||||||
|
this.sources = List.of(sources);
|
||||||
|
}
|
||||||
/** Adds a filter to the query. The filter will be applied to the results
|
/** Adds a filter to the query. The filter will be applied to the results
|
||||||
* after they are read from the sources.
|
* after they are read from the sources.
|
||||||
*
|
*
|
||||||
|
@ -1,14 +0,0 @@
|
|||||||
package nu.marginalia.index.query;
|
|
||||||
|
|
||||||
/** Designates the presumptive value of an IndexQuery.
|
|
||||||
*/
|
|
||||||
public enum IndexQueryPriority {
|
|
||||||
/** This is likely to produce highly relevant results */
|
|
||||||
BEST,
|
|
||||||
|
|
||||||
/** This may produce relevant results */
|
|
||||||
GOOD,
|
|
||||||
|
|
||||||
/** This is a fallback query, only execute if no higher prioritized query returned any results */
|
|
||||||
FALLBACK
|
|
||||||
}
|
|
@ -6,6 +6,9 @@ It exposes an API for querying the index, and contains the logic
|
|||||||
for ranking search results. It does not parse the query, that is
|
for ranking search results. It does not parse the query, that is
|
||||||
the responsibility of the [search-query](../functions/search-query) module.
|
the responsibility of the [search-query](../functions/search-query) module.
|
||||||
|
|
||||||
|
The central class of the index subsystem is the [IndexGrpcService](java/nu/marginalia/index/IndexGrpcService.java) class,
|
||||||
|
which is a gRPC service that exposes the index to the rest of the system.
|
||||||
|
|
||||||
## Indexes
|
## Indexes
|
||||||
|
|
||||||
There are two indexes with accompanying tools for constructing them.
|
There are two indexes with accompanying tools for constructing them.
|
||||||
|
Loading…
Reference in New Issue
Block a user