mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Cleaning up and adding better error messages.
This commit is contained in:
parent
fbe17b62ed
commit
eaef93f4ae
@ -25,14 +25,14 @@ public class CachingBTreeReader {
|
|||||||
return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
|
return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
|
||||||
}
|
}
|
||||||
|
|
||||||
public Cache prepareCache(BTreeHeader header) {
|
public BTreeCachedIndex prepareCache(BTreeHeader header) {
|
||||||
return new Cache(header);
|
return new BTreeCachedIndex(header);
|
||||||
}
|
}
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @return file offset of entry matching keyRaw, negative if absent
|
* @return file offset of entry matching keyRaw, negative if absent
|
||||||
*/
|
*/
|
||||||
public long findEntry(Cache cache, final long keyRaw) {
|
public long findEntry(BTreeCachedIndex cache, final long keyRaw) {
|
||||||
BTreeHeader header = cache.header;
|
BTreeHeader header = cache.header;
|
||||||
|
|
||||||
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
||||||
@ -62,7 +62,7 @@ public class CachingBTreeReader {
|
|||||||
return dataSearcher.binarySearch(key, searchStart, numEntries);
|
return dataSearcher.binarySearch(key, searchStart, numEntries);
|
||||||
}
|
}
|
||||||
|
|
||||||
private long searchIndex(BTreeHeader header, Cache cache, long key) {
|
private long searchIndex(BTreeHeader header, BTreeCachedIndex cache, long key) {
|
||||||
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
||||||
long layerOffset = 0;
|
long layerOffset = 0;
|
||||||
|
|
||||||
@ -83,13 +83,13 @@ public class CachingBTreeReader {
|
|||||||
* for repeated queries against the same tree. The memory consumption is typically very low
|
* for repeated queries against the same tree. The memory consumption is typically very low
|
||||||
* and the disk access pattern for reading the entire index relatively cheap.
|
* and the disk access pattern for reading the entire index relatively cheap.
|
||||||
*/
|
*/
|
||||||
public class Cache {
|
public class BTreeCachedIndex {
|
||||||
long[] indexData;
|
long[] indexData;
|
||||||
final BTreeHeader header;
|
final BTreeHeader header;
|
||||||
|
|
||||||
final int indexedDataSize;
|
final int indexedDataSize;
|
||||||
|
|
||||||
public Cache(BTreeHeader header) {
|
public BTreeCachedIndex(BTreeHeader header) {
|
||||||
this.header = header;
|
this.header = header;
|
||||||
indexedDataSize = header.numEntries();
|
indexedDataSize = header.numEntries();
|
||||||
}
|
}
|
||||||
|
@ -95,4 +95,8 @@ public class WmsaHome {
|
|||||||
home.resolve("model/opennlp-tok.bin"));
|
home.resolve("model/opennlp-tok.bin"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static final boolean debugMode = Boolean.getBoolean("wmsa-debug");
|
||||||
|
public static boolean isDebug() {
|
||||||
|
return debugMode;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,10 +3,10 @@ package nu.marginalia.wmsa.edge.index;
|
|||||||
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter;
|
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter;
|
||||||
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.IndexQueryCachePool;
|
|
||||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexQuery;
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexQueryFactory;
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
@ -3,68 +3,29 @@ package nu.marginalia.wmsa.edge.index;
|
|||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
import com.google.protobuf.InvalidProtocolBufferException;
|
|
||||||
import gnu.trove.set.hash.TIntHashSet;
|
|
||||||
import io.prometheus.client.Counter;
|
|
||||||
import io.prometheus.client.Histogram;
|
|
||||||
import io.reactivex.rxjava3.schedulers.Schedulers;
|
import io.reactivex.rxjava3.schedulers.Schedulers;
|
||||||
import nu.marginalia.util.ListChunker;
|
|
||||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
|
||||||
import nu.marginalia.wmsa.client.GsonFactory;
|
import nu.marginalia.wmsa.client.GsonFactory;
|
||||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||||
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
||||||
import nu.marginalia.wmsa.configuration.server.Service;
|
import nu.marginalia.wmsa.configuration.server.Service;
|
||||||
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
|
|
||||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
|
|
||||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
|
|
||||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
|
||||||
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
|
||||||
import nu.marginalia.wmsa.edge.index.reader.IndexQueryCachePool;
|
|
||||||
import nu.marginalia.wmsa.edge.index.reader.ResultDomainDeduplicator;
|
|
||||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexQuery;
|
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexLexiconService;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
|
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexOpsService;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexQueryService;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
|
||||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
|
||||||
import nu.marginalia.wmsa.edge.model.id.EdgeIdArray;
|
|
||||||
import nu.marginalia.wmsa.edge.model.search.*;
|
|
||||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
|
||||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
|
||||||
import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq;
|
|
||||||
import org.apache.http.HttpStatus;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
import spark.HaltException;
|
|
||||||
import spark.Request;
|
import spark.Request;
|
||||||
import spark.Response;
|
import spark.Response;
|
||||||
import spark.Spark;
|
import spark.Spark;
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.function.LongPredicate;
|
|
||||||
|
|
||||||
import static spark.Spark.get;
|
import static spark.Spark.get;
|
||||||
import static spark.Spark.halt;
|
|
||||||
|
|
||||||
public class EdgeIndexService extends Service {
|
public class EdgeIndexService extends Service {
|
||||||
private static final int SEARCH_BUDGET_TIMEOUT_MS = 3000;
|
|
||||||
private static final int QUERY_FETCH_SIZE = 8192;
|
|
||||||
private static final int QUERY_FIRST_PASS_DOMAIN_LIMIT = 64;
|
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
@NotNull
|
@NotNull
|
||||||
private final Initialization init;
|
private final Initialization init;
|
||||||
private final SearchIndexes indexes;
|
private final SearchIndexes indexes;
|
||||||
private final KeywordLexicon keywordLexicon;
|
|
||||||
|
|
||||||
private final Gson gson = GsonFactory.get();
|
|
||||||
|
|
||||||
private static final Histogram wmsa_edge_index_query_time = Histogram.build().name("wmsa_edge_index_query_time").linearBuckets(50, 50, 15).help("-").register();
|
|
||||||
private static final Counter wmsa_edge_index_query_timeouts = Counter.build().name("wmsa_edge_index_query_timeouts").help("-").register();
|
|
||||||
|
|
||||||
public static final int DYNAMIC_BUCKET_LENGTH = 7;
|
public static final int DYNAMIC_BUCKET_LENGTH = 7;
|
||||||
|
|
||||||
@ -75,71 +36,34 @@ public class EdgeIndexService extends Service {
|
|||||||
Initialization init,
|
Initialization init,
|
||||||
MetricsServer metricsServer,
|
MetricsServer metricsServer,
|
||||||
SearchIndexes indexes,
|
SearchIndexes indexes,
|
||||||
IndexServicesFactory servicesFactory) {
|
|
||||||
|
EdgeIndexOpsService opsService,
|
||||||
|
EdgeIndexLexiconService lexiconService,
|
||||||
|
EdgeIndexQueryService indexQueryService)
|
||||||
|
{
|
||||||
super(ip, port, init, metricsServer);
|
super(ip, port, init, metricsServer);
|
||||||
|
|
||||||
|
final Gson gson = GsonFactory.get();
|
||||||
|
|
||||||
this.init = init;
|
this.init = init;
|
||||||
this.indexes = indexes;
|
this.indexes = indexes;
|
||||||
this.keywordLexicon = servicesFactory.getKeywordLexicon();
|
|
||||||
|
|
||||||
Spark.post("/words/", this::putWords);
|
Spark.post("/words/", lexiconService::putWords);
|
||||||
Spark.post("/search/", this::search, gson::toJson);
|
|
||||||
Spark.post("/search-domain/", this::searchDomain, gson::toJson);
|
|
||||||
|
|
||||||
Spark.post("/dictionary/*", this::getWordId, gson::toJson);
|
Spark.post("/search/", indexQueryService::search, gson::toJson);
|
||||||
|
Spark.post("/search-domain/", indexQueryService::searchDomain, gson::toJson);
|
||||||
|
|
||||||
Spark.post("/ops/repartition", this::repartitionEndpoint);
|
Spark.get("/dictionary/*", lexiconService::getWordId, gson::toJson);
|
||||||
Spark.post("/ops/preconvert", this::preconvertEndpoint);
|
|
||||||
Spark.post("/ops/reindex/:id", this::reindexEndpoint);
|
Spark.post("/ops/repartition", opsService::repartitionEndpoint);
|
||||||
|
Spark.post("/ops/preconvert", opsService::preconvertEndpoint);
|
||||||
|
Spark.post("/ops/reindex/:id", opsService::reindexEndpoint);
|
||||||
|
|
||||||
get("/is-blocked", this::isBlocked, gson::toJson);
|
get("/is-blocked", this::isBlocked, gson::toJson);
|
||||||
|
|
||||||
Schedulers.newThread().scheduleDirect(this::initialize, 1, TimeUnit.MICROSECONDS);
|
Schedulers.newThread().scheduleDirect(this::initialize, 1, TimeUnit.MICROSECONDS);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Object getWordId(Request request, Response response) {
|
|
||||||
final String word = request.splat()[0];
|
|
||||||
|
|
||||||
var dr = indexes.getDictionaryReader();
|
|
||||||
if (null == dr) {
|
|
||||||
response.status(HttpStatus.SC_FAILED_DEPENDENCY);
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
|
|
||||||
final int wordId = dr.get(word);
|
|
||||||
|
|
||||||
if (DictionaryHashMap.NO_VALUE == wordId) {
|
|
||||||
response.status(404);
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
|
|
||||||
return wordId;
|
|
||||||
}
|
|
||||||
|
|
||||||
private Object repartitionEndpoint(Request request, Response response) {
|
|
||||||
|
|
||||||
if (!indexes.repartition()) {
|
|
||||||
Spark.halt(503, "Operations busy");
|
|
||||||
}
|
|
||||||
return "OK";
|
|
||||||
}
|
|
||||||
|
|
||||||
private Object preconvertEndpoint(Request request, Response response) {
|
|
||||||
if (!indexes.preconvert()) {
|
|
||||||
Spark.halt(503, "Operations busy");
|
|
||||||
}
|
|
||||||
return "OK";
|
|
||||||
}
|
|
||||||
|
|
||||||
private Object reindexEndpoint(Request request, Response response) {
|
|
||||||
int id = Integer.parseInt(request.params("id"));
|
|
||||||
|
|
||||||
if (!indexes.reindex(id)) {
|
|
||||||
Spark.halt(503, "Operations busy");
|
|
||||||
}
|
|
||||||
return "OK";
|
|
||||||
}
|
|
||||||
|
|
||||||
private Object isBlocked(Request request, Response response) {
|
private Object isBlocked(Request request, Response response) {
|
||||||
return indexes.isBusy() || !initialized;
|
return indexes.isBusy() || !initialized;
|
||||||
}
|
}
|
||||||
@ -156,296 +80,6 @@ public class EdgeIndexService extends Service {
|
|||||||
indexes.initialize(init);
|
indexes.initialize(init);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Object putWords(Request request, Response response) throws InvalidProtocolBufferException {
|
|
||||||
var req = IndexPutKeywordsReq.parseFrom(request.bodyAsBytes());
|
|
||||||
|
|
||||||
EdgeId<EdgeDomain> domainId = new EdgeId<>(req.getDomain());
|
|
||||||
EdgeId<EdgeUrl> urlId = new EdgeId<>(req.getUrl());
|
|
||||||
int idx = req.getIndex();
|
|
||||||
|
|
||||||
for (int ws = 0; ws < req.getWordSetCount(); ws++) {
|
|
||||||
putWords(domainId, urlId, req.getWordSet(ws), idx);
|
|
||||||
}
|
|
||||||
|
|
||||||
response.status(HttpStatus.SC_ACCEPTED);
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
|
|
||||||
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
|
|
||||||
IndexPutKeywordsReq.WordSet words, int idx
|
|
||||||
) {
|
|
||||||
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
|
|
||||||
|
|
||||||
IndexBlock block = IndexBlock.values()[words.getIndex()];
|
|
||||||
|
|
||||||
for (var chunk : ListChunker.chopList(words.getWordsList(), SearchIndexJournalEntry.MAX_LENGTH)) {
|
|
||||||
|
|
||||||
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
|
|
||||||
var header = new SearchIndexJournalEntryHeader(domainId, urlId, block);
|
|
||||||
|
|
||||||
indexWriter.put(header, entry);
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
private long[] getOrInsertWordIds(List<String> words) {
|
|
||||||
long[] ids = new long[words.size()];
|
|
||||||
int putIdx = 0;
|
|
||||||
|
|
||||||
for (String word : words) {
|
|
||||||
long id = keywordLexicon.getOrInsert(word);
|
|
||||||
if (id != DictionaryHashMap.NO_VALUE) {
|
|
||||||
ids[putIdx++] = id;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (putIdx != words.size()) {
|
|
||||||
ids = Arrays.copyOf(ids, putIdx);
|
|
||||||
}
|
|
||||||
return ids;
|
|
||||||
}
|
|
||||||
|
|
||||||
private Object searchDomain(Request request, Response response) {
|
|
||||||
if (indexes.getDictionaryReader() == null) {
|
|
||||||
logger.warn("Dictionary reader not yet initialized");
|
|
||||||
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
|
|
||||||
}
|
|
||||||
|
|
||||||
String json = request.body();
|
|
||||||
EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class);
|
|
||||||
|
|
||||||
final int wordId = keywordLexicon.getReadOnly(specsSet.keyword);
|
|
||||||
|
|
||||||
EdgeIdArray<EdgeUrl> urlIds = EdgeIdArray.gather(indexes
|
|
||||||
.getBucket(specsSet.bucket)
|
|
||||||
.findHotDomainsForKeyword(specsSet.block, wordId, specsSet.queryDepth, specsSet.minHitCount, specsSet.maxResults)
|
|
||||||
.mapToInt(lv -> (int)(lv & 0xFFFF_FFFFL)));
|
|
||||||
|
|
||||||
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
|
|
||||||
}
|
|
||||||
|
|
||||||
private Object search(Request request, Response response) {
|
|
||||||
if (indexes.getDictionaryReader() == null) {
|
|
||||||
logger.warn("Dictionary reader not yet initialized");
|
|
||||||
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
|
|
||||||
}
|
|
||||||
|
|
||||||
String json = request.body();
|
|
||||||
EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class);
|
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
|
||||||
try {
|
|
||||||
return new EdgeSearchResultSet(new SearchQuery(specsSet).execute());
|
|
||||||
}
|
|
||||||
catch (HaltException ex) {
|
|
||||||
logger.warn("Halt", ex);
|
|
||||||
throw ex;
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
logger.info("Error during search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
|
|
||||||
logger.info("Error", ex);
|
|
||||||
Spark.halt(500, "Error");
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
finally {
|
|
||||||
wmsa_edge_index_query_time.observe(System.currentTimeMillis() - start);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private class SearchQuery {
|
|
||||||
private final TIntHashSet seenResults = new TIntHashSet(QUERY_FETCH_SIZE, 0.5f);
|
|
||||||
private final EdgeSearchSpecification specsSet;
|
|
||||||
private final IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_TIMEOUT_MS);
|
|
||||||
private final IndexQueryCachePool cachePool = new IndexQueryCachePool();
|
|
||||||
|
|
||||||
public SearchQuery(EdgeSearchSpecification specsSet) {
|
|
||||||
this.specsSet = specsSet;
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<EdgeSearchResultItem> execute() {
|
|
||||||
final Set<EdgeSearchResultItem> results = new HashSet<>(QUERY_FETCH_SIZE);
|
|
||||||
|
|
||||||
for (var sq : specsSet.subqueries) {
|
|
||||||
Optional<EdgeIndexSearchTerms> searchTerms = getSearchTerms(sq);
|
|
||||||
|
|
||||||
if (searchTerms.isEmpty())
|
|
||||||
continue;
|
|
||||||
|
|
||||||
results.addAll(performSearch(searchTerms.get(), sq));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (var result : results) {
|
|
||||||
addResultScores(result);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!budget.hasTimeLeft()) {
|
|
||||||
wmsa_edge_index_query_timeouts.inc();
|
|
||||||
}
|
|
||||||
|
|
||||||
var domainCountFilter = new ResultDomainDeduplicator(specsSet.limitByDomain);
|
|
||||||
|
|
||||||
// cachePool.printSummary(logger);
|
|
||||||
cachePool.clear();
|
|
||||||
|
|
||||||
return results.stream()
|
|
||||||
.sorted(Comparator.comparing(EdgeSearchResultItem::getScore))
|
|
||||||
.filter(domainCountFilter::test)
|
|
||||||
.limit(specsSet.getLimitTotal()).toList();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private List<EdgeSearchResultItem> performSearch(EdgeIndexSearchTerms searchTerms,
|
|
||||||
EdgeSearchSubquery sq)
|
|
||||||
{
|
|
||||||
|
|
||||||
final List<EdgeSearchResultItem> results = new ArrayList<>(QUERY_FETCH_SIZE);
|
|
||||||
final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT);
|
|
||||||
|
|
||||||
final int remainingResults = QUERY_FETCH_SIZE;
|
|
||||||
|
|
||||||
for (int indexBucket : specsSet.buckets) {
|
|
||||||
|
|
||||||
if (!budget.hasTimeLeft()) {
|
|
||||||
logger.info("Query timed out, omitting {}:{} for query {}", indexBucket, sq.block, sq.searchTermsInclude);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (remainingResults <= results.size())
|
|
||||||
break;
|
|
||||||
|
|
||||||
var query = getQuery(cachePool, indexBucket, sq.block, lv -> localFilter.filterRawValue(indexBucket, lv), searchTerms);
|
|
||||||
long[] buf = new long[8192];
|
|
||||||
|
|
||||||
while (query.hasMore() && results.size() < remainingResults && budget.hasTimeLeft()) {
|
|
||||||
int cnt = query.getMoreResults(buf, budget);
|
|
||||||
|
|
||||||
for (int i = 0; i < cnt && results.size() < remainingResults; i++) {
|
|
||||||
long id = buf[i];
|
|
||||||
|
|
||||||
final EdgeSearchResultItem ri = new EdgeSearchResultItem(indexBucket, id);
|
|
||||||
|
|
||||||
if (!seenResults.add(ri.getUrlId().id()) || !localFilter.test(ri)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
results.add(ri);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
return results;
|
|
||||||
}
|
|
||||||
|
|
||||||
private IndexQuery getQuery(IndexQueryCachePool cachePool, int bucket, IndexBlock block,
|
|
||||||
LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
|
|
||||||
|
|
||||||
if (!indexes.isValidBucket(bucket)) {
|
|
||||||
logger.warn("Invalid bucket {}", bucket);
|
|
||||||
return new IndexQuery(Collections.emptyList());
|
|
||||||
}
|
|
||||||
|
|
||||||
return indexes.getBucket(bucket).getQuery(cachePool, block, filter, searchTerms);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void addResultScores(EdgeSearchResultItem searchResult) {
|
|
||||||
final var reader = Objects.requireNonNull(indexes.getDictionaryReader());
|
|
||||||
|
|
||||||
List<List<String>> searchTermVariants = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
|
|
||||||
|
|
||||||
// Memoize calls to getTermData, as they're redundant and cause disk reads
|
|
||||||
Map<ResultTerm, ResultTermData> termMetadata = new HashMap<>(32);
|
|
||||||
|
|
||||||
double bestScore = 0;
|
|
||||||
|
|
||||||
for (int searchTermListIdx = 0; searchTermListIdx < searchTermVariants.size(); searchTermListIdx++) {
|
|
||||||
double setScore = 0;
|
|
||||||
int setSize = 0;
|
|
||||||
for (var searchTerm : searchTermVariants.get(searchTermListIdx)) {
|
|
||||||
|
|
||||||
final int termId = reader.get(searchTerm);
|
|
||||||
|
|
||||||
ResultTermData data = termMetadata.computeIfAbsent(
|
|
||||||
new ResultTerm(searchResult.bucketId, termId, searchResult.getCombinedId()), this::getTermData);
|
|
||||||
|
|
||||||
var score = data.asScore(searchTermListIdx, searchTerm);
|
|
||||||
searchResult.scores.add(score);
|
|
||||||
setScore += score.value();
|
|
||||||
setSize++;
|
|
||||||
}
|
|
||||||
bestScore = Math.min(bestScore, setScore/setSize);
|
|
||||||
}
|
|
||||||
|
|
||||||
searchResult.setScore(bestScore);
|
|
||||||
}
|
|
||||||
|
|
||||||
private ResultTermData getTermData(ResultTerm resultTerm) {
|
|
||||||
final EdgeIndexBucket bucket = indexes.getBucket(resultTerm.bucket);
|
|
||||||
final int termId = resultTerm.termId;
|
|
||||||
final long combinedUrlId = resultTerm.combinedUrlId;
|
|
||||||
|
|
||||||
return new ResultTermData(bucket.getTermScore(cachePool, termId, combinedUrlId),
|
|
||||||
|
|
||||||
bucket.isTermInBucket(cachePool, IndexBlock.Title, termId, combinedUrlId),
|
|
||||||
bucket.isTermInBucket(cachePool, IndexBlock.Link, termId, combinedUrlId),
|
|
||||||
bucket.isTermInBucket(cachePool, IndexBlock.Site, termId, combinedUrlId),
|
|
||||||
bucket.isTermInBucket(cachePool, IndexBlock.Subjects, termId, combinedUrlId),
|
|
||||||
bucket.isTermInBucket(cachePool, IndexBlock.NamesWords, termId, combinedUrlId),
|
|
||||||
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Top, termId, combinedUrlId),
|
|
||||||
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Middle, termId, combinedUrlId),
|
|
||||||
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Lower, termId, combinedUrlId)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
record ResultTerm (int bucket, int termId, long combinedUrlId) {}
|
|
||||||
record ResultTermData (IndexBlock index,
|
|
||||||
boolean title,
|
|
||||||
boolean link,
|
|
||||||
boolean site,
|
|
||||||
boolean subject,
|
|
||||||
boolean name,
|
|
||||||
boolean high,
|
|
||||||
boolean mid,
|
|
||||||
boolean low
|
|
||||||
) {
|
|
||||||
public EdgeSearchResultKeywordScore asScore(int set, String searchTerm) {
|
|
||||||
return new EdgeSearchResultKeywordScore(set, searchTerm, index, title, link, site, subject, name, high, mid, low);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private Optional<EdgeIndexSearchTerms> getSearchTerms(EdgeSearchSubquery request) {
|
|
||||||
final List<Integer> excludes = new ArrayList<>();
|
|
||||||
final List<Integer> includes = new ArrayList<>();
|
|
||||||
|
|
||||||
for (var include : request.searchTermsInclude) {
|
|
||||||
var word = lookUpWord(include);
|
|
||||||
if (word.isEmpty()) {
|
|
||||||
logger.debug("Unknown search term: " + include);
|
|
||||||
return Optional.empty();
|
|
||||||
}
|
|
||||||
includes.add(word.getAsInt());
|
|
||||||
}
|
|
||||||
|
|
||||||
for (var exclude : request.searchTermsExclude) {
|
|
||||||
lookUpWord(exclude).ifPresent(excludes::add);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (includes.isEmpty()) {
|
|
||||||
return Optional.empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
return Optional.of(new EdgeIndexSearchTerms(includes, excludes));
|
|
||||||
}
|
|
||||||
|
|
||||||
private OptionalInt lookUpWord(String s) {
|
|
||||||
int ret = indexes.getDictionaryReader().get(s);
|
|
||||||
if (ret == DictionaryHashMap.NO_VALUE) {
|
|
||||||
return OptionalInt.empty();
|
|
||||||
}
|
|
||||||
return OptionalInt.of(ret);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.client;
|
package nu.marginalia.wmsa.edge.index.client;
|
||||||
|
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
|
import io.prometheus.client.Summary;
|
||||||
import io.reactivex.rxjava3.core.Observable;
|
import io.reactivex.rxjava3.core.Observable;
|
||||||
import io.reactivex.rxjava3.schedulers.Schedulers;
|
import io.reactivex.rxjava3.schedulers.Schedulers;
|
||||||
import nu.marginalia.wmsa.client.AbstractDynamicClient;
|
import nu.marginalia.wmsa.client.AbstractDynamicClient;
|
||||||
@ -10,6 +11,7 @@ import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywor
|
|||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||||
|
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem;
|
||||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultSet;
|
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultSet;
|
||||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
||||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
||||||
@ -23,6 +25,8 @@ import java.util.concurrent.TimeUnit;
|
|||||||
@Singleton
|
@Singleton
|
||||||
public class EdgeIndexClient extends AbstractDynamicClient implements EdgeIndexWriterClient {
|
public class EdgeIndexClient extends AbstractDynamicClient implements EdgeIndexWriterClient {
|
||||||
|
|
||||||
|
private static final Summary wmsa_search_index_api_time = Summary.build().name("wmsa_search_index_api_time").help("-").register();
|
||||||
|
|
||||||
public EdgeIndexClient() {
|
public EdgeIndexClient() {
|
||||||
super(ServiceDescriptor.EDGE_INDEX);
|
super(ServiceDescriptor.EDGE_INDEX);
|
||||||
setTimeout(30);
|
setTimeout(30);
|
||||||
@ -52,20 +56,10 @@ public class EdgeIndexClient extends AbstractDynamicClient implements EdgeIndexW
|
|||||||
|
|
||||||
|
|
||||||
@CheckReturnValue
|
@CheckReturnValue
|
||||||
public EdgeSearchResultSet query(Context ctx, EdgeSearchSpecification specs) {
|
public List<EdgeSearchResultItem> query(Context ctx, EdgeSearchSpecification specs) {
|
||||||
return this.postGet(ctx, "/search/", specs, EdgeSearchResultSet.class).blockingFirst();
|
return wmsa_search_index_api_time.time(
|
||||||
}
|
() -> this.postGet(ctx, "/search/", specs, EdgeSearchResultSet.class).blockingFirst().getResults()
|
||||||
|
);
|
||||||
@CheckReturnValue
|
|
||||||
public List<EdgeSearchResultSet> multiQuery(Context ctx, EdgeSearchSpecification... specs) {
|
|
||||||
|
|
||||||
return Observable.fromArray(specs)
|
|
||||||
.concatMap(s -> postGet(ctx, "/search/", s, EdgeSearchResultSet.class)
|
|
||||||
.subscribeOn(Schedulers.io())
|
|
||||||
.timeout(1, TimeUnit.SECONDS)
|
|
||||||
.onErrorComplete())
|
|
||||||
.toList()
|
|
||||||
.blockingGet();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@CheckReturnValue
|
@CheckReturnValue
|
||||||
|
@ -9,4 +9,8 @@ import java.util.List;
|
|||||||
public class EdgeIndexSearchTerms {
|
public class EdgeIndexSearchTerms {
|
||||||
public List<Integer> includes = new ArrayList<>();
|
public List<Integer> includes = new ArrayList<>();
|
||||||
public List<Integer> excludes = new ArrayList<>();
|
public List<Integer> excludes = new ArrayList<>();
|
||||||
|
|
||||||
|
public boolean isEmpty() {
|
||||||
|
return includes.isEmpty();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,52 +0,0 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.reader;
|
|
||||||
|
|
||||||
import gnu.trove.map.TLongIntMap;
|
|
||||||
import gnu.trove.map.hash.TLongIntHashMap;
|
|
||||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
public class ResultDomainDeduplicator {
|
|
||||||
final TLongIntMap resultsByRankingId = new TLongIntHashMap(2048, 0.5f, -1, 0);
|
|
||||||
final int limitByDomain;
|
|
||||||
|
|
||||||
public ResultDomainDeduplicator(int limitByDomain) {
|
|
||||||
this.limitByDomain = limitByDomain;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean filterRawValue(int bucket, long value) {
|
|
||||||
int domain = (int) (value >>> 32);
|
|
||||||
|
|
||||||
if (domain == Integer.MAX_VALUE) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return resultsByRankingId.get(getKey(bucket, domain)) <= limitByDomain;
|
|
||||||
}
|
|
||||||
|
|
||||||
long getKey(int bucketId, int rankingId) {
|
|
||||||
return ((long) bucketId) << 32 | rankingId;
|
|
||||||
}
|
|
||||||
|
|
||||||
long getKey(EdgeSearchResultItem item) {
|
|
||||||
return ((long) item.bucketId) << 32 | item.getRanking();
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean test(EdgeSearchResultItem item) {
|
|
||||||
if (item.getRanking() == Integer.MAX_VALUE) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return resultsByRankingId.adjustOrPutValue(getKey(item), 1, 1) <= limitByDomain;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void addAll(List<EdgeSearchResultItem> items) {
|
|
||||||
for (var item : items) {
|
|
||||||
resultsByRankingId.adjustOrPutValue(getKey(item), 1, 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void add(EdgeSearchResultItem item) {
|
|
||||||
resultsByRankingId.adjustOrPutValue(getKey(item), 1, 1);
|
|
||||||
}
|
|
||||||
}
|
|
@ -9,8 +9,9 @@ import nu.marginalia.util.btree.CachingBTreeReader;
|
|||||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||||
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
|
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.query.types.EntrySource;
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.query.types.QueryFilterStep;
|
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -77,25 +78,25 @@ public class SearchIndex implements AutoCloseable {
|
|||||||
return rangeForWord(pool, wordId).numEntries();
|
return rangeForWord(pool, wordId).numEntries();
|
||||||
}
|
}
|
||||||
|
|
||||||
public UrlIndexTree rangeForWord(IndexQueryCachePool pool, int wordId) {
|
public IndexBTreeRange rangeForWord(IndexQueryCachePool pool, int wordId) {
|
||||||
UrlIndexTree range = pool.getRange(words, wordId);
|
IndexBTreeRange range = pool.getRange(words, wordId);
|
||||||
|
|
||||||
if (range == null) {
|
if (range == null) {
|
||||||
range = new UrlIndexTree(words.positionForWord(wordId));
|
range = new IndexBTreeRange(words.positionForWord(wordId));
|
||||||
pool.cacheRange(words, wordId, range);
|
pool.cacheRange(words, wordId, range);
|
||||||
}
|
}
|
||||||
|
|
||||||
return range;
|
return range;
|
||||||
}
|
}
|
||||||
|
|
||||||
public UrlIndexTree rangeForWord(int wordId) {
|
public IndexBTreeRange rangeForWord(int wordId) {
|
||||||
return new UrlIndexTree(words.positionForWord(wordId));
|
return new IndexBTreeRange(words.positionForWord(wordId));
|
||||||
}
|
}
|
||||||
|
|
||||||
public class UrlIndexTree {
|
public class IndexBTreeRange {
|
||||||
final long dataOffset;
|
public final long dataOffset;
|
||||||
private BTreeHeader header;
|
private BTreeHeader header;
|
||||||
public UrlIndexTree(long dataOffset) {
|
public IndexBTreeRange(long dataOffset) {
|
||||||
this.dataOffset = dataOffset;
|
this.dataOffset = dataOffset;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -126,7 +127,7 @@ public class SearchIndex implements AutoCloseable {
|
|||||||
return new AsEntrySource();
|
return new AsEntrySource();
|
||||||
}
|
}
|
||||||
|
|
||||||
public QueryFilterStep asExcludeFilterStep(IndexQueryCachePool pool) {
|
public QueryFilterStepIf asExcludeFilterStep(IndexQueryCachePool pool) {
|
||||||
return new AsExcludeQueryFilterStep(pool);
|
return new AsExcludeQueryFilterStep(pool);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -150,7 +151,7 @@ public class SearchIndex implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean hasUrl(CachingBTreeReader.Cache cache, long url) {
|
public boolean hasUrl(CachingBTreeReader.BTreeCachedIndex cache, long url) {
|
||||||
if (dataOffset < 0) return false;
|
if (dataOffset < 0) return false;
|
||||||
|
|
||||||
return cachingBTreeReader.findEntry(cache, url) >= 0;
|
return cachingBTreeReader.findEntry(cache, url) >= 0;
|
||||||
@ -160,12 +161,12 @@ public class SearchIndex implements AutoCloseable {
|
|||||||
if (dataOffset < 0)
|
if (dataOffset < 0)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
CachingBTreeReader.Cache cache = pool.getIndexCache(SearchIndex.this, this);
|
CachingBTreeReader.BTreeCachedIndex cache = pool.getIndexCache(SearchIndex.this, this);
|
||||||
|
|
||||||
return cachingBTreeReader.findEntry(cache, url) >= 0;
|
return cachingBTreeReader.findEntry(cache, url) >= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
public CachingBTreeReader.Cache createIndexCache() {
|
public CachingBTreeReader.BTreeCachedIndex createIndexCache() {
|
||||||
if (dataOffset < 0)
|
if (dataOffset < 0)
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
@ -213,11 +214,11 @@ public class SearchIndex implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
class AsExcludeQueryFilterStep implements QueryFilterStep {
|
class AsExcludeQueryFilterStep implements QueryFilterStepIf {
|
||||||
private final CachingBTreeReader.Cache cache;
|
private final CachingBTreeReader.BTreeCachedIndex cache;
|
||||||
|
|
||||||
public AsExcludeQueryFilterStep(IndexQueryCachePool pool) {
|
public AsExcludeQueryFilterStep(IndexQueryCachePool pool) {
|
||||||
cache = pool.getIndexCache(SearchIndex.this, UrlIndexTree.this);
|
cache = pool.getIndexCache(SearchIndex.this, IndexBTreeRange.this);
|
||||||
}
|
}
|
||||||
|
|
||||||
public SearchIndex getIndex() {
|
public SearchIndex getIndex() {
|
||||||
|
@ -3,7 +3,8 @@ package nu.marginalia.wmsa.edge.index.reader;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexQueryFactory;
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -105,7 +105,7 @@ public class SearchIndexes {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
public KeywordLexiconReadOnlyView getDictionaryReader() {
|
public KeywordLexiconReadOnlyView getLexiconReader() {
|
||||||
return keywordLexiconReadOnlyView;
|
return keywordLexiconReadOnlyView;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,26 +0,0 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.reader.query;
|
|
||||||
|
|
||||||
import java.util.stream.LongStream;
|
|
||||||
|
|
||||||
public interface Query {
|
|
||||||
Query EMPTY = new Query() {
|
|
||||||
@Override
|
|
||||||
public Query also(int wordId) { return this; }
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Query alsoCached(int wordId) { return this; }
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Query not(int wordId) { return this; }
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public LongStream stream() { return LongStream.empty(); }
|
|
||||||
};
|
|
||||||
|
|
||||||
Query also(int wordId);
|
|
||||||
Query alsoCached(int wordId);
|
|
||||||
|
|
||||||
Query not(int wordId);
|
|
||||||
|
|
||||||
LongStream stream();
|
|
||||||
}
|
|
@ -1,125 +0,0 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.reader.query.types;
|
|
||||||
|
|
||||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.StringJoiner;
|
|
||||||
|
|
||||||
public interface QueryFilterStep extends Comparable<QueryFilterStep> {
|
|
||||||
@Nullable
|
|
||||||
SearchIndex getIndex();
|
|
||||||
|
|
||||||
boolean test(long value);
|
|
||||||
|
|
||||||
double cost();
|
|
||||||
|
|
||||||
default int compareTo(QueryFilterStep other) {
|
|
||||||
return (int)(cost() - other.cost());
|
|
||||||
}
|
|
||||||
|
|
||||||
String describe();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Move each value in items to the beginning of the array,
|
|
||||||
* and return the number of matching items.
|
|
||||||
*
|
|
||||||
* The remaining values are undefined.
|
|
||||||
*/
|
|
||||||
default int retainDestructive(long[] items, int max) {
|
|
||||||
int keep = 0;
|
|
||||||
for (int i = 0; i < max; i++) {
|
|
||||||
if (test(items[i])) {
|
|
||||||
if (i != keep) {
|
|
||||||
items[keep] = items[i];
|
|
||||||
}
|
|
||||||
keep++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return keep;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Move each value in items to the beginning of the array,
|
|
||||||
* and return the number of matching items. The values that do
|
|
||||||
* not pass the test are moved to the end of the array.
|
|
||||||
*/
|
|
||||||
default int retainReorder(long[] items, int start, int max) {
|
|
||||||
int keep = 0;
|
|
||||||
for (int i = start; i < max; i++) {
|
|
||||||
if (test(items[i])) {
|
|
||||||
if (i != keep) {
|
|
||||||
long tmp = items[keep];
|
|
||||||
items[keep] = items[i];
|
|
||||||
items[i] = tmp;
|
|
||||||
}
|
|
||||||
keep++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return keep;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static QueryFilterStep noPass() {
|
|
||||||
return NoPassFilter.instance;
|
|
||||||
}
|
|
||||||
static QueryFilterStep anyOf(List<? extends QueryFilterStep> steps) {
|
|
||||||
return new AnyOfFilter(steps);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
class AnyOfFilter implements QueryFilterStep {
|
|
||||||
private final List<? extends QueryFilterStep> steps;
|
|
||||||
|
|
||||||
AnyOfFilter(List<? extends QueryFilterStep> steps) {
|
|
||||||
this.steps = steps;
|
|
||||||
}
|
|
||||||
|
|
||||||
public SearchIndex getIndex() { return null; }
|
|
||||||
|
|
||||||
public double cost() {
|
|
||||||
return steps.stream().mapToDouble(QueryFilterStep::cost).average().orElse(0.);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean test(long value) {
|
|
||||||
for (var step : steps) {
|
|
||||||
if (step.test(value))
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String describe() {
|
|
||||||
StringJoiner sj = new StringJoiner(",", "[Any Of: ", "]");
|
|
||||||
for (var step : steps) {
|
|
||||||
sj.add(step.describe());
|
|
||||||
}
|
|
||||||
return sj.toString();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
class NoPassFilter implements QueryFilterStep {
|
|
||||||
static final QueryFilterStep instance = new NoPassFilter();
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean test(long value) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
public SearchIndex getIndex() { return null; }
|
|
||||||
public double cost() { return 0.; }
|
|
||||||
|
|
||||||
public int retainDestructive(long[] items, int max) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
public int retainReorder(long[] items, int start, int max) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String describe() {
|
|
||||||
return "[NoPass]";
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -0,0 +1,107 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.svc;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import com.google.protobuf.InvalidProtocolBufferException;
|
||||||
|
import nu.marginalia.util.ListChunker;
|
||||||
|
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||||
|
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||||
|
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
|
||||||
|
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
|
||||||
|
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
|
||||||
|
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||||
|
import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq;
|
||||||
|
import org.apache.http.HttpStatus;
|
||||||
|
import spark.Request;
|
||||||
|
import spark.Response;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class EdgeIndexLexiconService {
|
||||||
|
|
||||||
|
private final SearchIndexes indexes;
|
||||||
|
private final KeywordLexicon keywordLexicon;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public EdgeIndexLexiconService(SearchIndexes indexes, IndexServicesFactory servicesFactory) {
|
||||||
|
this.indexes = indexes;
|
||||||
|
this.keywordLexicon = servicesFactory.getKeywordLexicon();
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object getWordId(Request request, Response response) {
|
||||||
|
final String word = request.splat()[0];
|
||||||
|
|
||||||
|
var lr = indexes.getLexiconReader();
|
||||||
|
if (null == lr) {
|
||||||
|
response.status(HttpStatus.SC_FAILED_DEPENDENCY);
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
final int wordId = lr.get(word);
|
||||||
|
|
||||||
|
if (DictionaryHashMap.NO_VALUE == wordId) {
|
||||||
|
response.status(404);
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
return wordId;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Object putWords(Request request, Response response) throws InvalidProtocolBufferException {
|
||||||
|
var req = IndexPutKeywordsReq.parseFrom(request.bodyAsBytes());
|
||||||
|
|
||||||
|
EdgeId<EdgeDomain> domainId = new EdgeId<>(req.getDomain());
|
||||||
|
EdgeId<EdgeUrl> urlId = new EdgeId<>(req.getUrl());
|
||||||
|
int idx = req.getIndex();
|
||||||
|
|
||||||
|
for (int ws = 0; ws < req.getWordSetCount(); ws++) {
|
||||||
|
putWords(domainId, urlId, req.getWordSet(ws), idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
response.status(HttpStatus.SC_ACCEPTED);
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
|
||||||
|
IndexPutKeywordsReq.WordSet words, int idx
|
||||||
|
) {
|
||||||
|
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
|
||||||
|
|
||||||
|
IndexBlock block = IndexBlock.values()[words.getIndex()];
|
||||||
|
|
||||||
|
for (var chunk : ListChunker.chopList(words.getWordsList(), SearchIndexJournalEntry.MAX_LENGTH)) {
|
||||||
|
|
||||||
|
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
|
||||||
|
var header = new SearchIndexJournalEntryHeader(domainId, urlId, block);
|
||||||
|
|
||||||
|
indexWriter.put(header, entry);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
private long[] getOrInsertWordIds(List<String> words) {
|
||||||
|
long[] ids = new long[words.size()];
|
||||||
|
int putIdx = 0;
|
||||||
|
|
||||||
|
for (String word : words) {
|
||||||
|
long id = keywordLexicon.getOrInsert(word);
|
||||||
|
if (id != DictionaryHashMap.NO_VALUE) {
|
||||||
|
ids[putIdx++] = id;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (putIdx != words.size()) {
|
||||||
|
ids = Arrays.copyOf(ids, putIdx);
|
||||||
|
}
|
||||||
|
return ids;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,44 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.svc;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||||
|
import spark.Request;
|
||||||
|
import spark.Response;
|
||||||
|
import spark.Spark;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class EdgeIndexOpsService {
|
||||||
|
|
||||||
|
private final SearchIndexes indexes;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public EdgeIndexOpsService(SearchIndexes indexes) {
|
||||||
|
this.indexes = indexes;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object repartitionEndpoint(Request request, Response response) {
|
||||||
|
|
||||||
|
if (!indexes.repartition()) {
|
||||||
|
Spark.halt(503, "Operations busy");
|
||||||
|
}
|
||||||
|
return "OK";
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object preconvertEndpoint(Request request, Response response) {
|
||||||
|
if (!indexes.preconvert()) {
|
||||||
|
Spark.halt(503, "Operations busy");
|
||||||
|
}
|
||||||
|
return "OK";
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object reindexEndpoint(Request request, Response response) {
|
||||||
|
int id = Integer.parseInt(request.params("id"));
|
||||||
|
|
||||||
|
if (!indexes.reindex(id)) {
|
||||||
|
Spark.halt(503, "Operations busy");
|
||||||
|
}
|
||||||
|
return "OK";
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,320 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.svc;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import gnu.trove.set.hash.TIntHashSet;
|
||||||
|
import io.prometheus.client.Counter;
|
||||||
|
import io.prometheus.client.Histogram;
|
||||||
|
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||||
|
import nu.marginalia.wmsa.client.GsonFactory;
|
||||||
|
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||||
|
import nu.marginalia.wmsa.edge.index.EdgeIndexBucket;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexSearchBudget;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.id.EdgeIdList;
|
||||||
|
import nu.marginalia.wmsa.edge.model.search.*;
|
||||||
|
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
||||||
|
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
||||||
|
import org.apache.http.HttpStatus;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
import spark.HaltException;
|
||||||
|
import spark.Request;
|
||||||
|
import spark.Response;
|
||||||
|
import spark.Spark;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.function.LongPredicate;
|
||||||
|
|
||||||
|
import static spark.Spark.halt;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class EdgeIndexQueryService {
|
||||||
|
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
private static final int SEARCH_BUDGET_TIMEOUT_MS = 3000;
|
||||||
|
private static final int QUERY_FETCH_SIZE = 8192;
|
||||||
|
private static final int QUERY_FIRST_PASS_DOMAIN_LIMIT = 64;
|
||||||
|
|
||||||
|
private static final Counter wmsa_edge_index_query_timeouts = Counter.build().name("wmsa_edge_index_query_timeouts").help("-").register();
|
||||||
|
|
||||||
|
private static final Histogram wmsa_edge_index_query_time = Histogram.build().name("wmsa_edge_index_query_time").linearBuckets(50, 50, 15).help("-").register();
|
||||||
|
private static final Histogram wmsa_edge_index_domain_query_time = Histogram.build().name("wmsa_edge_index_domain_query_time").linearBuckets(50, 50, 15).help("-").register();
|
||||||
|
|
||||||
|
private final Gson gson = GsonFactory.get();
|
||||||
|
|
||||||
|
private final SearchIndexes indexes;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public EdgeIndexQueryService(SearchIndexes indexes) {
|
||||||
|
this.indexes = indexes;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object searchDomain(Request request, Response response) {
|
||||||
|
if (indexes.getLexiconReader() == null) {
|
||||||
|
logger.warn("Dictionary reader not yet initialized");
|
||||||
|
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
|
||||||
|
}
|
||||||
|
|
||||||
|
String json = request.body();
|
||||||
|
EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class);
|
||||||
|
|
||||||
|
try {
|
||||||
|
return wmsa_edge_index_domain_query_time.time(() -> queryDomain(specsSet));
|
||||||
|
}
|
||||||
|
catch (HaltException ex) {
|
||||||
|
logger.warn("Halt", ex);
|
||||||
|
throw ex;
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.info("Error during domain search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
|
||||||
|
logger.info("Error", ex);
|
||||||
|
Spark.halt(500, "Error");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object search(Request request, Response response) {
|
||||||
|
if (indexes.getLexiconReader() == null) {
|
||||||
|
logger.warn("Dictionary reader not yet initialized");
|
||||||
|
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
|
||||||
|
}
|
||||||
|
|
||||||
|
String json = request.body();
|
||||||
|
EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class);
|
||||||
|
|
||||||
|
try {
|
||||||
|
return wmsa_edge_index_query_time.time(() -> query(specsSet));
|
||||||
|
}
|
||||||
|
catch (HaltException ex) {
|
||||||
|
logger.warn("Halt", ex);
|
||||||
|
throw ex;
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.info("Error during search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
|
||||||
|
logger.info("Error", ex);
|
||||||
|
Spark.halt(500, "Error");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public EdgeSearchResultSet query(EdgeSearchSpecification specsSet) {
|
||||||
|
List<EdgeSearchResultItem> results = new SearchQuery(specsSet).execute();
|
||||||
|
return new EdgeSearchResultSet(results);
|
||||||
|
}
|
||||||
|
|
||||||
|
public EdgeDomainSearchResults queryDomain(EdgeDomainSearchSpecification specsSet) {
|
||||||
|
|
||||||
|
final OptionalInt wordId = lookUpWord(specsSet.keyword);
|
||||||
|
EdgeIdList<EdgeUrl> urlIds;
|
||||||
|
|
||||||
|
if (wordId.isEmpty()) {
|
||||||
|
urlIds = new EdgeIdList<>();
|
||||||
|
} else {
|
||||||
|
urlIds = indexes
|
||||||
|
.getBucket(specsSet.bucket)
|
||||||
|
.findHotDomainsForKeyword(specsSet.block, wordId.getAsInt(), specsSet.queryDepth, specsSet.minHitCount, specsSet.maxResults)
|
||||||
|
.mapToInt(lv -> (int) (lv & 0xFFFF_FFFFL))
|
||||||
|
.collect(EdgeIdList::new, EdgeIdList::add, EdgeIdList::addAll);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
|
||||||
|
}
|
||||||
|
|
||||||
|
private class SearchQuery {
|
||||||
|
private final TIntHashSet seenResults = new TIntHashSet(QUERY_FETCH_SIZE, 0.5f);
|
||||||
|
private final EdgeSearchSpecification specsSet;
|
||||||
|
private final IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_TIMEOUT_MS);
|
||||||
|
private final IndexQueryCachePool cachePool = new IndexQueryCachePool();
|
||||||
|
|
||||||
|
public SearchQuery(EdgeSearchSpecification specsSet) {
|
||||||
|
this.specsSet = specsSet;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<EdgeSearchResultItem> execute() {
|
||||||
|
final Set<EdgeSearchResultItem> results = new HashSet<>(QUERY_FETCH_SIZE);
|
||||||
|
|
||||||
|
for (var sq : specsSet.subqueries) {
|
||||||
|
results.addAll(performSearch(sq));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var result : results) {
|
||||||
|
addResultScores(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!budget.hasTimeLeft()) {
|
||||||
|
wmsa_edge_index_query_timeouts.inc();
|
||||||
|
}
|
||||||
|
|
||||||
|
var domainCountFilter = new ResultDomainDeduplicator(specsSet.limitByDomain);
|
||||||
|
|
||||||
|
if (WmsaHome.isDebug()) {
|
||||||
|
cachePool.printSummary(logger);
|
||||||
|
}
|
||||||
|
cachePool.clear();
|
||||||
|
|
||||||
|
return results.stream()
|
||||||
|
.sorted(Comparator.comparing(EdgeSearchResultItem::getScore))
|
||||||
|
.filter(domainCountFilter::test)
|
||||||
|
.limit(specsSet.getLimitTotal()).toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<EdgeSearchResultItem> performSearch(EdgeSearchSubquery sq)
|
||||||
|
{
|
||||||
|
|
||||||
|
final List<EdgeSearchResultItem> results = new ArrayList<>(QUERY_FETCH_SIZE);
|
||||||
|
final EdgeIndexSearchTerms searchTerms = getSearchTerms(sq);
|
||||||
|
|
||||||
|
if (searchTerms.isEmpty())
|
||||||
|
return Collections.emptyList();
|
||||||
|
|
||||||
|
for (int indexBucket : specsSet.buckets) {
|
||||||
|
final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT);
|
||||||
|
|
||||||
|
if (!budget.hasTimeLeft()) {
|
||||||
|
logger.info("Query timed out, omitting {}:{} for query {}", indexBucket, sq.block, sq.searchTermsInclude);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (QUERY_FETCH_SIZE <= results.size())
|
||||||
|
break;
|
||||||
|
|
||||||
|
IndexQuery query = getQuery(cachePool, indexBucket, sq.block, localFilter::filterRawValue, searchTerms);
|
||||||
|
long[] buf = new long[8192];
|
||||||
|
|
||||||
|
while (query.hasMore() && results.size() < QUERY_FETCH_SIZE && budget.hasTimeLeft()) {
|
||||||
|
int cnt = query.getMoreResults(buf, budget);
|
||||||
|
|
||||||
|
for (int i = 0; i < cnt && results.size() < QUERY_FETCH_SIZE; i++) {
|
||||||
|
final long id = buf[i];
|
||||||
|
|
||||||
|
if (!seenResults.add((int)(id & 0xFFFF_FFFFL)) || !localFilter.test(id)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
results.add(new EdgeSearchResultItem(indexBucket, id));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
private IndexQuery getQuery(IndexQueryCachePool cachePool, int bucket, IndexBlock block,
|
||||||
|
LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
|
||||||
|
|
||||||
|
if (!indexes.isValidBucket(bucket)) {
|
||||||
|
logger.warn("Invalid bucket {}", bucket);
|
||||||
|
return new IndexQuery(Collections.emptyList());
|
||||||
|
}
|
||||||
|
|
||||||
|
return indexes.getBucket(bucket).getQuery(cachePool, block, filter, searchTerms);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addResultScores(EdgeSearchResultItem searchResult) {
|
||||||
|
final var reader = Objects.requireNonNull(indexes.getLexiconReader());
|
||||||
|
|
||||||
|
List<List<String>> searchTermVariants = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
|
||||||
|
|
||||||
|
// Memoize calls to getTermData, as they're somewhat expensive and highly redundant
|
||||||
|
Map<ResultTerm, ResultTermData> termMetadata = new HashMap<>(32);
|
||||||
|
|
||||||
|
double bestScore = 0;
|
||||||
|
|
||||||
|
for (int searchTermListIdx = 0; searchTermListIdx < searchTermVariants.size(); searchTermListIdx++) {
|
||||||
|
double setScore = 0;
|
||||||
|
int setSize = 0;
|
||||||
|
for (var searchTerm : searchTermVariants.get(searchTermListIdx)) {
|
||||||
|
|
||||||
|
final int termId = reader.get(searchTerm);
|
||||||
|
|
||||||
|
ResultTermData data = termMetadata.computeIfAbsent(
|
||||||
|
new ResultTerm(searchResult.bucketId, termId, searchResult.getCombinedId()), this::getTermData);
|
||||||
|
|
||||||
|
var score = data.asScore(searchTermListIdx, searchTerm);
|
||||||
|
searchResult.scores.add(score);
|
||||||
|
setScore += score.value();
|
||||||
|
setSize++;
|
||||||
|
}
|
||||||
|
bestScore = Math.min(bestScore, setScore/setSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
searchResult.setScore(bestScore);
|
||||||
|
}
|
||||||
|
|
||||||
|
private ResultTermData getTermData(ResultTerm resultTerm) {
|
||||||
|
final EdgeIndexBucket bucket = indexes.getBucket(resultTerm.bucket);
|
||||||
|
final int termId = resultTerm.termId;
|
||||||
|
final long combinedUrlId = resultTerm.combinedUrlId;
|
||||||
|
|
||||||
|
return new ResultTermData(bucket.getTermScore(cachePool, termId, combinedUrlId),
|
||||||
|
bucket.isTermInBucket(cachePool, IndexBlock.Title, termId, combinedUrlId),
|
||||||
|
bucket.isTermInBucket(cachePool, IndexBlock.Link, termId, combinedUrlId),
|
||||||
|
bucket.isTermInBucket(cachePool, IndexBlock.Site, termId, combinedUrlId),
|
||||||
|
bucket.isTermInBucket(cachePool, IndexBlock.Subjects, termId, combinedUrlId),
|
||||||
|
bucket.isTermInBucket(cachePool, IndexBlock.NamesWords, termId, combinedUrlId),
|
||||||
|
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Top, termId, combinedUrlId),
|
||||||
|
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Middle, termId, combinedUrlId),
|
||||||
|
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Lower, termId, combinedUrlId)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
record ResultTerm (int bucket, int termId, long combinedUrlId) {}
|
||||||
|
record ResultTermData (IndexBlock index,
|
||||||
|
boolean title,
|
||||||
|
boolean link,
|
||||||
|
boolean site,
|
||||||
|
boolean subject,
|
||||||
|
boolean name,
|
||||||
|
boolean high,
|
||||||
|
boolean mid,
|
||||||
|
boolean low
|
||||||
|
) {
|
||||||
|
public EdgeSearchResultKeywordScore asScore(int set, String searchTerm) {
|
||||||
|
return new EdgeSearchResultKeywordScore(set, searchTerm, index, title, link, site, subject, name, high, mid, low);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private EdgeIndexSearchTerms getSearchTerms(EdgeSearchSubquery request) {
|
||||||
|
final List<Integer> excludes = new ArrayList<>();
|
||||||
|
final List<Integer> includes = new ArrayList<>();
|
||||||
|
|
||||||
|
for (var include : request.searchTermsInclude) {
|
||||||
|
var word = lookUpWord(include);
|
||||||
|
if (word.isEmpty()) {
|
||||||
|
logger.debug("Unknown search term: " + include);
|
||||||
|
return new EdgeIndexSearchTerms(includes, excludes);
|
||||||
|
}
|
||||||
|
includes.add(word.getAsInt());
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var exclude : request.searchTermsExclude) {
|
||||||
|
lookUpWord(exclude).ifPresent(excludes::add);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new EdgeIndexSearchTerms(includes, excludes);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private OptionalInt lookUpWord(String s) {
|
||||||
|
int ret = indexes.getLexiconReader().get(s);
|
||||||
|
if (ret == DictionaryHashMap.NO_VALUE) {
|
||||||
|
return OptionalInt.empty();
|
||||||
|
}
|
||||||
|
return OptionalInt.of(ret);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.reader.query;
|
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||||
|
|
||||||
import nu.marginalia.wmsa.edge.index.reader.query.types.EntrySource;
|
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.query.types.QueryFilterStep;
|
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -10,18 +10,18 @@ import static java.lang.Math.min;
|
|||||||
|
|
||||||
public class IndexQuery {
|
public class IndexQuery {
|
||||||
private final List<EntrySource> sources;
|
private final List<EntrySource> sources;
|
||||||
private final List<QueryFilterStep> inclusionFilter = new ArrayList<>(10);
|
private final List<QueryFilterStepIf> inclusionFilter = new ArrayList<>(10);
|
||||||
private final List<QueryFilterStep> priorityFilter = new ArrayList<>(10);
|
private final List<QueryFilterStepIf> priorityFilter = new ArrayList<>(10);
|
||||||
|
|
||||||
public IndexQuery(List<EntrySource> sources) {
|
public IndexQuery(List<EntrySource> sources) {
|
||||||
this.sources = sources;
|
this.sources = sources;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addInclusionFilter(QueryFilterStep filter) {
|
public void addInclusionFilter(QueryFilterStepIf filter) {
|
||||||
inclusionFilter.add(filter);
|
inclusionFilter.add(filter);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addPriorityFilter(QueryFilterStep filter) {
|
public void addPriorityFilter(QueryFilterStepIf filter) {
|
||||||
priorityFilter.add(filter);
|
priorityFilter.add(filter);
|
||||||
}
|
}
|
||||||
|
|
@ -1,17 +1,19 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.reader;
|
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||||
|
|
||||||
import nu.marginalia.util.btree.CachingBTreeReader;
|
import nu.marginalia.util.btree.CachingBTreeReader;
|
||||||
|
import nu.marginalia.wmsa.edge.index.reader.IndexWordsTable;
|
||||||
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
public class IndexQueryCachePool {
|
public class IndexQueryCachePool {
|
||||||
private final Map<PoolKey, CachingBTreeReader.Cache> indexCaches = new HashMap<>();
|
private final Map<PoolKey, CachingBTreeReader.BTreeCachedIndex> indexCaches = new HashMap<>();
|
||||||
private final Map<RangeKey, SearchIndex.UrlIndexTree> rangeCache = new HashMap<>();
|
private final Map<RangeKey, SearchIndex.IndexBTreeRange> rangeCache = new HashMap<>();
|
||||||
private final Map<PoolKey, Integer> savedCounts = new HashMap<>();
|
private final Map<PoolKey, Integer> savedCounts = new HashMap<>();
|
||||||
|
|
||||||
public CachingBTreeReader.Cache getIndexCache(SearchIndex index, SearchIndex.UrlIndexTree range) {
|
public CachingBTreeReader.BTreeCachedIndex getIndexCache(SearchIndex index, SearchIndex.IndexBTreeRange range) {
|
||||||
var key = new PoolKey(index, range.dataOffset);
|
var key = new PoolKey(index, range.dataOffset);
|
||||||
var entry = indexCaches.get(key);
|
var entry = indexCaches.get(key);
|
||||||
|
|
||||||
@ -33,10 +35,10 @@ public class IndexQueryCachePool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void printSummary(Logger logger) {
|
public void printSummary(Logger logger) {
|
||||||
long loadedBytes = indexCaches.values().stream().mapToLong(CachingBTreeReader.Cache::sizeBytes).sum();
|
long loadedBytes = indexCaches.values().stream().mapToLong(CachingBTreeReader.BTreeCachedIndex::sizeBytes).sum();
|
||||||
long savedBytes = savedCounts.entrySet().stream().mapToLong(e -> e.getValue() * indexCaches.get(e.getKey()).sizeBytes()).sum();
|
long savedBytes = savedCounts.entrySet().stream().mapToLong(e -> e.getValue() * indexCaches.get(e.getKey()).sizeBytes()).sum();
|
||||||
|
|
||||||
long loaded = indexCaches.values().stream().filter(CachingBTreeReader.Cache::isLoaded).count();
|
long loaded = indexCaches.values().stream().filter(CachingBTreeReader.BTreeCachedIndex::isLoaded).count();
|
||||||
|
|
||||||
logger.info("Index Cache Summary: {}/{} loaded/total, {} index blocks loaded, {} index blocks saved", loaded, indexCaches.size(), loadedBytes/4096., savedBytes/4096.);
|
logger.info("Index Cache Summary: {}/{} loaded/total, {} index blocks loaded, {} index blocks saved", loaded, indexCaches.size(), loadedBytes/4096., savedBytes/4096.);
|
||||||
}
|
}
|
||||||
@ -45,11 +47,11 @@ public class IndexQueryCachePool {
|
|||||||
indexCaches.clear();
|
indexCaches.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
public SearchIndex.UrlIndexTree getRange(IndexWordsTable words, int wordId) {
|
public SearchIndex.IndexBTreeRange getRange(IndexWordsTable words, int wordId) {
|
||||||
return rangeCache.get(new RangeKey(words, wordId));
|
return rangeCache.get(new RangeKey(words, wordId));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void cacheRange(IndexWordsTable words, int wordId, SearchIndex.UrlIndexTree range) {
|
public void cacheRange(IndexWordsTable words, int wordId, SearchIndex.IndexBTreeRange range) {
|
||||||
rangeCache.put(new RangeKey(words, wordId), range);
|
rangeCache.put(new RangeKey(words, wordId), range);
|
||||||
}
|
}
|
||||||
|
|
@ -1,11 +1,10 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.reader.query;
|
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||||
|
|
||||||
import nu.marginalia.wmsa.edge.index.reader.IndexQueryCachePool;
|
|
||||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.query.types.EntrySource;
|
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.query.types.QueryFilterStep;
|
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterBTreeRange;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.query.types.QueryFilterStepFromPredicate;
|
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepFromPredicate;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.query.types.UrlRangeSubFilter;
|
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.function.LongPredicate;
|
import java.util.function.LongPredicate;
|
||||||
@ -57,21 +56,21 @@ public class IndexQueryFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public IndexQueryBuilder also(int termId) {
|
public IndexQueryBuilder also(int termId) {
|
||||||
List<QueryFilterStep> filters = new ArrayList<>(requiredIndices.size());
|
List<QueryFilterStepIf> filters = new ArrayList<>(requiredIndices.size());
|
||||||
|
|
||||||
for (var ri : requiredIndices) {
|
for (var ri : requiredIndices) {
|
||||||
var range = ri.rangeForWord(cachePool, termId);
|
var range = ri.rangeForWord(cachePool, termId);
|
||||||
|
|
||||||
if (range.isPresent()) {
|
if (range.isPresent()) {
|
||||||
filters.add(new UrlRangeSubFilter(ri, range, cachePool));
|
filters.add(new QueryFilterBTreeRange(ri, range, cachePool));
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
filters.add(QueryFilterStep.noPass());
|
filters.add(QueryFilterStepIf.noPass());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
filters.sort(Comparator.naturalOrder());
|
filters.sort(Comparator.naturalOrder());
|
||||||
query.addInclusionFilter(QueryFilterStep.anyOf(filters));
|
query.addInclusionFilter(QueryFilterStepIf.anyOf(filters));
|
||||||
|
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
@ -92,7 +91,7 @@ public class IndexQueryFactory {
|
|||||||
for (var idx : priortyIndices) {
|
for (var idx : priortyIndices) {
|
||||||
var range = idx.rangeForWord(cachePool, termId);
|
var range = idx.rangeForWord(cachePool, termId);
|
||||||
if (range.isPresent()) {
|
if (range.isPresent()) {
|
||||||
query.addPriorityFilter(new UrlRangeSubFilter(idx, range, cachePool));
|
query.addPriorityFilter(new QueryFilterBTreeRange(idx, range, cachePool));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -0,0 +1,26 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||||
|
|
||||||
|
import java.util.stream.LongStream;
|
||||||
|
|
||||||
|
public interface IndexQueryIf {
|
||||||
|
IndexQueryIf EMPTY = new IndexQueryIf() {
|
||||||
|
@Override
|
||||||
|
public IndexQueryIf also(int wordId) { return this; }
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public IndexQueryIf alsoCached(int wordId) { return this; }
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public IndexQueryIf not(int wordId) { return this; }
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public LongStream stream() { return LongStream.empty(); }
|
||||||
|
};
|
||||||
|
|
||||||
|
IndexQueryIf also(int wordId);
|
||||||
|
IndexQueryIf alsoCached(int wordId);
|
||||||
|
|
||||||
|
IndexQueryIf not(int wordId);
|
||||||
|
|
||||||
|
LongStream stream();
|
||||||
|
}
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.reader.query;
|
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||||
|
|
||||||
|
|
||||||
public class IndexSearchBudget {
|
public class IndexSearchBudget {
|
@ -0,0 +1,45 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||||
|
|
||||||
|
import gnu.trove.map.TLongIntMap;
|
||||||
|
import gnu.trove.map.hash.TLongIntHashMap;
|
||||||
|
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem;
|
||||||
|
|
||||||
|
public class ResultDomainDeduplicator {
|
||||||
|
final TLongIntMap resultsByRankingId = new TLongIntHashMap(2048, 0.5f, -1, 0);
|
||||||
|
final int limitByDomain;
|
||||||
|
|
||||||
|
public ResultDomainDeduplicator(int limitByDomain) {
|
||||||
|
this.limitByDomain = limitByDomain;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean filterRawValue(long value) {
|
||||||
|
int rankingId = (int) (value >>> 32);
|
||||||
|
|
||||||
|
if (rankingId == Integer.MAX_VALUE) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return resultsByRankingId.get(getKey(rankingId)) <= limitByDomain;
|
||||||
|
}
|
||||||
|
|
||||||
|
long getKey(int rankingId) {
|
||||||
|
return rankingId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean test(long value) {
|
||||||
|
int ranking = (int) (value >>> 32);
|
||||||
|
if (ranking == Integer.MAX_VALUE) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return resultsByRankingId.adjustOrPutValue(ranking, 1, 1) <= limitByDomain;
|
||||||
|
}
|
||||||
|
public boolean test(EdgeSearchResultItem item) {
|
||||||
|
int ranking = item.getRanking();
|
||||||
|
if (ranking == Integer.MAX_VALUE) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return resultsByRankingId.adjustOrPutValue(ranking, 1, 1) <= limitByDomain;
|
||||||
|
}
|
||||||
|
}
|
@ -1,8 +1,9 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.reader.query.types;
|
package nu.marginalia.wmsa.edge.index.svc.query.types;
|
||||||
|
|
||||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||||
|
|
||||||
public interface EntrySource {
|
public interface EntrySource {
|
||||||
SearchIndex getIndex();
|
SearchIndex getIndex();
|
||||||
int read(long[] buffer, int n);
|
int read(long[] buffer, int n);
|
||||||
|
|
||||||
}
|
}
|
@ -0,0 +1,39 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.svc.query.types.filter;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.StringJoiner;
|
||||||
|
|
||||||
|
class QueryFilterAnyOf implements QueryFilterStepIf {
|
||||||
|
private final List<? extends QueryFilterStepIf> steps;
|
||||||
|
|
||||||
|
QueryFilterAnyOf(List<? extends QueryFilterStepIf> steps) {
|
||||||
|
this.steps = steps;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SearchIndex getIndex() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double cost() {
|
||||||
|
return steps.stream().mapToDouble(QueryFilterStepIf::cost).average().orElse(0.);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean test(long value) {
|
||||||
|
for (var step : steps) {
|
||||||
|
if (step.test(value))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String describe() {
|
||||||
|
StringJoiner sj = new StringJoiner(",", "[Any Of: ", "]");
|
||||||
|
for (var step : steps) {
|
||||||
|
sj.add(step.describe());
|
||||||
|
}
|
||||||
|
return sj.toString();
|
||||||
|
}
|
||||||
|
}
|
@ -1,13 +1,13 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.reader.query.types;
|
package nu.marginalia.wmsa.edge.index.svc.query.types.filter;
|
||||||
|
|
||||||
import nu.marginalia.util.btree.CachingBTreeReader;
|
import nu.marginalia.util.btree.CachingBTreeReader;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.IndexQueryCachePool;
|
|
||||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
||||||
import org.jetbrains.annotations.Nullable;
|
import org.jetbrains.annotations.Nullable;
|
||||||
|
|
||||||
public record UrlRangeSubFilter(SearchIndex source, SearchIndex.UrlIndexTree range, CachingBTreeReader.Cache cache) implements QueryFilterStep {
|
public record QueryFilterBTreeRange(SearchIndex source, SearchIndex.IndexBTreeRange range, CachingBTreeReader.BTreeCachedIndex cache) implements QueryFilterStepIf {
|
||||||
|
|
||||||
public UrlRangeSubFilter(SearchIndex source, SearchIndex.UrlIndexTree range, IndexQueryCachePool pool) {
|
public QueryFilterBTreeRange(SearchIndex source, SearchIndex.IndexBTreeRange range, IndexQueryCachePool pool) {
|
||||||
this(source, range, pool.getIndexCache(source, range));
|
this(source, range, pool.getIndexCache(source, range));
|
||||||
}
|
}
|
||||||
|
|
@ -0,0 +1,33 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.svc.query.types.filter;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||||
|
|
||||||
|
class QueryFilterNoPass implements QueryFilterStepIf {
|
||||||
|
static final QueryFilterStepIf instance = new QueryFilterNoPass();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean test(long value) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SearchIndex getIndex() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double cost() {
|
||||||
|
return 0.;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int retainDestructive(long[] items, int max) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int retainReorder(long[] items, int start, int max) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String describe() {
|
||||||
|
return "[NoPass]";
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,11 +1,11 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.reader.query.types;
|
package nu.marginalia.wmsa.edge.index.svc.query.types.filter;
|
||||||
|
|
||||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||||
import org.jetbrains.annotations.Nullable;
|
import org.jetbrains.annotations.Nullable;
|
||||||
|
|
||||||
import java.util.function.LongPredicate;
|
import java.util.function.LongPredicate;
|
||||||
|
|
||||||
public class QueryFilterStepFromPredicate implements QueryFilterStep {
|
public class QueryFilterStepFromPredicate implements QueryFilterStepIf {
|
||||||
private final LongPredicate pred;
|
private final LongPredicate pred;
|
||||||
|
|
||||||
public QueryFilterStepFromPredicate(LongPredicate pred) {
|
public QueryFilterStepFromPredicate(LongPredicate pred) {
|
@ -0,0 +1,71 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.svc.query.types.filter;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||||
|
|
||||||
|
import javax.annotation.Nullable;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public interface QueryFilterStepIf extends Comparable<QueryFilterStepIf> {
|
||||||
|
@Nullable
|
||||||
|
SearchIndex getIndex();
|
||||||
|
|
||||||
|
boolean test(long value);
|
||||||
|
|
||||||
|
double cost();
|
||||||
|
|
||||||
|
default int compareTo(QueryFilterStepIf other) {
|
||||||
|
return (int)(cost() - other.cost());
|
||||||
|
}
|
||||||
|
|
||||||
|
String describe();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Move each value in items to the beginning of the array,
|
||||||
|
* and return the number of matching items.
|
||||||
|
*
|
||||||
|
* The remaining values are undefined.
|
||||||
|
*/
|
||||||
|
default int retainDestructive(long[] items, int max) {
|
||||||
|
int keep = 0;
|
||||||
|
for (int i = 0; i < max; i++) {
|
||||||
|
if (test(items[i])) {
|
||||||
|
if (i != keep) {
|
||||||
|
items[keep] = items[i];
|
||||||
|
}
|
||||||
|
keep++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return keep;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Move each value in items to the beginning of the array,
|
||||||
|
* and return the number of matching items. The values that do
|
||||||
|
* not pass the test are moved to the end of the array.
|
||||||
|
*/
|
||||||
|
default int retainReorder(long[] items, int start, int max) {
|
||||||
|
int keep = 0;
|
||||||
|
for (int i = start; i < max; i++) {
|
||||||
|
if (test(items[i])) {
|
||||||
|
if (i != keep) {
|
||||||
|
long tmp = items[keep];
|
||||||
|
items[keep] = items[i];
|
||||||
|
items[i] = tmp;
|
||||||
|
}
|
||||||
|
keep++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return keep;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static QueryFilterStepIf noPass() {
|
||||||
|
return QueryFilterNoPass.instance;
|
||||||
|
}
|
||||||
|
static QueryFilterStepIf anyOf(List<? extends QueryFilterStepIf> steps) {
|
||||||
|
return new QueryFilterAnyOf(steps);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -12,14 +12,14 @@ import java.util.List;
|
|||||||
@AllArgsConstructor @ToString @Getter
|
@AllArgsConstructor @ToString @Getter
|
||||||
public class EdgeSearchResultItem {
|
public class EdgeSearchResultItem {
|
||||||
public final int bucketId;
|
public final int bucketId;
|
||||||
public final long combinedId; // this isn't the external domain ID, but a ranking
|
public final long combinedId;
|
||||||
|
|
||||||
public final List<EdgeSearchResultKeywordScore> scores;
|
public final List<EdgeSearchResultKeywordScore> scores;
|
||||||
|
|
||||||
public EdgeSearchResultItem(int bucketId, long val) {
|
public EdgeSearchResultItem(int bucketId, long val) {
|
||||||
this.bucketId = bucketId;
|
this.bucketId = bucketId;
|
||||||
|
this.combinedId = val;
|
||||||
combinedId = val;
|
this.scores = new ArrayList<>(16);
|
||||||
scores = new ArrayList<>(16);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public EdgeId<EdgeUrl> getUrlId() {
|
public EdgeId<EdgeUrl> getUrlId() {
|
||||||
@ -33,6 +33,7 @@ public class EdgeSearchResultItem {
|
|||||||
return (int)(combinedId >>> 32);
|
return (int)(combinedId >>> 32);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Used for evaluation */
|
||||||
private transient double scoreValue = 1;
|
private transient double scoreValue = 1;
|
||||||
public void setScore(double score) {
|
public void setScore(double score) {
|
||||||
scoreValue = score;
|
scoreValue = score;
|
||||||
|
@ -4,10 +4,10 @@ import lombok.AllArgsConstructor;
|
|||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import lombok.ToString;
|
import lombok.ToString;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import nu.marginalia.wmsa.edge.model.id.EdgeIdArray;
|
import nu.marginalia.wmsa.edge.model.id.EdgeIdList;
|
||||||
|
|
||||||
@AllArgsConstructor @Getter @ToString
|
@AllArgsConstructor @Getter @ToString
|
||||||
public class EdgeDomainSearchResults {
|
public class EdgeDomainSearchResults {
|
||||||
public final String keyword;
|
public final String keyword;
|
||||||
public final EdgeIdArray<EdgeUrl> results;
|
public final EdgeIdList<EdgeUrl> results;
|
||||||
}
|
}
|
||||||
|
@ -2,7 +2,6 @@ package nu.marginalia.wmsa.edge.search;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import io.prometheus.client.Summary;
|
|
||||||
import io.reactivex.rxjava3.core.Observable;
|
import io.reactivex.rxjava3.core.Observable;
|
||||||
import io.reactivex.rxjava3.schedulers.Schedulers;
|
import io.reactivex.rxjava3.schedulers.Schedulers;
|
||||||
import nu.marginalia.wmsa.configuration.server.Context;
|
import nu.marginalia.wmsa.configuration.server.Context;
|
||||||
@ -18,7 +17,6 @@ import nu.marginalia.wmsa.edge.model.id.EdgeIdSet;
|
|||||||
import nu.marginalia.wmsa.edge.model.search.*;
|
import nu.marginalia.wmsa.edge.model.search.*;
|
||||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
||||||
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
||||||
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet;
|
|
||||||
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults;
|
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults;
|
||||||
import nu.marginalia.wmsa.edge.search.query.QueryFactory;
|
import nu.marginalia.wmsa.edge.search.query.QueryFactory;
|
||||||
import nu.marginalia.wmsa.edge.search.query.model.EdgeSearchQuery;
|
import nu.marginalia.wmsa.edge.search.query.model.EdgeSearchQuery;
|
||||||
@ -50,8 +48,6 @@ public class EdgeSearchOperator {
|
|||||||
private final SearchResultDecorator resultDecorator;
|
private final SearchResultDecorator resultDecorator;
|
||||||
private final Comparator<EdgeUrlDetails> resultListComparator;
|
private final Comparator<EdgeUrlDetails> resultListComparator;
|
||||||
|
|
||||||
private static final Summary wmsa_search_index_api_time = Summary.build().name("wmsa_search_index_api_time").help("-").register();
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public EdgeSearchOperator(AssistantClient assistantClient,
|
public EdgeSearchOperator(AssistantClient assistantClient,
|
||||||
EncyclopediaClient encyclopediaClient,
|
EncyclopediaClient encyclopediaClient,
|
||||||
@ -81,9 +77,7 @@ public class EdgeSearchOperator {
|
|||||||
|
|
||||||
logger.info("Human terms (API): {}", Strings.join(processedQuery.searchTermsHuman, ','));
|
logger.info("Human terms (API): {}", Strings.join(processedQuery.searchTermsHuman, ','));
|
||||||
|
|
||||||
DecoratedSearchResultSet queryResults = performQuery(ctx, processedQuery);
|
return performQuery(ctx, processedQuery);
|
||||||
|
|
||||||
return queryResults.resultSet;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public DecoratedSearchResults doSearch(Context ctx, EdgeUserSearchParameters params, @Nullable Future<String> eval) {
|
public DecoratedSearchResults doSearch(Context ctx, EdgeUserSearchParameters params, @Nullable Future<String> eval) {
|
||||||
@ -91,23 +85,25 @@ public class EdgeSearchOperator {
|
|||||||
Observable<WikiArticles> definitions = getWikiArticle(ctx, params.humanQuery());
|
Observable<WikiArticles> definitions = getWikiArticle(ctx, params.humanQuery());
|
||||||
|
|
||||||
EdgeSearchQuery processedQuery = queryFactory.createQuery(params);
|
EdgeSearchQuery processedQuery = queryFactory.createQuery(params);
|
||||||
|
|
||||||
logger.info("Human terms: {}", Strings.join(processedQuery.searchTermsHuman, ','));
|
logger.info("Human terms: {}", Strings.join(processedQuery.searchTermsHuman, ','));
|
||||||
|
|
||||||
DecoratedSearchResultSet queryResults = performQuery(ctx, processedQuery);
|
List<EdgeUrlDetails> queryResults = performQuery(ctx, processedQuery);
|
||||||
|
|
||||||
String evalResult = getEvalResult(eval);
|
String evalResult = getEvalResult(eval);
|
||||||
|
|
||||||
|
|
||||||
List<BrowseResult> domainResults = getDomainResults(ctx, processedQuery.specs);
|
List<BrowseResult> domainResults = getDomainResults(ctx, processedQuery.specs);
|
||||||
|
WikiArticles wikiArticles = definitions.onErrorReturn((e) -> new WikiArticles()).blockingFirst();
|
||||||
|
|
||||||
return new DecoratedSearchResults(params,
|
return DecoratedSearchResults.builder()
|
||||||
getProblems(ctx, evalResult, queryResults, processedQuery),
|
.params(params)
|
||||||
evalResult,
|
.problems(getProblems(ctx, evalResult, queryResults, processedQuery))
|
||||||
definitions.onErrorReturn((e) -> new WikiArticles()).blockingFirst(),
|
.evalResult(evalResult)
|
||||||
queryResults.resultSet,
|
.wiki(wikiArticles)
|
||||||
domainResults,
|
.results(queryResults)
|
||||||
processedQuery.domain,
|
.domainResults(domainResults)
|
||||||
getDomainId(processedQuery.domain));
|
.focusDomain(processedQuery.domain)
|
||||||
|
.focusDomainId(getDomainId(processedQuery.domain))
|
||||||
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<BrowseResult> getDomainResults(Context ctx, EdgeSearchSpecification specs) {
|
private List<BrowseResult> getDomainResults(Context ctx, EdgeSearchSpecification specs) {
|
||||||
@ -169,7 +165,7 @@ public class EdgeSearchOperator {
|
|||||||
return domainId;
|
return domainId;
|
||||||
}
|
}
|
||||||
|
|
||||||
public DecoratedSearchResultSet performDumbQuery(Context ctx, EdgeSearchProfile profile, IndexBlock block, int limitPerDomain, int limitTotal, String... termsInclude) {
|
public List<EdgeUrlDetails> performDumbQuery(Context ctx, EdgeSearchProfile profile, IndexBlock block, int limitPerDomain, int limitTotal, String... termsInclude) {
|
||||||
List<EdgeSearchSubquery> sqs = new ArrayList<>();
|
List<EdgeSearchSubquery> sqs = new ArrayList<>();
|
||||||
|
|
||||||
sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), block));
|
sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), block));
|
||||||
@ -179,11 +175,13 @@ public class EdgeSearchOperator {
|
|||||||
return performQuery(ctx, new EdgeSearchQuery(specs));
|
return performQuery(ctx, new EdgeSearchQuery(specs));
|
||||||
}
|
}
|
||||||
|
|
||||||
private DecoratedSearchResultSet performQuery(Context ctx, EdgeSearchQuery processedQuery) {
|
private List<EdgeUrlDetails> performQuery(Context ctx, EdgeSearchQuery processedQuery) {
|
||||||
|
|
||||||
List<EdgeUrlDetails> resultList = new ArrayList<>(processedQuery.specs.limitTotal);
|
final List<EdgeSearchResultItem> results = indexClient.query(ctx, processedQuery.specs);
|
||||||
|
|
||||||
for (var details : wmsa_search_index_api_time.time(()->fetchResultsSimple(ctx, processedQuery))) {
|
final List<EdgeUrlDetails> resultList = new ArrayList<>(results.size());
|
||||||
|
|
||||||
|
for (var details : resultDecorator.getAllUrlDetails(results)) {
|
||||||
if (details.getUrlQuality() <= -100) {
|
if (details.getUrlQuality() <= -100) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -208,10 +206,10 @@ public class EdgeSearchOperator {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return new DecoratedSearchResultSet(retList);
|
return retList;
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<String> getProblems(Context ctx, String evalResult, DecoratedSearchResultSet queryResults, EdgeSearchQuery processedQuery) {
|
private List<String> getProblems(Context ctx, String evalResult, List<EdgeUrlDetails> queryResults, EdgeSearchQuery processedQuery) {
|
||||||
final List<String> problems = new ArrayList<>(processedQuery.problems);
|
final List<String> problems = new ArrayList<>(processedQuery.problems);
|
||||||
boolean siteSearch = processedQuery.domain != null;
|
boolean siteSearch = processedQuery.domain != null;
|
||||||
|
|
||||||
@ -305,15 +303,6 @@ public class EdgeSearchOperator {
|
|||||||
;
|
;
|
||||||
}
|
}
|
||||||
|
|
||||||
private Set<EdgeUrlDetails> fetchResultsSimple(Context ctx, EdgeSearchQuery processedQuery) {
|
|
||||||
EdgeSearchResultSet resultSet = indexClient.query(ctx, processedQuery.specs);
|
|
||||||
|
|
||||||
var results = resultSet.getResults();
|
|
||||||
Set<EdgeUrlDetails> ret = new HashSet<>(resultDecorator.getAllUrlDetails(results));
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
private Iterable<String> spellCheckTerms(Context ctx, EdgeSearchQuery disjointedQuery) {
|
private Iterable<String> spellCheckTerms(Context ctx, EdgeSearchQuery disjointedQuery) {
|
||||||
return Observable.fromIterable(disjointedQuery.searchTermsHuman)
|
return Observable.fromIterable(disjointedQuery.searchTermsHuman)
|
||||||
.subscribeOn(Schedulers.io())
|
.subscribeOn(Schedulers.io())
|
||||||
|
@ -20,6 +20,7 @@ import nu.marginalia.wmsa.edge.search.command.SearchJsParameter;
|
|||||||
import nu.marginalia.wmsa.edge.search.command.SearchParameters;
|
import nu.marginalia.wmsa.edge.search.command.SearchParameters;
|
||||||
import nu.marginalia.wmsa.edge.search.exceptions.RedirectException;
|
import nu.marginalia.wmsa.edge.search.exceptions.RedirectException;
|
||||||
import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters;
|
import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters;
|
||||||
|
import nu.marginalia.wmsa.edge.search.svc.EdgeSearchErrorPageService;
|
||||||
import nu.marginalia.wmsa.resource_store.StaticResources;
|
import nu.marginalia.wmsa.resource_store.StaticResources;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -40,6 +41,7 @@ public class EdgeSearchService extends Service {
|
|||||||
private final WebsiteUrl websiteUrl;
|
private final WebsiteUrl websiteUrl;
|
||||||
private StaticResources staticResources;
|
private StaticResources staticResources;
|
||||||
|
|
||||||
|
private final EdgeSearchErrorPageService errorPageService;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(EdgeSearchService.class);
|
private static final Logger logger = LoggerFactory.getLogger(EdgeSearchService.class);
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
@ -53,7 +55,8 @@ public class EdgeSearchService extends Service {
|
|||||||
CommandEvaluator searchCommandEvaulator,
|
CommandEvaluator searchCommandEvaulator,
|
||||||
WebsiteUrl websiteUrl,
|
WebsiteUrl websiteUrl,
|
||||||
StaticResources staticResources,
|
StaticResources staticResources,
|
||||||
IndexCommand indexCommand) {
|
IndexCommand indexCommand,
|
||||||
|
EdgeSearchErrorPageService errorPageService) {
|
||||||
super(ip, port, initialization, metricsServer);
|
super(ip, port, initialization, metricsServer);
|
||||||
this.indexClient = indexClient;
|
this.indexClient = indexClient;
|
||||||
|
|
||||||
@ -61,6 +64,7 @@ public class EdgeSearchService extends Service {
|
|||||||
this.searchCommandEvaulator = searchCommandEvaulator;
|
this.searchCommandEvaulator = searchCommandEvaulator;
|
||||||
this.websiteUrl = websiteUrl;
|
this.websiteUrl = websiteUrl;
|
||||||
this.staticResources = staticResources;
|
this.staticResources = staticResources;
|
||||||
|
this.errorPageService = errorPageService;
|
||||||
|
|
||||||
Spark.staticFiles.expireTime(600);
|
Spark.staticFiles.expireTime(600);
|
||||||
|
|
||||||
@ -79,7 +83,7 @@ public class EdgeSearchService extends Service {
|
|||||||
|
|
||||||
Spark.exception(Exception.class, (e,p,q) -> {
|
Spark.exception(Exception.class, (e,p,q) -> {
|
||||||
logger.error("Error during processing", e);
|
logger.error("Error during processing", e);
|
||||||
serveError(Context.fromRequest(p), q);
|
errorPageService.serveError(Context.fromRequest(p), q);
|
||||||
});
|
});
|
||||||
|
|
||||||
Spark.awaitInitialization();
|
Spark.awaitInitialization();
|
||||||
@ -104,26 +108,6 @@ public class EdgeSearchService extends Service {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void serveError(Context ctx, Response rsp) {
|
|
||||||
boolean isIndexUp = indexClient.isAlive();
|
|
||||||
|
|
||||||
try {
|
|
||||||
if (!isIndexUp) {
|
|
||||||
rsp.body("<html><head><title>Error</title><link rel=\"stylesheet\" href=\"https://www.marginalia.nu/style.css\"> <meta http-equiv=\"refresh\" content=\"5\"> </head><body><article><h1>Error</h1><p>Oops! It appears the index server is <span class=\"headline\">offline</span>.</p> <p>The server was probably restarted to bring online some changes. Restarting the index typically takes a few minutes, during which searches can't be served. </p><p>This page will attempt to refresh automatically every few seconds.</p></body></html>");
|
|
||||||
} else if (indexClient.isBlocked(ctx).blockingFirst()) {
|
|
||||||
rsp.body("<html><head><title>Error</title><link rel=\"stylesheet\" href=\"https://www.marginalia.nu/style.css\"> <meta http-equiv=\"refresh\" content=\"5\"> </head><body><article><h1>Error</h1><p>Oops! It appears the index server is <span class=\"headline\">starting up</span>.</p> <p>The server was probably restarted to bring online some changes. Restarting the index typically takes a few minutes, during which searches can't be served. </p><p>This page will attempt to refresh automatically every few seconds.</p></body></html>");
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
rsp.body("<html><head><title>Error</title><link rel=\"stylesheet\" href=\"https://www.marginalia.nu/style.css\"></head><body><article><h1>Error</h1><p>Oops! An unknown error occurred. The index server seems to be up, so I don't know why this is. Please send an email to kontakt@marginalia.nu telling me what you did :-) </p></body></html>");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
logger.error("Error", ex);
|
|
||||||
rsp.body("<html><head><title>Error</title><link rel=\"stylesheet\" href=\"https://www.marginalia.nu/style.css\"> <meta http-equiv=\"refresh\" content=\"5\"> </head><body><article><h1>Error</h1><p>Oops! It appears the index server is <span class=\"headline\">unresponsive</span>.</p> <p>The server was probably restarted to bring online some changes. Restarting the index typically takes a few minutes, during which searches can't be served. </p><p>This page will attempt to refresh automatically every few seconds.</p></body></html>");
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private Object apiSearch(Request request, Response response) {
|
private Object apiSearch(Request request, Response response) {
|
||||||
|
|
||||||
@ -180,7 +164,7 @@ public class EdgeSearchService extends Service {
|
|||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.error("Error", ex);
|
logger.error("Error", ex);
|
||||||
serveError(ctx, response);
|
errorPageService.serveError(ctx, response);
|
||||||
}
|
}
|
||||||
|
|
||||||
return "";
|
return "";
|
||||||
|
@ -5,11 +5,11 @@ import nu.marginalia.wmsa.configuration.server.Context;
|
|||||||
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
|
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||||
|
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
|
||||||
import nu.marginalia.wmsa.edge.search.EdgeSearchOperator;
|
import nu.marginalia.wmsa.edge.search.EdgeSearchOperator;
|
||||||
import nu.marginalia.wmsa.edge.search.EdgeSearchProfile;
|
import nu.marginalia.wmsa.edge.search.EdgeSearchProfile;
|
||||||
import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface;
|
import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface;
|
||||||
import nu.marginalia.wmsa.edge.search.command.SearchParameters;
|
import nu.marginalia.wmsa.edge.search.command.SearchParameters;
|
||||||
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet;
|
|
||||||
import nu.marginalia.wmsa.edge.search.model.DomainInformation;
|
import nu.marginalia.wmsa.edge.search.model.DomainInformation;
|
||||||
import nu.marginalia.wmsa.edge.search.siteinfo.DomainInformationService;
|
import nu.marginalia.wmsa.edge.search.siteinfo.DomainInformationService;
|
||||||
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
|
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
|
||||||
@ -19,10 +19,7 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.Collections;
|
import java.util.*;
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.function.Predicate;
|
import java.util.function.Predicate;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
@ -59,7 +56,7 @@ public class SiteSearchCommand implements SearchCommandInterface {
|
|||||||
var results = siteInfo(ctx, query);
|
var results = siteInfo(ctx, query);
|
||||||
var domain = results.getDomain();
|
var domain = results.getDomain();
|
||||||
|
|
||||||
DecoratedSearchResultSet resultSet;
|
List<EdgeUrlDetails> resultSet;
|
||||||
Path screenshotPath = null;
|
Path screenshotPath = null;
|
||||||
if (null != domain) {
|
if (null != domain) {
|
||||||
resultSet = searchOperator.performDumbQuery(ctx, EdgeSearchProfile.CORPO, IndexBlock.Words_1, 100, 100, "site:"+domain);
|
resultSet = searchOperator.performDumbQuery(ctx, EdgeSearchProfile.CORPO, IndexBlock.Words_1, 100, 100, "site:"+domain);
|
||||||
@ -67,10 +64,10 @@ public class SiteSearchCommand implements SearchCommandInterface {
|
|||||||
screenshotPath = Path.of("/screenshot/" + dataStoreDao.getDomainId(domain).id());
|
screenshotPath = Path.of("/screenshot/" + dataStoreDao.getDomainId(domain).id());
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
resultSet = new DecoratedSearchResultSet(Collections.emptyList());
|
resultSet = Collections.emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
return Optional.of(siteInfoRenderer.render(results, Map.of("query", query, "focusDomain", Objects.requireNonNullElse(domain, ""), "profile", parameters.profileStr(), "results", resultSet.resultSet, "screenshot", screenshotPath == null ? "" : screenshotPath.toString())));
|
return Optional.of(siteInfoRenderer.render(results, Map.of("query", query, "focusDomain", Objects.requireNonNullElse(domain, ""), "profile", parameters.profileStr(), "results", resultSet, "screenshot", screenshotPath == null ? "" : screenshotPath.toString())));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,22 +0,0 @@
|
|||||||
package nu.marginalia.wmsa.edge.search.model;
|
|
||||||
|
|
||||||
import lombok.Getter;
|
|
||||||
import lombok.ToString;
|
|
||||||
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Objects;
|
|
||||||
|
|
||||||
@ToString @Getter
|
|
||||||
public class DecoratedSearchResultSet {
|
|
||||||
public final List<EdgeUrlDetails> resultSet;
|
|
||||||
|
|
||||||
public int size() {
|
|
||||||
return resultSet.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
public DecoratedSearchResultSet(List<EdgeUrlDetails> resultSet) {
|
|
||||||
this.resultSet = Objects.requireNonNull(resultSet);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.wmsa.edge.search.model;
|
package nu.marginalia.wmsa.edge.search.model;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Builder;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles;
|
import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles;
|
||||||
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
|
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
|
||||||
@ -8,7 +9,7 @@ import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters;
|
|||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@AllArgsConstructor @Getter
|
@AllArgsConstructor @Getter @Builder
|
||||||
public class DecoratedSearchResults {
|
public class DecoratedSearchResults {
|
||||||
private final EdgeUserSearchParameters params;
|
private final EdgeUserSearchParameters params;
|
||||||
private final List<String> problems;
|
private final List<String> problems;
|
||||||
|
@ -13,7 +13,6 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
public class SearchResultDecorator {
|
public class SearchResultDecorator {
|
||||||
@ -67,8 +66,7 @@ public class SearchResultDecorator {
|
|||||||
if (!missedIds.isEmpty()) {
|
if (!missedIds.isEmpty()) {
|
||||||
logger.debug("Could not look up documents: {}", missedIds.toArray());
|
logger.debug("Could not look up documents: {}", missedIds.toArray());
|
||||||
}
|
}
|
||||||
retList.sort(Comparator.comparing(EdgeUrlDetails::getTermScore)
|
|
||||||
.thenComparing(url -> url.url.path.length()));
|
|
||||||
return retList;
|
return retList;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -0,0 +1,125 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.search.svc;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import nu.marginalia.wmsa.configuration.server.Context;
|
||||||
|
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
import spark.Response;
|
||||||
|
|
||||||
|
public class EdgeSearchErrorPageService {
|
||||||
|
private final EdgeIndexClient indexClient;
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public EdgeSearchErrorPageService(EdgeIndexClient indexClient) {
|
||||||
|
this.indexClient = indexClient;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void serveError(Context ctx, Response rsp) {
|
||||||
|
boolean isIndexUp = indexClient.isAlive();
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (!isIndexUp) {
|
||||||
|
rsp.body(renderError("The index is down",
|
||||||
|
"""
|
||||||
|
The search index server appears to be down.
|
||||||
|
<p>
|
||||||
|
The server was possibly restarted to bring online some changes.
|
||||||
|
Restarting the index typically takes a few minutes, during which
|
||||||
|
searches can't be served.
|
||||||
|
"""));
|
||||||
|
} else if (indexClient.isBlocked(ctx).blockingFirst()) {
|
||||||
|
rsp.body(renderError("The index is starting up",
|
||||||
|
"""
|
||||||
|
The search index server appears to be in the process of starting up.
|
||||||
|
This typically takes a few minutes. Be patient.
|
||||||
|
"""));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
rsp.body(renderError("Error processing request",
|
||||||
|
"""
|
||||||
|
The search index appears to be up and running, so the problem may be related
|
||||||
|
to some wider general error, or pertain to an error handling your query.
|
||||||
|
"""));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
rsp.body(renderError("Error processing error",
|
||||||
|
"""
|
||||||
|
An error has occurred, additionally, an error occurred while handling that error
|
||||||
|
<p>
|
||||||
|
<a href="https://www.youtube.com/watch?v=dsx2vdn7gpY">https://www.youtube.com/watch?v=dsx2vdn7gpY</a>.
|
||||||
|
|
||||||
|
"""));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String renderError(String title, String message) {
|
||||||
|
return """
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<title>Error</title>
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<link rel="stylesheet" href="https://search.marginalia.nu/style-new.css">
|
||||||
|
<header>
|
||||||
|
<nav>
|
||||||
|
<a href="https://www.marginalia.nu/">Marginalia</a>
|
||||||
|
<a href="https://memex.marginalia.nu/projects/edge/about.gmi">About</a>
|
||||||
|
<a href="https://memex.marginalia.nu/projects/edge/supporting.gmi">Support</a>
|
||||||
|
</nav>
|
||||||
|
</header>
|
||||||
|
<article>
|
||||||
|
<form method="get" action="/search">
|
||||||
|
<section class="search-box">
|
||||||
|
<h1>Search the Internet</h1>
|
||||||
|
<div class="input">
|
||||||
|
<input id="query" name="query" placeholder="Search terms" value="" autocomplete="off">
|
||||||
|
<input value="Go" type="submit">
|
||||||
|
</div>
|
||||||
|
<div class="settings">
|
||||||
|
<select name="profile" id="profile">
|
||||||
|
<option value="default">Popular Sites</option>
|
||||||
|
<option value="modern">Blogs and Personal Websites</option>
|
||||||
|
<option value="academia">Academia, Forums, Big Websites</option>
|
||||||
|
<option value="yolo">Default Ranking Algorithm</option>
|
||||||
|
<option value="food">Recipes 🍳</option>
|
||||||
|
<option value="corpo">Experimental</option>
|
||||||
|
</select>
|
||||||
|
<select name="js" id="js">
|
||||||
|
<option value="default">Allow JS</option>
|
||||||
|
<option value="no-js">Deny JS</option>
|
||||||
|
<option value="yes-js">Require JS</option>
|
||||||
|
</select>
|
||||||
|
</div>
|
||||||
|
<div class="extra">
|
||||||
|
<a href="https://search.marginalia.nu/explore/random">Random Websites</a>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
</form>
|
||||||
|
<div class="cards big">
|
||||||
|
<div class="card problems">
|
||||||
|
<h2>
|
||||||
|
"""
|
||||||
|
+ title +
|
||||||
|
"""
|
||||||
|
</h2>
|
||||||
|
<div class="info">
|
||||||
|
"""
|
||||||
|
+message+
|
||||||
|
"""
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="card">
|
||||||
|
<h2>More Info</h2>
|
||||||
|
<div class="info">
|
||||||
|
You may be able to find more information here:
|
||||||
|
<ul>
|
||||||
|
<li><a href="https://status.marginalia.nu/">Maintenance Messages</a></li>
|
||||||
|
<li><a href="https://twitter.com/MarginaliaNu">Twitter Account</a></li>
|
||||||
|
<li>Email Me: <tt>kontakt@marginalia.nu</tt></li>
|
||||||
|
</ul>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
""";
|
||||||
|
}
|
||||||
|
}
|
@ -5,7 +5,7 @@ import org.junit.jupiter.api.Test;
|
|||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
class MicroCacheTest {
|
class MicroBTreeCachedIndexTest {
|
||||||
MicroCache mc;
|
MicroCache mc;
|
||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
@ -1,13 +1,15 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.reader.query.types;
|
package nu.marginalia.wmsa.edge.index.reader.query.types;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepFromPredicate;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
class QueryFilterStepTest {
|
class QueryFilterStepIfTest {
|
||||||
QueryFilterStep even = new QueryFilterStepFromPredicate(l -> (l%2) == 0);
|
QueryFilterStepIf even = new QueryFilterStepFromPredicate(l -> (l%2) == 0);
|
||||||
QueryFilterStep divisibleByThree = new QueryFilterStepFromPredicate(l -> (l%3) == 0);
|
QueryFilterStepIf divisibleByThree = new QueryFilterStepFromPredicate(l -> (l%3) == 0);
|
||||||
QueryFilterStep either = QueryFilterStep.anyOf(List.of(even, divisibleByThree));
|
QueryFilterStepIf either = QueryFilterStepIf.anyOf(List.of(even, divisibleByThree));
|
||||||
@Test
|
@Test
|
||||||
public void test() {
|
public void test() {
|
||||||
long[] values = new long[100];
|
long[] values = new long[100];
|
Loading…
Reference in New Issue
Block a user