From eaef93f4ae9728a0da150d7a42dd468c5e2833b9 Mon Sep 17 00:00:00 2001 From: vlofgren Date: Sun, 11 Sep 2022 11:31:22 +0200 Subject: [PATCH] Cleaning up and adding better error messages. --- .../util/btree/CachingBTreeReader.java | 12 +- .../wmsa/configuration/WmsaHome.java | 4 + .../wmsa/edge/index/EdgeIndexBucket.java | 6 +- .../wmsa/edge/index/EdgeIndexService.java | 402 +----------------- .../edge/index/client/EdgeIndexClient.java | 22 +- .../index/model/EdgeIndexSearchTerms.java | 4 + .../reader/ResultDomainDeduplicator.java | 52 --- .../wmsa/edge/index/reader/SearchIndex.java | 35 +- .../edge/index/reader/SearchIndexReader.java | 3 +- .../wmsa/edge/index/reader/SearchIndexes.java | 2 +- .../wmsa/edge/index/reader/query/Query.java | 26 -- .../reader/query/types/QueryFilterStep.java | 125 ------ .../index/svc/EdgeIndexLexiconService.java | 107 +++++ .../edge/index/svc/EdgeIndexOpsService.java | 44 ++ .../edge/index/svc/EdgeIndexQueryService.java | 320 ++++++++++++++ .../{reader => svc}/query/IndexQuery.java | 14 +- .../query}/IndexQueryCachePool.java | 18 +- .../query/IndexQueryFactory.java | 21 +- .../edge/index/svc/query/IndexQueryIf.java | 26 ++ .../query/IndexSearchBudget.java | 2 +- .../svc/query/ResultDomainDeduplicator.java | 45 ++ .../query/types/EntrySource.java | 3 +- .../query/types/filter/QueryFilterAnyOf.java | 39 ++ .../types/filter/QueryFilterBTreeRange.java} | 8 +- .../query/types/filter/QueryFilterNoPass.java | 33 ++ .../filter}/QueryFilterStepFromPredicate.java | 4 +- .../query/types/filter/QueryFilterStepIf.java | 71 ++++ .../model/search/EdgeSearchResultItem.java | 9 +- .../domain/EdgeDomainSearchResults.java | 4 +- .../wmsa/edge/search/EdgeSearchOperator.java | 55 +-- .../wmsa/edge/search/EdgeSearchService.java | 30 +- .../command/commands/SiteSearchCommand.java | 13 +- .../model/DecoratedSearchResultSet.java | 22 - .../search/model/DecoratedSearchResults.java | 3 +- .../search/results/SearchResultDecorator.java | 4 +- .../svc/EdgeSearchErrorPageService.java | 125 ++++++ ...st.java => MicroBTreeCachedIndexTest.java} | 2 +- ...epTest.java => QueryFilterStepIfTest.java} | 10 +- 38 files changed, 961 insertions(+), 764 deletions(-) delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/ResultDomainDeduplicator.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/Query.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/types/QueryFilterStep.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexOpsService.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{reader => svc}/query/IndexQuery.java (82%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{reader => svc/query}/IndexQueryCachePool.java (70%) rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{reader => svc}/query/IndexQueryFactory.java (77%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryIf.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{reader => svc}/query/IndexSearchBudget.java (88%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/ResultDomainDeduplicator.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{reader => svc}/query/types/EntrySource.java (72%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterAnyOf.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{reader/query/types/UrlRangeSubFilter.java => svc/query/types/filter/QueryFilterBTreeRange.java} (59%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterNoPass.java rename marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/{reader/query/types => svc/query/types/filter}/QueryFilterStepFromPredicate.java (90%) create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterStepIf.java delete mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DecoratedSearchResultSet.java create mode 100644 marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchErrorPageService.java rename marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/{MicroCacheTest.java => MicroBTreeCachedIndexTest.java} (97%) rename marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/query/types/{QueryFilterStepTest.java => QueryFilterStepIfTest.java} (54%) diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/btree/CachingBTreeReader.java b/marginalia_nu/src/main/java/nu/marginalia/util/btree/CachingBTreeReader.java index cc859c54..feea6bde 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/CachingBTreeReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/CachingBTreeReader.java @@ -25,14 +25,14 @@ public class CachingBTreeReader { return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2)); } - public Cache prepareCache(BTreeHeader header) { - return new Cache(header); + public BTreeCachedIndex prepareCache(BTreeHeader header) { + return new BTreeCachedIndex(header); } /** * * @return file offset of entry matching keyRaw, negative if absent */ - public long findEntry(Cache cache, final long keyRaw) { + public long findEntry(BTreeCachedIndex cache, final long keyRaw) { BTreeHeader header = cache.header; final int blockSize = ctx.BLOCK_SIZE_WORDS(); @@ -62,7 +62,7 @@ public class CachingBTreeReader { return dataSearcher.binarySearch(key, searchStart, numEntries); } - private long searchIndex(BTreeHeader header, Cache cache, long key) { + private long searchIndex(BTreeHeader header, BTreeCachedIndex cache, long key) { final int blockSize = ctx.BLOCK_SIZE_WORDS(); long layerOffset = 0; @@ -83,13 +83,13 @@ public class CachingBTreeReader { * for repeated queries against the same tree. The memory consumption is typically very low * and the disk access pattern for reading the entire index relatively cheap. */ - public class Cache { + public class BTreeCachedIndex { long[] indexData; final BTreeHeader header; final int indexedDataSize; - public Cache(BTreeHeader header) { + public BTreeCachedIndex(BTreeHeader header) { this.header = header; indexedDataSize = header.numEntries(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WmsaHome.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WmsaHome.java index 877ca129..bbf7ccbc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WmsaHome.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/WmsaHome.java @@ -95,4 +95,8 @@ public class WmsaHome { home.resolve("model/opennlp-tok.bin")); } + private static final boolean debugMode = Boolean.getBoolean("wmsa-debug"); + public static boolean isDebug() { + return debugMode; + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java index b736b614..cde869d7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java @@ -3,10 +3,10 @@ package nu.marginalia.wmsa.edge.index; import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter; import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.reader.IndexQueryCachePool; import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader; -import nu.marginalia.wmsa.edge.index.reader.query.IndexQuery; -import nu.marginalia.wmsa.edge.index.reader.query.IndexQueryFactory; +import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery; +import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool; +import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java index 70812ab0..9774fd97 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java @@ -3,68 +3,29 @@ package nu.marginalia.wmsa.edge.index; import com.google.gson.Gson; import com.google.inject.Inject; import com.google.inject.name.Named; -import com.google.protobuf.InvalidProtocolBufferException; -import gnu.trove.set.hash.TIntHashSet; -import io.prometheus.client.Counter; -import io.prometheus.client.Histogram; import io.reactivex.rxjava3.schedulers.Schedulers; -import nu.marginalia.util.ListChunker; -import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.wmsa.client.GsonFactory; import nu.marginalia.wmsa.configuration.server.Initialization; import nu.marginalia.wmsa.configuration.server.MetricsServer; import nu.marginalia.wmsa.configuration.server.Service; -import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl; -import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry; -import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader; -import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; -import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms; -import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.reader.IndexQueryCachePool; -import nu.marginalia.wmsa.edge.index.reader.ResultDomainDeduplicator; import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; -import nu.marginalia.wmsa.edge.index.reader.query.IndexQuery; -import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; -import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.id.EdgeId; -import nu.marginalia.wmsa.edge.model.id.EdgeIdArray; -import nu.marginalia.wmsa.edge.model.search.*; -import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults; -import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification; -import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq; -import org.apache.http.HttpStatus; +import nu.marginalia.wmsa.edge.index.svc.EdgeIndexLexiconService; +import nu.marginalia.wmsa.edge.index.svc.EdgeIndexOpsService; +import nu.marginalia.wmsa.edge.index.svc.EdgeIndexQueryService; import org.jetbrains.annotations.NotNull; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import spark.HaltException; import spark.Request; import spark.Response; import spark.Spark; -import java.util.*; import java.util.concurrent.TimeUnit; -import java.util.function.LongPredicate; import static spark.Spark.get; -import static spark.Spark.halt; public class EdgeIndexService extends Service { - private static final int SEARCH_BUDGET_TIMEOUT_MS = 3000; - private static final int QUERY_FETCH_SIZE = 8192; - private static final int QUERY_FIRST_PASS_DOMAIN_LIMIT = 64; - - private final Logger logger = LoggerFactory.getLogger(getClass()); @NotNull private final Initialization init; private final SearchIndexes indexes; - private final KeywordLexicon keywordLexicon; - - private final Gson gson = GsonFactory.get(); - - private static final Histogram wmsa_edge_index_query_time = Histogram.build().name("wmsa_edge_index_query_time").linearBuckets(50, 50, 15).help("-").register(); - private static final Counter wmsa_edge_index_query_timeouts = Counter.build().name("wmsa_edge_index_query_timeouts").help("-").register(); public static final int DYNAMIC_BUCKET_LENGTH = 7; @@ -75,71 +36,34 @@ public class EdgeIndexService extends Service { Initialization init, MetricsServer metricsServer, SearchIndexes indexes, - IndexServicesFactory servicesFactory) { + + EdgeIndexOpsService opsService, + EdgeIndexLexiconService lexiconService, + EdgeIndexQueryService indexQueryService) + { super(ip, port, init, metricsServer); + final Gson gson = GsonFactory.get(); + this.init = init; this.indexes = indexes; - this.keywordLexicon = servicesFactory.getKeywordLexicon(); - Spark.post("/words/", this::putWords); - Spark.post("/search/", this::search, gson::toJson); - Spark.post("/search-domain/", this::searchDomain, gson::toJson); + Spark.post("/words/", lexiconService::putWords); - Spark.post("/dictionary/*", this::getWordId, gson::toJson); + Spark.post("/search/", indexQueryService::search, gson::toJson); + Spark.post("/search-domain/", indexQueryService::searchDomain, gson::toJson); - Spark.post("/ops/repartition", this::repartitionEndpoint); - Spark.post("/ops/preconvert", this::preconvertEndpoint); - Spark.post("/ops/reindex/:id", this::reindexEndpoint); + Spark.get("/dictionary/*", lexiconService::getWordId, gson::toJson); + + Spark.post("/ops/repartition", opsService::repartitionEndpoint); + Spark.post("/ops/preconvert", opsService::preconvertEndpoint); + Spark.post("/ops/reindex/:id", opsService::reindexEndpoint); get("/is-blocked", this::isBlocked, gson::toJson); Schedulers.newThread().scheduleDirect(this::initialize, 1, TimeUnit.MICROSECONDS); } - private Object getWordId(Request request, Response response) { - final String word = request.splat()[0]; - - var dr = indexes.getDictionaryReader(); - if (null == dr) { - response.status(HttpStatus.SC_FAILED_DEPENDENCY); - return ""; - } - - final int wordId = dr.get(word); - - if (DictionaryHashMap.NO_VALUE == wordId) { - response.status(404); - return ""; - } - - return wordId; - } - - private Object repartitionEndpoint(Request request, Response response) { - - if (!indexes.repartition()) { - Spark.halt(503, "Operations busy"); - } - return "OK"; - } - - private Object preconvertEndpoint(Request request, Response response) { - if (!indexes.preconvert()) { - Spark.halt(503, "Operations busy"); - } - return "OK"; - } - - private Object reindexEndpoint(Request request, Response response) { - int id = Integer.parseInt(request.params("id")); - - if (!indexes.reindex(id)) { - Spark.halt(503, "Operations busy"); - } - return "OK"; - } - private Object isBlocked(Request request, Response response) { return indexes.isBusy() || !initialized; } @@ -156,296 +80,6 @@ public class EdgeIndexService extends Service { indexes.initialize(init); } - private Object putWords(Request request, Response response) throws InvalidProtocolBufferException { - var req = IndexPutKeywordsReq.parseFrom(request.bodyAsBytes()); - - EdgeId domainId = new EdgeId<>(req.getDomain()); - EdgeId urlId = new EdgeId<>(req.getUrl()); - int idx = req.getIndex(); - - for (int ws = 0; ws < req.getWordSetCount(); ws++) { - putWords(domainId, urlId, req.getWordSet(ws), idx); - } - - response.status(HttpStatus.SC_ACCEPTED); - return ""; - } - - public void putWords(EdgeId domainId, EdgeId urlId, - IndexPutKeywordsReq.WordSet words, int idx - ) { - SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx); - - IndexBlock block = IndexBlock.values()[words.getIndex()]; - - for (var chunk : ListChunker.chopList(words.getWordsList(), SearchIndexJournalEntry.MAX_LENGTH)) { - - var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk)); - var header = new SearchIndexJournalEntryHeader(domainId, urlId, block); - - indexWriter.put(header, entry); - }; - } - - private long[] getOrInsertWordIds(List words) { - long[] ids = new long[words.size()]; - int putIdx = 0; - - for (String word : words) { - long id = keywordLexicon.getOrInsert(word); - if (id != DictionaryHashMap.NO_VALUE) { - ids[putIdx++] = id; - } - } - - if (putIdx != words.size()) { - ids = Arrays.copyOf(ids, putIdx); - } - return ids; - } - - private Object searchDomain(Request request, Response response) { - if (indexes.getDictionaryReader() == null) { - logger.warn("Dictionary reader not yet initialized"); - halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes"); - } - - String json = request.body(); - EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class); - - final int wordId = keywordLexicon.getReadOnly(specsSet.keyword); - - EdgeIdArray urlIds = EdgeIdArray.gather(indexes - .getBucket(specsSet.bucket) - .findHotDomainsForKeyword(specsSet.block, wordId, specsSet.queryDepth, specsSet.minHitCount, specsSet.maxResults) - .mapToInt(lv -> (int)(lv & 0xFFFF_FFFFL))); - - return new EdgeDomainSearchResults(specsSet.keyword, urlIds); - } - - private Object search(Request request, Response response) { - if (indexes.getDictionaryReader() == null) { - logger.warn("Dictionary reader not yet initialized"); - halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes"); - } - - String json = request.body(); - EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class); - - long start = System.currentTimeMillis(); - try { - return new EdgeSearchResultSet(new SearchQuery(specsSet).execute()); - } - catch (HaltException ex) { - logger.warn("Halt", ex); - throw ex; - } - catch (Exception ex) { - logger.info("Error during search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json); - logger.info("Error", ex); - Spark.halt(500, "Error"); - return null; - } - finally { - wmsa_edge_index_query_time.observe(System.currentTimeMillis() - start); - } - } - - - private class SearchQuery { - private final TIntHashSet seenResults = new TIntHashSet(QUERY_FETCH_SIZE, 0.5f); - private final EdgeSearchSpecification specsSet; - private final IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_TIMEOUT_MS); - private final IndexQueryCachePool cachePool = new IndexQueryCachePool(); - - public SearchQuery(EdgeSearchSpecification specsSet) { - this.specsSet = specsSet; - } - - private List execute() { - final Set results = new HashSet<>(QUERY_FETCH_SIZE); - - for (var sq : specsSet.subqueries) { - Optional searchTerms = getSearchTerms(sq); - - if (searchTerms.isEmpty()) - continue; - - results.addAll(performSearch(searchTerms.get(), sq)); - } - - for (var result : results) { - addResultScores(result); - } - - if (!budget.hasTimeLeft()) { - wmsa_edge_index_query_timeouts.inc(); - } - - var domainCountFilter = new ResultDomainDeduplicator(specsSet.limitByDomain); - -// cachePool.printSummary(logger); - cachePool.clear(); - - return results.stream() - .sorted(Comparator.comparing(EdgeSearchResultItem::getScore)) - .filter(domainCountFilter::test) - .limit(specsSet.getLimitTotal()).toList(); - } - - - private List performSearch(EdgeIndexSearchTerms searchTerms, - EdgeSearchSubquery sq) - { - - final List results = new ArrayList<>(QUERY_FETCH_SIZE); - final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT); - - final int remainingResults = QUERY_FETCH_SIZE; - - for (int indexBucket : specsSet.buckets) { - - if (!budget.hasTimeLeft()) { - logger.info("Query timed out, omitting {}:{} for query {}", indexBucket, sq.block, sq.searchTermsInclude); - continue; - } - - if (remainingResults <= results.size()) - break; - - var query = getQuery(cachePool, indexBucket, sq.block, lv -> localFilter.filterRawValue(indexBucket, lv), searchTerms); - long[] buf = new long[8192]; - - while (query.hasMore() && results.size() < remainingResults && budget.hasTimeLeft()) { - int cnt = query.getMoreResults(buf, budget); - - for (int i = 0; i < cnt && results.size() < remainingResults; i++) { - long id = buf[i]; - - final EdgeSearchResultItem ri = new EdgeSearchResultItem(indexBucket, id); - - if (!seenResults.add(ri.getUrlId().id()) || !localFilter.test(ri)) { - continue; - } - - results.add(ri); - } - } - - } - - return results; - } - - private IndexQuery getQuery(IndexQueryCachePool cachePool, int bucket, IndexBlock block, - LongPredicate filter, EdgeIndexSearchTerms searchTerms) { - - if (!indexes.isValidBucket(bucket)) { - logger.warn("Invalid bucket {}", bucket); - return new IndexQuery(Collections.emptyList()); - } - - return indexes.getBucket(bucket).getQuery(cachePool, block, filter, searchTerms); - } - - private void addResultScores(EdgeSearchResultItem searchResult) { - final var reader = Objects.requireNonNull(indexes.getDictionaryReader()); - - List> searchTermVariants = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList(); - - // Memoize calls to getTermData, as they're redundant and cause disk reads - Map termMetadata = new HashMap<>(32); - - double bestScore = 0; - - for (int searchTermListIdx = 0; searchTermListIdx < searchTermVariants.size(); searchTermListIdx++) { - double setScore = 0; - int setSize = 0; - for (var searchTerm : searchTermVariants.get(searchTermListIdx)) { - - final int termId = reader.get(searchTerm); - - ResultTermData data = termMetadata.computeIfAbsent( - new ResultTerm(searchResult.bucketId, termId, searchResult.getCombinedId()), this::getTermData); - - var score = data.asScore(searchTermListIdx, searchTerm); - searchResult.scores.add(score); - setScore += score.value(); - setSize++; - } - bestScore = Math.min(bestScore, setScore/setSize); - } - - searchResult.setScore(bestScore); - } - - private ResultTermData getTermData(ResultTerm resultTerm) { - final EdgeIndexBucket bucket = indexes.getBucket(resultTerm.bucket); - final int termId = resultTerm.termId; - final long combinedUrlId = resultTerm.combinedUrlId; - - return new ResultTermData(bucket.getTermScore(cachePool, termId, combinedUrlId), - - bucket.isTermInBucket(cachePool, IndexBlock.Title, termId, combinedUrlId), - bucket.isTermInBucket(cachePool, IndexBlock.Link, termId, combinedUrlId), - bucket.isTermInBucket(cachePool, IndexBlock.Site, termId, combinedUrlId), - bucket.isTermInBucket(cachePool, IndexBlock.Subjects, termId, combinedUrlId), - bucket.isTermInBucket(cachePool, IndexBlock.NamesWords, termId, combinedUrlId), - bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Top, termId, combinedUrlId), - bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Middle, termId, combinedUrlId), - bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Lower, termId, combinedUrlId) - ); - } - - record ResultTerm (int bucket, int termId, long combinedUrlId) {} - record ResultTermData (IndexBlock index, - boolean title, - boolean link, - boolean site, - boolean subject, - boolean name, - boolean high, - boolean mid, - boolean low - ) { - public EdgeSearchResultKeywordScore asScore(int set, String searchTerm) { - return new EdgeSearchResultKeywordScore(set, searchTerm, index, title, link, site, subject, name, high, mid, low); - } - } - } - - - private Optional getSearchTerms(EdgeSearchSubquery request) { - final List excludes = new ArrayList<>(); - final List includes = new ArrayList<>(); - - for (var include : request.searchTermsInclude) { - var word = lookUpWord(include); - if (word.isEmpty()) { - logger.debug("Unknown search term: " + include); - return Optional.empty(); - } - includes.add(word.getAsInt()); - } - - for (var exclude : request.searchTermsExclude) { - lookUpWord(exclude).ifPresent(excludes::add); - } - - if (includes.isEmpty()) { - return Optional.empty(); - } - - return Optional.of(new EdgeIndexSearchTerms(includes, excludes)); - } - - private OptionalInt lookUpWord(String s) { - int ret = indexes.getDictionaryReader().get(s); - if (ret == DictionaryHashMap.NO_VALUE) { - return OptionalInt.empty(); - } - return OptionalInt.of(ret); - } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java index ed67bc95..7b2ed5c6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java @@ -1,6 +1,7 @@ package nu.marginalia.wmsa.edge.index.client; import com.google.inject.Singleton; +import io.prometheus.client.Summary; import io.reactivex.rxjava3.core.Observable; import io.reactivex.rxjava3.schedulers.Schedulers; import nu.marginalia.wmsa.client.AbstractDynamicClient; @@ -10,6 +11,7 @@ import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywor import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.id.EdgeId; +import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem; import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultSet; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults; @@ -23,6 +25,8 @@ import java.util.concurrent.TimeUnit; @Singleton public class EdgeIndexClient extends AbstractDynamicClient implements EdgeIndexWriterClient { + private static final Summary wmsa_search_index_api_time = Summary.build().name("wmsa_search_index_api_time").help("-").register(); + public EdgeIndexClient() { super(ServiceDescriptor.EDGE_INDEX); setTimeout(30); @@ -52,20 +56,10 @@ public class EdgeIndexClient extends AbstractDynamicClient implements EdgeIndexW @CheckReturnValue - public EdgeSearchResultSet query(Context ctx, EdgeSearchSpecification specs) { - return this.postGet(ctx, "/search/", specs, EdgeSearchResultSet.class).blockingFirst(); - } - - @CheckReturnValue - public List multiQuery(Context ctx, EdgeSearchSpecification... specs) { - - return Observable.fromArray(specs) - .concatMap(s -> postGet(ctx, "/search/", s, EdgeSearchResultSet.class) - .subscribeOn(Schedulers.io()) - .timeout(1, TimeUnit.SECONDS) - .onErrorComplete()) - .toList() - .blockingGet(); + public List query(Context ctx, EdgeSearchSpecification specs) { + return wmsa_search_index_api_time.time( + () -> this.postGet(ctx, "/search/", specs, EdgeSearchResultSet.class).blockingFirst().getResults() + ); } @CheckReturnValue diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgeIndexSearchTerms.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgeIndexSearchTerms.java index 6d4119e1..9c78a2d2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgeIndexSearchTerms.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgeIndexSearchTerms.java @@ -9,4 +9,8 @@ import java.util.List; public class EdgeIndexSearchTerms { public List includes = new ArrayList<>(); public List excludes = new ArrayList<>(); + + public boolean isEmpty() { + return includes.isEmpty(); + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/ResultDomainDeduplicator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/ResultDomainDeduplicator.java deleted file mode 100644 index 8b9002bf..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/ResultDomainDeduplicator.java +++ /dev/null @@ -1,52 +0,0 @@ -package nu.marginalia.wmsa.edge.index.reader; - -import gnu.trove.map.TLongIntMap; -import gnu.trove.map.hash.TLongIntHashMap; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem; - -import java.util.List; - -public class ResultDomainDeduplicator { - final TLongIntMap resultsByRankingId = new TLongIntHashMap(2048, 0.5f, -1, 0); - final int limitByDomain; - - public ResultDomainDeduplicator(int limitByDomain) { - this.limitByDomain = limitByDomain; - } - - public boolean filterRawValue(int bucket, long value) { - int domain = (int) (value >>> 32); - - if (domain == Integer.MAX_VALUE) { - return true; - } - - return resultsByRankingId.get(getKey(bucket, domain)) <= limitByDomain; - } - - long getKey(int bucketId, int rankingId) { - return ((long) bucketId) << 32 | rankingId; - } - - long getKey(EdgeSearchResultItem item) { - return ((long) item.bucketId) << 32 | item.getRanking(); - } - - public boolean test(EdgeSearchResultItem item) { - if (item.getRanking() == Integer.MAX_VALUE) { - return true; - } - - return resultsByRankingId.adjustOrPutValue(getKey(item), 1, 1) <= limitByDomain; - } - - public void addAll(List items) { - for (var item : items) { - resultsByRankingId.adjustOrPutValue(getKey(item), 1, 1); - } - } - - public void add(EdgeSearchResultItem item) { - resultsByRankingId.adjustOrPutValue(getKey(item), 1, 1); - } -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java index 643d76d8..0c5a8ab0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java @@ -9,8 +9,9 @@ import nu.marginalia.util.btree.CachingBTreeReader; import nu.marginalia.util.btree.model.BTreeHeader; import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter; -import nu.marginalia.wmsa.edge.index.reader.query.types.EntrySource; -import nu.marginalia.wmsa.edge.index.reader.query.types.QueryFilterStep; +import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool; +import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource; +import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -77,25 +78,25 @@ public class SearchIndex implements AutoCloseable { return rangeForWord(pool, wordId).numEntries(); } - public UrlIndexTree rangeForWord(IndexQueryCachePool pool, int wordId) { - UrlIndexTree range = pool.getRange(words, wordId); + public IndexBTreeRange rangeForWord(IndexQueryCachePool pool, int wordId) { + IndexBTreeRange range = pool.getRange(words, wordId); if (range == null) { - range = new UrlIndexTree(words.positionForWord(wordId)); + range = new IndexBTreeRange(words.positionForWord(wordId)); pool.cacheRange(words, wordId, range); } return range; } - public UrlIndexTree rangeForWord(int wordId) { - return new UrlIndexTree(words.positionForWord(wordId)); + public IndexBTreeRange rangeForWord(int wordId) { + return new IndexBTreeRange(words.positionForWord(wordId)); } - public class UrlIndexTree { - final long dataOffset; + public class IndexBTreeRange { + public final long dataOffset; private BTreeHeader header; - public UrlIndexTree(long dataOffset) { + public IndexBTreeRange(long dataOffset) { this.dataOffset = dataOffset; } @@ -126,7 +127,7 @@ public class SearchIndex implements AutoCloseable { return new AsEntrySource(); } - public QueryFilterStep asExcludeFilterStep(IndexQueryCachePool pool) { + public QueryFilterStepIf asExcludeFilterStep(IndexQueryCachePool pool) { return new AsExcludeQueryFilterStep(pool); } @@ -150,7 +151,7 @@ public class SearchIndex implements AutoCloseable { } } - public boolean hasUrl(CachingBTreeReader.Cache cache, long url) { + public boolean hasUrl(CachingBTreeReader.BTreeCachedIndex cache, long url) { if (dataOffset < 0) return false; return cachingBTreeReader.findEntry(cache, url) >= 0; @@ -160,12 +161,12 @@ public class SearchIndex implements AutoCloseable { if (dataOffset < 0) return false; - CachingBTreeReader.Cache cache = pool.getIndexCache(SearchIndex.this, this); + CachingBTreeReader.BTreeCachedIndex cache = pool.getIndexCache(SearchIndex.this, this); return cachingBTreeReader.findEntry(cache, url) >= 0; } - public CachingBTreeReader.Cache createIndexCache() { + public CachingBTreeReader.BTreeCachedIndex createIndexCache() { if (dataOffset < 0) return null; @@ -213,11 +214,11 @@ public class SearchIndex implements AutoCloseable { } } - class AsExcludeQueryFilterStep implements QueryFilterStep { - private final CachingBTreeReader.Cache cache; + class AsExcludeQueryFilterStep implements QueryFilterStepIf { + private final CachingBTreeReader.BTreeCachedIndex cache; public AsExcludeQueryFilterStep(IndexQueryCachePool pool) { - cache = pool.getIndexCache(SearchIndex.this, UrlIndexTree.this); + cache = pool.getIndexCache(SearchIndex.this, IndexBTreeRange.this); } public SearchIndex getIndex() { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java index 68732223..5a796cce 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java @@ -3,7 +3,8 @@ package nu.marginalia.wmsa.edge.index.reader; import com.google.inject.Inject; import lombok.SneakyThrows; import nu.marginalia.wmsa.edge.index.model.IndexBlock; -import nu.marginalia.wmsa.edge.index.reader.query.IndexQueryFactory; +import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool; +import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java index 4d26c99d..828714c7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexes.java @@ -105,7 +105,7 @@ public class SearchIndexes { } @Nullable - public KeywordLexiconReadOnlyView getDictionaryReader() { + public KeywordLexiconReadOnlyView getLexiconReader() { return keywordLexiconReadOnlyView; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/Query.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/Query.java deleted file mode 100644 index 149be164..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/Query.java +++ /dev/null @@ -1,26 +0,0 @@ -package nu.marginalia.wmsa.edge.index.reader.query; - -import java.util.stream.LongStream; - -public interface Query { - Query EMPTY = new Query() { - @Override - public Query also(int wordId) { return this; } - - @Override - public Query alsoCached(int wordId) { return this; } - - @Override - public Query not(int wordId) { return this; } - - @Override - public LongStream stream() { return LongStream.empty(); } - }; - - Query also(int wordId); - Query alsoCached(int wordId); - - Query not(int wordId); - - LongStream stream(); -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/types/QueryFilterStep.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/types/QueryFilterStep.java deleted file mode 100644 index 95cace9e..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/types/QueryFilterStep.java +++ /dev/null @@ -1,125 +0,0 @@ -package nu.marginalia.wmsa.edge.index.reader.query.types; - -import nu.marginalia.wmsa.edge.index.reader.SearchIndex; - -import javax.annotation.Nullable; -import java.util.List; -import java.util.StringJoiner; - -public interface QueryFilterStep extends Comparable { - @Nullable - SearchIndex getIndex(); - - boolean test(long value); - - double cost(); - - default int compareTo(QueryFilterStep other) { - return (int)(cost() - other.cost()); - } - - String describe(); - - /** - * Move each value in items to the beginning of the array, - * and return the number of matching items. - * - * The remaining values are undefined. - */ - default int retainDestructive(long[] items, int max) { - int keep = 0; - for (int i = 0; i < max; i++) { - if (test(items[i])) { - if (i != keep) { - items[keep] = items[i]; - } - keep++; - } - } - return keep; - } - - /** - * Move each value in items to the beginning of the array, - * and return the number of matching items. The values that do - * not pass the test are moved to the end of the array. - */ - default int retainReorder(long[] items, int start, int max) { - int keep = 0; - for (int i = start; i < max; i++) { - if (test(items[i])) { - if (i != keep) { - long tmp = items[keep]; - items[keep] = items[i]; - items[i] = tmp; - } - keep++; - } - } - return keep; - } - - - static QueryFilterStep noPass() { - return NoPassFilter.instance; - } - static QueryFilterStep anyOf(List steps) { - return new AnyOfFilter(steps); - } - - -} - -class AnyOfFilter implements QueryFilterStep { - private final List steps; - - AnyOfFilter(List steps) { - this.steps = steps; - } - - public SearchIndex getIndex() { return null; } - - public double cost() { - return steps.stream().mapToDouble(QueryFilterStep::cost).average().orElse(0.); - } - - @Override - public boolean test(long value) { - for (var step : steps) { - if (step.test(value)) - return true; - } - return false; - } - - public String describe() { - StringJoiner sj = new StringJoiner(",", "[Any Of: ", "]"); - for (var step : steps) { - sj.add(step.describe()); - } - return sj.toString(); - } -} - -class NoPassFilter implements QueryFilterStep { - static final QueryFilterStep instance = new NoPassFilter(); - - @Override - public boolean test(long value) { - return false; - } - public SearchIndex getIndex() { return null; } - public double cost() { return 0.; } - - public int retainDestructive(long[] items, int max) { - return 0; - } - public int retainReorder(long[] items, int start, int max) { - return 0; - } - - public String describe() { - return "[NoPass]"; - } - -} \ No newline at end of file diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java new file mode 100644 index 00000000..6cabbd9d --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java @@ -0,0 +1,107 @@ +package nu.marginalia.wmsa.edge.index.svc; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.google.protobuf.InvalidProtocolBufferException; +import nu.marginalia.util.ListChunker; +import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.wmsa.edge.index.IndexServicesFactory; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl; +import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader; +import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; +import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.id.EdgeId; +import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq; +import org.apache.http.HttpStatus; +import spark.Request; +import spark.Response; + +import java.util.Arrays; +import java.util.List; + +@Singleton +public class EdgeIndexLexiconService { + + private final SearchIndexes indexes; + private final KeywordLexicon keywordLexicon; + + @Inject + public EdgeIndexLexiconService(SearchIndexes indexes, IndexServicesFactory servicesFactory) { + this.indexes = indexes; + this.keywordLexicon = servicesFactory.getKeywordLexicon(); + } + + public Object getWordId(Request request, Response response) { + final String word = request.splat()[0]; + + var lr = indexes.getLexiconReader(); + if (null == lr) { + response.status(HttpStatus.SC_FAILED_DEPENDENCY); + return ""; + } + + final int wordId = lr.get(word); + + if (DictionaryHashMap.NO_VALUE == wordId) { + response.status(404); + return ""; + } + + return wordId; + } + + + public Object putWords(Request request, Response response) throws InvalidProtocolBufferException { + var req = IndexPutKeywordsReq.parseFrom(request.bodyAsBytes()); + + EdgeId domainId = new EdgeId<>(req.getDomain()); + EdgeId urlId = new EdgeId<>(req.getUrl()); + int idx = req.getIndex(); + + for (int ws = 0; ws < req.getWordSetCount(); ws++) { + putWords(domainId, urlId, req.getWordSet(ws), idx); + } + + response.status(HttpStatus.SC_ACCEPTED); + return ""; + } + + public void putWords(EdgeId domainId, EdgeId urlId, + IndexPutKeywordsReq.WordSet words, int idx + ) { + SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx); + + IndexBlock block = IndexBlock.values()[words.getIndex()]; + + for (var chunk : ListChunker.chopList(words.getWordsList(), SearchIndexJournalEntry.MAX_LENGTH)) { + + var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk)); + var header = new SearchIndexJournalEntryHeader(domainId, urlId, block); + + indexWriter.put(header, entry); + }; + } + + private long[] getOrInsertWordIds(List words) { + long[] ids = new long[words.size()]; + int putIdx = 0; + + for (String word : words) { + long id = keywordLexicon.getOrInsert(word); + if (id != DictionaryHashMap.NO_VALUE) { + ids[putIdx++] = id; + } + } + + if (putIdx != words.size()) { + ids = Arrays.copyOf(ids, putIdx); + } + return ids; + } + + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexOpsService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexOpsService.java new file mode 100644 index 00000000..668890cb --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexOpsService.java @@ -0,0 +1,44 @@ +package nu.marginalia.wmsa.edge.index.svc; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; +import spark.Request; +import spark.Response; +import spark.Spark; + +@Singleton +public class EdgeIndexOpsService { + + private final SearchIndexes indexes; + + @Inject + public EdgeIndexOpsService(SearchIndexes indexes) { + this.indexes = indexes; + } + + public Object repartitionEndpoint(Request request, Response response) { + + if (!indexes.repartition()) { + Spark.halt(503, "Operations busy"); + } + return "OK"; + } + + public Object preconvertEndpoint(Request request, Response response) { + if (!indexes.preconvert()) { + Spark.halt(503, "Operations busy"); + } + return "OK"; + } + + public Object reindexEndpoint(Request request, Response response) { + int id = Integer.parseInt(request.params("id")); + + if (!indexes.reindex(id)) { + Spark.halt(503, "Operations busy"); + } + return "OK"; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java new file mode 100644 index 00000000..a410159f --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java @@ -0,0 +1,320 @@ +package nu.marginalia.wmsa.edge.index.svc; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import gnu.trove.set.hash.TIntHashSet; +import io.prometheus.client.Counter; +import io.prometheus.client.Histogram; +import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.wmsa.client.GsonFactory; +import nu.marginalia.wmsa.configuration.WmsaHome; +import nu.marginalia.wmsa.edge.index.EdgeIndexBucket; +import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms; +import nu.marginalia.wmsa.edge.index.model.IndexBlock; +import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; +import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery; +import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool; +import nu.marginalia.wmsa.edge.index.svc.query.IndexSearchBudget; +import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator; +import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.id.EdgeIdList; +import nu.marginalia.wmsa.edge.model.search.*; +import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults; +import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification; +import org.apache.http.HttpStatus; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import spark.HaltException; +import spark.Request; +import spark.Response; +import spark.Spark; + +import java.util.*; +import java.util.function.LongPredicate; + +import static spark.Spark.halt; + +@Singleton +public class EdgeIndexQueryService { + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private static final int SEARCH_BUDGET_TIMEOUT_MS = 3000; + private static final int QUERY_FETCH_SIZE = 8192; + private static final int QUERY_FIRST_PASS_DOMAIN_LIMIT = 64; + + private static final Counter wmsa_edge_index_query_timeouts = Counter.build().name("wmsa_edge_index_query_timeouts").help("-").register(); + + private static final Histogram wmsa_edge_index_query_time = Histogram.build().name("wmsa_edge_index_query_time").linearBuckets(50, 50, 15).help("-").register(); + private static final Histogram wmsa_edge_index_domain_query_time = Histogram.build().name("wmsa_edge_index_domain_query_time").linearBuckets(50, 50, 15).help("-").register(); + + private final Gson gson = GsonFactory.get(); + + private final SearchIndexes indexes; + + @Inject + public EdgeIndexQueryService(SearchIndexes indexes) { + this.indexes = indexes; + } + + public Object searchDomain(Request request, Response response) { + if (indexes.getLexiconReader() == null) { + logger.warn("Dictionary reader not yet initialized"); + halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes"); + } + + String json = request.body(); + EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class); + + try { + return wmsa_edge_index_domain_query_time.time(() -> queryDomain(specsSet)); + } + catch (HaltException ex) { + logger.warn("Halt", ex); + throw ex; + } + catch (Exception ex) { + logger.info("Error during domain search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json); + logger.info("Error", ex); + Spark.halt(500, "Error"); + return null; + } + } + + public Object search(Request request, Response response) { + if (indexes.getLexiconReader() == null) { + logger.warn("Dictionary reader not yet initialized"); + halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes"); + } + + String json = request.body(); + EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class); + + try { + return wmsa_edge_index_query_time.time(() -> query(specsSet)); + } + catch (HaltException ex) { + logger.warn("Halt", ex); + throw ex; + } + catch (Exception ex) { + logger.info("Error during search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json); + logger.info("Error", ex); + Spark.halt(500, "Error"); + return null; + } + } + + + public EdgeSearchResultSet query(EdgeSearchSpecification specsSet) { + List results = new SearchQuery(specsSet).execute(); + return new EdgeSearchResultSet(results); + } + + public EdgeDomainSearchResults queryDomain(EdgeDomainSearchSpecification specsSet) { + + final OptionalInt wordId = lookUpWord(specsSet.keyword); + EdgeIdList urlIds; + + if (wordId.isEmpty()) { + urlIds = new EdgeIdList<>(); + } else { + urlIds = indexes + .getBucket(specsSet.bucket) + .findHotDomainsForKeyword(specsSet.block, wordId.getAsInt(), specsSet.queryDepth, specsSet.minHitCount, specsSet.maxResults) + .mapToInt(lv -> (int) (lv & 0xFFFF_FFFFL)) + .collect(EdgeIdList::new, EdgeIdList::add, EdgeIdList::addAll); + } + + return new EdgeDomainSearchResults(specsSet.keyword, urlIds); + } + + private class SearchQuery { + private final TIntHashSet seenResults = new TIntHashSet(QUERY_FETCH_SIZE, 0.5f); + private final EdgeSearchSpecification specsSet; + private final IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_TIMEOUT_MS); + private final IndexQueryCachePool cachePool = new IndexQueryCachePool(); + + public SearchQuery(EdgeSearchSpecification specsSet) { + this.specsSet = specsSet; + } + + private List execute() { + final Set results = new HashSet<>(QUERY_FETCH_SIZE); + + for (var sq : specsSet.subqueries) { + results.addAll(performSearch(sq)); + } + + for (var result : results) { + addResultScores(result); + } + + if (!budget.hasTimeLeft()) { + wmsa_edge_index_query_timeouts.inc(); + } + + var domainCountFilter = new ResultDomainDeduplicator(specsSet.limitByDomain); + + if (WmsaHome.isDebug()) { + cachePool.printSummary(logger); + } + cachePool.clear(); + + return results.stream() + .sorted(Comparator.comparing(EdgeSearchResultItem::getScore)) + .filter(domainCountFilter::test) + .limit(specsSet.getLimitTotal()).toList(); + } + + + private List performSearch(EdgeSearchSubquery sq) + { + + final List results = new ArrayList<>(QUERY_FETCH_SIZE); + final EdgeIndexSearchTerms searchTerms = getSearchTerms(sq); + + if (searchTerms.isEmpty()) + return Collections.emptyList(); + + for (int indexBucket : specsSet.buckets) { + final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT); + + if (!budget.hasTimeLeft()) { + logger.info("Query timed out, omitting {}:{} for query {}", indexBucket, sq.block, sq.searchTermsInclude); + continue; + } + + if (QUERY_FETCH_SIZE <= results.size()) + break; + + IndexQuery query = getQuery(cachePool, indexBucket, sq.block, localFilter::filterRawValue, searchTerms); + long[] buf = new long[8192]; + + while (query.hasMore() && results.size() < QUERY_FETCH_SIZE && budget.hasTimeLeft()) { + int cnt = query.getMoreResults(buf, budget); + + for (int i = 0; i < cnt && results.size() < QUERY_FETCH_SIZE; i++) { + final long id = buf[i]; + + if (!seenResults.add((int)(id & 0xFFFF_FFFFL)) || !localFilter.test(id)) { + continue; + } + + results.add(new EdgeSearchResultItem(indexBucket, id)); + } + } + + } + + return results; + } + + private IndexQuery getQuery(IndexQueryCachePool cachePool, int bucket, IndexBlock block, + LongPredicate filter, EdgeIndexSearchTerms searchTerms) { + + if (!indexes.isValidBucket(bucket)) { + logger.warn("Invalid bucket {}", bucket); + return new IndexQuery(Collections.emptyList()); + } + + return indexes.getBucket(bucket).getQuery(cachePool, block, filter, searchTerms); + } + + private void addResultScores(EdgeSearchResultItem searchResult) { + final var reader = Objects.requireNonNull(indexes.getLexiconReader()); + + List> searchTermVariants = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList(); + + // Memoize calls to getTermData, as they're somewhat expensive and highly redundant + Map termMetadata = new HashMap<>(32); + + double bestScore = 0; + + for (int searchTermListIdx = 0; searchTermListIdx < searchTermVariants.size(); searchTermListIdx++) { + double setScore = 0; + int setSize = 0; + for (var searchTerm : searchTermVariants.get(searchTermListIdx)) { + + final int termId = reader.get(searchTerm); + + ResultTermData data = termMetadata.computeIfAbsent( + new ResultTerm(searchResult.bucketId, termId, searchResult.getCombinedId()), this::getTermData); + + var score = data.asScore(searchTermListIdx, searchTerm); + searchResult.scores.add(score); + setScore += score.value(); + setSize++; + } + bestScore = Math.min(bestScore, setScore/setSize); + } + + searchResult.setScore(bestScore); + } + + private ResultTermData getTermData(ResultTerm resultTerm) { + final EdgeIndexBucket bucket = indexes.getBucket(resultTerm.bucket); + final int termId = resultTerm.termId; + final long combinedUrlId = resultTerm.combinedUrlId; + + return new ResultTermData(bucket.getTermScore(cachePool, termId, combinedUrlId), + bucket.isTermInBucket(cachePool, IndexBlock.Title, termId, combinedUrlId), + bucket.isTermInBucket(cachePool, IndexBlock.Link, termId, combinedUrlId), + bucket.isTermInBucket(cachePool, IndexBlock.Site, termId, combinedUrlId), + bucket.isTermInBucket(cachePool, IndexBlock.Subjects, termId, combinedUrlId), + bucket.isTermInBucket(cachePool, IndexBlock.NamesWords, termId, combinedUrlId), + bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Top, termId, combinedUrlId), + bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Middle, termId, combinedUrlId), + bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Lower, termId, combinedUrlId) + ); + } + + record ResultTerm (int bucket, int termId, long combinedUrlId) {} + record ResultTermData (IndexBlock index, + boolean title, + boolean link, + boolean site, + boolean subject, + boolean name, + boolean high, + boolean mid, + boolean low + ) { + public EdgeSearchResultKeywordScore asScore(int set, String searchTerm) { + return new EdgeSearchResultKeywordScore(set, searchTerm, index, title, link, site, subject, name, high, mid, low); + } + } + } + + + private EdgeIndexSearchTerms getSearchTerms(EdgeSearchSubquery request) { + final List excludes = new ArrayList<>(); + final List includes = new ArrayList<>(); + + for (var include : request.searchTermsInclude) { + var word = lookUpWord(include); + if (word.isEmpty()) { + logger.debug("Unknown search term: " + include); + return new EdgeIndexSearchTerms(includes, excludes); + } + includes.add(word.getAsInt()); + } + + for (var exclude : request.searchTermsExclude) { + lookUpWord(exclude).ifPresent(excludes::add); + } + + return new EdgeIndexSearchTerms(includes, excludes); + } + + + private OptionalInt lookUpWord(String s) { + int ret = indexes.getLexiconReader().get(s); + if (ret == DictionaryHashMap.NO_VALUE) { + return OptionalInt.empty(); + } + return OptionalInt.of(ret); + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQuery.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQuery.java similarity index 82% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQuery.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQuery.java index d51c71f2..fb34ace4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQuery.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQuery.java @@ -1,7 +1,7 @@ -package nu.marginalia.wmsa.edge.index.reader.query; +package nu.marginalia.wmsa.edge.index.svc.query; -import nu.marginalia.wmsa.edge.index.reader.query.types.EntrySource; -import nu.marginalia.wmsa.edge.index.reader.query.types.QueryFilterStep; +import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource; +import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf; import java.util.ArrayList; import java.util.List; @@ -10,18 +10,18 @@ import static java.lang.Math.min; public class IndexQuery { private final List sources; - private final List inclusionFilter = new ArrayList<>(10); - private final List priorityFilter = new ArrayList<>(10); + private final List inclusionFilter = new ArrayList<>(10); + private final List priorityFilter = new ArrayList<>(10); public IndexQuery(List sources) { this.sources = sources; } - public void addInclusionFilter(QueryFilterStep filter) { + public void addInclusionFilter(QueryFilterStepIf filter) { inclusionFilter.add(filter); } - public void addPriorityFilter(QueryFilterStep filter) { + public void addPriorityFilter(QueryFilterStepIf filter) { priorityFilter.add(filter); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexQueryCachePool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryCachePool.java similarity index 70% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexQueryCachePool.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryCachePool.java index f22d8728..52925c3c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexQueryCachePool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryCachePool.java @@ -1,17 +1,19 @@ -package nu.marginalia.wmsa.edge.index.reader; +package nu.marginalia.wmsa.edge.index.svc.query; import nu.marginalia.util.btree.CachingBTreeReader; +import nu.marginalia.wmsa.edge.index.reader.IndexWordsTable; +import nu.marginalia.wmsa.edge.index.reader.SearchIndex; import org.slf4j.Logger; import java.util.HashMap; import java.util.Map; public class IndexQueryCachePool { - private final Map indexCaches = new HashMap<>(); - private final Map rangeCache = new HashMap<>(); + private final Map indexCaches = new HashMap<>(); + private final Map rangeCache = new HashMap<>(); private final Map savedCounts = new HashMap<>(); - public CachingBTreeReader.Cache getIndexCache(SearchIndex index, SearchIndex.UrlIndexTree range) { + public CachingBTreeReader.BTreeCachedIndex getIndexCache(SearchIndex index, SearchIndex.IndexBTreeRange range) { var key = new PoolKey(index, range.dataOffset); var entry = indexCaches.get(key); @@ -33,10 +35,10 @@ public class IndexQueryCachePool { } public void printSummary(Logger logger) { - long loadedBytes = indexCaches.values().stream().mapToLong(CachingBTreeReader.Cache::sizeBytes).sum(); + long loadedBytes = indexCaches.values().stream().mapToLong(CachingBTreeReader.BTreeCachedIndex::sizeBytes).sum(); long savedBytes = savedCounts.entrySet().stream().mapToLong(e -> e.getValue() * indexCaches.get(e.getKey()).sizeBytes()).sum(); - long loaded = indexCaches.values().stream().filter(CachingBTreeReader.Cache::isLoaded).count(); + long loaded = indexCaches.values().stream().filter(CachingBTreeReader.BTreeCachedIndex::isLoaded).count(); logger.info("Index Cache Summary: {}/{} loaded/total, {} index blocks loaded, {} index blocks saved", loaded, indexCaches.size(), loadedBytes/4096., savedBytes/4096.); } @@ -45,11 +47,11 @@ public class IndexQueryCachePool { indexCaches.clear(); } - public SearchIndex.UrlIndexTree getRange(IndexWordsTable words, int wordId) { + public SearchIndex.IndexBTreeRange getRange(IndexWordsTable words, int wordId) { return rangeCache.get(new RangeKey(words, wordId)); } - public void cacheRange(IndexWordsTable words, int wordId, SearchIndex.UrlIndexTree range) { + public void cacheRange(IndexWordsTable words, int wordId, SearchIndex.IndexBTreeRange range) { rangeCache.put(new RangeKey(words, wordId), range); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryFactory.java similarity index 77% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryFactory.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryFactory.java index e203208e..5bd31122 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryFactory.java @@ -1,11 +1,10 @@ -package nu.marginalia.wmsa.edge.index.reader.query; +package nu.marginalia.wmsa.edge.index.svc.query; -import nu.marginalia.wmsa.edge.index.reader.IndexQueryCachePool; import nu.marginalia.wmsa.edge.index.reader.SearchIndex; -import nu.marginalia.wmsa.edge.index.reader.query.types.EntrySource; -import nu.marginalia.wmsa.edge.index.reader.query.types.QueryFilterStep; -import nu.marginalia.wmsa.edge.index.reader.query.types.QueryFilterStepFromPredicate; -import nu.marginalia.wmsa.edge.index.reader.query.types.UrlRangeSubFilter; +import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource; +import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterBTreeRange; +import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepFromPredicate; +import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf; import java.util.*; import java.util.function.LongPredicate; @@ -57,21 +56,21 @@ public class IndexQueryFactory { } public IndexQueryBuilder also(int termId) { - List filters = new ArrayList<>(requiredIndices.size()); + List filters = new ArrayList<>(requiredIndices.size()); for (var ri : requiredIndices) { var range = ri.rangeForWord(cachePool, termId); if (range.isPresent()) { - filters.add(new UrlRangeSubFilter(ri, range, cachePool)); + filters.add(new QueryFilterBTreeRange(ri, range, cachePool)); } else { - filters.add(QueryFilterStep.noPass()); + filters.add(QueryFilterStepIf.noPass()); } } filters.sort(Comparator.naturalOrder()); - query.addInclusionFilter(QueryFilterStep.anyOf(filters)); + query.addInclusionFilter(QueryFilterStepIf.anyOf(filters)); return this; } @@ -92,7 +91,7 @@ public class IndexQueryFactory { for (var idx : priortyIndices) { var range = idx.rangeForWord(cachePool, termId); if (range.isPresent()) { - query.addPriorityFilter(new UrlRangeSubFilter(idx, range, cachePool)); + query.addPriorityFilter(new QueryFilterBTreeRange(idx, range, cachePool)); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryIf.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryIf.java new file mode 100644 index 00000000..82e951c4 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexQueryIf.java @@ -0,0 +1,26 @@ +package nu.marginalia.wmsa.edge.index.svc.query; + +import java.util.stream.LongStream; + +public interface IndexQueryIf { + IndexQueryIf EMPTY = new IndexQueryIf() { + @Override + public IndexQueryIf also(int wordId) { return this; } + + @Override + public IndexQueryIf alsoCached(int wordId) { return this; } + + @Override + public IndexQueryIf not(int wordId) { return this; } + + @Override + public LongStream stream() { return LongStream.empty(); } + }; + + IndexQueryIf also(int wordId); + IndexQueryIf alsoCached(int wordId); + + IndexQueryIf not(int wordId); + + LongStream stream(); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexSearchBudget.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexSearchBudget.java similarity index 88% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexSearchBudget.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexSearchBudget.java index 071dfec1..24d28594 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexSearchBudget.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/IndexSearchBudget.java @@ -1,4 +1,4 @@ -package nu.marginalia.wmsa.edge.index.reader.query; +package nu.marginalia.wmsa.edge.index.svc.query; public class IndexSearchBudget { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/ResultDomainDeduplicator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/ResultDomainDeduplicator.java new file mode 100644 index 00000000..24922eb7 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/ResultDomainDeduplicator.java @@ -0,0 +1,45 @@ +package nu.marginalia.wmsa.edge.index.svc.query; + +import gnu.trove.map.TLongIntMap; +import gnu.trove.map.hash.TLongIntHashMap; +import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem; + +public class ResultDomainDeduplicator { + final TLongIntMap resultsByRankingId = new TLongIntHashMap(2048, 0.5f, -1, 0); + final int limitByDomain; + + public ResultDomainDeduplicator(int limitByDomain) { + this.limitByDomain = limitByDomain; + } + + public boolean filterRawValue(long value) { + int rankingId = (int) (value >>> 32); + + if (rankingId == Integer.MAX_VALUE) { + return true; + } + + return resultsByRankingId.get(getKey(rankingId)) <= limitByDomain; + } + + long getKey(int rankingId) { + return rankingId; + } + + public boolean test(long value) { + int ranking = (int) (value >>> 32); + if (ranking == Integer.MAX_VALUE) { + return true; + } + + return resultsByRankingId.adjustOrPutValue(ranking, 1, 1) <= limitByDomain; + } + public boolean test(EdgeSearchResultItem item) { + int ranking = item.getRanking(); + if (ranking == Integer.MAX_VALUE) { + return true; + } + + return resultsByRankingId.adjustOrPutValue(ranking, 1, 1) <= limitByDomain; + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/types/EntrySource.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySource.java similarity index 72% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/types/EntrySource.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySource.java index 8595937f..b550a589 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/types/EntrySource.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/EntrySource.java @@ -1,8 +1,9 @@ -package nu.marginalia.wmsa.edge.index.reader.query.types; +package nu.marginalia.wmsa.edge.index.svc.query.types; import nu.marginalia.wmsa.edge.index.reader.SearchIndex; public interface EntrySource { SearchIndex getIndex(); int read(long[] buffer, int n); + } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterAnyOf.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterAnyOf.java new file mode 100644 index 00000000..52d8d1f2 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterAnyOf.java @@ -0,0 +1,39 @@ +package nu.marginalia.wmsa.edge.index.svc.query.types.filter; + +import nu.marginalia.wmsa.edge.index.reader.SearchIndex; + +import java.util.List; +import java.util.StringJoiner; + +class QueryFilterAnyOf implements QueryFilterStepIf { + private final List steps; + + QueryFilterAnyOf(List steps) { + this.steps = steps; + } + + public SearchIndex getIndex() { + return null; + } + + public double cost() { + return steps.stream().mapToDouble(QueryFilterStepIf::cost).average().orElse(0.); + } + + @Override + public boolean test(long value) { + for (var step : steps) { + if (step.test(value)) + return true; + } + return false; + } + + public String describe() { + StringJoiner sj = new StringJoiner(",", "[Any Of: ", "]"); + for (var step : steps) { + sj.add(step.describe()); + } + return sj.toString(); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/types/UrlRangeSubFilter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterBTreeRange.java similarity index 59% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/types/UrlRangeSubFilter.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterBTreeRange.java index eaf4ebd6..6ce32620 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/types/UrlRangeSubFilter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterBTreeRange.java @@ -1,13 +1,13 @@ -package nu.marginalia.wmsa.edge.index.reader.query.types; +package nu.marginalia.wmsa.edge.index.svc.query.types.filter; import nu.marginalia.util.btree.CachingBTreeReader; -import nu.marginalia.wmsa.edge.index.reader.IndexQueryCachePool; import nu.marginalia.wmsa.edge.index.reader.SearchIndex; +import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool; import org.jetbrains.annotations.Nullable; -public record UrlRangeSubFilter(SearchIndex source, SearchIndex.UrlIndexTree range, CachingBTreeReader.Cache cache) implements QueryFilterStep { +public record QueryFilterBTreeRange(SearchIndex source, SearchIndex.IndexBTreeRange range, CachingBTreeReader.BTreeCachedIndex cache) implements QueryFilterStepIf { - public UrlRangeSubFilter(SearchIndex source, SearchIndex.UrlIndexTree range, IndexQueryCachePool pool) { + public QueryFilterBTreeRange(SearchIndex source, SearchIndex.IndexBTreeRange range, IndexQueryCachePool pool) { this(source, range, pool.getIndexCache(source, range)); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterNoPass.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterNoPass.java new file mode 100644 index 00000000..3c2f6b07 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterNoPass.java @@ -0,0 +1,33 @@ +package nu.marginalia.wmsa.edge.index.svc.query.types.filter; + +import nu.marginalia.wmsa.edge.index.reader.SearchIndex; + +class QueryFilterNoPass implements QueryFilterStepIf { + static final QueryFilterStepIf instance = new QueryFilterNoPass(); + + @Override + public boolean test(long value) { + return false; + } + + public SearchIndex getIndex() { + return null; + } + + public double cost() { + return 0.; + } + + public int retainDestructive(long[] items, int max) { + return 0; + } + + public int retainReorder(long[] items, int start, int max) { + return 0; + } + + public String describe() { + return "[NoPass]"; + } + +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/types/QueryFilterStepFromPredicate.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterStepFromPredicate.java similarity index 90% rename from marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/types/QueryFilterStepFromPredicate.java rename to marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterStepFromPredicate.java index cf80371e..4fdb204e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/types/QueryFilterStepFromPredicate.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterStepFromPredicate.java @@ -1,11 +1,11 @@ -package nu.marginalia.wmsa.edge.index.reader.query.types; +package nu.marginalia.wmsa.edge.index.svc.query.types.filter; import nu.marginalia.wmsa.edge.index.reader.SearchIndex; import org.jetbrains.annotations.Nullable; import java.util.function.LongPredicate; -public class QueryFilterStepFromPredicate implements QueryFilterStep { +public class QueryFilterStepFromPredicate implements QueryFilterStepIf { private final LongPredicate pred; public QueryFilterStepFromPredicate(LongPredicate pred) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterStepIf.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterStepIf.java new file mode 100644 index 00000000..211c9e79 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/query/types/filter/QueryFilterStepIf.java @@ -0,0 +1,71 @@ +package nu.marginalia.wmsa.edge.index.svc.query.types.filter; + +import nu.marginalia.wmsa.edge.index.reader.SearchIndex; + +import javax.annotation.Nullable; +import java.util.List; + +public interface QueryFilterStepIf extends Comparable { + @Nullable + SearchIndex getIndex(); + + boolean test(long value); + + double cost(); + + default int compareTo(QueryFilterStepIf other) { + return (int)(cost() - other.cost()); + } + + String describe(); + + /** + * Move each value in items to the beginning of the array, + * and return the number of matching items. + * + * The remaining values are undefined. + */ + default int retainDestructive(long[] items, int max) { + int keep = 0; + for (int i = 0; i < max; i++) { + if (test(items[i])) { + if (i != keep) { + items[keep] = items[i]; + } + keep++; + } + } + return keep; + } + + /** + * Move each value in items to the beginning of the array, + * and return the number of matching items. The values that do + * not pass the test are moved to the end of the array. + */ + default int retainReorder(long[] items, int start, int max) { + int keep = 0; + for (int i = start; i < max; i++) { + if (test(items[i])) { + if (i != keep) { + long tmp = items[keep]; + items[keep] = items[i]; + items[i] = tmp; + } + keep++; + } + } + return keep; + } + + + static QueryFilterStepIf noPass() { + return QueryFilterNoPass.instance; + } + static QueryFilterStepIf anyOf(List steps) { + return new QueryFilterAnyOf(steps); + } + + +} + diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java index 3112fe47..c81bdafc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/EdgeSearchResultItem.java @@ -12,14 +12,14 @@ import java.util.List; @AllArgsConstructor @ToString @Getter public class EdgeSearchResultItem { public final int bucketId; - public final long combinedId; // this isn't the external domain ID, but a ranking + public final long combinedId; + public final List scores; public EdgeSearchResultItem(int bucketId, long val) { this.bucketId = bucketId; - - combinedId = val; - scores = new ArrayList<>(16); + this.combinedId = val; + this.scores = new ArrayList<>(16); } public EdgeId getUrlId() { @@ -33,6 +33,7 @@ public class EdgeSearchResultItem { return (int)(combinedId >>> 32); } + /* Used for evaluation */ private transient double scoreValue = 1; public void setScore(double score) { scoreValue = score; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/EdgeDomainSearchResults.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/EdgeDomainSearchResults.java index dfaf4e43..e9079bb7 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/EdgeDomainSearchResults.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/search/domain/EdgeDomainSearchResults.java @@ -4,10 +4,10 @@ import lombok.AllArgsConstructor; import lombok.Getter; import lombok.ToString; import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.id.EdgeIdArray; +import nu.marginalia.wmsa.edge.model.id.EdgeIdList; @AllArgsConstructor @Getter @ToString public class EdgeDomainSearchResults { public final String keyword; - public final EdgeIdArray results; + public final EdgeIdList results; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java index 161f40bd..b1f86b73 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchOperator.java @@ -2,7 +2,6 @@ package nu.marginalia.wmsa.edge.search; import com.google.inject.Inject; import com.google.inject.Singleton; -import io.prometheus.client.Summary; import io.reactivex.rxjava3.core.Observable; import io.reactivex.rxjava3.schedulers.Schedulers; import nu.marginalia.wmsa.configuration.server.Context; @@ -18,7 +17,6 @@ import nu.marginalia.wmsa.edge.model.id.EdgeIdSet; import nu.marginalia.wmsa.edge.model.search.*; import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification; import nu.marginalia.wmsa.edge.search.model.BrowseResult; -import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet; import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults; import nu.marginalia.wmsa.edge.search.query.QueryFactory; import nu.marginalia.wmsa.edge.search.query.model.EdgeSearchQuery; @@ -50,8 +48,6 @@ public class EdgeSearchOperator { private final SearchResultDecorator resultDecorator; private final Comparator resultListComparator; - private static final Summary wmsa_search_index_api_time = Summary.build().name("wmsa_search_index_api_time").help("-").register(); - @Inject public EdgeSearchOperator(AssistantClient assistantClient, EncyclopediaClient encyclopediaClient, @@ -81,9 +77,7 @@ public class EdgeSearchOperator { logger.info("Human terms (API): {}", Strings.join(processedQuery.searchTermsHuman, ',')); - DecoratedSearchResultSet queryResults = performQuery(ctx, processedQuery); - - return queryResults.resultSet; + return performQuery(ctx, processedQuery); } public DecoratedSearchResults doSearch(Context ctx, EdgeUserSearchParameters params, @Nullable Future eval) { @@ -91,23 +85,25 @@ public class EdgeSearchOperator { Observable definitions = getWikiArticle(ctx, params.humanQuery()); EdgeSearchQuery processedQuery = queryFactory.createQuery(params); + logger.info("Human terms: {}", Strings.join(processedQuery.searchTermsHuman, ',')); - DecoratedSearchResultSet queryResults = performQuery(ctx, processedQuery); + List queryResults = performQuery(ctx, processedQuery); String evalResult = getEvalResult(eval); - - List domainResults = getDomainResults(ctx, processedQuery.specs); + WikiArticles wikiArticles = definitions.onErrorReturn((e) -> new WikiArticles()).blockingFirst(); - return new DecoratedSearchResults(params, - getProblems(ctx, evalResult, queryResults, processedQuery), - evalResult, - definitions.onErrorReturn((e) -> new WikiArticles()).blockingFirst(), - queryResults.resultSet, - domainResults, - processedQuery.domain, - getDomainId(processedQuery.domain)); + return DecoratedSearchResults.builder() + .params(params) + .problems(getProblems(ctx, evalResult, queryResults, processedQuery)) + .evalResult(evalResult) + .wiki(wikiArticles) + .results(queryResults) + .domainResults(domainResults) + .focusDomain(processedQuery.domain) + .focusDomainId(getDomainId(processedQuery.domain)) + .build(); } private List getDomainResults(Context ctx, EdgeSearchSpecification specs) { @@ -169,7 +165,7 @@ public class EdgeSearchOperator { return domainId; } - public DecoratedSearchResultSet performDumbQuery(Context ctx, EdgeSearchProfile profile, IndexBlock block, int limitPerDomain, int limitTotal, String... termsInclude) { + public List performDumbQuery(Context ctx, EdgeSearchProfile profile, IndexBlock block, int limitPerDomain, int limitTotal, String... termsInclude) { List sqs = new ArrayList<>(); sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), block)); @@ -179,11 +175,13 @@ public class EdgeSearchOperator { return performQuery(ctx, new EdgeSearchQuery(specs)); } - private DecoratedSearchResultSet performQuery(Context ctx, EdgeSearchQuery processedQuery) { + private List performQuery(Context ctx, EdgeSearchQuery processedQuery) { - List resultList = new ArrayList<>(processedQuery.specs.limitTotal); + final List results = indexClient.query(ctx, processedQuery.specs); - for (var details : wmsa_search_index_api_time.time(()->fetchResultsSimple(ctx, processedQuery))) { + final List resultList = new ArrayList<>(results.size()); + + for (var details : resultDecorator.getAllUrlDetails(results)) { if (details.getUrlQuality() <= -100) { continue; } @@ -208,10 +206,10 @@ public class EdgeSearchOperator { } } - return new DecoratedSearchResultSet(retList); + return retList; } - private List getProblems(Context ctx, String evalResult, DecoratedSearchResultSet queryResults, EdgeSearchQuery processedQuery) { + private List getProblems(Context ctx, String evalResult, List queryResults, EdgeSearchQuery processedQuery) { final List problems = new ArrayList<>(processedQuery.problems); boolean siteSearch = processedQuery.domain != null; @@ -305,15 +303,6 @@ public class EdgeSearchOperator { ; } - private Set fetchResultsSimple(Context ctx, EdgeSearchQuery processedQuery) { - EdgeSearchResultSet resultSet = indexClient.query(ctx, processedQuery.specs); - - var results = resultSet.getResults(); - Set ret = new HashSet<>(resultDecorator.getAllUrlDetails(results)); - - return ret; - } - private Iterable spellCheckTerms(Context ctx, EdgeSearchQuery disjointedQuery) { return Observable.fromIterable(disjointedQuery.searchTermsHuman) .subscribeOn(Schedulers.io()) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java index 31a83866..6c28c0ef 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchService.java @@ -20,6 +20,7 @@ import nu.marginalia.wmsa.edge.search.command.SearchJsParameter; import nu.marginalia.wmsa.edge.search.command.SearchParameters; import nu.marginalia.wmsa.edge.search.exceptions.RedirectException; import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters; +import nu.marginalia.wmsa.edge.search.svc.EdgeSearchErrorPageService; import nu.marginalia.wmsa.resource_store.StaticResources; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -40,6 +41,7 @@ public class EdgeSearchService extends Service { private final WebsiteUrl websiteUrl; private StaticResources staticResources; + private final EdgeSearchErrorPageService errorPageService; private static final Logger logger = LoggerFactory.getLogger(EdgeSearchService.class); @SneakyThrows @@ -53,7 +55,8 @@ public class EdgeSearchService extends Service { CommandEvaluator searchCommandEvaulator, WebsiteUrl websiteUrl, StaticResources staticResources, - IndexCommand indexCommand) { + IndexCommand indexCommand, + EdgeSearchErrorPageService errorPageService) { super(ip, port, initialization, metricsServer); this.indexClient = indexClient; @@ -61,6 +64,7 @@ public class EdgeSearchService extends Service { this.searchCommandEvaulator = searchCommandEvaulator; this.websiteUrl = websiteUrl; this.staticResources = staticResources; + this.errorPageService = errorPageService; Spark.staticFiles.expireTime(600); @@ -79,7 +83,7 @@ public class EdgeSearchService extends Service { Spark.exception(Exception.class, (e,p,q) -> { logger.error("Error during processing", e); - serveError(Context.fromRequest(p), q); + errorPageService.serveError(Context.fromRequest(p), q); }); Spark.awaitInitialization(); @@ -104,26 +108,6 @@ public class EdgeSearchService extends Service { } - private void serveError(Context ctx, Response rsp) { - boolean isIndexUp = indexClient.isAlive(); - - try { - if (!isIndexUp) { - rsp.body("Error

Error

Oops! It appears the index server is offline.

The server was probably restarted to bring online some changes. Restarting the index typically takes a few minutes, during which searches can't be served.

This page will attempt to refresh automatically every few seconds.

"); - } else if (indexClient.isBlocked(ctx).blockingFirst()) { - rsp.body("Error

Error

Oops! It appears the index server is starting up.

The server was probably restarted to bring online some changes. Restarting the index typically takes a few minutes, during which searches can't be served.

This page will attempt to refresh automatically every few seconds.

"); - } - else { - rsp.body("Error

Error

Oops! An unknown error occurred. The index server seems to be up, so I don't know why this is. Please send an email to kontakt@marginalia.nu telling me what you did :-)

"); - } - } - catch (Exception ex) { - logger.error("Error", ex); - rsp.body("Error

Error

Oops! It appears the index server is unresponsive.

The server was probably restarted to bring online some changes. Restarting the index typically takes a few minutes, during which searches can't be served.

This page will attempt to refresh automatically every few seconds.

"); - } - - } - @SneakyThrows private Object apiSearch(Request request, Response response) { @@ -180,7 +164,7 @@ public class EdgeSearchService extends Service { } catch (Exception ex) { logger.error("Error", ex); - serveError(ctx, response); + errorPageService.serveError(ctx, response); } return ""; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java index 22969872..56ddcc7f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java @@ -5,11 +5,11 @@ import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; +import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; import nu.marginalia.wmsa.edge.search.EdgeSearchOperator; import nu.marginalia.wmsa.edge.search.EdgeSearchProfile; import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface; import nu.marginalia.wmsa.edge.search.command.SearchParameters; -import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet; import nu.marginalia.wmsa.edge.search.model.DomainInformation; import nu.marginalia.wmsa.edge.search.siteinfo.DomainInformationService; import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer; @@ -19,10 +19,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Path; -import java.util.Collections; -import java.util.Map; -import java.util.Objects; -import java.util.Optional; +import java.util.*; import java.util.function.Predicate; import java.util.regex.Pattern; @@ -59,7 +56,7 @@ public class SiteSearchCommand implements SearchCommandInterface { var results = siteInfo(ctx, query); var domain = results.getDomain(); - DecoratedSearchResultSet resultSet; + List resultSet; Path screenshotPath = null; if (null != domain) { resultSet = searchOperator.performDumbQuery(ctx, EdgeSearchProfile.CORPO, IndexBlock.Words_1, 100, 100, "site:"+domain); @@ -67,10 +64,10 @@ public class SiteSearchCommand implements SearchCommandInterface { screenshotPath = Path.of("/screenshot/" + dataStoreDao.getDomainId(domain).id()); } else { - resultSet = new DecoratedSearchResultSet(Collections.emptyList()); + resultSet = Collections.emptyList(); } - return Optional.of(siteInfoRenderer.render(results, Map.of("query", query, "focusDomain", Objects.requireNonNullElse(domain, ""), "profile", parameters.profileStr(), "results", resultSet.resultSet, "screenshot", screenshotPath == null ? "" : screenshotPath.toString()))); + return Optional.of(siteInfoRenderer.render(results, Map.of("query", query, "focusDomain", Objects.requireNonNullElse(domain, ""), "profile", parameters.profileStr(), "results", resultSet, "screenshot", screenshotPath == null ? "" : screenshotPath.toString()))); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DecoratedSearchResultSet.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DecoratedSearchResultSet.java deleted file mode 100644 index 9df13783..00000000 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DecoratedSearchResultSet.java +++ /dev/null @@ -1,22 +0,0 @@ -package nu.marginalia.wmsa.edge.search.model; - -import lombok.Getter; -import lombok.ToString; -import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; - -import java.util.List; -import java.util.Objects; - -@ToString @Getter -public class DecoratedSearchResultSet { - public final List resultSet; - - public int size() { - return resultSet.size(); - } - - public DecoratedSearchResultSet(List resultSet) { - this.resultSet = Objects.requireNonNull(resultSet); - } - -} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DecoratedSearchResults.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DecoratedSearchResults.java index fb9717af..3d4acda8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DecoratedSearchResults.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/model/DecoratedSearchResults.java @@ -1,6 +1,7 @@ package nu.marginalia.wmsa.edge.search.model; import lombok.AllArgsConstructor; +import lombok.Builder; import lombok.Getter; import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles; import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; @@ -8,7 +9,7 @@ import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters; import java.util.List; -@AllArgsConstructor @Getter +@AllArgsConstructor @Getter @Builder public class DecoratedSearchResults { private final EdgeUserSearchParameters params; private final List problems; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java index 230f88cc..26520f8c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultDecorator.java @@ -13,7 +13,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; -import java.util.Comparator; import java.util.List; public class SearchResultDecorator { @@ -67,8 +66,7 @@ public class SearchResultDecorator { if (!missedIds.isEmpty()) { logger.debug("Could not look up documents: {}", missedIds.toArray()); } - retList.sort(Comparator.comparing(EdgeUrlDetails::getTermScore) - .thenComparing(url -> url.url.path.length())); + return retList; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchErrorPageService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchErrorPageService.java new file mode 100644 index 00000000..54e75178 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/svc/EdgeSearchErrorPageService.java @@ -0,0 +1,125 @@ +package nu.marginalia.wmsa.edge.search.svc; + +import com.google.inject.Inject; +import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import spark.Response; + +public class EdgeSearchErrorPageService { + private final EdgeIndexClient indexClient; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + @Inject + public EdgeSearchErrorPageService(EdgeIndexClient indexClient) { + this.indexClient = indexClient; + } + + public void serveError(Context ctx, Response rsp) { + boolean isIndexUp = indexClient.isAlive(); + + try { + if (!isIndexUp) { + rsp.body(renderError("The index is down", + """ + The search index server appears to be down. +

+ The server was possibly restarted to bring online some changes. + Restarting the index typically takes a few minutes, during which + searches can't be served. + """)); + } else if (indexClient.isBlocked(ctx).blockingFirst()) { + rsp.body(renderError("The index is starting up", + """ + The search index server appears to be in the process of starting up. + This typically takes a few minutes. Be patient. + """)); + } + else { + rsp.body(renderError("Error processing request", + """ + The search index appears to be up and running, so the problem may be related + to some wider general error, or pertain to an error handling your query. + """)); + } + } + catch (Exception ex) { + rsp.body(renderError("Error processing error", + """ + An error has occurred, additionally, an error occurred while handling that error +

+ https://www.youtube.com/watch?v=dsx2vdn7gpY. + + """)); + } + } + + private String renderError(String title, String message) { + return """ + + Error + + +

+ +
+
+
+ +
+
+
+

+ """ + + title + + """ +

+
+ """ + +message+ + """ +
+
+
+

More Info

+
+ You may be able to find more information here: + +
+
+ """; + } +} diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/MicroCacheTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/MicroBTreeCachedIndexTest.java similarity index 97% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/MicroCacheTest.java rename to marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/MicroBTreeCachedIndexTest.java index ff05f73b..0daeecbe 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/MicroCacheTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/MicroBTreeCachedIndexTest.java @@ -5,7 +5,7 @@ import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.assertEquals; -class MicroCacheTest { +class MicroBTreeCachedIndexTest { MicroCache mc; @BeforeEach diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/query/types/QueryFilterStepTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/query/types/QueryFilterStepIfTest.java similarity index 54% rename from marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/query/types/QueryFilterStepTest.java rename to marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/query/types/QueryFilterStepIfTest.java index b890ea34..3dc2d57c 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/query/types/QueryFilterStepTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/reader/query/types/QueryFilterStepIfTest.java @@ -1,13 +1,15 @@ package nu.marginalia.wmsa.edge.index.reader.query.types; +import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepFromPredicate; +import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf; import org.junit.jupiter.api.Test; import java.util.List; -class QueryFilterStepTest { - QueryFilterStep even = new QueryFilterStepFromPredicate(l -> (l%2) == 0); - QueryFilterStep divisibleByThree = new QueryFilterStepFromPredicate(l -> (l%3) == 0); - QueryFilterStep either = QueryFilterStep.anyOf(List.of(even, divisibleByThree)); +class QueryFilterStepIfTest { + QueryFilterStepIf even = new QueryFilterStepFromPredicate(l -> (l%2) == 0); + QueryFilterStepIf divisibleByThree = new QueryFilterStepFromPredicate(l -> (l%3) == 0); + QueryFilterStepIf either = QueryFilterStepIf.anyOf(List.of(even, divisibleByThree)); @Test public void test() { long[] values = new long[100];