mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Cleaning up and adding better error messages.
This commit is contained in:
parent
fbe17b62ed
commit
eaef93f4ae
@ -25,14 +25,14 @@ public class CachingBTreeReader {
|
||||
return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
|
||||
}
|
||||
|
||||
public Cache prepareCache(BTreeHeader header) {
|
||||
return new Cache(header);
|
||||
public BTreeCachedIndex prepareCache(BTreeHeader header) {
|
||||
return new BTreeCachedIndex(header);
|
||||
}
|
||||
/**
|
||||
*
|
||||
* @return file offset of entry matching keyRaw, negative if absent
|
||||
*/
|
||||
public long findEntry(Cache cache, final long keyRaw) {
|
||||
public long findEntry(BTreeCachedIndex cache, final long keyRaw) {
|
||||
BTreeHeader header = cache.header;
|
||||
|
||||
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
||||
@ -62,7 +62,7 @@ public class CachingBTreeReader {
|
||||
return dataSearcher.binarySearch(key, searchStart, numEntries);
|
||||
}
|
||||
|
||||
private long searchIndex(BTreeHeader header, Cache cache, long key) {
|
||||
private long searchIndex(BTreeHeader header, BTreeCachedIndex cache, long key) {
|
||||
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
||||
long layerOffset = 0;
|
||||
|
||||
@ -83,13 +83,13 @@ public class CachingBTreeReader {
|
||||
* for repeated queries against the same tree. The memory consumption is typically very low
|
||||
* and the disk access pattern for reading the entire index relatively cheap.
|
||||
*/
|
||||
public class Cache {
|
||||
public class BTreeCachedIndex {
|
||||
long[] indexData;
|
||||
final BTreeHeader header;
|
||||
|
||||
final int indexedDataSize;
|
||||
|
||||
public Cache(BTreeHeader header) {
|
||||
public BTreeCachedIndex(BTreeHeader header) {
|
||||
this.header = header;
|
||||
indexedDataSize = header.numEntries();
|
||||
}
|
||||
|
@ -95,4 +95,8 @@ public class WmsaHome {
|
||||
home.resolve("model/opennlp-tok.bin"));
|
||||
}
|
||||
|
||||
private static final boolean debugMode = Boolean.getBoolean("wmsa-debug");
|
||||
public static boolean isDebug() {
|
||||
return debugMode;
|
||||
}
|
||||
}
|
||||
|
@ -3,10 +3,10 @@ package nu.marginalia.wmsa.edge.index;
|
||||
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.reader.IndexQueryCachePool;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexQuery;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexQueryFactory;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -3,68 +3,29 @@ package nu.marginalia.wmsa.edge.index;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import com.google.protobuf.InvalidProtocolBufferException;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import io.prometheus.client.Counter;
|
||||
import io.prometheus.client.Histogram;
|
||||
import io.reactivex.rxjava3.schedulers.Schedulers;
|
||||
import nu.marginalia.util.ListChunker;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.wmsa.client.GsonFactory;
|
||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
||||
import nu.marginalia.wmsa.configuration.server.Service;
|
||||
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
|
||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
|
||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.reader.IndexQueryCachePool;
|
||||
import nu.marginalia.wmsa.edge.index.reader.ResultDomainDeduplicator;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexQuery;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeIdArray;
|
||||
import nu.marginalia.wmsa.edge.model.search.*;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
||||
import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq;
|
||||
import org.apache.http.HttpStatus;
|
||||
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexLexiconService;
|
||||
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexOpsService;
|
||||
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexQueryService;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.HaltException;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.LongPredicate;
|
||||
|
||||
import static spark.Spark.get;
|
||||
import static spark.Spark.halt;
|
||||
|
||||
public class EdgeIndexService extends Service {
|
||||
private static final int SEARCH_BUDGET_TIMEOUT_MS = 3000;
|
||||
private static final int QUERY_FETCH_SIZE = 8192;
|
||||
private static final int QUERY_FIRST_PASS_DOMAIN_LIMIT = 64;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@NotNull
|
||||
private final Initialization init;
|
||||
private final SearchIndexes indexes;
|
||||
private final KeywordLexicon keywordLexicon;
|
||||
|
||||
private final Gson gson = GsonFactory.get();
|
||||
|
||||
private static final Histogram wmsa_edge_index_query_time = Histogram.build().name("wmsa_edge_index_query_time").linearBuckets(50, 50, 15).help("-").register();
|
||||
private static final Counter wmsa_edge_index_query_timeouts = Counter.build().name("wmsa_edge_index_query_timeouts").help("-").register();
|
||||
|
||||
public static final int DYNAMIC_BUCKET_LENGTH = 7;
|
||||
|
||||
@ -75,71 +36,34 @@ public class EdgeIndexService extends Service {
|
||||
Initialization init,
|
||||
MetricsServer metricsServer,
|
||||
SearchIndexes indexes,
|
||||
IndexServicesFactory servicesFactory) {
|
||||
|
||||
EdgeIndexOpsService opsService,
|
||||
EdgeIndexLexiconService lexiconService,
|
||||
EdgeIndexQueryService indexQueryService)
|
||||
{
|
||||
super(ip, port, init, metricsServer);
|
||||
|
||||
final Gson gson = GsonFactory.get();
|
||||
|
||||
this.init = init;
|
||||
this.indexes = indexes;
|
||||
this.keywordLexicon = servicesFactory.getKeywordLexicon();
|
||||
|
||||
Spark.post("/words/", this::putWords);
|
||||
Spark.post("/search/", this::search, gson::toJson);
|
||||
Spark.post("/search-domain/", this::searchDomain, gson::toJson);
|
||||
Spark.post("/words/", lexiconService::putWords);
|
||||
|
||||
Spark.post("/dictionary/*", this::getWordId, gson::toJson);
|
||||
Spark.post("/search/", indexQueryService::search, gson::toJson);
|
||||
Spark.post("/search-domain/", indexQueryService::searchDomain, gson::toJson);
|
||||
|
||||
Spark.post("/ops/repartition", this::repartitionEndpoint);
|
||||
Spark.post("/ops/preconvert", this::preconvertEndpoint);
|
||||
Spark.post("/ops/reindex/:id", this::reindexEndpoint);
|
||||
Spark.get("/dictionary/*", lexiconService::getWordId, gson::toJson);
|
||||
|
||||
Spark.post("/ops/repartition", opsService::repartitionEndpoint);
|
||||
Spark.post("/ops/preconvert", opsService::preconvertEndpoint);
|
||||
Spark.post("/ops/reindex/:id", opsService::reindexEndpoint);
|
||||
|
||||
get("/is-blocked", this::isBlocked, gson::toJson);
|
||||
|
||||
Schedulers.newThread().scheduleDirect(this::initialize, 1, TimeUnit.MICROSECONDS);
|
||||
}
|
||||
|
||||
private Object getWordId(Request request, Response response) {
|
||||
final String word = request.splat()[0];
|
||||
|
||||
var dr = indexes.getDictionaryReader();
|
||||
if (null == dr) {
|
||||
response.status(HttpStatus.SC_FAILED_DEPENDENCY);
|
||||
return "";
|
||||
}
|
||||
|
||||
final int wordId = dr.get(word);
|
||||
|
||||
if (DictionaryHashMap.NO_VALUE == wordId) {
|
||||
response.status(404);
|
||||
return "";
|
||||
}
|
||||
|
||||
return wordId;
|
||||
}
|
||||
|
||||
private Object repartitionEndpoint(Request request, Response response) {
|
||||
|
||||
if (!indexes.repartition()) {
|
||||
Spark.halt(503, "Operations busy");
|
||||
}
|
||||
return "OK";
|
||||
}
|
||||
|
||||
private Object preconvertEndpoint(Request request, Response response) {
|
||||
if (!indexes.preconvert()) {
|
||||
Spark.halt(503, "Operations busy");
|
||||
}
|
||||
return "OK";
|
||||
}
|
||||
|
||||
private Object reindexEndpoint(Request request, Response response) {
|
||||
int id = Integer.parseInt(request.params("id"));
|
||||
|
||||
if (!indexes.reindex(id)) {
|
||||
Spark.halt(503, "Operations busy");
|
||||
}
|
||||
return "OK";
|
||||
}
|
||||
|
||||
private Object isBlocked(Request request, Response response) {
|
||||
return indexes.isBusy() || !initialized;
|
||||
}
|
||||
@ -156,296 +80,6 @@ public class EdgeIndexService extends Service {
|
||||
indexes.initialize(init);
|
||||
}
|
||||
|
||||
private Object putWords(Request request, Response response) throws InvalidProtocolBufferException {
|
||||
var req = IndexPutKeywordsReq.parseFrom(request.bodyAsBytes());
|
||||
|
||||
EdgeId<EdgeDomain> domainId = new EdgeId<>(req.getDomain());
|
||||
EdgeId<EdgeUrl> urlId = new EdgeId<>(req.getUrl());
|
||||
int idx = req.getIndex();
|
||||
|
||||
for (int ws = 0; ws < req.getWordSetCount(); ws++) {
|
||||
putWords(domainId, urlId, req.getWordSet(ws), idx);
|
||||
}
|
||||
|
||||
response.status(HttpStatus.SC_ACCEPTED);
|
||||
return "";
|
||||
}
|
||||
|
||||
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
|
||||
IndexPutKeywordsReq.WordSet words, int idx
|
||||
) {
|
||||
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
|
||||
|
||||
IndexBlock block = IndexBlock.values()[words.getIndex()];
|
||||
|
||||
for (var chunk : ListChunker.chopList(words.getWordsList(), SearchIndexJournalEntry.MAX_LENGTH)) {
|
||||
|
||||
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
|
||||
var header = new SearchIndexJournalEntryHeader(domainId, urlId, block);
|
||||
|
||||
indexWriter.put(header, entry);
|
||||
};
|
||||
}
|
||||
|
||||
private long[] getOrInsertWordIds(List<String> words) {
|
||||
long[] ids = new long[words.size()];
|
||||
int putIdx = 0;
|
||||
|
||||
for (String word : words) {
|
||||
long id = keywordLexicon.getOrInsert(word);
|
||||
if (id != DictionaryHashMap.NO_VALUE) {
|
||||
ids[putIdx++] = id;
|
||||
}
|
||||
}
|
||||
|
||||
if (putIdx != words.size()) {
|
||||
ids = Arrays.copyOf(ids, putIdx);
|
||||
}
|
||||
return ids;
|
||||
}
|
||||
|
||||
private Object searchDomain(Request request, Response response) {
|
||||
if (indexes.getDictionaryReader() == null) {
|
||||
logger.warn("Dictionary reader not yet initialized");
|
||||
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
|
||||
}
|
||||
|
||||
String json = request.body();
|
||||
EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class);
|
||||
|
||||
final int wordId = keywordLexicon.getReadOnly(specsSet.keyword);
|
||||
|
||||
EdgeIdArray<EdgeUrl> urlIds = EdgeIdArray.gather(indexes
|
||||
.getBucket(specsSet.bucket)
|
||||
.findHotDomainsForKeyword(specsSet.block, wordId, specsSet.queryDepth, specsSet.minHitCount, specsSet.maxResults)
|
||||
.mapToInt(lv -> (int)(lv & 0xFFFF_FFFFL)));
|
||||
|
||||
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
|
||||
}
|
||||
|
||||
private Object search(Request request, Response response) {
|
||||
if (indexes.getDictionaryReader() == null) {
|
||||
logger.warn("Dictionary reader not yet initialized");
|
||||
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
|
||||
}
|
||||
|
||||
String json = request.body();
|
||||
EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class);
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
try {
|
||||
return new EdgeSearchResultSet(new SearchQuery(specsSet).execute());
|
||||
}
|
||||
catch (HaltException ex) {
|
||||
logger.warn("Halt", ex);
|
||||
throw ex;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.info("Error during search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
|
||||
logger.info("Error", ex);
|
||||
Spark.halt(500, "Error");
|
||||
return null;
|
||||
}
|
||||
finally {
|
||||
wmsa_edge_index_query_time.observe(System.currentTimeMillis() - start);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private class SearchQuery {
|
||||
private final TIntHashSet seenResults = new TIntHashSet(QUERY_FETCH_SIZE, 0.5f);
|
||||
private final EdgeSearchSpecification specsSet;
|
||||
private final IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_TIMEOUT_MS);
|
||||
private final IndexQueryCachePool cachePool = new IndexQueryCachePool();
|
||||
|
||||
public SearchQuery(EdgeSearchSpecification specsSet) {
|
||||
this.specsSet = specsSet;
|
||||
}
|
||||
|
||||
private List<EdgeSearchResultItem> execute() {
|
||||
final Set<EdgeSearchResultItem> results = new HashSet<>(QUERY_FETCH_SIZE);
|
||||
|
||||
for (var sq : specsSet.subqueries) {
|
||||
Optional<EdgeIndexSearchTerms> searchTerms = getSearchTerms(sq);
|
||||
|
||||
if (searchTerms.isEmpty())
|
||||
continue;
|
||||
|
||||
results.addAll(performSearch(searchTerms.get(), sq));
|
||||
}
|
||||
|
||||
for (var result : results) {
|
||||
addResultScores(result);
|
||||
}
|
||||
|
||||
if (!budget.hasTimeLeft()) {
|
||||
wmsa_edge_index_query_timeouts.inc();
|
||||
}
|
||||
|
||||
var domainCountFilter = new ResultDomainDeduplicator(specsSet.limitByDomain);
|
||||
|
||||
// cachePool.printSummary(logger);
|
||||
cachePool.clear();
|
||||
|
||||
return results.stream()
|
||||
.sorted(Comparator.comparing(EdgeSearchResultItem::getScore))
|
||||
.filter(domainCountFilter::test)
|
||||
.limit(specsSet.getLimitTotal()).toList();
|
||||
}
|
||||
|
||||
|
||||
private List<EdgeSearchResultItem> performSearch(EdgeIndexSearchTerms searchTerms,
|
||||
EdgeSearchSubquery sq)
|
||||
{
|
||||
|
||||
final List<EdgeSearchResultItem> results = new ArrayList<>(QUERY_FETCH_SIZE);
|
||||
final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT);
|
||||
|
||||
final int remainingResults = QUERY_FETCH_SIZE;
|
||||
|
||||
for (int indexBucket : specsSet.buckets) {
|
||||
|
||||
if (!budget.hasTimeLeft()) {
|
||||
logger.info("Query timed out, omitting {}:{} for query {}", indexBucket, sq.block, sq.searchTermsInclude);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (remainingResults <= results.size())
|
||||
break;
|
||||
|
||||
var query = getQuery(cachePool, indexBucket, sq.block, lv -> localFilter.filterRawValue(indexBucket, lv), searchTerms);
|
||||
long[] buf = new long[8192];
|
||||
|
||||
while (query.hasMore() && results.size() < remainingResults && budget.hasTimeLeft()) {
|
||||
int cnt = query.getMoreResults(buf, budget);
|
||||
|
||||
for (int i = 0; i < cnt && results.size() < remainingResults; i++) {
|
||||
long id = buf[i];
|
||||
|
||||
final EdgeSearchResultItem ri = new EdgeSearchResultItem(indexBucket, id);
|
||||
|
||||
if (!seenResults.add(ri.getUrlId().id()) || !localFilter.test(ri)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
results.add(ri);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private IndexQuery getQuery(IndexQueryCachePool cachePool, int bucket, IndexBlock block,
|
||||
LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
|
||||
|
||||
if (!indexes.isValidBucket(bucket)) {
|
||||
logger.warn("Invalid bucket {}", bucket);
|
||||
return new IndexQuery(Collections.emptyList());
|
||||
}
|
||||
|
||||
return indexes.getBucket(bucket).getQuery(cachePool, block, filter, searchTerms);
|
||||
}
|
||||
|
||||
private void addResultScores(EdgeSearchResultItem searchResult) {
|
||||
final var reader = Objects.requireNonNull(indexes.getDictionaryReader());
|
||||
|
||||
List<List<String>> searchTermVariants = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
|
||||
|
||||
// Memoize calls to getTermData, as they're redundant and cause disk reads
|
||||
Map<ResultTerm, ResultTermData> termMetadata = new HashMap<>(32);
|
||||
|
||||
double bestScore = 0;
|
||||
|
||||
for (int searchTermListIdx = 0; searchTermListIdx < searchTermVariants.size(); searchTermListIdx++) {
|
||||
double setScore = 0;
|
||||
int setSize = 0;
|
||||
for (var searchTerm : searchTermVariants.get(searchTermListIdx)) {
|
||||
|
||||
final int termId = reader.get(searchTerm);
|
||||
|
||||
ResultTermData data = termMetadata.computeIfAbsent(
|
||||
new ResultTerm(searchResult.bucketId, termId, searchResult.getCombinedId()), this::getTermData);
|
||||
|
||||
var score = data.asScore(searchTermListIdx, searchTerm);
|
||||
searchResult.scores.add(score);
|
||||
setScore += score.value();
|
||||
setSize++;
|
||||
}
|
||||
bestScore = Math.min(bestScore, setScore/setSize);
|
||||
}
|
||||
|
||||
searchResult.setScore(bestScore);
|
||||
}
|
||||
|
||||
private ResultTermData getTermData(ResultTerm resultTerm) {
|
||||
final EdgeIndexBucket bucket = indexes.getBucket(resultTerm.bucket);
|
||||
final int termId = resultTerm.termId;
|
||||
final long combinedUrlId = resultTerm.combinedUrlId;
|
||||
|
||||
return new ResultTermData(bucket.getTermScore(cachePool, termId, combinedUrlId),
|
||||
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Title, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Link, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Site, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Subjects, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.NamesWords, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Top, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Middle, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Lower, termId, combinedUrlId)
|
||||
);
|
||||
}
|
||||
|
||||
record ResultTerm (int bucket, int termId, long combinedUrlId) {}
|
||||
record ResultTermData (IndexBlock index,
|
||||
boolean title,
|
||||
boolean link,
|
||||
boolean site,
|
||||
boolean subject,
|
||||
boolean name,
|
||||
boolean high,
|
||||
boolean mid,
|
||||
boolean low
|
||||
) {
|
||||
public EdgeSearchResultKeywordScore asScore(int set, String searchTerm) {
|
||||
return new EdgeSearchResultKeywordScore(set, searchTerm, index, title, link, site, subject, name, high, mid, low);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private Optional<EdgeIndexSearchTerms> getSearchTerms(EdgeSearchSubquery request) {
|
||||
final List<Integer> excludes = new ArrayList<>();
|
||||
final List<Integer> includes = new ArrayList<>();
|
||||
|
||||
for (var include : request.searchTermsInclude) {
|
||||
var word = lookUpWord(include);
|
||||
if (word.isEmpty()) {
|
||||
logger.debug("Unknown search term: " + include);
|
||||
return Optional.empty();
|
||||
}
|
||||
includes.add(word.getAsInt());
|
||||
}
|
||||
|
||||
for (var exclude : request.searchTermsExclude) {
|
||||
lookUpWord(exclude).ifPresent(excludes::add);
|
||||
}
|
||||
|
||||
if (includes.isEmpty()) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
return Optional.of(new EdgeIndexSearchTerms(includes, excludes));
|
||||
}
|
||||
|
||||
private OptionalInt lookUpWord(String s) {
|
||||
int ret = indexes.getDictionaryReader().get(s);
|
||||
if (ret == DictionaryHashMap.NO_VALUE) {
|
||||
return OptionalInt.empty();
|
||||
}
|
||||
return OptionalInt.of(ret);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.wmsa.edge.index.client;
|
||||
|
||||
import com.google.inject.Singleton;
|
||||
import io.prometheus.client.Summary;
|
||||
import io.reactivex.rxjava3.core.Observable;
|
||||
import io.reactivex.rxjava3.schedulers.Schedulers;
|
||||
import nu.marginalia.wmsa.client.AbstractDynamicClient;
|
||||
@ -10,6 +11,7 @@ import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywor
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultSet;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
||||
@ -23,6 +25,8 @@ import java.util.concurrent.TimeUnit;
|
||||
@Singleton
|
||||
public class EdgeIndexClient extends AbstractDynamicClient implements EdgeIndexWriterClient {
|
||||
|
||||
private static final Summary wmsa_search_index_api_time = Summary.build().name("wmsa_search_index_api_time").help("-").register();
|
||||
|
||||
public EdgeIndexClient() {
|
||||
super(ServiceDescriptor.EDGE_INDEX);
|
||||
setTimeout(30);
|
||||
@ -52,20 +56,10 @@ public class EdgeIndexClient extends AbstractDynamicClient implements EdgeIndexW
|
||||
|
||||
|
||||
@CheckReturnValue
|
||||
public EdgeSearchResultSet query(Context ctx, EdgeSearchSpecification specs) {
|
||||
return this.postGet(ctx, "/search/", specs, EdgeSearchResultSet.class).blockingFirst();
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
public List<EdgeSearchResultSet> multiQuery(Context ctx, EdgeSearchSpecification... specs) {
|
||||
|
||||
return Observable.fromArray(specs)
|
||||
.concatMap(s -> postGet(ctx, "/search/", s, EdgeSearchResultSet.class)
|
||||
.subscribeOn(Schedulers.io())
|
||||
.timeout(1, TimeUnit.SECONDS)
|
||||
.onErrorComplete())
|
||||
.toList()
|
||||
.blockingGet();
|
||||
public List<EdgeSearchResultItem> query(Context ctx, EdgeSearchSpecification specs) {
|
||||
return wmsa_search_index_api_time.time(
|
||||
() -> this.postGet(ctx, "/search/", specs, EdgeSearchResultSet.class).blockingFirst().getResults()
|
||||
);
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
|
@ -9,4 +9,8 @@ import java.util.List;
|
||||
public class EdgeIndexSearchTerms {
|
||||
public List<Integer> includes = new ArrayList<>();
|
||||
public List<Integer> excludes = new ArrayList<>();
|
||||
|
||||
public boolean isEmpty() {
|
||||
return includes.isEmpty();
|
||||
}
|
||||
}
|
||||
|
@ -1,52 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.index.reader;
|
||||
|
||||
import gnu.trove.map.TLongIntMap;
|
||||
import gnu.trove.map.hash.TLongIntHashMap;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class ResultDomainDeduplicator {
|
||||
final TLongIntMap resultsByRankingId = new TLongIntHashMap(2048, 0.5f, -1, 0);
|
||||
final int limitByDomain;
|
||||
|
||||
public ResultDomainDeduplicator(int limitByDomain) {
|
||||
this.limitByDomain = limitByDomain;
|
||||
}
|
||||
|
||||
public boolean filterRawValue(int bucket, long value) {
|
||||
int domain = (int) (value >>> 32);
|
||||
|
||||
if (domain == Integer.MAX_VALUE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return resultsByRankingId.get(getKey(bucket, domain)) <= limitByDomain;
|
||||
}
|
||||
|
||||
long getKey(int bucketId, int rankingId) {
|
||||
return ((long) bucketId) << 32 | rankingId;
|
||||
}
|
||||
|
||||
long getKey(EdgeSearchResultItem item) {
|
||||
return ((long) item.bucketId) << 32 | item.getRanking();
|
||||
}
|
||||
|
||||
public boolean test(EdgeSearchResultItem item) {
|
||||
if (item.getRanking() == Integer.MAX_VALUE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return resultsByRankingId.adjustOrPutValue(getKey(item), 1, 1) <= limitByDomain;
|
||||
}
|
||||
|
||||
public void addAll(List<EdgeSearchResultItem> items) {
|
||||
for (var item : items) {
|
||||
resultsByRankingId.adjustOrPutValue(getKey(item), 1, 1);
|
||||
}
|
||||
}
|
||||
|
||||
public void add(EdgeSearchResultItem item) {
|
||||
resultsByRankingId.adjustOrPutValue(getKey(item), 1, 1);
|
||||
}
|
||||
}
|
@ -9,8 +9,9 @@ import nu.marginalia.util.btree.CachingBTreeReader;
|
||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.types.EntrySource;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.types.QueryFilterStep;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -77,25 +78,25 @@ public class SearchIndex implements AutoCloseable {
|
||||
return rangeForWord(pool, wordId).numEntries();
|
||||
}
|
||||
|
||||
public UrlIndexTree rangeForWord(IndexQueryCachePool pool, int wordId) {
|
||||
UrlIndexTree range = pool.getRange(words, wordId);
|
||||
public IndexBTreeRange rangeForWord(IndexQueryCachePool pool, int wordId) {
|
||||
IndexBTreeRange range = pool.getRange(words, wordId);
|
||||
|
||||
if (range == null) {
|
||||
range = new UrlIndexTree(words.positionForWord(wordId));
|
||||
range = new IndexBTreeRange(words.positionForWord(wordId));
|
||||
pool.cacheRange(words, wordId, range);
|
||||
}
|
||||
|
||||
return range;
|
||||
}
|
||||
|
||||
public UrlIndexTree rangeForWord(int wordId) {
|
||||
return new UrlIndexTree(words.positionForWord(wordId));
|
||||
public IndexBTreeRange rangeForWord(int wordId) {
|
||||
return new IndexBTreeRange(words.positionForWord(wordId));
|
||||
}
|
||||
|
||||
public class UrlIndexTree {
|
||||
final long dataOffset;
|
||||
public class IndexBTreeRange {
|
||||
public final long dataOffset;
|
||||
private BTreeHeader header;
|
||||
public UrlIndexTree(long dataOffset) {
|
||||
public IndexBTreeRange(long dataOffset) {
|
||||
this.dataOffset = dataOffset;
|
||||
}
|
||||
|
||||
@ -126,7 +127,7 @@ public class SearchIndex implements AutoCloseable {
|
||||
return new AsEntrySource();
|
||||
}
|
||||
|
||||
public QueryFilterStep asExcludeFilterStep(IndexQueryCachePool pool) {
|
||||
public QueryFilterStepIf asExcludeFilterStep(IndexQueryCachePool pool) {
|
||||
return new AsExcludeQueryFilterStep(pool);
|
||||
}
|
||||
|
||||
@ -150,7 +151,7 @@ public class SearchIndex implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasUrl(CachingBTreeReader.Cache cache, long url) {
|
||||
public boolean hasUrl(CachingBTreeReader.BTreeCachedIndex cache, long url) {
|
||||
if (dataOffset < 0) return false;
|
||||
|
||||
return cachingBTreeReader.findEntry(cache, url) >= 0;
|
||||
@ -160,12 +161,12 @@ public class SearchIndex implements AutoCloseable {
|
||||
if (dataOffset < 0)
|
||||
return false;
|
||||
|
||||
CachingBTreeReader.Cache cache = pool.getIndexCache(SearchIndex.this, this);
|
||||
CachingBTreeReader.BTreeCachedIndex cache = pool.getIndexCache(SearchIndex.this, this);
|
||||
|
||||
return cachingBTreeReader.findEntry(cache, url) >= 0;
|
||||
}
|
||||
|
||||
public CachingBTreeReader.Cache createIndexCache() {
|
||||
public CachingBTreeReader.BTreeCachedIndex createIndexCache() {
|
||||
if (dataOffset < 0)
|
||||
return null;
|
||||
|
||||
@ -213,11 +214,11 @@ public class SearchIndex implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
class AsExcludeQueryFilterStep implements QueryFilterStep {
|
||||
private final CachingBTreeReader.Cache cache;
|
||||
class AsExcludeQueryFilterStep implements QueryFilterStepIf {
|
||||
private final CachingBTreeReader.BTreeCachedIndex cache;
|
||||
|
||||
public AsExcludeQueryFilterStep(IndexQueryCachePool pool) {
|
||||
cache = pool.getIndexCache(SearchIndex.this, UrlIndexTree.this);
|
||||
cache = pool.getIndexCache(SearchIndex.this, IndexBTreeRange.this);
|
||||
}
|
||||
|
||||
public SearchIndex getIndex() {
|
||||
|
@ -3,7 +3,8 @@ package nu.marginalia.wmsa.edge.index.reader;
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexQueryFactory;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -105,7 +105,7 @@ public class SearchIndexes {
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public KeywordLexiconReadOnlyView getDictionaryReader() {
|
||||
public KeywordLexiconReadOnlyView getLexiconReader() {
|
||||
return keywordLexiconReadOnlyView;
|
||||
}
|
||||
|
||||
|
@ -1,26 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.index.reader.query;
|
||||
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
public interface Query {
|
||||
Query EMPTY = new Query() {
|
||||
@Override
|
||||
public Query also(int wordId) { return this; }
|
||||
|
||||
@Override
|
||||
public Query alsoCached(int wordId) { return this; }
|
||||
|
||||
@Override
|
||||
public Query not(int wordId) { return this; }
|
||||
|
||||
@Override
|
||||
public LongStream stream() { return LongStream.empty(); }
|
||||
};
|
||||
|
||||
Query also(int wordId);
|
||||
Query alsoCached(int wordId);
|
||||
|
||||
Query not(int wordId);
|
||||
|
||||
LongStream stream();
|
||||
}
|
@ -1,125 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.index.reader.query.types;
|
||||
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.List;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
public interface QueryFilterStep extends Comparable<QueryFilterStep> {
|
||||
@Nullable
|
||||
SearchIndex getIndex();
|
||||
|
||||
boolean test(long value);
|
||||
|
||||
double cost();
|
||||
|
||||
default int compareTo(QueryFilterStep other) {
|
||||
return (int)(cost() - other.cost());
|
||||
}
|
||||
|
||||
String describe();
|
||||
|
||||
/**
|
||||
* Move each value in items to the beginning of the array,
|
||||
* and return the number of matching items.
|
||||
*
|
||||
* The remaining values are undefined.
|
||||
*/
|
||||
default int retainDestructive(long[] items, int max) {
|
||||
int keep = 0;
|
||||
for (int i = 0; i < max; i++) {
|
||||
if (test(items[i])) {
|
||||
if (i != keep) {
|
||||
items[keep] = items[i];
|
||||
}
|
||||
keep++;
|
||||
}
|
||||
}
|
||||
return keep;
|
||||
}
|
||||
|
||||
/**
|
||||
* Move each value in items to the beginning of the array,
|
||||
* and return the number of matching items. The values that do
|
||||
* not pass the test are moved to the end of the array.
|
||||
*/
|
||||
default int retainReorder(long[] items, int start, int max) {
|
||||
int keep = 0;
|
||||
for (int i = start; i < max; i++) {
|
||||
if (test(items[i])) {
|
||||
if (i != keep) {
|
||||
long tmp = items[keep];
|
||||
items[keep] = items[i];
|
||||
items[i] = tmp;
|
||||
}
|
||||
keep++;
|
||||
}
|
||||
}
|
||||
return keep;
|
||||
}
|
||||
|
||||
|
||||
static QueryFilterStep noPass() {
|
||||
return NoPassFilter.instance;
|
||||
}
|
||||
static QueryFilterStep anyOf(List<? extends QueryFilterStep> steps) {
|
||||
return new AnyOfFilter(steps);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
class AnyOfFilter implements QueryFilterStep {
|
||||
private final List<? extends QueryFilterStep> steps;
|
||||
|
||||
AnyOfFilter(List<? extends QueryFilterStep> steps) {
|
||||
this.steps = steps;
|
||||
}
|
||||
|
||||
public SearchIndex getIndex() { return null; }
|
||||
|
||||
public double cost() {
|
||||
return steps.stream().mapToDouble(QueryFilterStep::cost).average().orElse(0.);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean test(long value) {
|
||||
for (var step : steps) {
|
||||
if (step.test(value))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public String describe() {
|
||||
StringJoiner sj = new StringJoiner(",", "[Any Of: ", "]");
|
||||
for (var step : steps) {
|
||||
sj.add(step.describe());
|
||||
}
|
||||
return sj.toString();
|
||||
}
|
||||
}
|
||||
|
||||
class NoPassFilter implements QueryFilterStep {
|
||||
static final QueryFilterStep instance = new NoPassFilter();
|
||||
|
||||
@Override
|
||||
public boolean test(long value) {
|
||||
return false;
|
||||
}
|
||||
public SearchIndex getIndex() { return null; }
|
||||
public double cost() { return 0.; }
|
||||
|
||||
public int retainDestructive(long[] items, int max) {
|
||||
return 0;
|
||||
}
|
||||
public int retainReorder(long[] items, int start, int max) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public String describe() {
|
||||
return "[NoPass]";
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,107 @@
|
||||
package nu.marginalia.wmsa.edge.index.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.protobuf.InvalidProtocolBufferException;
|
||||
import nu.marginalia.util.ListChunker;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
|
||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
|
||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq;
|
||||
import org.apache.http.HttpStatus;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
@Singleton
|
||||
public class EdgeIndexLexiconService {
|
||||
|
||||
private final SearchIndexes indexes;
|
||||
private final KeywordLexicon keywordLexicon;
|
||||
|
||||
@Inject
|
||||
public EdgeIndexLexiconService(SearchIndexes indexes, IndexServicesFactory servicesFactory) {
|
||||
this.indexes = indexes;
|
||||
this.keywordLexicon = servicesFactory.getKeywordLexicon();
|
||||
}
|
||||
|
||||
public Object getWordId(Request request, Response response) {
|
||||
final String word = request.splat()[0];
|
||||
|
||||
var lr = indexes.getLexiconReader();
|
||||
if (null == lr) {
|
||||
response.status(HttpStatus.SC_FAILED_DEPENDENCY);
|
||||
return "";
|
||||
}
|
||||
|
||||
final int wordId = lr.get(word);
|
||||
|
||||
if (DictionaryHashMap.NO_VALUE == wordId) {
|
||||
response.status(404);
|
||||
return "";
|
||||
}
|
||||
|
||||
return wordId;
|
||||
}
|
||||
|
||||
|
||||
public Object putWords(Request request, Response response) throws InvalidProtocolBufferException {
|
||||
var req = IndexPutKeywordsReq.parseFrom(request.bodyAsBytes());
|
||||
|
||||
EdgeId<EdgeDomain> domainId = new EdgeId<>(req.getDomain());
|
||||
EdgeId<EdgeUrl> urlId = new EdgeId<>(req.getUrl());
|
||||
int idx = req.getIndex();
|
||||
|
||||
for (int ws = 0; ws < req.getWordSetCount(); ws++) {
|
||||
putWords(domainId, urlId, req.getWordSet(ws), idx);
|
||||
}
|
||||
|
||||
response.status(HttpStatus.SC_ACCEPTED);
|
||||
return "";
|
||||
}
|
||||
|
||||
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
|
||||
IndexPutKeywordsReq.WordSet words, int idx
|
||||
) {
|
||||
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
|
||||
|
||||
IndexBlock block = IndexBlock.values()[words.getIndex()];
|
||||
|
||||
for (var chunk : ListChunker.chopList(words.getWordsList(), SearchIndexJournalEntry.MAX_LENGTH)) {
|
||||
|
||||
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
|
||||
var header = new SearchIndexJournalEntryHeader(domainId, urlId, block);
|
||||
|
||||
indexWriter.put(header, entry);
|
||||
};
|
||||
}
|
||||
|
||||
private long[] getOrInsertWordIds(List<String> words) {
|
||||
long[] ids = new long[words.size()];
|
||||
int putIdx = 0;
|
||||
|
||||
for (String word : words) {
|
||||
long id = keywordLexicon.getOrInsert(word);
|
||||
if (id != DictionaryHashMap.NO_VALUE) {
|
||||
ids[putIdx++] = id;
|
||||
}
|
||||
}
|
||||
|
||||
if (putIdx != words.size()) {
|
||||
ids = Arrays.copyOf(ids, putIdx);
|
||||
}
|
||||
return ids;
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,44 @@
|
||||
package nu.marginalia.wmsa.edge.index.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
@Singleton
|
||||
public class EdgeIndexOpsService {
|
||||
|
||||
private final SearchIndexes indexes;
|
||||
|
||||
@Inject
|
||||
public EdgeIndexOpsService(SearchIndexes indexes) {
|
||||
this.indexes = indexes;
|
||||
}
|
||||
|
||||
public Object repartitionEndpoint(Request request, Response response) {
|
||||
|
||||
if (!indexes.repartition()) {
|
||||
Spark.halt(503, "Operations busy");
|
||||
}
|
||||
return "OK";
|
||||
}
|
||||
|
||||
public Object preconvertEndpoint(Request request, Response response) {
|
||||
if (!indexes.preconvert()) {
|
||||
Spark.halt(503, "Operations busy");
|
||||
}
|
||||
return "OK";
|
||||
}
|
||||
|
||||
public Object reindexEndpoint(Request request, Response response) {
|
||||
int id = Integer.parseInt(request.params("id"));
|
||||
|
||||
if (!indexes.reindex(id)) {
|
||||
Spark.halt(503, "Operations busy");
|
||||
}
|
||||
return "OK";
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,320 @@
|
||||
package nu.marginalia.wmsa.edge.index.svc;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import io.prometheus.client.Counter;
|
||||
import io.prometheus.client.Histogram;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.wmsa.client.GsonFactory;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.edge.index.EdgeIndexBucket;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexSearchBudget;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeIdList;
|
||||
import nu.marginalia.wmsa.edge.model.search.*;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
||||
import org.apache.http.HttpStatus;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.HaltException;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.function.LongPredicate;
|
||||
|
||||
import static spark.Spark.halt;
|
||||
|
||||
@Singleton
|
||||
public class EdgeIndexQueryService {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private static final int SEARCH_BUDGET_TIMEOUT_MS = 3000;
|
||||
private static final int QUERY_FETCH_SIZE = 8192;
|
||||
private static final int QUERY_FIRST_PASS_DOMAIN_LIMIT = 64;
|
||||
|
||||
private static final Counter wmsa_edge_index_query_timeouts = Counter.build().name("wmsa_edge_index_query_timeouts").help("-").register();
|
||||
|
||||
private static final Histogram wmsa_edge_index_query_time = Histogram.build().name("wmsa_edge_index_query_time").linearBuckets(50, 50, 15).help("-").register();
|
||||
private static final Histogram wmsa_edge_index_domain_query_time = Histogram.build().name("wmsa_edge_index_domain_query_time").linearBuckets(50, 50, 15).help("-").register();
|
||||
|
||||
private final Gson gson = GsonFactory.get();
|
||||
|
||||
private final SearchIndexes indexes;
|
||||
|
||||
@Inject
|
||||
public EdgeIndexQueryService(SearchIndexes indexes) {
|
||||
this.indexes = indexes;
|
||||
}
|
||||
|
||||
public Object searchDomain(Request request, Response response) {
|
||||
if (indexes.getLexiconReader() == null) {
|
||||
logger.warn("Dictionary reader not yet initialized");
|
||||
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
|
||||
}
|
||||
|
||||
String json = request.body();
|
||||
EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class);
|
||||
|
||||
try {
|
||||
return wmsa_edge_index_domain_query_time.time(() -> queryDomain(specsSet));
|
||||
}
|
||||
catch (HaltException ex) {
|
||||
logger.warn("Halt", ex);
|
||||
throw ex;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.info("Error during domain search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
|
||||
logger.info("Error", ex);
|
||||
Spark.halt(500, "Error");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public Object search(Request request, Response response) {
|
||||
if (indexes.getLexiconReader() == null) {
|
||||
logger.warn("Dictionary reader not yet initialized");
|
||||
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
|
||||
}
|
||||
|
||||
String json = request.body();
|
||||
EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class);
|
||||
|
||||
try {
|
||||
return wmsa_edge_index_query_time.time(() -> query(specsSet));
|
||||
}
|
||||
catch (HaltException ex) {
|
||||
logger.warn("Halt", ex);
|
||||
throw ex;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.info("Error during search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
|
||||
logger.info("Error", ex);
|
||||
Spark.halt(500, "Error");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public EdgeSearchResultSet query(EdgeSearchSpecification specsSet) {
|
||||
List<EdgeSearchResultItem> results = new SearchQuery(specsSet).execute();
|
||||
return new EdgeSearchResultSet(results);
|
||||
}
|
||||
|
||||
public EdgeDomainSearchResults queryDomain(EdgeDomainSearchSpecification specsSet) {
|
||||
|
||||
final OptionalInt wordId = lookUpWord(specsSet.keyword);
|
||||
EdgeIdList<EdgeUrl> urlIds;
|
||||
|
||||
if (wordId.isEmpty()) {
|
||||
urlIds = new EdgeIdList<>();
|
||||
} else {
|
||||
urlIds = indexes
|
||||
.getBucket(specsSet.bucket)
|
||||
.findHotDomainsForKeyword(specsSet.block, wordId.getAsInt(), specsSet.queryDepth, specsSet.minHitCount, specsSet.maxResults)
|
||||
.mapToInt(lv -> (int) (lv & 0xFFFF_FFFFL))
|
||||
.collect(EdgeIdList::new, EdgeIdList::add, EdgeIdList::addAll);
|
||||
}
|
||||
|
||||
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
|
||||
}
|
||||
|
||||
private class SearchQuery {
|
||||
private final TIntHashSet seenResults = new TIntHashSet(QUERY_FETCH_SIZE, 0.5f);
|
||||
private final EdgeSearchSpecification specsSet;
|
||||
private final IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_TIMEOUT_MS);
|
||||
private final IndexQueryCachePool cachePool = new IndexQueryCachePool();
|
||||
|
||||
public SearchQuery(EdgeSearchSpecification specsSet) {
|
||||
this.specsSet = specsSet;
|
||||
}
|
||||
|
||||
private List<EdgeSearchResultItem> execute() {
|
||||
final Set<EdgeSearchResultItem> results = new HashSet<>(QUERY_FETCH_SIZE);
|
||||
|
||||
for (var sq : specsSet.subqueries) {
|
||||
results.addAll(performSearch(sq));
|
||||
}
|
||||
|
||||
for (var result : results) {
|
||||
addResultScores(result);
|
||||
}
|
||||
|
||||
if (!budget.hasTimeLeft()) {
|
||||
wmsa_edge_index_query_timeouts.inc();
|
||||
}
|
||||
|
||||
var domainCountFilter = new ResultDomainDeduplicator(specsSet.limitByDomain);
|
||||
|
||||
if (WmsaHome.isDebug()) {
|
||||
cachePool.printSummary(logger);
|
||||
}
|
||||
cachePool.clear();
|
||||
|
||||
return results.stream()
|
||||
.sorted(Comparator.comparing(EdgeSearchResultItem::getScore))
|
||||
.filter(domainCountFilter::test)
|
||||
.limit(specsSet.getLimitTotal()).toList();
|
||||
}
|
||||
|
||||
|
||||
private List<EdgeSearchResultItem> performSearch(EdgeSearchSubquery sq)
|
||||
{
|
||||
|
||||
final List<EdgeSearchResultItem> results = new ArrayList<>(QUERY_FETCH_SIZE);
|
||||
final EdgeIndexSearchTerms searchTerms = getSearchTerms(sq);
|
||||
|
||||
if (searchTerms.isEmpty())
|
||||
return Collections.emptyList();
|
||||
|
||||
for (int indexBucket : specsSet.buckets) {
|
||||
final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT);
|
||||
|
||||
if (!budget.hasTimeLeft()) {
|
||||
logger.info("Query timed out, omitting {}:{} for query {}", indexBucket, sq.block, sq.searchTermsInclude);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (QUERY_FETCH_SIZE <= results.size())
|
||||
break;
|
||||
|
||||
IndexQuery query = getQuery(cachePool, indexBucket, sq.block, localFilter::filterRawValue, searchTerms);
|
||||
long[] buf = new long[8192];
|
||||
|
||||
while (query.hasMore() && results.size() < QUERY_FETCH_SIZE && budget.hasTimeLeft()) {
|
||||
int cnt = query.getMoreResults(buf, budget);
|
||||
|
||||
for (int i = 0; i < cnt && results.size() < QUERY_FETCH_SIZE; i++) {
|
||||
final long id = buf[i];
|
||||
|
||||
if (!seenResults.add((int)(id & 0xFFFF_FFFFL)) || !localFilter.test(id)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
results.add(new EdgeSearchResultItem(indexBucket, id));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private IndexQuery getQuery(IndexQueryCachePool cachePool, int bucket, IndexBlock block,
|
||||
LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
|
||||
|
||||
if (!indexes.isValidBucket(bucket)) {
|
||||
logger.warn("Invalid bucket {}", bucket);
|
||||
return new IndexQuery(Collections.emptyList());
|
||||
}
|
||||
|
||||
return indexes.getBucket(bucket).getQuery(cachePool, block, filter, searchTerms);
|
||||
}
|
||||
|
||||
private void addResultScores(EdgeSearchResultItem searchResult) {
|
||||
final var reader = Objects.requireNonNull(indexes.getLexiconReader());
|
||||
|
||||
List<List<String>> searchTermVariants = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
|
||||
|
||||
// Memoize calls to getTermData, as they're somewhat expensive and highly redundant
|
||||
Map<ResultTerm, ResultTermData> termMetadata = new HashMap<>(32);
|
||||
|
||||
double bestScore = 0;
|
||||
|
||||
for (int searchTermListIdx = 0; searchTermListIdx < searchTermVariants.size(); searchTermListIdx++) {
|
||||
double setScore = 0;
|
||||
int setSize = 0;
|
||||
for (var searchTerm : searchTermVariants.get(searchTermListIdx)) {
|
||||
|
||||
final int termId = reader.get(searchTerm);
|
||||
|
||||
ResultTermData data = termMetadata.computeIfAbsent(
|
||||
new ResultTerm(searchResult.bucketId, termId, searchResult.getCombinedId()), this::getTermData);
|
||||
|
||||
var score = data.asScore(searchTermListIdx, searchTerm);
|
||||
searchResult.scores.add(score);
|
||||
setScore += score.value();
|
||||
setSize++;
|
||||
}
|
||||
bestScore = Math.min(bestScore, setScore/setSize);
|
||||
}
|
||||
|
||||
searchResult.setScore(bestScore);
|
||||
}
|
||||
|
||||
private ResultTermData getTermData(ResultTerm resultTerm) {
|
||||
final EdgeIndexBucket bucket = indexes.getBucket(resultTerm.bucket);
|
||||
final int termId = resultTerm.termId;
|
||||
final long combinedUrlId = resultTerm.combinedUrlId;
|
||||
|
||||
return new ResultTermData(bucket.getTermScore(cachePool, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Title, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Link, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Site, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Subjects, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.NamesWords, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Top, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Middle, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Lower, termId, combinedUrlId)
|
||||
);
|
||||
}
|
||||
|
||||
record ResultTerm (int bucket, int termId, long combinedUrlId) {}
|
||||
record ResultTermData (IndexBlock index,
|
||||
boolean title,
|
||||
boolean link,
|
||||
boolean site,
|
||||
boolean subject,
|
||||
boolean name,
|
||||
boolean high,
|
||||
boolean mid,
|
||||
boolean low
|
||||
) {
|
||||
public EdgeSearchResultKeywordScore asScore(int set, String searchTerm) {
|
||||
return new EdgeSearchResultKeywordScore(set, searchTerm, index, title, link, site, subject, name, high, mid, low);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private EdgeIndexSearchTerms getSearchTerms(EdgeSearchSubquery request) {
|
||||
final List<Integer> excludes = new ArrayList<>();
|
||||
final List<Integer> includes = new ArrayList<>();
|
||||
|
||||
for (var include : request.searchTermsInclude) {
|
||||
var word = lookUpWord(include);
|
||||
if (word.isEmpty()) {
|
||||
logger.debug("Unknown search term: " + include);
|
||||
return new EdgeIndexSearchTerms(includes, excludes);
|
||||
}
|
||||
includes.add(word.getAsInt());
|
||||
}
|
||||
|
||||
for (var exclude : request.searchTermsExclude) {
|
||||
lookUpWord(exclude).ifPresent(excludes::add);
|
||||
}
|
||||
|
||||
return new EdgeIndexSearchTerms(includes, excludes);
|
||||
}
|
||||
|
||||
|
||||
private OptionalInt lookUpWord(String s) {
|
||||
int ret = indexes.getLexiconReader().get(s);
|
||||
if (ret == DictionaryHashMap.NO_VALUE) {
|
||||
return OptionalInt.empty();
|
||||
}
|
||||
return OptionalInt.of(ret);
|
||||
}
|
||||
|
||||
}
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.wmsa.edge.index.reader.query;
|
||||
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.types.EntrySource;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.types.QueryFilterStep;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@ -10,18 +10,18 @@ import static java.lang.Math.min;
|
||||
|
||||
public class IndexQuery {
|
||||
private final List<EntrySource> sources;
|
||||
private final List<QueryFilterStep> inclusionFilter = new ArrayList<>(10);
|
||||
private final List<QueryFilterStep> priorityFilter = new ArrayList<>(10);
|
||||
private final List<QueryFilterStepIf> inclusionFilter = new ArrayList<>(10);
|
||||
private final List<QueryFilterStepIf> priorityFilter = new ArrayList<>(10);
|
||||
|
||||
public IndexQuery(List<EntrySource> sources) {
|
||||
this.sources = sources;
|
||||
}
|
||||
|
||||
public void addInclusionFilter(QueryFilterStep filter) {
|
||||
public void addInclusionFilter(QueryFilterStepIf filter) {
|
||||
inclusionFilter.add(filter);
|
||||
}
|
||||
|
||||
public void addPriorityFilter(QueryFilterStep filter) {
|
||||
public void addPriorityFilter(QueryFilterStepIf filter) {
|
||||
priorityFilter.add(filter);
|
||||
}
|
||||
|
@ -1,17 +1,19 @@
|
||||
package nu.marginalia.wmsa.edge.index.reader;
|
||||
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||
|
||||
import nu.marginalia.util.btree.CachingBTreeReader;
|
||||
import nu.marginalia.wmsa.edge.index.reader.IndexWordsTable;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||
import org.slf4j.Logger;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class IndexQueryCachePool {
|
||||
private final Map<PoolKey, CachingBTreeReader.Cache> indexCaches = new HashMap<>();
|
||||
private final Map<RangeKey, SearchIndex.UrlIndexTree> rangeCache = new HashMap<>();
|
||||
private final Map<PoolKey, CachingBTreeReader.BTreeCachedIndex> indexCaches = new HashMap<>();
|
||||
private final Map<RangeKey, SearchIndex.IndexBTreeRange> rangeCache = new HashMap<>();
|
||||
private final Map<PoolKey, Integer> savedCounts = new HashMap<>();
|
||||
|
||||
public CachingBTreeReader.Cache getIndexCache(SearchIndex index, SearchIndex.UrlIndexTree range) {
|
||||
public CachingBTreeReader.BTreeCachedIndex getIndexCache(SearchIndex index, SearchIndex.IndexBTreeRange range) {
|
||||
var key = new PoolKey(index, range.dataOffset);
|
||||
var entry = indexCaches.get(key);
|
||||
|
||||
@ -33,10 +35,10 @@ public class IndexQueryCachePool {
|
||||
}
|
||||
|
||||
public void printSummary(Logger logger) {
|
||||
long loadedBytes = indexCaches.values().stream().mapToLong(CachingBTreeReader.Cache::sizeBytes).sum();
|
||||
long loadedBytes = indexCaches.values().stream().mapToLong(CachingBTreeReader.BTreeCachedIndex::sizeBytes).sum();
|
||||
long savedBytes = savedCounts.entrySet().stream().mapToLong(e -> e.getValue() * indexCaches.get(e.getKey()).sizeBytes()).sum();
|
||||
|
||||
long loaded = indexCaches.values().stream().filter(CachingBTreeReader.Cache::isLoaded).count();
|
||||
long loaded = indexCaches.values().stream().filter(CachingBTreeReader.BTreeCachedIndex::isLoaded).count();
|
||||
|
||||
logger.info("Index Cache Summary: {}/{} loaded/total, {} index blocks loaded, {} index blocks saved", loaded, indexCaches.size(), loadedBytes/4096., savedBytes/4096.);
|
||||
}
|
||||
@ -45,11 +47,11 @@ public class IndexQueryCachePool {
|
||||
indexCaches.clear();
|
||||
}
|
||||
|
||||
public SearchIndex.UrlIndexTree getRange(IndexWordsTable words, int wordId) {
|
||||
public SearchIndex.IndexBTreeRange getRange(IndexWordsTable words, int wordId) {
|
||||
return rangeCache.get(new RangeKey(words, wordId));
|
||||
}
|
||||
|
||||
public void cacheRange(IndexWordsTable words, int wordId, SearchIndex.UrlIndexTree range) {
|
||||
public void cacheRange(IndexWordsTable words, int wordId, SearchIndex.IndexBTreeRange range) {
|
||||
rangeCache.put(new RangeKey(words, wordId), range);
|
||||
}
|
||||
|
@ -1,11 +1,10 @@
|
||||
package nu.marginalia.wmsa.edge.index.reader.query;
|
||||
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||
|
||||
import nu.marginalia.wmsa.edge.index.reader.IndexQueryCachePool;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.types.EntrySource;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.types.QueryFilterStep;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.types.QueryFilterStepFromPredicate;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.types.UrlRangeSubFilter;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterBTreeRange;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepFromPredicate;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.function.LongPredicate;
|
||||
@ -57,21 +56,21 @@ public class IndexQueryFactory {
|
||||
}
|
||||
|
||||
public IndexQueryBuilder also(int termId) {
|
||||
List<QueryFilterStep> filters = new ArrayList<>(requiredIndices.size());
|
||||
List<QueryFilterStepIf> filters = new ArrayList<>(requiredIndices.size());
|
||||
|
||||
for (var ri : requiredIndices) {
|
||||
var range = ri.rangeForWord(cachePool, termId);
|
||||
|
||||
if (range.isPresent()) {
|
||||
filters.add(new UrlRangeSubFilter(ri, range, cachePool));
|
||||
filters.add(new QueryFilterBTreeRange(ri, range, cachePool));
|
||||
}
|
||||
else {
|
||||
filters.add(QueryFilterStep.noPass());
|
||||
filters.add(QueryFilterStepIf.noPass());
|
||||
}
|
||||
}
|
||||
|
||||
filters.sort(Comparator.naturalOrder());
|
||||
query.addInclusionFilter(QueryFilterStep.anyOf(filters));
|
||||
query.addInclusionFilter(QueryFilterStepIf.anyOf(filters));
|
||||
|
||||
return this;
|
||||
}
|
||||
@ -92,7 +91,7 @@ public class IndexQueryFactory {
|
||||
for (var idx : priortyIndices) {
|
||||
var range = idx.rangeForWord(cachePool, termId);
|
||||
if (range.isPresent()) {
|
||||
query.addPriorityFilter(new UrlRangeSubFilter(idx, range, cachePool));
|
||||
query.addPriorityFilter(new QueryFilterBTreeRange(idx, range, cachePool));
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,26 @@
|
||||
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
public interface IndexQueryIf {
|
||||
IndexQueryIf EMPTY = new IndexQueryIf() {
|
||||
@Override
|
||||
public IndexQueryIf also(int wordId) { return this; }
|
||||
|
||||
@Override
|
||||
public IndexQueryIf alsoCached(int wordId) { return this; }
|
||||
|
||||
@Override
|
||||
public IndexQueryIf not(int wordId) { return this; }
|
||||
|
||||
@Override
|
||||
public LongStream stream() { return LongStream.empty(); }
|
||||
};
|
||||
|
||||
IndexQueryIf also(int wordId);
|
||||
IndexQueryIf alsoCached(int wordId);
|
||||
|
||||
IndexQueryIf not(int wordId);
|
||||
|
||||
LongStream stream();
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.wmsa.edge.index.reader.query;
|
||||
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||
|
||||
|
||||
public class IndexSearchBudget {
|
@ -0,0 +1,45 @@
|
||||
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||
|
||||
import gnu.trove.map.TLongIntMap;
|
||||
import gnu.trove.map.hash.TLongIntHashMap;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem;
|
||||
|
||||
public class ResultDomainDeduplicator {
|
||||
final TLongIntMap resultsByRankingId = new TLongIntHashMap(2048, 0.5f, -1, 0);
|
||||
final int limitByDomain;
|
||||
|
||||
public ResultDomainDeduplicator(int limitByDomain) {
|
||||
this.limitByDomain = limitByDomain;
|
||||
}
|
||||
|
||||
public boolean filterRawValue(long value) {
|
||||
int rankingId = (int) (value >>> 32);
|
||||
|
||||
if (rankingId == Integer.MAX_VALUE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return resultsByRankingId.get(getKey(rankingId)) <= limitByDomain;
|
||||
}
|
||||
|
||||
long getKey(int rankingId) {
|
||||
return rankingId;
|
||||
}
|
||||
|
||||
public boolean test(long value) {
|
||||
int ranking = (int) (value >>> 32);
|
||||
if (ranking == Integer.MAX_VALUE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return resultsByRankingId.adjustOrPutValue(ranking, 1, 1) <= limitByDomain;
|
||||
}
|
||||
public boolean test(EdgeSearchResultItem item) {
|
||||
int ranking = item.getRanking();
|
||||
if (ranking == Integer.MAX_VALUE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return resultsByRankingId.adjustOrPutValue(ranking, 1, 1) <= limitByDomain;
|
||||
}
|
||||
}
|
@ -1,8 +1,9 @@
|
||||
package nu.marginalia.wmsa.edge.index.reader.query.types;
|
||||
package nu.marginalia.wmsa.edge.index.svc.query.types;
|
||||
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||
|
||||
public interface EntrySource {
|
||||
SearchIndex getIndex();
|
||||
int read(long[] buffer, int n);
|
||||
|
||||
}
|
@ -0,0 +1,39 @@
|
||||
package nu.marginalia.wmsa.edge.index.svc.query.types.filter;
|
||||
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
class QueryFilterAnyOf implements QueryFilterStepIf {
|
||||
private final List<? extends QueryFilterStepIf> steps;
|
||||
|
||||
QueryFilterAnyOf(List<? extends QueryFilterStepIf> steps) {
|
||||
this.steps = steps;
|
||||
}
|
||||
|
||||
public SearchIndex getIndex() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public double cost() {
|
||||
return steps.stream().mapToDouble(QueryFilterStepIf::cost).average().orElse(0.);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean test(long value) {
|
||||
for (var step : steps) {
|
||||
if (step.test(value))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public String describe() {
|
||||
StringJoiner sj = new StringJoiner(",", "[Any Of: ", "]");
|
||||
for (var step : steps) {
|
||||
sj.add(step.describe());
|
||||
}
|
||||
return sj.toString();
|
||||
}
|
||||
}
|
@ -1,13 +1,13 @@
|
||||
package nu.marginalia.wmsa.edge.index.reader.query.types;
|
||||
package nu.marginalia.wmsa.edge.index.svc.query.types.filter;
|
||||
|
||||
import nu.marginalia.util.btree.CachingBTreeReader;
|
||||
import nu.marginalia.wmsa.edge.index.reader.IndexQueryCachePool;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
|
||||
public record UrlRangeSubFilter(SearchIndex source, SearchIndex.UrlIndexTree range, CachingBTreeReader.Cache cache) implements QueryFilterStep {
|
||||
public record QueryFilterBTreeRange(SearchIndex source, SearchIndex.IndexBTreeRange range, CachingBTreeReader.BTreeCachedIndex cache) implements QueryFilterStepIf {
|
||||
|
||||
public UrlRangeSubFilter(SearchIndex source, SearchIndex.UrlIndexTree range, IndexQueryCachePool pool) {
|
||||
public QueryFilterBTreeRange(SearchIndex source, SearchIndex.IndexBTreeRange range, IndexQueryCachePool pool) {
|
||||
this(source, range, pool.getIndexCache(source, range));
|
||||
}
|
||||
|
@ -0,0 +1,33 @@
|
||||
package nu.marginalia.wmsa.edge.index.svc.query.types.filter;
|
||||
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||
|
||||
class QueryFilterNoPass implements QueryFilterStepIf {
|
||||
static final QueryFilterStepIf instance = new QueryFilterNoPass();
|
||||
|
||||
@Override
|
||||
public boolean test(long value) {
|
||||
return false;
|
||||
}
|
||||
|
||||
public SearchIndex getIndex() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public double cost() {
|
||||
return 0.;
|
||||
}
|
||||
|
||||
public int retainDestructive(long[] items, int max) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public int retainReorder(long[] items, int start, int max) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public String describe() {
|
||||
return "[NoPass]";
|
||||
}
|
||||
|
||||
}
|
@ -1,11 +1,11 @@
|
||||
package nu.marginalia.wmsa.edge.index.reader.query.types;
|
||||
package nu.marginalia.wmsa.edge.index.svc.query.types.filter;
|
||||
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
|
||||
import java.util.function.LongPredicate;
|
||||
|
||||
public class QueryFilterStepFromPredicate implements QueryFilterStep {
|
||||
public class QueryFilterStepFromPredicate implements QueryFilterStepIf {
|
||||
private final LongPredicate pred;
|
||||
|
||||
public QueryFilterStepFromPredicate(LongPredicate pred) {
|
@ -0,0 +1,71 @@
|
||||
package nu.marginalia.wmsa.edge.index.svc.query.types.filter;
|
||||
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.List;
|
||||
|
||||
public interface QueryFilterStepIf extends Comparable<QueryFilterStepIf> {
|
||||
@Nullable
|
||||
SearchIndex getIndex();
|
||||
|
||||
boolean test(long value);
|
||||
|
||||
double cost();
|
||||
|
||||
default int compareTo(QueryFilterStepIf other) {
|
||||
return (int)(cost() - other.cost());
|
||||
}
|
||||
|
||||
String describe();
|
||||
|
||||
/**
|
||||
* Move each value in items to the beginning of the array,
|
||||
* and return the number of matching items.
|
||||
*
|
||||
* The remaining values are undefined.
|
||||
*/
|
||||
default int retainDestructive(long[] items, int max) {
|
||||
int keep = 0;
|
||||
for (int i = 0; i < max; i++) {
|
||||
if (test(items[i])) {
|
||||
if (i != keep) {
|
||||
items[keep] = items[i];
|
||||
}
|
||||
keep++;
|
||||
}
|
||||
}
|
||||
return keep;
|
||||
}
|
||||
|
||||
/**
|
||||
* Move each value in items to the beginning of the array,
|
||||
* and return the number of matching items. The values that do
|
||||
* not pass the test are moved to the end of the array.
|
||||
*/
|
||||
default int retainReorder(long[] items, int start, int max) {
|
||||
int keep = 0;
|
||||
for (int i = start; i < max; i++) {
|
||||
if (test(items[i])) {
|
||||
if (i != keep) {
|
||||
long tmp = items[keep];
|
||||
items[keep] = items[i];
|
||||
items[i] = tmp;
|
||||
}
|
||||
keep++;
|
||||
}
|
||||
}
|
||||
return keep;
|
||||
}
|
||||
|
||||
|
||||
static QueryFilterStepIf noPass() {
|
||||
return QueryFilterNoPass.instance;
|
||||
}
|
||||
static QueryFilterStepIf anyOf(List<? extends QueryFilterStepIf> steps) {
|
||||
return new QueryFilterAnyOf(steps);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -12,14 +12,14 @@ import java.util.List;
|
||||
@AllArgsConstructor @ToString @Getter
|
||||
public class EdgeSearchResultItem {
|
||||
public final int bucketId;
|
||||
public final long combinedId; // this isn't the external domain ID, but a ranking
|
||||
public final long combinedId;
|
||||
|
||||
public final List<EdgeSearchResultKeywordScore> scores;
|
||||
|
||||
public EdgeSearchResultItem(int bucketId, long val) {
|
||||
this.bucketId = bucketId;
|
||||
|
||||
combinedId = val;
|
||||
scores = new ArrayList<>(16);
|
||||
this.combinedId = val;
|
||||
this.scores = new ArrayList<>(16);
|
||||
}
|
||||
|
||||
public EdgeId<EdgeUrl> getUrlId() {
|
||||
@ -33,6 +33,7 @@ public class EdgeSearchResultItem {
|
||||
return (int)(combinedId >>> 32);
|
||||
}
|
||||
|
||||
/* Used for evaluation */
|
||||
private transient double scoreValue = 1;
|
||||
public void setScore(double score) {
|
||||
scoreValue = score;
|
||||
|
@ -4,10 +4,10 @@ import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeIdArray;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeIdList;
|
||||
|
||||
@AllArgsConstructor @Getter @ToString
|
||||
public class EdgeDomainSearchResults {
|
||||
public final String keyword;
|
||||
public final EdgeIdArray<EdgeUrl> results;
|
||||
public final EdgeIdList<EdgeUrl> results;
|
||||
}
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.wmsa.edge.search;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.prometheus.client.Summary;
|
||||
import io.reactivex.rxjava3.core.Observable;
|
||||
import io.reactivex.rxjava3.schedulers.Schedulers;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
@ -18,7 +17,6 @@ import nu.marginalia.wmsa.edge.model.id.EdgeIdSet;
|
||||
import nu.marginalia.wmsa.edge.model.search.*;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
||||
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
||||
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet;
|
||||
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults;
|
||||
import nu.marginalia.wmsa.edge.search.query.QueryFactory;
|
||||
import nu.marginalia.wmsa.edge.search.query.model.EdgeSearchQuery;
|
||||
@ -50,8 +48,6 @@ public class EdgeSearchOperator {
|
||||
private final SearchResultDecorator resultDecorator;
|
||||
private final Comparator<EdgeUrlDetails> resultListComparator;
|
||||
|
||||
private static final Summary wmsa_search_index_api_time = Summary.build().name("wmsa_search_index_api_time").help("-").register();
|
||||
|
||||
@Inject
|
||||
public EdgeSearchOperator(AssistantClient assistantClient,
|
||||
EncyclopediaClient encyclopediaClient,
|
||||
@ -81,9 +77,7 @@ public class EdgeSearchOperator {
|
||||
|
||||
logger.info("Human terms (API): {}", Strings.join(processedQuery.searchTermsHuman, ','));
|
||||
|
||||
DecoratedSearchResultSet queryResults = performQuery(ctx, processedQuery);
|
||||
|
||||
return queryResults.resultSet;
|
||||
return performQuery(ctx, processedQuery);
|
||||
}
|
||||
|
||||
public DecoratedSearchResults doSearch(Context ctx, EdgeUserSearchParameters params, @Nullable Future<String> eval) {
|
||||
@ -91,23 +85,25 @@ public class EdgeSearchOperator {
|
||||
Observable<WikiArticles> definitions = getWikiArticle(ctx, params.humanQuery());
|
||||
|
||||
EdgeSearchQuery processedQuery = queryFactory.createQuery(params);
|
||||
|
||||
logger.info("Human terms: {}", Strings.join(processedQuery.searchTermsHuman, ','));
|
||||
|
||||
DecoratedSearchResultSet queryResults = performQuery(ctx, processedQuery);
|
||||
List<EdgeUrlDetails> queryResults = performQuery(ctx, processedQuery);
|
||||
|
||||
String evalResult = getEvalResult(eval);
|
||||
|
||||
|
||||
List<BrowseResult> domainResults = getDomainResults(ctx, processedQuery.specs);
|
||||
WikiArticles wikiArticles = definitions.onErrorReturn((e) -> new WikiArticles()).blockingFirst();
|
||||
|
||||
return new DecoratedSearchResults(params,
|
||||
getProblems(ctx, evalResult, queryResults, processedQuery),
|
||||
evalResult,
|
||||
definitions.onErrorReturn((e) -> new WikiArticles()).blockingFirst(),
|
||||
queryResults.resultSet,
|
||||
domainResults,
|
||||
processedQuery.domain,
|
||||
getDomainId(processedQuery.domain));
|
||||
return DecoratedSearchResults.builder()
|
||||
.params(params)
|
||||
.problems(getProblems(ctx, evalResult, queryResults, processedQuery))
|
||||
.evalResult(evalResult)
|
||||
.wiki(wikiArticles)
|
||||
.results(queryResults)
|
||||
.domainResults(domainResults)
|
||||
.focusDomain(processedQuery.domain)
|
||||
.focusDomainId(getDomainId(processedQuery.domain))
|
||||
.build();
|
||||
}
|
||||
|
||||
private List<BrowseResult> getDomainResults(Context ctx, EdgeSearchSpecification specs) {
|
||||
@ -169,7 +165,7 @@ public class EdgeSearchOperator {
|
||||
return domainId;
|
||||
}
|
||||
|
||||
public DecoratedSearchResultSet performDumbQuery(Context ctx, EdgeSearchProfile profile, IndexBlock block, int limitPerDomain, int limitTotal, String... termsInclude) {
|
||||
public List<EdgeUrlDetails> performDumbQuery(Context ctx, EdgeSearchProfile profile, IndexBlock block, int limitPerDomain, int limitTotal, String... termsInclude) {
|
||||
List<EdgeSearchSubquery> sqs = new ArrayList<>();
|
||||
|
||||
sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), block));
|
||||
@ -179,11 +175,13 @@ public class EdgeSearchOperator {
|
||||
return performQuery(ctx, new EdgeSearchQuery(specs));
|
||||
}
|
||||
|
||||
private DecoratedSearchResultSet performQuery(Context ctx, EdgeSearchQuery processedQuery) {
|
||||
private List<EdgeUrlDetails> performQuery(Context ctx, EdgeSearchQuery processedQuery) {
|
||||
|
||||
List<EdgeUrlDetails> resultList = new ArrayList<>(processedQuery.specs.limitTotal);
|
||||
final List<EdgeSearchResultItem> results = indexClient.query(ctx, processedQuery.specs);
|
||||
|
||||
for (var details : wmsa_search_index_api_time.time(()->fetchResultsSimple(ctx, processedQuery))) {
|
||||
final List<EdgeUrlDetails> resultList = new ArrayList<>(results.size());
|
||||
|
||||
for (var details : resultDecorator.getAllUrlDetails(results)) {
|
||||
if (details.getUrlQuality() <= -100) {
|
||||
continue;
|
||||
}
|
||||
@ -208,10 +206,10 @@ public class EdgeSearchOperator {
|
||||
}
|
||||
}
|
||||
|
||||
return new DecoratedSearchResultSet(retList);
|
||||
return retList;
|
||||
}
|
||||
|
||||
private List<String> getProblems(Context ctx, String evalResult, DecoratedSearchResultSet queryResults, EdgeSearchQuery processedQuery) {
|
||||
private List<String> getProblems(Context ctx, String evalResult, List<EdgeUrlDetails> queryResults, EdgeSearchQuery processedQuery) {
|
||||
final List<String> problems = new ArrayList<>(processedQuery.problems);
|
||||
boolean siteSearch = processedQuery.domain != null;
|
||||
|
||||
@ -305,15 +303,6 @@ public class EdgeSearchOperator {
|
||||
;
|
||||
}
|
||||
|
||||
private Set<EdgeUrlDetails> fetchResultsSimple(Context ctx, EdgeSearchQuery processedQuery) {
|
||||
EdgeSearchResultSet resultSet = indexClient.query(ctx, processedQuery.specs);
|
||||
|
||||
var results = resultSet.getResults();
|
||||
Set<EdgeUrlDetails> ret = new HashSet<>(resultDecorator.getAllUrlDetails(results));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
private Iterable<String> spellCheckTerms(Context ctx, EdgeSearchQuery disjointedQuery) {
|
||||
return Observable.fromIterable(disjointedQuery.searchTermsHuman)
|
||||
.subscribeOn(Schedulers.io())
|
||||
|
@ -20,6 +20,7 @@ import nu.marginalia.wmsa.edge.search.command.SearchJsParameter;
|
||||
import nu.marginalia.wmsa.edge.search.command.SearchParameters;
|
||||
import nu.marginalia.wmsa.edge.search.exceptions.RedirectException;
|
||||
import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters;
|
||||
import nu.marginalia.wmsa.edge.search.svc.EdgeSearchErrorPageService;
|
||||
import nu.marginalia.wmsa.resource_store.StaticResources;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -40,6 +41,7 @@ public class EdgeSearchService extends Service {
|
||||
private final WebsiteUrl websiteUrl;
|
||||
private StaticResources staticResources;
|
||||
|
||||
private final EdgeSearchErrorPageService errorPageService;
|
||||
private static final Logger logger = LoggerFactory.getLogger(EdgeSearchService.class);
|
||||
|
||||
@SneakyThrows
|
||||
@ -53,7 +55,8 @@ public class EdgeSearchService extends Service {
|
||||
CommandEvaluator searchCommandEvaulator,
|
||||
WebsiteUrl websiteUrl,
|
||||
StaticResources staticResources,
|
||||
IndexCommand indexCommand) {
|
||||
IndexCommand indexCommand,
|
||||
EdgeSearchErrorPageService errorPageService) {
|
||||
super(ip, port, initialization, metricsServer);
|
||||
this.indexClient = indexClient;
|
||||
|
||||
@ -61,6 +64,7 @@ public class EdgeSearchService extends Service {
|
||||
this.searchCommandEvaulator = searchCommandEvaulator;
|
||||
this.websiteUrl = websiteUrl;
|
||||
this.staticResources = staticResources;
|
||||
this.errorPageService = errorPageService;
|
||||
|
||||
Spark.staticFiles.expireTime(600);
|
||||
|
||||
@ -79,7 +83,7 @@ public class EdgeSearchService extends Service {
|
||||
|
||||
Spark.exception(Exception.class, (e,p,q) -> {
|
||||
logger.error("Error during processing", e);
|
||||
serveError(Context.fromRequest(p), q);
|
||||
errorPageService.serveError(Context.fromRequest(p), q);
|
||||
});
|
||||
|
||||
Spark.awaitInitialization();
|
||||
@ -104,26 +108,6 @@ public class EdgeSearchService extends Service {
|
||||
}
|
||||
|
||||
|
||||
private void serveError(Context ctx, Response rsp) {
|
||||
boolean isIndexUp = indexClient.isAlive();
|
||||
|
||||
try {
|
||||
if (!isIndexUp) {
|
||||
rsp.body("<html><head><title>Error</title><link rel=\"stylesheet\" href=\"https://www.marginalia.nu/style.css\"> <meta http-equiv=\"refresh\" content=\"5\"> </head><body><article><h1>Error</h1><p>Oops! It appears the index server is <span class=\"headline\">offline</span>.</p> <p>The server was probably restarted to bring online some changes. Restarting the index typically takes a few minutes, during which searches can't be served. </p><p>This page will attempt to refresh automatically every few seconds.</p></body></html>");
|
||||
} else if (indexClient.isBlocked(ctx).blockingFirst()) {
|
||||
rsp.body("<html><head><title>Error</title><link rel=\"stylesheet\" href=\"https://www.marginalia.nu/style.css\"> <meta http-equiv=\"refresh\" content=\"5\"> </head><body><article><h1>Error</h1><p>Oops! It appears the index server is <span class=\"headline\">starting up</span>.</p> <p>The server was probably restarted to bring online some changes. Restarting the index typically takes a few minutes, during which searches can't be served. </p><p>This page will attempt to refresh automatically every few seconds.</p></body></html>");
|
||||
}
|
||||
else {
|
||||
rsp.body("<html><head><title>Error</title><link rel=\"stylesheet\" href=\"https://www.marginalia.nu/style.css\"></head><body><article><h1>Error</h1><p>Oops! An unknown error occurred. The index server seems to be up, so I don't know why this is. Please send an email to kontakt@marginalia.nu telling me what you did :-) </p></body></html>");
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error", ex);
|
||||
rsp.body("<html><head><title>Error</title><link rel=\"stylesheet\" href=\"https://www.marginalia.nu/style.css\"> <meta http-equiv=\"refresh\" content=\"5\"> </head><body><article><h1>Error</h1><p>Oops! It appears the index server is <span class=\"headline\">unresponsive</span>.</p> <p>The server was probably restarted to bring online some changes. Restarting the index typically takes a few minutes, during which searches can't be served. </p><p>This page will attempt to refresh automatically every few seconds.</p></body></html>");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private Object apiSearch(Request request, Response response) {
|
||||
|
||||
@ -180,7 +164,7 @@ public class EdgeSearchService extends Service {
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error", ex);
|
||||
serveError(ctx, response);
|
||||
errorPageService.serveError(ctx, response);
|
||||
}
|
||||
|
||||
return "";
|
||||
|
@ -5,11 +5,11 @@ import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
|
||||
import nu.marginalia.wmsa.edge.search.EdgeSearchOperator;
|
||||
import nu.marginalia.wmsa.edge.search.EdgeSearchProfile;
|
||||
import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface;
|
||||
import nu.marginalia.wmsa.edge.search.command.SearchParameters;
|
||||
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet;
|
||||
import nu.marginalia.wmsa.edge.search.model.DomainInformation;
|
||||
import nu.marginalia.wmsa.edge.search.siteinfo.DomainInformationService;
|
||||
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
|
||||
@ -19,10 +19,7 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.*;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@ -59,7 +56,7 @@ public class SiteSearchCommand implements SearchCommandInterface {
|
||||
var results = siteInfo(ctx, query);
|
||||
var domain = results.getDomain();
|
||||
|
||||
DecoratedSearchResultSet resultSet;
|
||||
List<EdgeUrlDetails> resultSet;
|
||||
Path screenshotPath = null;
|
||||
if (null != domain) {
|
||||
resultSet = searchOperator.performDumbQuery(ctx, EdgeSearchProfile.CORPO, IndexBlock.Words_1, 100, 100, "site:"+domain);
|
||||
@ -67,10 +64,10 @@ public class SiteSearchCommand implements SearchCommandInterface {
|
||||
screenshotPath = Path.of("/screenshot/" + dataStoreDao.getDomainId(domain).id());
|
||||
}
|
||||
else {
|
||||
resultSet = new DecoratedSearchResultSet(Collections.emptyList());
|
||||
resultSet = Collections.emptyList();
|
||||
}
|
||||
|
||||
return Optional.of(siteInfoRenderer.render(results, Map.of("query", query, "focusDomain", Objects.requireNonNullElse(domain, ""), "profile", parameters.profileStr(), "results", resultSet.resultSet, "screenshot", screenshotPath == null ? "" : screenshotPath.toString())));
|
||||
return Optional.of(siteInfoRenderer.render(results, Map.of("query", query, "focusDomain", Objects.requireNonNullElse(domain, ""), "profile", parameters.profileStr(), "results", resultSet, "screenshot", screenshotPath == null ? "" : screenshotPath.toString())));
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,22 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.search.model;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
@ToString @Getter
|
||||
public class DecoratedSearchResultSet {
|
||||
public final List<EdgeUrlDetails> resultSet;
|
||||
|
||||
public int size() {
|
||||
return resultSet.size();
|
||||
}
|
||||
|
||||
public DecoratedSearchResultSet(List<EdgeUrlDetails> resultSet) {
|
||||
this.resultSet = Objects.requireNonNull(resultSet);
|
||||
}
|
||||
|
||||
}
|
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.wmsa.edge.search.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
|
||||
@ -8,7 +9,7 @@ import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@AllArgsConstructor @Getter
|
||||
@AllArgsConstructor @Getter @Builder
|
||||
public class DecoratedSearchResults {
|
||||
private final EdgeUserSearchParameters params;
|
||||
private final List<String> problems;
|
||||
|
@ -13,7 +13,6 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
public class SearchResultDecorator {
|
||||
@ -67,8 +66,7 @@ public class SearchResultDecorator {
|
||||
if (!missedIds.isEmpty()) {
|
||||
logger.debug("Could not look up documents: {}", missedIds.toArray());
|
||||
}
|
||||
retList.sort(Comparator.comparing(EdgeUrlDetails::getTermScore)
|
||||
.thenComparing(url -> url.url.path.length()));
|
||||
|
||||
return retList;
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,125 @@
|
||||
package nu.marginalia.wmsa.edge.search.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Response;
|
||||
|
||||
public class EdgeSearchErrorPageService {
|
||||
private final EdgeIndexClient indexClient;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public EdgeSearchErrorPageService(EdgeIndexClient indexClient) {
|
||||
this.indexClient = indexClient;
|
||||
}
|
||||
|
||||
public void serveError(Context ctx, Response rsp) {
|
||||
boolean isIndexUp = indexClient.isAlive();
|
||||
|
||||
try {
|
||||
if (!isIndexUp) {
|
||||
rsp.body(renderError("The index is down",
|
||||
"""
|
||||
The search index server appears to be down.
|
||||
<p>
|
||||
The server was possibly restarted to bring online some changes.
|
||||
Restarting the index typically takes a few minutes, during which
|
||||
searches can't be served.
|
||||
"""));
|
||||
} else if (indexClient.isBlocked(ctx).blockingFirst()) {
|
||||
rsp.body(renderError("The index is starting up",
|
||||
"""
|
||||
The search index server appears to be in the process of starting up.
|
||||
This typically takes a few minutes. Be patient.
|
||||
"""));
|
||||
}
|
||||
else {
|
||||
rsp.body(renderError("Error processing request",
|
||||
"""
|
||||
The search index appears to be up and running, so the problem may be related
|
||||
to some wider general error, or pertain to an error handling your query.
|
||||
"""));
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
rsp.body(renderError("Error processing error",
|
||||
"""
|
||||
An error has occurred, additionally, an error occurred while handling that error
|
||||
<p>
|
||||
<a href="https://www.youtube.com/watch?v=dsx2vdn7gpY">https://www.youtube.com/watch?v=dsx2vdn7gpY</a>.
|
||||
|
||||
"""));
|
||||
}
|
||||
}
|
||||
|
||||
private String renderError(String title, String message) {
|
||||
return """
|
||||
<!DOCTYPE html>
|
||||
<title>Error</title>
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<link rel="stylesheet" href="https://search.marginalia.nu/style-new.css">
|
||||
<header>
|
||||
<nav>
|
||||
<a href="https://www.marginalia.nu/">Marginalia</a>
|
||||
<a href="https://memex.marginalia.nu/projects/edge/about.gmi">About</a>
|
||||
<a href="https://memex.marginalia.nu/projects/edge/supporting.gmi">Support</a>
|
||||
</nav>
|
||||
</header>
|
||||
<article>
|
||||
<form method="get" action="/search">
|
||||
<section class="search-box">
|
||||
<h1>Search the Internet</h1>
|
||||
<div class="input">
|
||||
<input id="query" name="query" placeholder="Search terms" value="" autocomplete="off">
|
||||
<input value="Go" type="submit">
|
||||
</div>
|
||||
<div class="settings">
|
||||
<select name="profile" id="profile">
|
||||
<option value="default">Popular Sites</option>
|
||||
<option value="modern">Blogs and Personal Websites</option>
|
||||
<option value="academia">Academia, Forums, Big Websites</option>
|
||||
<option value="yolo">Default Ranking Algorithm</option>
|
||||
<option value="food">Recipes 🍳</option>
|
||||
<option value="corpo">Experimental</option>
|
||||
</select>
|
||||
<select name="js" id="js">
|
||||
<option value="default">Allow JS</option>
|
||||
<option value="no-js">Deny JS</option>
|
||||
<option value="yes-js">Require JS</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="extra">
|
||||
<a href="https://search.marginalia.nu/explore/random">Random Websites</a>
|
||||
</div>
|
||||
</section>
|
||||
</form>
|
||||
<div class="cards big">
|
||||
<div class="card problems">
|
||||
<h2>
|
||||
"""
|
||||
+ title +
|
||||
"""
|
||||
</h2>
|
||||
<div class="info">
|
||||
"""
|
||||
+message+
|
||||
"""
|
||||
</div>
|
||||
</div>
|
||||
<div class="card">
|
||||
<h2>More Info</h2>
|
||||
<div class="info">
|
||||
You may be able to find more information here:
|
||||
<ul>
|
||||
<li><a href="https://status.marginalia.nu/">Maintenance Messages</a></li>
|
||||
<li><a href="https://twitter.com/MarginaliaNu">Twitter Account</a></li>
|
||||
<li>Email Me: <tt>kontakt@marginalia.nu</tt></li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
""";
|
||||
}
|
||||
}
|
@ -5,7 +5,7 @@ import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class MicroCacheTest {
|
||||
class MicroBTreeCachedIndexTest {
|
||||
MicroCache mc;
|
||||
|
||||
@BeforeEach
|
@ -1,13 +1,15 @@
|
||||
package nu.marginalia.wmsa.edge.index.reader.query.types;
|
||||
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepFromPredicate;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
class QueryFilterStepTest {
|
||||
QueryFilterStep even = new QueryFilterStepFromPredicate(l -> (l%2) == 0);
|
||||
QueryFilterStep divisibleByThree = new QueryFilterStepFromPredicate(l -> (l%3) == 0);
|
||||
QueryFilterStep either = QueryFilterStep.anyOf(List.of(even, divisibleByThree));
|
||||
class QueryFilterStepIfTest {
|
||||
QueryFilterStepIf even = new QueryFilterStepFromPredicate(l -> (l%2) == 0);
|
||||
QueryFilterStepIf divisibleByThree = new QueryFilterStepFromPredicate(l -> (l%3) == 0);
|
||||
QueryFilterStepIf either = QueryFilterStepIf.anyOf(List.of(even, divisibleByThree));
|
||||
@Test
|
||||
public void test() {
|
||||
long[] values = new long[100];
|
Loading…
Reference in New Issue
Block a user