Cleaning up and adding better error messages.

This commit is contained in:
vlofgren 2022-09-11 11:31:22 +02:00
parent fbe17b62ed
commit eaef93f4ae
38 changed files with 961 additions and 764 deletions

View File

@ -25,14 +25,14 @@ public class CachingBTreeReader {
return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
}
public Cache prepareCache(BTreeHeader header) {
return new Cache(header);
public BTreeCachedIndex prepareCache(BTreeHeader header) {
return new BTreeCachedIndex(header);
}
/**
*
* @return file offset of entry matching keyRaw, negative if absent
*/
public long findEntry(Cache cache, final long keyRaw) {
public long findEntry(BTreeCachedIndex cache, final long keyRaw) {
BTreeHeader header = cache.header;
final int blockSize = ctx.BLOCK_SIZE_WORDS();
@ -62,7 +62,7 @@ public class CachingBTreeReader {
return dataSearcher.binarySearch(key, searchStart, numEntries);
}
private long searchIndex(BTreeHeader header, Cache cache, long key) {
private long searchIndex(BTreeHeader header, BTreeCachedIndex cache, long key) {
final int blockSize = ctx.BLOCK_SIZE_WORDS();
long layerOffset = 0;
@ -83,13 +83,13 @@ public class CachingBTreeReader {
* for repeated queries against the same tree. The memory consumption is typically very low
* and the disk access pattern for reading the entire index relatively cheap.
*/
public class Cache {
public class BTreeCachedIndex {
long[] indexData;
final BTreeHeader header;
final int indexedDataSize;
public Cache(BTreeHeader header) {
public BTreeCachedIndex(BTreeHeader header) {
this.header = header;
indexedDataSize = header.numEntries();
}

View File

@ -95,4 +95,8 @@ public class WmsaHome {
home.resolve("model/opennlp-tok.bin"));
}
private static final boolean debugMode = Boolean.getBoolean("wmsa-debug");
public static boolean isDebug() {
return debugMode;
}
}

View File

@ -3,10 +3,10 @@ package nu.marginalia.wmsa.edge.index;
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter;
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.reader.IndexQueryCachePool;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
import nu.marginalia.wmsa.edge.index.reader.query.IndexQuery;
import nu.marginalia.wmsa.edge.index.reader.query.IndexQueryFactory;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View File

@ -3,68 +3,29 @@ package nu.marginalia.wmsa.edge.index;
import com.google.gson.Gson;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import com.google.protobuf.InvalidProtocolBufferException;
import gnu.trove.set.hash.TIntHashSet;
import io.prometheus.client.Counter;
import io.prometheus.client.Histogram;
import io.reactivex.rxjava3.schedulers.Schedulers;
import nu.marginalia.util.ListChunker;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.wmsa.client.GsonFactory;
import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.configuration.server.MetricsServer;
import nu.marginalia.wmsa.configuration.server.Service;
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.reader.IndexQueryCachePool;
import nu.marginalia.wmsa.edge.index.reader.ResultDomainDeduplicator;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
import nu.marginalia.wmsa.edge.index.reader.query.IndexQuery;
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
import nu.marginalia.wmsa.edge.model.id.EdgeIdArray;
import nu.marginalia.wmsa.edge.model.search.*;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq;
import org.apache.http.HttpStatus;
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexLexiconService;
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexOpsService;
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexQueryService;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.HaltException;
import spark.Request;
import spark.Response;
import spark.Spark;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.function.LongPredicate;
import static spark.Spark.get;
import static spark.Spark.halt;
public class EdgeIndexService extends Service {
private static final int SEARCH_BUDGET_TIMEOUT_MS = 3000;
private static final int QUERY_FETCH_SIZE = 8192;
private static final int QUERY_FIRST_PASS_DOMAIN_LIMIT = 64;
private final Logger logger = LoggerFactory.getLogger(getClass());
@NotNull
private final Initialization init;
private final SearchIndexes indexes;
private final KeywordLexicon keywordLexicon;
private final Gson gson = GsonFactory.get();
private static final Histogram wmsa_edge_index_query_time = Histogram.build().name("wmsa_edge_index_query_time").linearBuckets(50, 50, 15).help("-").register();
private static final Counter wmsa_edge_index_query_timeouts = Counter.build().name("wmsa_edge_index_query_timeouts").help("-").register();
public static final int DYNAMIC_BUCKET_LENGTH = 7;
@ -75,71 +36,34 @@ public class EdgeIndexService extends Service {
Initialization init,
MetricsServer metricsServer,
SearchIndexes indexes,
IndexServicesFactory servicesFactory) {
EdgeIndexOpsService opsService,
EdgeIndexLexiconService lexiconService,
EdgeIndexQueryService indexQueryService)
{
super(ip, port, init, metricsServer);
final Gson gson = GsonFactory.get();
this.init = init;
this.indexes = indexes;
this.keywordLexicon = servicesFactory.getKeywordLexicon();
Spark.post("/words/", this::putWords);
Spark.post("/search/", this::search, gson::toJson);
Spark.post("/search-domain/", this::searchDomain, gson::toJson);
Spark.post("/words/", lexiconService::putWords);
Spark.post("/dictionary/*", this::getWordId, gson::toJson);
Spark.post("/search/", indexQueryService::search, gson::toJson);
Spark.post("/search-domain/", indexQueryService::searchDomain, gson::toJson);
Spark.post("/ops/repartition", this::repartitionEndpoint);
Spark.post("/ops/preconvert", this::preconvertEndpoint);
Spark.post("/ops/reindex/:id", this::reindexEndpoint);
Spark.get("/dictionary/*", lexiconService::getWordId, gson::toJson);
Spark.post("/ops/repartition", opsService::repartitionEndpoint);
Spark.post("/ops/preconvert", opsService::preconvertEndpoint);
Spark.post("/ops/reindex/:id", opsService::reindexEndpoint);
get("/is-blocked", this::isBlocked, gson::toJson);
Schedulers.newThread().scheduleDirect(this::initialize, 1, TimeUnit.MICROSECONDS);
}
private Object getWordId(Request request, Response response) {
final String word = request.splat()[0];
var dr = indexes.getDictionaryReader();
if (null == dr) {
response.status(HttpStatus.SC_FAILED_DEPENDENCY);
return "";
}
final int wordId = dr.get(word);
if (DictionaryHashMap.NO_VALUE == wordId) {
response.status(404);
return "";
}
return wordId;
}
private Object repartitionEndpoint(Request request, Response response) {
if (!indexes.repartition()) {
Spark.halt(503, "Operations busy");
}
return "OK";
}
private Object preconvertEndpoint(Request request, Response response) {
if (!indexes.preconvert()) {
Spark.halt(503, "Operations busy");
}
return "OK";
}
private Object reindexEndpoint(Request request, Response response) {
int id = Integer.parseInt(request.params("id"));
if (!indexes.reindex(id)) {
Spark.halt(503, "Operations busy");
}
return "OK";
}
private Object isBlocked(Request request, Response response) {
return indexes.isBusy() || !initialized;
}
@ -156,296 +80,6 @@ public class EdgeIndexService extends Service {
indexes.initialize(init);
}
private Object putWords(Request request, Response response) throws InvalidProtocolBufferException {
var req = IndexPutKeywordsReq.parseFrom(request.bodyAsBytes());
EdgeId<EdgeDomain> domainId = new EdgeId<>(req.getDomain());
EdgeId<EdgeUrl> urlId = new EdgeId<>(req.getUrl());
int idx = req.getIndex();
for (int ws = 0; ws < req.getWordSetCount(); ws++) {
putWords(domainId, urlId, req.getWordSet(ws), idx);
}
response.status(HttpStatus.SC_ACCEPTED);
return "";
}
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
IndexPutKeywordsReq.WordSet words, int idx
) {
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
IndexBlock block = IndexBlock.values()[words.getIndex()];
for (var chunk : ListChunker.chopList(words.getWordsList(), SearchIndexJournalEntry.MAX_LENGTH)) {
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
var header = new SearchIndexJournalEntryHeader(domainId, urlId, block);
indexWriter.put(header, entry);
};
}
private long[] getOrInsertWordIds(List<String> words) {
long[] ids = new long[words.size()];
int putIdx = 0;
for (String word : words) {
long id = keywordLexicon.getOrInsert(word);
if (id != DictionaryHashMap.NO_VALUE) {
ids[putIdx++] = id;
}
}
if (putIdx != words.size()) {
ids = Arrays.copyOf(ids, putIdx);
}
return ids;
}
private Object searchDomain(Request request, Response response) {
if (indexes.getDictionaryReader() == null) {
logger.warn("Dictionary reader not yet initialized");
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
}
String json = request.body();
EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class);
final int wordId = keywordLexicon.getReadOnly(specsSet.keyword);
EdgeIdArray<EdgeUrl> urlIds = EdgeIdArray.gather(indexes
.getBucket(specsSet.bucket)
.findHotDomainsForKeyword(specsSet.block, wordId, specsSet.queryDepth, specsSet.minHitCount, specsSet.maxResults)
.mapToInt(lv -> (int)(lv & 0xFFFF_FFFFL)));
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
}
private Object search(Request request, Response response) {
if (indexes.getDictionaryReader() == null) {
logger.warn("Dictionary reader not yet initialized");
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
}
String json = request.body();
EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class);
long start = System.currentTimeMillis();
try {
return new EdgeSearchResultSet(new SearchQuery(specsSet).execute());
}
catch (HaltException ex) {
logger.warn("Halt", ex);
throw ex;
}
catch (Exception ex) {
logger.info("Error during search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
logger.info("Error", ex);
Spark.halt(500, "Error");
return null;
}
finally {
wmsa_edge_index_query_time.observe(System.currentTimeMillis() - start);
}
}
private class SearchQuery {
private final TIntHashSet seenResults = new TIntHashSet(QUERY_FETCH_SIZE, 0.5f);
private final EdgeSearchSpecification specsSet;
private final IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_TIMEOUT_MS);
private final IndexQueryCachePool cachePool = new IndexQueryCachePool();
public SearchQuery(EdgeSearchSpecification specsSet) {
this.specsSet = specsSet;
}
private List<EdgeSearchResultItem> execute() {
final Set<EdgeSearchResultItem> results = new HashSet<>(QUERY_FETCH_SIZE);
for (var sq : specsSet.subqueries) {
Optional<EdgeIndexSearchTerms> searchTerms = getSearchTerms(sq);
if (searchTerms.isEmpty())
continue;
results.addAll(performSearch(searchTerms.get(), sq));
}
for (var result : results) {
addResultScores(result);
}
if (!budget.hasTimeLeft()) {
wmsa_edge_index_query_timeouts.inc();
}
var domainCountFilter = new ResultDomainDeduplicator(specsSet.limitByDomain);
// cachePool.printSummary(logger);
cachePool.clear();
return results.stream()
.sorted(Comparator.comparing(EdgeSearchResultItem::getScore))
.filter(domainCountFilter::test)
.limit(specsSet.getLimitTotal()).toList();
}
private List<EdgeSearchResultItem> performSearch(EdgeIndexSearchTerms searchTerms,
EdgeSearchSubquery sq)
{
final List<EdgeSearchResultItem> results = new ArrayList<>(QUERY_FETCH_SIZE);
final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT);
final int remainingResults = QUERY_FETCH_SIZE;
for (int indexBucket : specsSet.buckets) {
if (!budget.hasTimeLeft()) {
logger.info("Query timed out, omitting {}:{} for query {}", indexBucket, sq.block, sq.searchTermsInclude);
continue;
}
if (remainingResults <= results.size())
break;
var query = getQuery(cachePool, indexBucket, sq.block, lv -> localFilter.filterRawValue(indexBucket, lv), searchTerms);
long[] buf = new long[8192];
while (query.hasMore() && results.size() < remainingResults && budget.hasTimeLeft()) {
int cnt = query.getMoreResults(buf, budget);
for (int i = 0; i < cnt && results.size() < remainingResults; i++) {
long id = buf[i];
final EdgeSearchResultItem ri = new EdgeSearchResultItem(indexBucket, id);
if (!seenResults.add(ri.getUrlId().id()) || !localFilter.test(ri)) {
continue;
}
results.add(ri);
}
}
}
return results;
}
private IndexQuery getQuery(IndexQueryCachePool cachePool, int bucket, IndexBlock block,
LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
if (!indexes.isValidBucket(bucket)) {
logger.warn("Invalid bucket {}", bucket);
return new IndexQuery(Collections.emptyList());
}
return indexes.getBucket(bucket).getQuery(cachePool, block, filter, searchTerms);
}
private void addResultScores(EdgeSearchResultItem searchResult) {
final var reader = Objects.requireNonNull(indexes.getDictionaryReader());
List<List<String>> searchTermVariants = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
// Memoize calls to getTermData, as they're redundant and cause disk reads
Map<ResultTerm, ResultTermData> termMetadata = new HashMap<>(32);
double bestScore = 0;
for (int searchTermListIdx = 0; searchTermListIdx < searchTermVariants.size(); searchTermListIdx++) {
double setScore = 0;
int setSize = 0;
for (var searchTerm : searchTermVariants.get(searchTermListIdx)) {
final int termId = reader.get(searchTerm);
ResultTermData data = termMetadata.computeIfAbsent(
new ResultTerm(searchResult.bucketId, termId, searchResult.getCombinedId()), this::getTermData);
var score = data.asScore(searchTermListIdx, searchTerm);
searchResult.scores.add(score);
setScore += score.value();
setSize++;
}
bestScore = Math.min(bestScore, setScore/setSize);
}
searchResult.setScore(bestScore);
}
private ResultTermData getTermData(ResultTerm resultTerm) {
final EdgeIndexBucket bucket = indexes.getBucket(resultTerm.bucket);
final int termId = resultTerm.termId;
final long combinedUrlId = resultTerm.combinedUrlId;
return new ResultTermData(bucket.getTermScore(cachePool, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Title, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Link, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Site, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Subjects, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.NamesWords, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Top, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Middle, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Lower, termId, combinedUrlId)
);
}
record ResultTerm (int bucket, int termId, long combinedUrlId) {}
record ResultTermData (IndexBlock index,
boolean title,
boolean link,
boolean site,
boolean subject,
boolean name,
boolean high,
boolean mid,
boolean low
) {
public EdgeSearchResultKeywordScore asScore(int set, String searchTerm) {
return new EdgeSearchResultKeywordScore(set, searchTerm, index, title, link, site, subject, name, high, mid, low);
}
}
}
private Optional<EdgeIndexSearchTerms> getSearchTerms(EdgeSearchSubquery request) {
final List<Integer> excludes = new ArrayList<>();
final List<Integer> includes = new ArrayList<>();
for (var include : request.searchTermsInclude) {
var word = lookUpWord(include);
if (word.isEmpty()) {
logger.debug("Unknown search term: " + include);
return Optional.empty();
}
includes.add(word.getAsInt());
}
for (var exclude : request.searchTermsExclude) {
lookUpWord(exclude).ifPresent(excludes::add);
}
if (includes.isEmpty()) {
return Optional.empty();
}
return Optional.of(new EdgeIndexSearchTerms(includes, excludes));
}
private OptionalInt lookUpWord(String s) {
int ret = indexes.getDictionaryReader().get(s);
if (ret == DictionaryHashMap.NO_VALUE) {
return OptionalInt.empty();
}
return OptionalInt.of(ret);
}
}

View File

@ -1,6 +1,7 @@
package nu.marginalia.wmsa.edge.index.client;
import com.google.inject.Singleton;
import io.prometheus.client.Summary;
import io.reactivex.rxjava3.core.Observable;
import io.reactivex.rxjava3.schedulers.Schedulers;
import nu.marginalia.wmsa.client.AbstractDynamicClient;
@ -10,6 +11,7 @@ import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywor
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultSet;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
@ -23,6 +25,8 @@ import java.util.concurrent.TimeUnit;
@Singleton
public class EdgeIndexClient extends AbstractDynamicClient implements EdgeIndexWriterClient {
private static final Summary wmsa_search_index_api_time = Summary.build().name("wmsa_search_index_api_time").help("-").register();
public EdgeIndexClient() {
super(ServiceDescriptor.EDGE_INDEX);
setTimeout(30);
@ -52,20 +56,10 @@ public class EdgeIndexClient extends AbstractDynamicClient implements EdgeIndexW
@CheckReturnValue
public EdgeSearchResultSet query(Context ctx, EdgeSearchSpecification specs) {
return this.postGet(ctx, "/search/", specs, EdgeSearchResultSet.class).blockingFirst();
}
@CheckReturnValue
public List<EdgeSearchResultSet> multiQuery(Context ctx, EdgeSearchSpecification... specs) {
return Observable.fromArray(specs)
.concatMap(s -> postGet(ctx, "/search/", s, EdgeSearchResultSet.class)
.subscribeOn(Schedulers.io())
.timeout(1, TimeUnit.SECONDS)
.onErrorComplete())
.toList()
.blockingGet();
public List<EdgeSearchResultItem> query(Context ctx, EdgeSearchSpecification specs) {
return wmsa_search_index_api_time.time(
() -> this.postGet(ctx, "/search/", specs, EdgeSearchResultSet.class).blockingFirst().getResults()
);
}
@CheckReturnValue

View File

@ -9,4 +9,8 @@ import java.util.List;
public class EdgeIndexSearchTerms {
public List<Integer> includes = new ArrayList<>();
public List<Integer> excludes = new ArrayList<>();
public boolean isEmpty() {
return includes.isEmpty();
}
}

View File

@ -1,52 +0,0 @@
package nu.marginalia.wmsa.edge.index.reader;
import gnu.trove.map.TLongIntMap;
import gnu.trove.map.hash.TLongIntHashMap;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem;
import java.util.List;
public class ResultDomainDeduplicator {
final TLongIntMap resultsByRankingId = new TLongIntHashMap(2048, 0.5f, -1, 0);
final int limitByDomain;
public ResultDomainDeduplicator(int limitByDomain) {
this.limitByDomain = limitByDomain;
}
public boolean filterRawValue(int bucket, long value) {
int domain = (int) (value >>> 32);
if (domain == Integer.MAX_VALUE) {
return true;
}
return resultsByRankingId.get(getKey(bucket, domain)) <= limitByDomain;
}
long getKey(int bucketId, int rankingId) {
return ((long) bucketId) << 32 | rankingId;
}
long getKey(EdgeSearchResultItem item) {
return ((long) item.bucketId) << 32 | item.getRanking();
}
public boolean test(EdgeSearchResultItem item) {
if (item.getRanking() == Integer.MAX_VALUE) {
return true;
}
return resultsByRankingId.adjustOrPutValue(getKey(item), 1, 1) <= limitByDomain;
}
public void addAll(List<EdgeSearchResultItem> items) {
for (var item : items) {
resultsByRankingId.adjustOrPutValue(getKey(item), 1, 1);
}
}
public void add(EdgeSearchResultItem item) {
resultsByRankingId.adjustOrPutValue(getKey(item), 1, 1);
}
}

View File

@ -9,8 +9,9 @@ import nu.marginalia.util.btree.CachingBTreeReader;
import nu.marginalia.util.btree.model.BTreeHeader;
import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
import nu.marginalia.wmsa.edge.index.reader.query.types.EntrySource;
import nu.marginalia.wmsa.edge.index.reader.query.types.QueryFilterStep;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -77,25 +78,25 @@ public class SearchIndex implements AutoCloseable {
return rangeForWord(pool, wordId).numEntries();
}
public UrlIndexTree rangeForWord(IndexQueryCachePool pool, int wordId) {
UrlIndexTree range = pool.getRange(words, wordId);
public IndexBTreeRange rangeForWord(IndexQueryCachePool pool, int wordId) {
IndexBTreeRange range = pool.getRange(words, wordId);
if (range == null) {
range = new UrlIndexTree(words.positionForWord(wordId));
range = new IndexBTreeRange(words.positionForWord(wordId));
pool.cacheRange(words, wordId, range);
}
return range;
}
public UrlIndexTree rangeForWord(int wordId) {
return new UrlIndexTree(words.positionForWord(wordId));
public IndexBTreeRange rangeForWord(int wordId) {
return new IndexBTreeRange(words.positionForWord(wordId));
}
public class UrlIndexTree {
final long dataOffset;
public class IndexBTreeRange {
public final long dataOffset;
private BTreeHeader header;
public UrlIndexTree(long dataOffset) {
public IndexBTreeRange(long dataOffset) {
this.dataOffset = dataOffset;
}
@ -126,7 +127,7 @@ public class SearchIndex implements AutoCloseable {
return new AsEntrySource();
}
public QueryFilterStep asExcludeFilterStep(IndexQueryCachePool pool) {
public QueryFilterStepIf asExcludeFilterStep(IndexQueryCachePool pool) {
return new AsExcludeQueryFilterStep(pool);
}
@ -150,7 +151,7 @@ public class SearchIndex implements AutoCloseable {
}
}
public boolean hasUrl(CachingBTreeReader.Cache cache, long url) {
public boolean hasUrl(CachingBTreeReader.BTreeCachedIndex cache, long url) {
if (dataOffset < 0) return false;
return cachingBTreeReader.findEntry(cache, url) >= 0;
@ -160,12 +161,12 @@ public class SearchIndex implements AutoCloseable {
if (dataOffset < 0)
return false;
CachingBTreeReader.Cache cache = pool.getIndexCache(SearchIndex.this, this);
CachingBTreeReader.BTreeCachedIndex cache = pool.getIndexCache(SearchIndex.this, this);
return cachingBTreeReader.findEntry(cache, url) >= 0;
}
public CachingBTreeReader.Cache createIndexCache() {
public CachingBTreeReader.BTreeCachedIndex createIndexCache() {
if (dataOffset < 0)
return null;
@ -213,11 +214,11 @@ public class SearchIndex implements AutoCloseable {
}
}
class AsExcludeQueryFilterStep implements QueryFilterStep {
private final CachingBTreeReader.Cache cache;
class AsExcludeQueryFilterStep implements QueryFilterStepIf {
private final CachingBTreeReader.BTreeCachedIndex cache;
public AsExcludeQueryFilterStep(IndexQueryCachePool pool) {
cache = pool.getIndexCache(SearchIndex.this, UrlIndexTree.this);
cache = pool.getIndexCache(SearchIndex.this, IndexBTreeRange.this);
}
public SearchIndex getIndex() {

View File

@ -3,7 +3,8 @@ package nu.marginalia.wmsa.edge.index.reader;
import com.google.inject.Inject;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.reader.query.IndexQueryFactory;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View File

@ -105,7 +105,7 @@ public class SearchIndexes {
}
@Nullable
public KeywordLexiconReadOnlyView getDictionaryReader() {
public KeywordLexiconReadOnlyView getLexiconReader() {
return keywordLexiconReadOnlyView;
}

View File

@ -1,26 +0,0 @@
package nu.marginalia.wmsa.edge.index.reader.query;
import java.util.stream.LongStream;
public interface Query {
Query EMPTY = new Query() {
@Override
public Query also(int wordId) { return this; }
@Override
public Query alsoCached(int wordId) { return this; }
@Override
public Query not(int wordId) { return this; }
@Override
public LongStream stream() { return LongStream.empty(); }
};
Query also(int wordId);
Query alsoCached(int wordId);
Query not(int wordId);
LongStream stream();
}

View File

@ -1,125 +0,0 @@
package nu.marginalia.wmsa.edge.index.reader.query.types;
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
import javax.annotation.Nullable;
import java.util.List;
import java.util.StringJoiner;
public interface QueryFilterStep extends Comparable<QueryFilterStep> {
@Nullable
SearchIndex getIndex();
boolean test(long value);
double cost();
default int compareTo(QueryFilterStep other) {
return (int)(cost() - other.cost());
}
String describe();
/**
* Move each value in items to the beginning of the array,
* and return the number of matching items.
*
* The remaining values are undefined.
*/
default int retainDestructive(long[] items, int max) {
int keep = 0;
for (int i = 0; i < max; i++) {
if (test(items[i])) {
if (i != keep) {
items[keep] = items[i];
}
keep++;
}
}
return keep;
}
/**
* Move each value in items to the beginning of the array,
* and return the number of matching items. The values that do
* not pass the test are moved to the end of the array.
*/
default int retainReorder(long[] items, int start, int max) {
int keep = 0;
for (int i = start; i < max; i++) {
if (test(items[i])) {
if (i != keep) {
long tmp = items[keep];
items[keep] = items[i];
items[i] = tmp;
}
keep++;
}
}
return keep;
}
static QueryFilterStep noPass() {
return NoPassFilter.instance;
}
static QueryFilterStep anyOf(List<? extends QueryFilterStep> steps) {
return new AnyOfFilter(steps);
}
}
class AnyOfFilter implements QueryFilterStep {
private final List<? extends QueryFilterStep> steps;
AnyOfFilter(List<? extends QueryFilterStep> steps) {
this.steps = steps;
}
public SearchIndex getIndex() { return null; }
public double cost() {
return steps.stream().mapToDouble(QueryFilterStep::cost).average().orElse(0.);
}
@Override
public boolean test(long value) {
for (var step : steps) {
if (step.test(value))
return true;
}
return false;
}
public String describe() {
StringJoiner sj = new StringJoiner(",", "[Any Of: ", "]");
for (var step : steps) {
sj.add(step.describe());
}
return sj.toString();
}
}
class NoPassFilter implements QueryFilterStep {
static final QueryFilterStep instance = new NoPassFilter();
@Override
public boolean test(long value) {
return false;
}
public SearchIndex getIndex() { return null; }
public double cost() { return 0.; }
public int retainDestructive(long[] items, int max) {
return 0;
}
public int retainReorder(long[] items, int start, int max) {
return 0;
}
public String describe() {
return "[NoPass]";
}
}

View File

@ -0,0 +1,107 @@
package nu.marginalia.wmsa.edge.index.svc;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.protobuf.InvalidProtocolBufferException;
import nu.marginalia.util.ListChunker;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq;
import org.apache.http.HttpStatus;
import spark.Request;
import spark.Response;
import java.util.Arrays;
import java.util.List;
@Singleton
public class EdgeIndexLexiconService {
private final SearchIndexes indexes;
private final KeywordLexicon keywordLexicon;
@Inject
public EdgeIndexLexiconService(SearchIndexes indexes, IndexServicesFactory servicesFactory) {
this.indexes = indexes;
this.keywordLexicon = servicesFactory.getKeywordLexicon();
}
public Object getWordId(Request request, Response response) {
final String word = request.splat()[0];
var lr = indexes.getLexiconReader();
if (null == lr) {
response.status(HttpStatus.SC_FAILED_DEPENDENCY);
return "";
}
final int wordId = lr.get(word);
if (DictionaryHashMap.NO_VALUE == wordId) {
response.status(404);
return "";
}
return wordId;
}
public Object putWords(Request request, Response response) throws InvalidProtocolBufferException {
var req = IndexPutKeywordsReq.parseFrom(request.bodyAsBytes());
EdgeId<EdgeDomain> domainId = new EdgeId<>(req.getDomain());
EdgeId<EdgeUrl> urlId = new EdgeId<>(req.getUrl());
int idx = req.getIndex();
for (int ws = 0; ws < req.getWordSetCount(); ws++) {
putWords(domainId, urlId, req.getWordSet(ws), idx);
}
response.status(HttpStatus.SC_ACCEPTED);
return "";
}
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
IndexPutKeywordsReq.WordSet words, int idx
) {
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
IndexBlock block = IndexBlock.values()[words.getIndex()];
for (var chunk : ListChunker.chopList(words.getWordsList(), SearchIndexJournalEntry.MAX_LENGTH)) {
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
var header = new SearchIndexJournalEntryHeader(domainId, urlId, block);
indexWriter.put(header, entry);
};
}
private long[] getOrInsertWordIds(List<String> words) {
long[] ids = new long[words.size()];
int putIdx = 0;
for (String word : words) {
long id = keywordLexicon.getOrInsert(word);
if (id != DictionaryHashMap.NO_VALUE) {
ids[putIdx++] = id;
}
}
if (putIdx != words.size()) {
ids = Arrays.copyOf(ids, putIdx);
}
return ids;
}
}

View File

@ -0,0 +1,44 @@
package nu.marginalia.wmsa.edge.index.svc;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
import spark.Request;
import spark.Response;
import spark.Spark;
@Singleton
public class EdgeIndexOpsService {
private final SearchIndexes indexes;
@Inject
public EdgeIndexOpsService(SearchIndexes indexes) {
this.indexes = indexes;
}
public Object repartitionEndpoint(Request request, Response response) {
if (!indexes.repartition()) {
Spark.halt(503, "Operations busy");
}
return "OK";
}
public Object preconvertEndpoint(Request request, Response response) {
if (!indexes.preconvert()) {
Spark.halt(503, "Operations busy");
}
return "OK";
}
public Object reindexEndpoint(Request request, Response response) {
int id = Integer.parseInt(request.params("id"));
if (!indexes.reindex(id)) {
Spark.halt(503, "Operations busy");
}
return "OK";
}
}

View File

@ -0,0 +1,320 @@
package nu.marginalia.wmsa.edge.index.svc;
import com.google.gson.Gson;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import gnu.trove.set.hash.TIntHashSet;
import io.prometheus.client.Counter;
import io.prometheus.client.Histogram;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.wmsa.client.GsonFactory;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.index.EdgeIndexBucket;
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
import nu.marginalia.wmsa.edge.index.svc.query.IndexSearchBudget;
import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.id.EdgeIdList;
import nu.marginalia.wmsa.edge.model.search.*;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
import org.apache.http.HttpStatus;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.HaltException;
import spark.Request;
import spark.Response;
import spark.Spark;
import java.util.*;
import java.util.function.LongPredicate;
import static spark.Spark.halt;
@Singleton
public class EdgeIndexQueryService {
private final Logger logger = LoggerFactory.getLogger(getClass());
private static final int SEARCH_BUDGET_TIMEOUT_MS = 3000;
private static final int QUERY_FETCH_SIZE = 8192;
private static final int QUERY_FIRST_PASS_DOMAIN_LIMIT = 64;
private static final Counter wmsa_edge_index_query_timeouts = Counter.build().name("wmsa_edge_index_query_timeouts").help("-").register();
private static final Histogram wmsa_edge_index_query_time = Histogram.build().name("wmsa_edge_index_query_time").linearBuckets(50, 50, 15).help("-").register();
private static final Histogram wmsa_edge_index_domain_query_time = Histogram.build().name("wmsa_edge_index_domain_query_time").linearBuckets(50, 50, 15).help("-").register();
private final Gson gson = GsonFactory.get();
private final SearchIndexes indexes;
@Inject
public EdgeIndexQueryService(SearchIndexes indexes) {
this.indexes = indexes;
}
public Object searchDomain(Request request, Response response) {
if (indexes.getLexiconReader() == null) {
logger.warn("Dictionary reader not yet initialized");
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
}
String json = request.body();
EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class);
try {
return wmsa_edge_index_domain_query_time.time(() -> queryDomain(specsSet));
}
catch (HaltException ex) {
logger.warn("Halt", ex);
throw ex;
}
catch (Exception ex) {
logger.info("Error during domain search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
logger.info("Error", ex);
Spark.halt(500, "Error");
return null;
}
}
public Object search(Request request, Response response) {
if (indexes.getLexiconReader() == null) {
logger.warn("Dictionary reader not yet initialized");
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
}
String json = request.body();
EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class);
try {
return wmsa_edge_index_query_time.time(() -> query(specsSet));
}
catch (HaltException ex) {
logger.warn("Halt", ex);
throw ex;
}
catch (Exception ex) {
logger.info("Error during search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
logger.info("Error", ex);
Spark.halt(500, "Error");
return null;
}
}
public EdgeSearchResultSet query(EdgeSearchSpecification specsSet) {
List<EdgeSearchResultItem> results = new SearchQuery(specsSet).execute();
return new EdgeSearchResultSet(results);
}
public EdgeDomainSearchResults queryDomain(EdgeDomainSearchSpecification specsSet) {
final OptionalInt wordId = lookUpWord(specsSet.keyword);
EdgeIdList<EdgeUrl> urlIds;
if (wordId.isEmpty()) {
urlIds = new EdgeIdList<>();
} else {
urlIds = indexes
.getBucket(specsSet.bucket)
.findHotDomainsForKeyword(specsSet.block, wordId.getAsInt(), specsSet.queryDepth, specsSet.minHitCount, specsSet.maxResults)
.mapToInt(lv -> (int) (lv & 0xFFFF_FFFFL))
.collect(EdgeIdList::new, EdgeIdList::add, EdgeIdList::addAll);
}
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
}
private class SearchQuery {
private final TIntHashSet seenResults = new TIntHashSet(QUERY_FETCH_SIZE, 0.5f);
private final EdgeSearchSpecification specsSet;
private final IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_TIMEOUT_MS);
private final IndexQueryCachePool cachePool = new IndexQueryCachePool();
public SearchQuery(EdgeSearchSpecification specsSet) {
this.specsSet = specsSet;
}
private List<EdgeSearchResultItem> execute() {
final Set<EdgeSearchResultItem> results = new HashSet<>(QUERY_FETCH_SIZE);
for (var sq : specsSet.subqueries) {
results.addAll(performSearch(sq));
}
for (var result : results) {
addResultScores(result);
}
if (!budget.hasTimeLeft()) {
wmsa_edge_index_query_timeouts.inc();
}
var domainCountFilter = new ResultDomainDeduplicator(specsSet.limitByDomain);
if (WmsaHome.isDebug()) {
cachePool.printSummary(logger);
}
cachePool.clear();
return results.stream()
.sorted(Comparator.comparing(EdgeSearchResultItem::getScore))
.filter(domainCountFilter::test)
.limit(specsSet.getLimitTotal()).toList();
}
private List<EdgeSearchResultItem> performSearch(EdgeSearchSubquery sq)
{
final List<EdgeSearchResultItem> results = new ArrayList<>(QUERY_FETCH_SIZE);
final EdgeIndexSearchTerms searchTerms = getSearchTerms(sq);
if (searchTerms.isEmpty())
return Collections.emptyList();
for (int indexBucket : specsSet.buckets) {
final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT);
if (!budget.hasTimeLeft()) {
logger.info("Query timed out, omitting {}:{} for query {}", indexBucket, sq.block, sq.searchTermsInclude);
continue;
}
if (QUERY_FETCH_SIZE <= results.size())
break;
IndexQuery query = getQuery(cachePool, indexBucket, sq.block, localFilter::filterRawValue, searchTerms);
long[] buf = new long[8192];
while (query.hasMore() && results.size() < QUERY_FETCH_SIZE && budget.hasTimeLeft()) {
int cnt = query.getMoreResults(buf, budget);
for (int i = 0; i < cnt && results.size() < QUERY_FETCH_SIZE; i++) {
final long id = buf[i];
if (!seenResults.add((int)(id & 0xFFFF_FFFFL)) || !localFilter.test(id)) {
continue;
}
results.add(new EdgeSearchResultItem(indexBucket, id));
}
}
}
return results;
}
private IndexQuery getQuery(IndexQueryCachePool cachePool, int bucket, IndexBlock block,
LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
if (!indexes.isValidBucket(bucket)) {
logger.warn("Invalid bucket {}", bucket);
return new IndexQuery(Collections.emptyList());
}
return indexes.getBucket(bucket).getQuery(cachePool, block, filter, searchTerms);
}
private void addResultScores(EdgeSearchResultItem searchResult) {
final var reader = Objects.requireNonNull(indexes.getLexiconReader());
List<List<String>> searchTermVariants = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
// Memoize calls to getTermData, as they're somewhat expensive and highly redundant
Map<ResultTerm, ResultTermData> termMetadata = new HashMap<>(32);
double bestScore = 0;
for (int searchTermListIdx = 0; searchTermListIdx < searchTermVariants.size(); searchTermListIdx++) {
double setScore = 0;
int setSize = 0;
for (var searchTerm : searchTermVariants.get(searchTermListIdx)) {
final int termId = reader.get(searchTerm);
ResultTermData data = termMetadata.computeIfAbsent(
new ResultTerm(searchResult.bucketId, termId, searchResult.getCombinedId()), this::getTermData);
var score = data.asScore(searchTermListIdx, searchTerm);
searchResult.scores.add(score);
setScore += score.value();
setSize++;
}
bestScore = Math.min(bestScore, setScore/setSize);
}
searchResult.setScore(bestScore);
}
private ResultTermData getTermData(ResultTerm resultTerm) {
final EdgeIndexBucket bucket = indexes.getBucket(resultTerm.bucket);
final int termId = resultTerm.termId;
final long combinedUrlId = resultTerm.combinedUrlId;
return new ResultTermData(bucket.getTermScore(cachePool, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Title, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Link, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Site, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Subjects, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.NamesWords, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Top, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Middle, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Lower, termId, combinedUrlId)
);
}
record ResultTerm (int bucket, int termId, long combinedUrlId) {}
record ResultTermData (IndexBlock index,
boolean title,
boolean link,
boolean site,
boolean subject,
boolean name,
boolean high,
boolean mid,
boolean low
) {
public EdgeSearchResultKeywordScore asScore(int set, String searchTerm) {
return new EdgeSearchResultKeywordScore(set, searchTerm, index, title, link, site, subject, name, high, mid, low);
}
}
}
private EdgeIndexSearchTerms getSearchTerms(EdgeSearchSubquery request) {
final List<Integer> excludes = new ArrayList<>();
final List<Integer> includes = new ArrayList<>();
for (var include : request.searchTermsInclude) {
var word = lookUpWord(include);
if (word.isEmpty()) {
logger.debug("Unknown search term: " + include);
return new EdgeIndexSearchTerms(includes, excludes);
}
includes.add(word.getAsInt());
}
for (var exclude : request.searchTermsExclude) {
lookUpWord(exclude).ifPresent(excludes::add);
}
return new EdgeIndexSearchTerms(includes, excludes);
}
private OptionalInt lookUpWord(String s) {
int ret = indexes.getLexiconReader().get(s);
if (ret == DictionaryHashMap.NO_VALUE) {
return OptionalInt.empty();
}
return OptionalInt.of(ret);
}
}

View File

@ -1,7 +1,7 @@
package nu.marginalia.wmsa.edge.index.reader.query;
package nu.marginalia.wmsa.edge.index.svc.query;
import nu.marginalia.wmsa.edge.index.reader.query.types.EntrySource;
import nu.marginalia.wmsa.edge.index.reader.query.types.QueryFilterStep;
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
import java.util.ArrayList;
import java.util.List;
@ -10,18 +10,18 @@ import static java.lang.Math.min;
public class IndexQuery {
private final List<EntrySource> sources;
private final List<QueryFilterStep> inclusionFilter = new ArrayList<>(10);
private final List<QueryFilterStep> priorityFilter = new ArrayList<>(10);
private final List<QueryFilterStepIf> inclusionFilter = new ArrayList<>(10);
private final List<QueryFilterStepIf> priorityFilter = new ArrayList<>(10);
public IndexQuery(List<EntrySource> sources) {
this.sources = sources;
}
public void addInclusionFilter(QueryFilterStep filter) {
public void addInclusionFilter(QueryFilterStepIf filter) {
inclusionFilter.add(filter);
}
public void addPriorityFilter(QueryFilterStep filter) {
public void addPriorityFilter(QueryFilterStepIf filter) {
priorityFilter.add(filter);
}

View File

@ -1,17 +1,19 @@
package nu.marginalia.wmsa.edge.index.reader;
package nu.marginalia.wmsa.edge.index.svc.query;
import nu.marginalia.util.btree.CachingBTreeReader;
import nu.marginalia.wmsa.edge.index.reader.IndexWordsTable;
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
import org.slf4j.Logger;
import java.util.HashMap;
import java.util.Map;
public class IndexQueryCachePool {
private final Map<PoolKey, CachingBTreeReader.Cache> indexCaches = new HashMap<>();
private final Map<RangeKey, SearchIndex.UrlIndexTree> rangeCache = new HashMap<>();
private final Map<PoolKey, CachingBTreeReader.BTreeCachedIndex> indexCaches = new HashMap<>();
private final Map<RangeKey, SearchIndex.IndexBTreeRange> rangeCache = new HashMap<>();
private final Map<PoolKey, Integer> savedCounts = new HashMap<>();
public CachingBTreeReader.Cache getIndexCache(SearchIndex index, SearchIndex.UrlIndexTree range) {
public CachingBTreeReader.BTreeCachedIndex getIndexCache(SearchIndex index, SearchIndex.IndexBTreeRange range) {
var key = new PoolKey(index, range.dataOffset);
var entry = indexCaches.get(key);
@ -33,10 +35,10 @@ public class IndexQueryCachePool {
}
public void printSummary(Logger logger) {
long loadedBytes = indexCaches.values().stream().mapToLong(CachingBTreeReader.Cache::sizeBytes).sum();
long loadedBytes = indexCaches.values().stream().mapToLong(CachingBTreeReader.BTreeCachedIndex::sizeBytes).sum();
long savedBytes = savedCounts.entrySet().stream().mapToLong(e -> e.getValue() * indexCaches.get(e.getKey()).sizeBytes()).sum();
long loaded = indexCaches.values().stream().filter(CachingBTreeReader.Cache::isLoaded).count();
long loaded = indexCaches.values().stream().filter(CachingBTreeReader.BTreeCachedIndex::isLoaded).count();
logger.info("Index Cache Summary: {}/{} loaded/total, {} index blocks loaded, {} index blocks saved", loaded, indexCaches.size(), loadedBytes/4096., savedBytes/4096.);
}
@ -45,11 +47,11 @@ public class IndexQueryCachePool {
indexCaches.clear();
}
public SearchIndex.UrlIndexTree getRange(IndexWordsTable words, int wordId) {
public SearchIndex.IndexBTreeRange getRange(IndexWordsTable words, int wordId) {
return rangeCache.get(new RangeKey(words, wordId));
}
public void cacheRange(IndexWordsTable words, int wordId, SearchIndex.UrlIndexTree range) {
public void cacheRange(IndexWordsTable words, int wordId, SearchIndex.IndexBTreeRange range) {
rangeCache.put(new RangeKey(words, wordId), range);
}

View File

@ -1,11 +1,10 @@
package nu.marginalia.wmsa.edge.index.reader.query;
package nu.marginalia.wmsa.edge.index.svc.query;
import nu.marginalia.wmsa.edge.index.reader.IndexQueryCachePool;
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
import nu.marginalia.wmsa.edge.index.reader.query.types.EntrySource;
import nu.marginalia.wmsa.edge.index.reader.query.types.QueryFilterStep;
import nu.marginalia.wmsa.edge.index.reader.query.types.QueryFilterStepFromPredicate;
import nu.marginalia.wmsa.edge.index.reader.query.types.UrlRangeSubFilter;
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterBTreeRange;
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepFromPredicate;
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
import java.util.*;
import java.util.function.LongPredicate;
@ -57,21 +56,21 @@ public class IndexQueryFactory {
}
public IndexQueryBuilder also(int termId) {
List<QueryFilterStep> filters = new ArrayList<>(requiredIndices.size());
List<QueryFilterStepIf> filters = new ArrayList<>(requiredIndices.size());
for (var ri : requiredIndices) {
var range = ri.rangeForWord(cachePool, termId);
if (range.isPresent()) {
filters.add(new UrlRangeSubFilter(ri, range, cachePool));
filters.add(new QueryFilterBTreeRange(ri, range, cachePool));
}
else {
filters.add(QueryFilterStep.noPass());
filters.add(QueryFilterStepIf.noPass());
}
}
filters.sort(Comparator.naturalOrder());
query.addInclusionFilter(QueryFilterStep.anyOf(filters));
query.addInclusionFilter(QueryFilterStepIf.anyOf(filters));
return this;
}
@ -92,7 +91,7 @@ public class IndexQueryFactory {
for (var idx : priortyIndices) {
var range = idx.rangeForWord(cachePool, termId);
if (range.isPresent()) {
query.addPriorityFilter(new UrlRangeSubFilter(idx, range, cachePool));
query.addPriorityFilter(new QueryFilterBTreeRange(idx, range, cachePool));
}
}
}

View File

@ -0,0 +1,26 @@
package nu.marginalia.wmsa.edge.index.svc.query;
import java.util.stream.LongStream;
public interface IndexQueryIf {
IndexQueryIf EMPTY = new IndexQueryIf() {
@Override
public IndexQueryIf also(int wordId) { return this; }
@Override
public IndexQueryIf alsoCached(int wordId) { return this; }
@Override
public IndexQueryIf not(int wordId) { return this; }
@Override
public LongStream stream() { return LongStream.empty(); }
};
IndexQueryIf also(int wordId);
IndexQueryIf alsoCached(int wordId);
IndexQueryIf not(int wordId);
LongStream stream();
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.wmsa.edge.index.reader.query;
package nu.marginalia.wmsa.edge.index.svc.query;
public class IndexSearchBudget {

View File

@ -0,0 +1,45 @@
package nu.marginalia.wmsa.edge.index.svc.query;
import gnu.trove.map.TLongIntMap;
import gnu.trove.map.hash.TLongIntHashMap;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem;
public class ResultDomainDeduplicator {
final TLongIntMap resultsByRankingId = new TLongIntHashMap(2048, 0.5f, -1, 0);
final int limitByDomain;
public ResultDomainDeduplicator(int limitByDomain) {
this.limitByDomain = limitByDomain;
}
public boolean filterRawValue(long value) {
int rankingId = (int) (value >>> 32);
if (rankingId == Integer.MAX_VALUE) {
return true;
}
return resultsByRankingId.get(getKey(rankingId)) <= limitByDomain;
}
long getKey(int rankingId) {
return rankingId;
}
public boolean test(long value) {
int ranking = (int) (value >>> 32);
if (ranking == Integer.MAX_VALUE) {
return true;
}
return resultsByRankingId.adjustOrPutValue(ranking, 1, 1) <= limitByDomain;
}
public boolean test(EdgeSearchResultItem item) {
int ranking = item.getRanking();
if (ranking == Integer.MAX_VALUE) {
return true;
}
return resultsByRankingId.adjustOrPutValue(ranking, 1, 1) <= limitByDomain;
}
}

View File

@ -1,8 +1,9 @@
package nu.marginalia.wmsa.edge.index.reader.query.types;
package nu.marginalia.wmsa.edge.index.svc.query.types;
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
public interface EntrySource {
SearchIndex getIndex();
int read(long[] buffer, int n);
}

View File

@ -0,0 +1,39 @@
package nu.marginalia.wmsa.edge.index.svc.query.types.filter;
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
import java.util.List;
import java.util.StringJoiner;
class QueryFilterAnyOf implements QueryFilterStepIf {
private final List<? extends QueryFilterStepIf> steps;
QueryFilterAnyOf(List<? extends QueryFilterStepIf> steps) {
this.steps = steps;
}
public SearchIndex getIndex() {
return null;
}
public double cost() {
return steps.stream().mapToDouble(QueryFilterStepIf::cost).average().orElse(0.);
}
@Override
public boolean test(long value) {
for (var step : steps) {
if (step.test(value))
return true;
}
return false;
}
public String describe() {
StringJoiner sj = new StringJoiner(",", "[Any Of: ", "]");
for (var step : steps) {
sj.add(step.describe());
}
return sj.toString();
}
}

View File

@ -1,13 +1,13 @@
package nu.marginalia.wmsa.edge.index.reader.query.types;
package nu.marginalia.wmsa.edge.index.svc.query.types.filter;
import nu.marginalia.util.btree.CachingBTreeReader;
import nu.marginalia.wmsa.edge.index.reader.IndexQueryCachePool;
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
import org.jetbrains.annotations.Nullable;
public record UrlRangeSubFilter(SearchIndex source, SearchIndex.UrlIndexTree range, CachingBTreeReader.Cache cache) implements QueryFilterStep {
public record QueryFilterBTreeRange(SearchIndex source, SearchIndex.IndexBTreeRange range, CachingBTreeReader.BTreeCachedIndex cache) implements QueryFilterStepIf {
public UrlRangeSubFilter(SearchIndex source, SearchIndex.UrlIndexTree range, IndexQueryCachePool pool) {
public QueryFilterBTreeRange(SearchIndex source, SearchIndex.IndexBTreeRange range, IndexQueryCachePool pool) {
this(source, range, pool.getIndexCache(source, range));
}

View File

@ -0,0 +1,33 @@
package nu.marginalia.wmsa.edge.index.svc.query.types.filter;
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
class QueryFilterNoPass implements QueryFilterStepIf {
static final QueryFilterStepIf instance = new QueryFilterNoPass();
@Override
public boolean test(long value) {
return false;
}
public SearchIndex getIndex() {
return null;
}
public double cost() {
return 0.;
}
public int retainDestructive(long[] items, int max) {
return 0;
}
public int retainReorder(long[] items, int start, int max) {
return 0;
}
public String describe() {
return "[NoPass]";
}
}

View File

@ -1,11 +1,11 @@
package nu.marginalia.wmsa.edge.index.reader.query.types;
package nu.marginalia.wmsa.edge.index.svc.query.types.filter;
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
import org.jetbrains.annotations.Nullable;
import java.util.function.LongPredicate;
public class QueryFilterStepFromPredicate implements QueryFilterStep {
public class QueryFilterStepFromPredicate implements QueryFilterStepIf {
private final LongPredicate pred;
public QueryFilterStepFromPredicate(LongPredicate pred) {

View File

@ -0,0 +1,71 @@
package nu.marginalia.wmsa.edge.index.svc.query.types.filter;
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
import javax.annotation.Nullable;
import java.util.List;
public interface QueryFilterStepIf extends Comparable<QueryFilterStepIf> {
@Nullable
SearchIndex getIndex();
boolean test(long value);
double cost();
default int compareTo(QueryFilterStepIf other) {
return (int)(cost() - other.cost());
}
String describe();
/**
* Move each value in items to the beginning of the array,
* and return the number of matching items.
*
* The remaining values are undefined.
*/
default int retainDestructive(long[] items, int max) {
int keep = 0;
for (int i = 0; i < max; i++) {
if (test(items[i])) {
if (i != keep) {
items[keep] = items[i];
}
keep++;
}
}
return keep;
}
/**
* Move each value in items to the beginning of the array,
* and return the number of matching items. The values that do
* not pass the test are moved to the end of the array.
*/
default int retainReorder(long[] items, int start, int max) {
int keep = 0;
for (int i = start; i < max; i++) {
if (test(items[i])) {
if (i != keep) {
long tmp = items[keep];
items[keep] = items[i];
items[i] = tmp;
}
keep++;
}
}
return keep;
}
static QueryFilterStepIf noPass() {
return QueryFilterNoPass.instance;
}
static QueryFilterStepIf anyOf(List<? extends QueryFilterStepIf> steps) {
return new QueryFilterAnyOf(steps);
}
}

View File

@ -12,14 +12,14 @@ import java.util.List;
@AllArgsConstructor @ToString @Getter
public class EdgeSearchResultItem {
public final int bucketId;
public final long combinedId; // this isn't the external domain ID, but a ranking
public final long combinedId;
public final List<EdgeSearchResultKeywordScore> scores;
public EdgeSearchResultItem(int bucketId, long val) {
this.bucketId = bucketId;
combinedId = val;
scores = new ArrayList<>(16);
this.combinedId = val;
this.scores = new ArrayList<>(16);
}
public EdgeId<EdgeUrl> getUrlId() {
@ -33,6 +33,7 @@ public class EdgeSearchResultItem {
return (int)(combinedId >>> 32);
}
/* Used for evaluation */
private transient double scoreValue = 1;
public void setScore(double score) {
scoreValue = score;

View File

@ -4,10 +4,10 @@ import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.id.EdgeIdArray;
import nu.marginalia.wmsa.edge.model.id.EdgeIdList;
@AllArgsConstructor @Getter @ToString
public class EdgeDomainSearchResults {
public final String keyword;
public final EdgeIdArray<EdgeUrl> results;
public final EdgeIdList<EdgeUrl> results;
}

View File

@ -2,7 +2,6 @@ package nu.marginalia.wmsa.edge.search;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import io.prometheus.client.Summary;
import io.reactivex.rxjava3.core.Observable;
import io.reactivex.rxjava3.schedulers.Schedulers;
import nu.marginalia.wmsa.configuration.server.Context;
@ -18,7 +17,6 @@ import nu.marginalia.wmsa.edge.model.id.EdgeIdSet;
import nu.marginalia.wmsa.edge.model.search.*;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet;
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResults;
import nu.marginalia.wmsa.edge.search.query.QueryFactory;
import nu.marginalia.wmsa.edge.search.query.model.EdgeSearchQuery;
@ -50,8 +48,6 @@ public class EdgeSearchOperator {
private final SearchResultDecorator resultDecorator;
private final Comparator<EdgeUrlDetails> resultListComparator;
private static final Summary wmsa_search_index_api_time = Summary.build().name("wmsa_search_index_api_time").help("-").register();
@Inject
public EdgeSearchOperator(AssistantClient assistantClient,
EncyclopediaClient encyclopediaClient,
@ -81,9 +77,7 @@ public class EdgeSearchOperator {
logger.info("Human terms (API): {}", Strings.join(processedQuery.searchTermsHuman, ','));
DecoratedSearchResultSet queryResults = performQuery(ctx, processedQuery);
return queryResults.resultSet;
return performQuery(ctx, processedQuery);
}
public DecoratedSearchResults doSearch(Context ctx, EdgeUserSearchParameters params, @Nullable Future<String> eval) {
@ -91,23 +85,25 @@ public class EdgeSearchOperator {
Observable<WikiArticles> definitions = getWikiArticle(ctx, params.humanQuery());
EdgeSearchQuery processedQuery = queryFactory.createQuery(params);
logger.info("Human terms: {}", Strings.join(processedQuery.searchTermsHuman, ','));
DecoratedSearchResultSet queryResults = performQuery(ctx, processedQuery);
List<EdgeUrlDetails> queryResults = performQuery(ctx, processedQuery);
String evalResult = getEvalResult(eval);
List<BrowseResult> domainResults = getDomainResults(ctx, processedQuery.specs);
WikiArticles wikiArticles = definitions.onErrorReturn((e) -> new WikiArticles()).blockingFirst();
return new DecoratedSearchResults(params,
getProblems(ctx, evalResult, queryResults, processedQuery),
evalResult,
definitions.onErrorReturn((e) -> new WikiArticles()).blockingFirst(),
queryResults.resultSet,
domainResults,
processedQuery.domain,
getDomainId(processedQuery.domain));
return DecoratedSearchResults.builder()
.params(params)
.problems(getProblems(ctx, evalResult, queryResults, processedQuery))
.evalResult(evalResult)
.wiki(wikiArticles)
.results(queryResults)
.domainResults(domainResults)
.focusDomain(processedQuery.domain)
.focusDomainId(getDomainId(processedQuery.domain))
.build();
}
private List<BrowseResult> getDomainResults(Context ctx, EdgeSearchSpecification specs) {
@ -169,7 +165,7 @@ public class EdgeSearchOperator {
return domainId;
}
public DecoratedSearchResultSet performDumbQuery(Context ctx, EdgeSearchProfile profile, IndexBlock block, int limitPerDomain, int limitTotal, String... termsInclude) {
public List<EdgeUrlDetails> performDumbQuery(Context ctx, EdgeSearchProfile profile, IndexBlock block, int limitPerDomain, int limitTotal, String... termsInclude) {
List<EdgeSearchSubquery> sqs = new ArrayList<>();
sqs.add(new EdgeSearchSubquery(Arrays.asList(termsInclude), Collections.emptyList(), block));
@ -179,11 +175,13 @@ public class EdgeSearchOperator {
return performQuery(ctx, new EdgeSearchQuery(specs));
}
private DecoratedSearchResultSet performQuery(Context ctx, EdgeSearchQuery processedQuery) {
private List<EdgeUrlDetails> performQuery(Context ctx, EdgeSearchQuery processedQuery) {
List<EdgeUrlDetails> resultList = new ArrayList<>(processedQuery.specs.limitTotal);
final List<EdgeSearchResultItem> results = indexClient.query(ctx, processedQuery.specs);
for (var details : wmsa_search_index_api_time.time(()->fetchResultsSimple(ctx, processedQuery))) {
final List<EdgeUrlDetails> resultList = new ArrayList<>(results.size());
for (var details : resultDecorator.getAllUrlDetails(results)) {
if (details.getUrlQuality() <= -100) {
continue;
}
@ -208,10 +206,10 @@ public class EdgeSearchOperator {
}
}
return new DecoratedSearchResultSet(retList);
return retList;
}
private List<String> getProblems(Context ctx, String evalResult, DecoratedSearchResultSet queryResults, EdgeSearchQuery processedQuery) {
private List<String> getProblems(Context ctx, String evalResult, List<EdgeUrlDetails> queryResults, EdgeSearchQuery processedQuery) {
final List<String> problems = new ArrayList<>(processedQuery.problems);
boolean siteSearch = processedQuery.domain != null;
@ -305,15 +303,6 @@ public class EdgeSearchOperator {
;
}
private Set<EdgeUrlDetails> fetchResultsSimple(Context ctx, EdgeSearchQuery processedQuery) {
EdgeSearchResultSet resultSet = indexClient.query(ctx, processedQuery.specs);
var results = resultSet.getResults();
Set<EdgeUrlDetails> ret = new HashSet<>(resultDecorator.getAllUrlDetails(results));
return ret;
}
private Iterable<String> spellCheckTerms(Context ctx, EdgeSearchQuery disjointedQuery) {
return Observable.fromIterable(disjointedQuery.searchTermsHuman)
.subscribeOn(Schedulers.io())

View File

@ -20,6 +20,7 @@ import nu.marginalia.wmsa.edge.search.command.SearchJsParameter;
import nu.marginalia.wmsa.edge.search.command.SearchParameters;
import nu.marginalia.wmsa.edge.search.exceptions.RedirectException;
import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters;
import nu.marginalia.wmsa.edge.search.svc.EdgeSearchErrorPageService;
import nu.marginalia.wmsa.resource_store.StaticResources;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -40,6 +41,7 @@ public class EdgeSearchService extends Service {
private final WebsiteUrl websiteUrl;
private StaticResources staticResources;
private final EdgeSearchErrorPageService errorPageService;
private static final Logger logger = LoggerFactory.getLogger(EdgeSearchService.class);
@SneakyThrows
@ -53,7 +55,8 @@ public class EdgeSearchService extends Service {
CommandEvaluator searchCommandEvaulator,
WebsiteUrl websiteUrl,
StaticResources staticResources,
IndexCommand indexCommand) {
IndexCommand indexCommand,
EdgeSearchErrorPageService errorPageService) {
super(ip, port, initialization, metricsServer);
this.indexClient = indexClient;
@ -61,6 +64,7 @@ public class EdgeSearchService extends Service {
this.searchCommandEvaulator = searchCommandEvaulator;
this.websiteUrl = websiteUrl;
this.staticResources = staticResources;
this.errorPageService = errorPageService;
Spark.staticFiles.expireTime(600);
@ -79,7 +83,7 @@ public class EdgeSearchService extends Service {
Spark.exception(Exception.class, (e,p,q) -> {
logger.error("Error during processing", e);
serveError(Context.fromRequest(p), q);
errorPageService.serveError(Context.fromRequest(p), q);
});
Spark.awaitInitialization();
@ -104,26 +108,6 @@ public class EdgeSearchService extends Service {
}
private void serveError(Context ctx, Response rsp) {
boolean isIndexUp = indexClient.isAlive();
try {
if (!isIndexUp) {
rsp.body("<html><head><title>Error</title><link rel=\"stylesheet\" href=\"https://www.marginalia.nu/style.css\"> <meta http-equiv=\"refresh\" content=\"5\"> </head><body><article><h1>Error</h1><p>Oops! It appears the index server is <span class=\"headline\">offline</span>.</p> <p>The server was probably restarted to bring online some changes. Restarting the index typically takes a few minutes, during which searches can't be served. </p><p>This page will attempt to refresh automatically every few seconds.</p></body></html>");
} else if (indexClient.isBlocked(ctx).blockingFirst()) {
rsp.body("<html><head><title>Error</title><link rel=\"stylesheet\" href=\"https://www.marginalia.nu/style.css\"> <meta http-equiv=\"refresh\" content=\"5\"> </head><body><article><h1>Error</h1><p>Oops! It appears the index server is <span class=\"headline\">starting up</span>.</p> <p>The server was probably restarted to bring online some changes. Restarting the index typically takes a few minutes, during which searches can't be served. </p><p>This page will attempt to refresh automatically every few seconds.</p></body></html>");
}
else {
rsp.body("<html><head><title>Error</title><link rel=\"stylesheet\" href=\"https://www.marginalia.nu/style.css\"></head><body><article><h1>Error</h1><p>Oops! An unknown error occurred. The index server seems to be up, so I don't know why this is. Please send an email to kontakt@marginalia.nu telling me what you did :-) </p></body></html>");
}
}
catch (Exception ex) {
logger.error("Error", ex);
rsp.body("<html><head><title>Error</title><link rel=\"stylesheet\" href=\"https://www.marginalia.nu/style.css\"> <meta http-equiv=\"refresh\" content=\"5\"> </head><body><article><h1>Error</h1><p>Oops! It appears the index server is <span class=\"headline\">unresponsive</span>.</p> <p>The server was probably restarted to bring online some changes. Restarting the index typically takes a few minutes, during which searches can't be served. </p><p>This page will attempt to refresh automatically every few seconds.</p></body></html>");
}
}
@SneakyThrows
private Object apiSearch(Request request, Response response) {
@ -180,7 +164,7 @@ public class EdgeSearchService extends Service {
}
catch (Exception ex) {
logger.error("Error", ex);
serveError(ctx, response);
errorPageService.serveError(ctx, response);
}
return "";

View File

@ -5,11 +5,11 @@ import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
import nu.marginalia.wmsa.edge.search.EdgeSearchOperator;
import nu.marginalia.wmsa.edge.search.EdgeSearchProfile;
import nu.marginalia.wmsa.edge.search.command.SearchCommandInterface;
import nu.marginalia.wmsa.edge.search.command.SearchParameters;
import nu.marginalia.wmsa.edge.search.model.DecoratedSearchResultSet;
import nu.marginalia.wmsa.edge.search.model.DomainInformation;
import nu.marginalia.wmsa.edge.search.siteinfo.DomainInformationService;
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
@ -19,10 +19,7 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
import java.util.Collections;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.*;
import java.util.function.Predicate;
import java.util.regex.Pattern;
@ -59,7 +56,7 @@ public class SiteSearchCommand implements SearchCommandInterface {
var results = siteInfo(ctx, query);
var domain = results.getDomain();
DecoratedSearchResultSet resultSet;
List<EdgeUrlDetails> resultSet;
Path screenshotPath = null;
if (null != domain) {
resultSet = searchOperator.performDumbQuery(ctx, EdgeSearchProfile.CORPO, IndexBlock.Words_1, 100, 100, "site:"+domain);
@ -67,10 +64,10 @@ public class SiteSearchCommand implements SearchCommandInterface {
screenshotPath = Path.of("/screenshot/" + dataStoreDao.getDomainId(domain).id());
}
else {
resultSet = new DecoratedSearchResultSet(Collections.emptyList());
resultSet = Collections.emptyList();
}
return Optional.of(siteInfoRenderer.render(results, Map.of("query", query, "focusDomain", Objects.requireNonNullElse(domain, ""), "profile", parameters.profileStr(), "results", resultSet.resultSet, "screenshot", screenshotPath == null ? "" : screenshotPath.toString())));
return Optional.of(siteInfoRenderer.render(results, Map.of("query", query, "focusDomain", Objects.requireNonNullElse(domain, ""), "profile", parameters.profileStr(), "results", resultSet, "screenshot", screenshotPath == null ? "" : screenshotPath.toString())));
}

View File

@ -1,22 +0,0 @@
package nu.marginalia.wmsa.edge.search.model;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
import java.util.List;
import java.util.Objects;
@ToString @Getter
public class DecoratedSearchResultSet {
public final List<EdgeUrlDetails> resultSet;
public int size() {
return resultSet.size();
}
public DecoratedSearchResultSet(List<EdgeUrlDetails> resultSet) {
this.resultSet = Objects.requireNonNull(resultSet);
}
}

View File

@ -1,6 +1,7 @@
package nu.marginalia.wmsa.edge.search.model;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import nu.marginalia.wmsa.edge.assistant.dict.WikiArticles;
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
@ -8,7 +9,7 @@ import nu.marginalia.wmsa.edge.search.query.model.EdgeUserSearchParameters;
import java.util.List;
@AllArgsConstructor @Getter
@AllArgsConstructor @Getter @Builder
public class DecoratedSearchResults {
private final EdgeUserSearchParameters params;
private final List<String> problems;

View File

@ -13,7 +13,6 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
public class SearchResultDecorator {
@ -67,8 +66,7 @@ public class SearchResultDecorator {
if (!missedIds.isEmpty()) {
logger.debug("Could not look up documents: {}", missedIds.toArray());
}
retList.sort(Comparator.comparing(EdgeUrlDetails::getTermScore)
.thenComparing(url -> url.url.path.length()));
return retList;
}

View File

@ -0,0 +1,125 @@
package nu.marginalia.wmsa.edge.search.svc;
import com.google.inject.Inject;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Response;
public class EdgeSearchErrorPageService {
private final EdgeIndexClient indexClient;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public EdgeSearchErrorPageService(EdgeIndexClient indexClient) {
this.indexClient = indexClient;
}
public void serveError(Context ctx, Response rsp) {
boolean isIndexUp = indexClient.isAlive();
try {
if (!isIndexUp) {
rsp.body(renderError("The index is down",
"""
The search index server appears to be down.
<p>
The server was possibly restarted to bring online some changes.
Restarting the index typically takes a few minutes, during which
searches can't be served.
"""));
} else if (indexClient.isBlocked(ctx).blockingFirst()) {
rsp.body(renderError("The index is starting up",
"""
The search index server appears to be in the process of starting up.
This typically takes a few minutes. Be patient.
"""));
}
else {
rsp.body(renderError("Error processing request",
"""
The search index appears to be up and running, so the problem may be related
to some wider general error, or pertain to an error handling your query.
"""));
}
}
catch (Exception ex) {
rsp.body(renderError("Error processing error",
"""
An error has occurred, additionally, an error occurred while handling that error
<p>
<a href="https://www.youtube.com/watch?v=dsx2vdn7gpY">https://www.youtube.com/watch?v=dsx2vdn7gpY</a>.
"""));
}
}
private String renderError(String title, String message) {
return """
<!DOCTYPE html>
<title>Error</title>
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<link rel="stylesheet" href="https://search.marginalia.nu/style-new.css">
<header>
<nav>
<a href="https://www.marginalia.nu/">Marginalia</a>
<a href="https://memex.marginalia.nu/projects/edge/about.gmi">About</a>
<a href="https://memex.marginalia.nu/projects/edge/supporting.gmi">Support</a>
</nav>
</header>
<article>
<form method="get" action="/search">
<section class="search-box">
<h1>Search the Internet</h1>
<div class="input">
<input id="query" name="query" placeholder="Search terms" value="" autocomplete="off">
<input value="Go" type="submit">
</div>
<div class="settings">
<select name="profile" id="profile">
<option value="default">Popular Sites</option>
<option value="modern">Blogs and Personal Websites</option>
<option value="academia">Academia, Forums, Big Websites</option>
<option value="yolo">Default Ranking Algorithm</option>
<option value="food">Recipes 🍳</option>
<option value="corpo">Experimental</option>
</select>
<select name="js" id="js">
<option value="default">Allow JS</option>
<option value="no-js">Deny JS</option>
<option value="yes-js">Require JS</option>
</select>
</div>
<div class="extra">
<a href="https://search.marginalia.nu/explore/random">Random Websites</a>
</div>
</section>
</form>
<div class="cards big">
<div class="card problems">
<h2>
"""
+ title +
"""
</h2>
<div class="info">
"""
+message+
"""
</div>
</div>
<div class="card">
<h2>More Info</h2>
<div class="info">
You may be able to find more information here:
<ul>
<li><a href="https://status.marginalia.nu/">Maintenance Messages</a></li>
<li><a href="https://twitter.com/MarginaliaNu">Twitter Account</a></li>
<li>Email Me: <tt>kontakt@marginalia.nu</tt></li>
</ul>
</div>
</div>
""";
}
}

View File

@ -5,7 +5,7 @@ import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals;
class MicroCacheTest {
class MicroBTreeCachedIndexTest {
MicroCache mc;
@BeforeEach

View File

@ -1,13 +1,15 @@
package nu.marginalia.wmsa.edge.index.reader.query.types;
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepFromPredicate;
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
import org.junit.jupiter.api.Test;
import java.util.List;
class QueryFilterStepTest {
QueryFilterStep even = new QueryFilterStepFromPredicate(l -> (l%2) == 0);
QueryFilterStep divisibleByThree = new QueryFilterStepFromPredicate(l -> (l%3) == 0);
QueryFilterStep either = QueryFilterStep.anyOf(List.of(even, divisibleByThree));
class QueryFilterStepIfTest {
QueryFilterStepIf even = new QueryFilterStepFromPredicate(l -> (l%2) == 0);
QueryFilterStepIf divisibleByThree = new QueryFilterStepFromPredicate(l -> (l%3) == 0);
QueryFilterStepIf either = QueryFilterStepIf.anyOf(List.of(even, divisibleByThree));
@Test
public void test() {
long[] values = new long[100];