diff --git a/marginalia_nu/build.gradle b/marginalia_nu/build.gradle index 304af5c8..8a8f3e0a 100644 --- a/marginalia_nu/build.gradle +++ b/marginalia_nu/build.gradle @@ -58,7 +58,7 @@ jmhJar { } dependencies { implementation project(':third_party') - + implementation project(':protocol') implementation 'org.projectlombok:lombok:1.18.24' annotationProcessor 'org.projectlombok:lombok:1.18.24' @@ -157,6 +157,9 @@ dependencies { jmh 'org.openjdk.jmh:jmh-core:1.35' jmh 'org.openjdk.jmh:jmh-generator-annprocess:1.35' + + implementation 'com.dslplatform:dsl-json:1.9.9' + annotationProcessor 'com.dslplatform:dsl-json-processor:1.9.9' } configurations { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java index 9aa953dc..bbb17c51 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java @@ -1,18 +1,18 @@ package nu.marginalia.util.dict; -import nu.marginalia.util.SeekDictionary; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.nio.ByteBuffer; import java.nio.LongBuffer; +import java.util.ArrayList; public class DictionaryData { private final int DICTIONARY_BANK_SIZE; private static final Logger logger = LoggerFactory.getLogger(DictionaryData.class); - private final SeekDictionary banks = SeekDictionary.of(DictionaryDataBank::getSize); + private final ArrayList banks = new ArrayList(100); public DictionaryData(int bankSize) { DICTIONARY_BANK_SIZE = bankSize; @@ -20,12 +20,8 @@ public class DictionaryData { banks.add(new DictionaryDataBank(0, bankSize)); } - public int size() { - return banks.end(); - } - public int add(long key) { - var activeBank = banks.last(); + var activeBank = banks.get(banks.size()-1); int rb = activeBank.add(key); if (rb == -1) { @@ -42,10 +38,10 @@ public class DictionaryData { public long getKey(int offset) { - return banks.bankForOffset(offset).getKey(offset); + return banks.get(offset/DICTIONARY_BANK_SIZE).getKey(offset); } public boolean keyEquals(int offset, long otherKey) { - return banks.bankForOffset(offset).keyEquals(offset, otherKey); + return banks.get(offset/DICTIONARY_BANK_SIZE).keyEquals(offset, otherKey); } private static class DictionaryDataBank { diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java index 3a95072b..b7a588db 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java @@ -19,7 +19,12 @@ public class WordPatterns { public static final Pattern wordBreakPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))"); public static final Pattern characterNoisePattern = Pattern.compile("^[/+\\-]+$"); + public static final Pattern singleWordAdditionalPattern = + Pattern.compile("[\\da-zA-Z]{1,15}([.\\-_/:][\\da-zA-Z]{1,10}){0,4}"); + + public static final Predicate singleWordQualitiesPredicate = singleWordAdditionalPattern.asMatchPredicate(); public static final Predicate wordQualitiesPredicate = wordPattern.asMatchPredicate(); + public static final Predicate wordAppendixPredicate = wordAppendixPattern.asMatchPredicate(); public static final Predicate wordPredicateEither = wordQualitiesPredicate.or(wordAppendixPredicate); public static final Predicate characterNoisePredicate = characterNoisePattern.asMatchPredicate(); diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java index 33b88671..479dcd4c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java @@ -8,7 +8,6 @@ import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; -import org.jetbrains.annotations.NotNull; import javax.inject.Inject; import java.util.*; @@ -45,7 +44,6 @@ public class DocumentKeywordExtractor { List wordsNamesRepeated = nameCounter.count(documentLanguageData, 2); List wordsNamesAll = nameCounter.count(documentLanguageData, 1); List subjects = subjectCounter.count(documentLanguageData); - List wordsLongName = longNameCounter.count(documentLanguageData); int totalSize = wordsTfIdf.size(); @@ -61,17 +59,6 @@ public class DocumentKeywordExtractor { var wordsToMatchWithTitle = joinWordLists(topKeywords, midKeywords, wordsNamesRepeated, subjects); - var words = getSimpleWords(documentLanguageData); - - for (var w : wordsLongName) - words.add(w.word); - for (var w : lowKeywords) - words.remove(w.word); - for (var w : midKeywords) - words.remove(w.word); - for (var w : topKeywords) - words.remove(w.word); - Collection artifacts = getArtifacts(documentLanguageData); var wordSet = new EdgePageWordSet( @@ -85,15 +72,81 @@ public class DocumentKeywordExtractor { new EdgePageWords(IndexBlock.Artifacts, artifacts) ); - wordSet.append(IndexBlock.Words, words); + getSimpleWords(wordSet, documentLanguageData, + IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus); return wordSet; } + private void getSimpleWords(EdgePageWordSet wordSet, DocumentLanguageData documentLanguageData, IndexBlock... blocks) { + + int start = 0; + int lengthGoal = 32; + + for (int blockIdx = 0; blockIdx < blocks.length-1 && start < documentLanguageData.sentences.length; blockIdx++) { + IndexBlock block = blocks[blockIdx]; + Set words = new HashSet<>(lengthGoal+100); + + int pos; + int length = 0; + for (pos = start; pos < documentLanguageData.sentences.length && length < lengthGoal; pos++) { + var sent = documentLanguageData.sentences[pos]; + length += sent.length(); + + for (var word : sent) { + if (!word.isStopWord()) { + String w = AsciiFlattener.flattenUnicode(word.wordLowerCase()); + if (WordPatterns.singleWordQualitiesPredicate.test(w)) { + words.add(w); + } + } + } + } + wordSet.append(block, words); + start = pos; + lengthGoal+=32; + } + + if (start < documentLanguageData.sentences.length) { + + Map counts = new HashMap<>(documentLanguageData.totalNumWords()); + for (int pos = start; pos < documentLanguageData.sentences.length && counts.size() < lengthGoal; pos++) { + var sent = documentLanguageData.sentences[pos]; + for (var word : sent) { + if (!word.isStopWord()) { + String w = AsciiFlattener.flattenUnicode(word.wordLowerCase()); + if (counts.containsKey(w) || (WordPatterns.singleWordQualitiesPredicate.test(w))) { + counts.merge(w, 1, Integer::sum); + } + } + } + } + + Set lastSet; + if (counts.size() < 1024) { + lastSet = counts.keySet(); + } + else { + lastSet = counts.entrySet().stream() + .sorted(Comparator.comparing(e -> { + double N = 11820118.; // Number of documents in term freq dictionary + + // Caveat: This is actually the *negated* term score, because the second logarithm has + // its parameter inverted (log(a^b) = b log(a); here b = -1) + return (1 + Math.log(e.getValue())) * Math.log((1. + dict.getTermFreq(e.getKey())) / N); + })) + .map(Map.Entry::getKey) + .limit(1024) + .collect(Collectors.toCollection(LinkedHashSet::new)); + } + + wordSet.append(blocks[blocks.length - 1], lastSet); + } + } + private Collection getArtifacts(DocumentLanguageData documentLanguageData) { Set reps = new HashSet<>(); - for (var sent : documentLanguageData.sentences) { for (var word : sent) { String lc = word.wordLowerCase(); @@ -138,33 +191,6 @@ public class DocumentKeywordExtractor { return ret; } - @NotNull - private Set getSimpleWords(DocumentLanguageData documentLanguageData) { - Map counts = new HashMap<>(documentLanguageData.totalNumWords()); - - for (var sent : documentLanguageData.sentences) { - for (int i = 0; i < sent.length(); i++) { - if (!sent.isStopWord(i)) { - String w = AsciiFlattener.flattenUnicode(sent.wordsLowerCase[i]); - if (counts.containsKey(w) || (WordPatterns.wordQualitiesPredicate.test(w) && WordPatterns.filter(w))) { - counts.merge(w, 1, Integer::sum); - } - } - } - } - - return counts.entrySet().stream() - .sorted(Comparator.comparing(e -> { - double N = 11820118.; // Number of documents in term freq dictionary - - // Caveat: This is actually the *negated* term score, because the second logarithm has - // its parameter inverted (log(a^b) = b log(a); here b = -1) - return (1+Math.log(e.getValue())) * Math.log((1.+dict.getTermFreq(e.getKey()))/N); - })) - .map(Map.Entry::getKey) - .limit(512).collect(Collectors.toCollection(LinkedHashSet::new)); - } - public EdgePageWords createWords(IndexBlock block, Collection words) { return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet())); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractClient.java index 603f57e5..569b7eaa 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractClient.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractClient.java @@ -2,6 +2,7 @@ package nu.marginalia.wmsa.client; import com.google.gson.Gson; import com.google.gson.GsonBuilder; +import com.google.protobuf.GeneratedMessageV3; import io.reactivex.rxjava3.core.Observable; import io.reactivex.rxjava3.core.ObservableSource; import io.reactivex.rxjava3.plugins.RxJavaPlugins; @@ -17,8 +18,6 @@ import org.apache.http.HttpHost; import org.apache.logging.log4j.ThreadContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.slf4j.Marker; -import org.slf4j.MarkerFactory; import java.io.ByteArrayOutputStream; import java.io.IOException; @@ -186,6 +185,31 @@ public abstract class AbstractClient implements AutoCloseable { .doFinally(() -> ThreadContext.remove("outbound-request")); } + @SneakyThrows + protected synchronized Observable post(Context ctx, String endpoint, GeneratedMessageV3 data) { + + ensureAlive(); + + RequestBody body = RequestBody.create( + MediaType.parse("application/protobuf"), + data.toByteArray()); + + var req = ctx.paint(new Request.Builder()).url(url + endpoint).post(body).build(); + var call = client.newCall(req); + + logInbound(call); + ThreadContext.put("outbound-request", url + endpoint); + try (var rsp = call.execute()) { + logOutbound(rsp); + int code = rsp.code(); + + return validateStatus(code, req).map(HttpStatusCode::new); + } + finally { + ThreadContext.remove("outbound-request"); + } + } + @SneakyThrows protected synchronized Observable postGet(Context ctx, String endpoint, Object data, Class returnType) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java index b68ee68c..02f502d2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java @@ -76,7 +76,7 @@ public class LinkKeywordLoaderMain { // System.out.println(lastLine + " -/- " + domainId + ":" + urlId + " : " + keywords); - indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -5, new EdgePageWordSet( + indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), new EdgePageWordSet( new EdgePageWords(IndexBlock.Link, new HashSet<>(keywords))), 0 ).blockingSubscribe(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java index 46d71505..486eb343 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java @@ -39,7 +39,7 @@ public class IndexLoadKeywords implements Runnable { while (!canceled) { var data = insertQueue.poll(1, TimeUnit.SECONDS); if (data != null) { - client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), -5., data.wordSet, index).blockingSubscribe(); + client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.wordSet, index).blockingSubscribe(); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java index ee106cce..ef88c831 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java @@ -147,7 +147,10 @@ public class DocumentProcessor { throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS); } - var dld = sentenceExtractor.extractSentences(doc.clone()); + DomPruner domPruner = new DomPruner(); + Document prunedDoc = doc.clone(); + domPruner.prune(prunedDoc, 0.5); + var dld = sentenceExtractor.extractSentences(prunedDoc); checkDocumentLanguage(dld); @@ -192,7 +195,7 @@ public class DocumentProcessor { ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add); words.append(IndexBlock.Meta, tagWords); - words.append(IndexBlock.Words, tagWords); + words.append(IndexBlock.Words_1, tagWords); } private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruner.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruner.java new file mode 100644 index 00000000..ebe3de66 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruner.java @@ -0,0 +1,111 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.select.NodeVisitor; + +import java.util.HashMap; +import java.util.Map; + +public class DomPruner { + + public void prune(Document document, double pruneThreshold) { + PruningVisitor pruningVisitor = new PruningVisitor(); + document.traverse(pruningVisitor); + + pruningVisitor.data.forEach((node, data) -> { + if (data.depth <= 1) { + return; + } + if (data.signalNodeSize == 0) node.remove(); + else if (data.noiseNodeSize > 0 + && data.signalRate() < pruneThreshold + && data.treeSize > 3) { + node.remove(); + } + }); + } + + + + private static class PruningVisitor implements NodeVisitor { + + private final Map data = new HashMap<>(); + private final NodeData dummy = new NodeData(Integer.MAX_VALUE, 1, 0); + + @Override + public void head(Node node, int depth) {} + + @Override + public void tail(Node node, int depth) { + final NodeData dataForNode; + + if (node instanceof TextNode tn) { + dataForNode = new NodeData(depth, tn.text().length(), 0); + } + else if (isSignal(node)) { + dataForNode = new NodeData(depth, 0,0); + for (var childNode : node.childNodes()) { + dataForNode.add(data.getOrDefault(childNode, dummy)); + } + } + else { + dataForNode = new NodeData(depth, 0,0); + for (var childNode : node.childNodes()) { + dataForNode.addAsNoise(data.getOrDefault(childNode, dummy)); + } + } + + + + data.put(node, dataForNode); + } + + public boolean isSignal(Node node) { + + if (node instanceof Element e) { + if ("a".equalsIgnoreCase(e.tagName())) + return false; + if ("nav".equalsIgnoreCase(e.tagName())) + return false; + if ("footer".equalsIgnoreCase(e.tagName())) + return false; + if ("header".equalsIgnoreCase(e.tagName())) + return false; + } + + return true; + } + } + + private static class NodeData { + int signalNodeSize; + int noiseNodeSize; + int treeSize = 1; + int depth; + + private NodeData(int depth, int signalNodeSize, int noiseNodeSize) { + this.depth = depth; + this.signalNodeSize = signalNodeSize; + this.noiseNodeSize = noiseNodeSize; + } + + public void add(NodeData other) { + signalNodeSize += other.signalNodeSize; + noiseNodeSize += other.noiseNodeSize; + treeSize += other.treeSize; + } + + public void addAsNoise(NodeData other) { + noiseNodeSize += other.noiseNodeSize + other.signalNodeSize; + treeSize += other.treeSize; + } + + + public double signalRate() { + return signalNodeSize / (double)(signalNodeSize + noiseNodeSize); + } + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java index f9ed8ecc..939b625b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java @@ -4,11 +4,11 @@ import com.google.gson.Gson; import com.google.gson.GsonBuilder; import com.google.inject.Inject; import com.google.inject.name.Named; +import com.google.protobuf.InvalidProtocolBufferException; import gnu.trove.map.TLongIntMap; import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.map.hash.TLongIntHashMap; import gnu.trove.set.hash.TIntHashSet; -import io.prometheus.client.Counter; import io.prometheus.client.Histogram; import io.reactivex.rxjava3.schedulers.Schedulers; import marcono1234.gson.recordadapter.RecordTypeAdapterFactory; @@ -22,18 +22,16 @@ import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry; import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader; import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms; -import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeId; import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; import nu.marginalia.wmsa.edge.model.search.*; import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults; import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification; +import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq; import org.apache.http.HttpStatus; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; @@ -52,7 +50,7 @@ import static spark.Spark.get; import static spark.Spark.halt; public class EdgeIndexService extends Service { - private static final int SEARCH_BUDGET_TIMEOUT_MS = 100; + private static final int SEARCH_BUDGET_TIMEOUT_MS = 3000; private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -66,11 +64,9 @@ public class EdgeIndexService extends Service { .create(); private static final Histogram wmsa_edge_index_query_time - = Histogram.build().name("wmsa_edge_index_query_time").help("-").register(); - private static final Counter wmsa_edge_index_query_count - = Counter.build().name("wmsa_edge_index_query_count").help("-").register(); - private static final Histogram wmsa_edge_index_put_words_time - = Histogram.build().name("wmsa_edge_index_put_words_time").help("-").register(); + = Histogram.build().name("wmsa_edge_index_query_time") + .linearBuckets(50, 50, 15) + .help("-").register(); public static final int DYNAMIC_BUCKET_LENGTH = 7; @@ -162,12 +158,15 @@ public class EdgeIndexService extends Service { indexes.initialize(init); } - private Object putWords(Request request, Response response) { - var putWordsRequest = gson.fromJson(request.body(), EdgePutWordsRequest.class); + private Object putWords(Request request, Response response) throws InvalidProtocolBufferException { + var req = IndexPutKeywordsReq.parseFrom(request.bodyAsBytes()); - synchronized (this) { - putWords(putWordsRequest.getDomainId(), putWordsRequest.getUrlId(), - putWordsRequest.wordSet, putWordsRequest.getIndex()); + EdgeId domainId = new EdgeId<>(req.getDomain()); + EdgeId urlId = new EdgeId<>(req.getUrl()); + int idx = req.getIndex(); + + for (int ws = 0; ws < req.getWordSetCount(); ws++) { + putWords(domainId, urlId, req.getWordSet(ws), idx); } response.status(HttpStatus.SC_ACCEPTED); @@ -175,26 +174,16 @@ public class EdgeIndexService extends Service { } public void putWords(EdgeId domainId, EdgeId urlId, - EdgePageWordSet wordSet, int idx - ) { - - wmsa_edge_index_put_words_time.time(() -> { - for (EdgePageWords words : wordSet.values()) { - putWords(domainId, urlId, words, idx); - } - }); - - } - - public void putWords(EdgeId domainId, EdgeId urlId, - EdgePageWords words, int idx + IndexPutKeywordsReq.WordSet words, int idx ) { SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx); - for (var chunk : ListChunker.chopList(words.words, SearchIndexJournalEntry.MAX_LENGTH)) { + IndexBlock block = IndexBlock.values()[words.getIndex()]; + + for (var chunk : ListChunker.chopList(words.getWordsList(), SearchIndexJournalEntry.MAX_LENGTH)) { var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk)); - var header = new SearchIndexJournalEntryHeader(domainId, urlId, words.block); + var header = new SearchIndexJournalEntryHeader(domainId, urlId, block); indexWriter.put(header, entry); }; @@ -257,7 +246,6 @@ public class EdgeIndexService extends Service { } finally { wmsa_edge_index_query_time.observe(System.currentTimeMillis() - start); - wmsa_edge_index_query_count.inc(); } } @@ -410,16 +398,6 @@ public class EdgeIndexService extends Service { } - public LongStream getHotDomainsQuery(int bucket, IndexBlock block, int wordId, - int queryDepth, int minHitCount, int maxResults) { - if (!indexes.isValidBucket(bucket)) { - logger.warn("Invalid bucket {}", bucket); - return LongStream.empty(); - } - - return indexes.getBucket(bucket).findHotDomainsForKeyword(block, wordId, queryDepth, minHitCount, maxResults); - } - private LongStream getQuery(int bucket, IndexSearchBudget budget, IndexBlock block, LongPredicate filter, EdgeIndexSearchTerms searchTerms) { if (!indexes.isValidBucket(bucket)) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java index 36cd966e..48b1b3ee 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java @@ -9,7 +9,6 @@ import nu.marginalia.wmsa.client.AbstractDynamicClient; import nu.marginalia.wmsa.client.HttpStatusCode; import nu.marginalia.wmsa.configuration.ServiceDescriptor; import nu.marginalia.wmsa.configuration.server.Context; -import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeId; import nu.marginalia.wmsa.edge.model.EdgeUrl; @@ -18,6 +17,7 @@ import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultSet; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults; import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification; +import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -37,13 +37,27 @@ public class EdgeIndexClient extends AbstractDynamicClient { } @CheckReturnValue - public Observable putWords(Context ctx, EdgeId domain, EdgeId url, double quality, + public Observable putWords(Context ctx, EdgeId domain, EdgeId url, EdgePageWordSet wordSet, int writer ) { - EdgePutWordsRequest request = new EdgePutWordsRequest(domain, url, quality, wordSet, writer); - return this.post(ctx, "/words/", request); + var keywordBuilder = + IndexPutKeywordsReq.newBuilder() + .setDomain(domain.id()) + .setUrl(url.id()) + .setIndex(writer); + + for (var set : wordSet.wordSets.values()) { + var wordSetBuilder = IndexPutKeywordsReq.WordSet.newBuilder(); + wordSetBuilder.setIndex(set.block.ordinal()); + wordSetBuilder.addAllWords(set.words); + keywordBuilder.addWordSet(wordSetBuilder.build()); + } + + var req = keywordBuilder.build(); + + return this.post(ctx, "/words/", req); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java index 23c4b481..a11ee5d0 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java @@ -15,6 +15,7 @@ import java.io.IOException; import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; +import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter { @@ -36,6 +37,8 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter { byteBuffer = ByteBuffer.allocate(MAX_BLOCK_SIZE); + new Thread(this::journalWriterThread, "Journal Writer").start(); + writerTask = Schedulers.io().schedulePeriodicallyDirect(this::forceWrite, 1, 1, TimeUnit.SECONDS); Runtime.getRuntime().addShutdownHook(new Thread(this::forceWrite)); } @@ -56,25 +59,45 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter { } } + private record WriteJob(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {} + private final LinkedBlockingQueue writeQueue = new LinkedBlockingQueue<>(512); + @Override @SneakyThrows - public synchronized void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) { + public void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) { + writeQueue.put(new WriteJob(header, entryData)); + } - byteBuffer.clear(); + @SneakyThrows + public void journalWriterThread() { - byteBuffer.putInt(entryData.size()); - byteBuffer.putInt(header.block().id); - byteBuffer.putLong(header.documentId()); + while (true) { + var job = writeQueue.take(); - entryData.write(byteBuffer); + writeEntry(job.header, job.entryData); + } + } + private synchronized void writeEntry(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) { - byteBuffer.limit(byteBuffer.position()); - byteBuffer.rewind(); + try { + byteBuffer.clear(); - while (byteBuffer.position() < byteBuffer.limit()) - channel.write(byteBuffer); + byteBuffer.putInt(entryData.size()); + byteBuffer.putInt(header.block().id); + byteBuffer.putLong(header.documentId()); - writePositionMarker(); + entryData.write(byteBuffer); + + byteBuffer.limit(byteBuffer.position()); + byteBuffer.rewind(); + + while (byteBuffer.position() < byteBuffer.limit()) + channel.write(byteBuffer); + + writePositionMarker(); + } catch (IOException e) { + throw new RuntimeException(e); + } } @Override @@ -94,13 +117,11 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter { } private void writePositionMarker() throws IOException { - var lock = channel.lock(0, 16, false); pos = channel.size(); raf.seek(0); raf.writeLong(pos); raf.writeLong(dictionaryWriter.size()); raf.seek(pos); - lock.release(); } public synchronized void close() throws IOException { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePutWordsRequest.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePutWordsRequest.java index dc541c5b..2494b99c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePutWordsRequest.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePutWordsRequest.java @@ -5,16 +5,16 @@ import lombok.Getter; import lombok.ToString; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeId; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; @AllArgsConstructor @Getter @ToString public class EdgePutWordsRequest { - public final EdgeId domainId; - public final EdgeId urlId; - public final double quality; + public EdgeId domainId; + public EdgeId urlId; + public double quality; - public final EdgePageWordSet wordSet; + public EdgePageWordSet wordSet; private int index = 0; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java index 819706fd..f35fcf3a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java @@ -5,14 +5,18 @@ public enum IndexBlock { Title(1, 1), Link(2, 1.25), Top(3, 2), - Middle(4, 3), - Low(5, 4), - Words(6, 6), + Middle(4, 2.5), + Low(5, 3.0), + Words_1(6, 3.0), Meta(7, 7), - PositionWords(8, 4.5), + Words_2(8, 3.5), NamesWords(9, 5), Artifacts(10, 10), - Topic(11, 0.5); + Topic(11, 0.5), + Words_4(12, 4.0), + Words_8(13, 4.5), + Words_16Plus(14, 7.0), + ; public final int id; public final double sortOrder; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java index 65a4dafe..cc2927ca 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java @@ -29,8 +29,12 @@ public class SearchIndexReader implements AutoCloseable { IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, - IndexBlock.Words, IndexBlock.NamesWords, + IndexBlock.Words_1, + IndexBlock.Words_2, + IndexBlock.Words_4, + IndexBlock.Words_8, + IndexBlock.Words_16Plus, }; @Inject @@ -44,24 +48,29 @@ public class SearchIndexReader implements AutoCloseable { var linkIndex = indices.get(IndexBlock.Link); var titleIndex = indices.get(IndexBlock.Title); var namesIndex = indices.get(IndexBlock.NamesWords); - var positionIndex = indices.get(IndexBlock.PositionWords); var titleKeywordsIndex = indices.get(IndexBlock.TitleKeywords); - var wordsIndex = indices.get(IndexBlock.Words); var metaIndex = indices.get(IndexBlock.Meta); var topicIndex = indices.get(IndexBlock.Topic); + var words1 = indices.get(IndexBlock.Words_1); + var words2 = indices.get(IndexBlock.Words_2); + var words4 = indices.get(IndexBlock.Words_4); + var words8 = indices.get(IndexBlock.Words_8); + var words16 = indices.get(IndexBlock.Words_16Plus); + var artifacts = indices.get(IndexBlock.Artifacts); + queryBuilders = new EnumMap<>(IndexBlock.class); underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class); - queryBuilders.put(IndexBlock.Words, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex, namesIndex, wordsIndex), wordsIndex)); - queryBuilders.put(IndexBlock.Low, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex), wordsIndex)); - queryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex), wordsIndex)); - queryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex), wordsIndex)); - queryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex), wordsIndex)); + queryBuilders.put(IndexBlock.Words_1, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words1), words1)); + queryBuilders.put(IndexBlock.Words_2, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words2), words1)); + queryBuilders.put(IndexBlock.Words_4, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words4), words1)); + queryBuilders.put(IndexBlock.Words_8, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words8), words1)); + queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words1, words2, words4, words8, words16, artifacts), words1)); - underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex)); - underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex)); - underspecifiedQueryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex)); + underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1)); + underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1)); + underspecifiedQueryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, metaIndex), words1)); } @SafeVarargs @@ -157,7 +166,7 @@ public class SearchIndexReader implements AutoCloseable { return block; } } - return IndexBlock.Words; + return IndexBlock.Words_1; } public boolean isTermInBucket(IndexBlock block, int searchTerm, long urlId) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java index 1b27ddd0..78e132b3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java @@ -27,7 +27,8 @@ public class IndexQueryBuilder { public Query build(IndexSearchBudget budget, LongPredicate filter, - int wordId) { + int wordId) + { return new QueryForIndices(budget, filter, wordId); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java index 1338473a..a5fb0656 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java @@ -1,13 +1,13 @@ package nu.marginalia.wmsa.edge.integration.stackoverflow; import com.google.inject.Inject; -import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; import nu.marginalia.wmsa.edge.integration.stackoverflow.model.StackOverflowPost; -import nu.marginalia.wmsa.edge.model.*; +import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; @@ -46,8 +46,8 @@ public class StackOverflowPostProcessor { var keywords = documentKeywordExtractor.extractKeywords(dld); keywords.get(IndexBlock.Meta).addJust("site:"+post.getUrl().domain); - keywords.get(IndexBlock.Words).addJust("site:"+post.getUrl().domain); - keywords.get(IndexBlock.Words).addJust("special:wikipedia"); + keywords.get(IndexBlock.Words_1).addJust("site:"+post.getUrl().domain); + keywords.get(IndexBlock.Words_1).addJust("special:wikipedia"); keywords.get(IndexBlock.Meta).addJust("special:wikipedia"); keywords.get(IndexBlock.Meta).addJust("js:true"); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java index df066adb..22536b90 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java @@ -1,13 +1,13 @@ package nu.marginalia.wmsa.edge.integration.wikipedia; -import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; import nu.marginalia.util.language.processing.SentenceExtractor; +import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData; import nu.marginalia.wmsa.edge.integration.wikipedia.model.WikipediaArticle; -import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink; import nu.marginalia.wmsa.edge.model.EdgeUrl; +import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -42,8 +42,8 @@ public class WikipediaProcessor { var keywords = documentKeywordExtractor.extractKeywords(dld); keywords.get(IndexBlock.Meta).addJust("site:"+post.getUrl().domain); - keywords.get(IndexBlock.Words).addJust("site:"+post.getUrl().domain); - keywords.get(IndexBlock.Words).addJust("special:stackoverflow"); + keywords.get(IndexBlock.Words_1).addJust("site:"+post.getUrl().domain); + keywords.get(IndexBlock.Words_1).addJust("special:stackoverflow"); keywords.get(IndexBlock.Meta).addJust("special:stackoverflow"); keywords.get(IndexBlock.Meta).addJust("js:true"); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java index c4355ae3..6fdaf059 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java @@ -1,13 +1,15 @@ package nu.marginalia.wmsa.edge.model.crawl; +import com.dslplatform.json.JsonObject; +import com.dslplatform.json.JsonWriter; import lombok.Data; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import java.util.*; @Data -public class EdgePageWordSet { - public final Map wordSets; +public class EdgePageWordSet implements JsonObject { + public Map wordSets; public EdgePageWordSet(EdgePageWords... words) { wordSets = new EnumMap<>(IndexBlock.class); @@ -45,4 +47,18 @@ public class EdgePageWordSet { }); return sj.toString(); } + + @Override + public void serialize(JsonWriter writer, boolean minimal) { + writer.writeAscii("["); + boolean first = false; + for (var w : wordSets.values()) { + if (!first) first = true; + else writer.writeAscii(", "); + + w.serialize(writer, minimal); + } + writer.writeAscii("]}"); + + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java index efb20dcc..4a158c25 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java @@ -1,4 +1,7 @@ package nu.marginalia.wmsa.edge.model.crawl; +import com.dslplatform.json.JsonObject; +import com.dslplatform.json.JsonWriter; +import com.dslplatform.json.NumberConverter; import lombok.Getter; import lombok.ToString; import nu.marginalia.wmsa.edge.index.model.IndexBlock; @@ -8,7 +11,7 @@ import java.util.Collection; import java.util.List; @ToString @Getter -public class EdgePageWords { +public class EdgePageWords implements JsonObject { public final IndexBlock block; public final List words = new ArrayList<>(); @@ -31,4 +34,19 @@ public class EdgePageWords { return words.size(); } public void addJust(String word) { words.add(word); } + + @Override + public void serialize(JsonWriter writer, boolean minimal) { + writer.writeAscii("{\"b\":"); + NumberConverter.serialize(block.ordinal(), writer); + writer.writeAscii(", \"w\": ["); + boolean first = false; + for (var word : words) { + if (!first) first = true; + else { writer.writeAscii(","); } + + writer.writeString(word); + } + writer.writeAscii("]}"); + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java index 64f8f8b1..f9deecd2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java @@ -10,25 +10,31 @@ import java.util.stream.Collectors; public enum EdgeSearchProfile { DEFAULT("default", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), + List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Link, + IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus + ), 0, 1), MODERN("modern", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), + List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Link, IndexBlock.NamesWords, + IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus + ), 2), CORPO("corpo", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), - 4, 5, 6, 7), + List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords, + IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus), + 4, 5, 7), YOLO("yolo", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), + List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords, + IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus), 0, 2, 1, 3, 4, 6), CORPO_CLEAN("corpo-clean", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), + List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords), 4, 5), ACADEMIA("academia", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), + List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords), 3), FOOD("food", - List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords), + List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords), 2, 0), ; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java index fafcaa4b..22969872 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java @@ -62,7 +62,7 @@ public class SiteSearchCommand implements SearchCommandInterface { DecoratedSearchResultSet resultSet; Path screenshotPath = null; if (null != domain) { - resultSet = searchOperator.performDumbQuery(ctx, EdgeSearchProfile.CORPO, IndexBlock.Words, 100, 100, "site:"+domain); + resultSet = searchOperator.performDumbQuery(ctx, EdgeSearchProfile.CORPO, IndexBlock.Words_1, 100, 100, "site:"+domain); screenshotPath = Path.of("/screenshot/" + dataStoreDao.getDomainId(domain).id()); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultValuator.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultValuator.java index 8bcd93a6..e82153ed 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultValuator.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultValuator.java @@ -30,7 +30,7 @@ public class SearchResultValuator { EdgeSearchResultKeywordScore[] scores = rawScores.stream().filter(w -> !w.keyword.contains(":")).toArray(EdgeSearchResultKeywordScore[]::new); if (scores.length == 0) { - return IndexBlock.Words.sortOrder; + return IndexBlock.Words_1.sortOrder; } final double[] weights = getTermWeights(scores); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java index 2e3398da..d3ed948e 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java @@ -51,7 +51,7 @@ public class FeaturesLoaderTool { throw new RuntimeException(ex); } - client.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -5, wordSet, 0) + client.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), wordSet, 0) .blockingSubscribe(); }); diff --git a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql index c1dc9aa9..d2def737 100644 --- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql +++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql @@ -141,7 +141,7 @@ CREATE OR REPLACE VIEW EC_URL_VIEW AS EC_PAGE_DATA.FEATURES AS FEATURES, EC_DOMAIN.IP AS IP, - EC_DOMAIN.STATE AS STATE, + EC_URL.STATE AS STATE, EC_DOMAIN.RANK AS RANK, EC_DOMAIN.STATE AS DOMAIN_STATE FROM EC_URL diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPrunerTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPrunerTest.java new file mode 100644 index 00000000..a0eb5ba5 --- /dev/null +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPrunerTest.java @@ -0,0 +1,12 @@ +package nu.marginalia.wmsa.edge.converting.processor.logic; + +import org.junit.jupiter.api.Test; + +import java.io.IOException; + +class DomPrunerTest { + @Test + public void test() throws IOException { + + } +} \ No newline at end of file diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java index f78fb757..ea742a93 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java @@ -1,10 +1,8 @@ package nu.marginalia.wmsa.edge.crawling; -import com.zaxxer.hikari.HikariConfig; -import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; import nu.marginalia.util.TestLanguageModels; -import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; +import nu.marginalia.util.language.WordPatterns; import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.util.language.processing.DocumentKeywordExtractor; import nu.marginalia.util.language.processing.KeywordExtractor; @@ -12,11 +10,9 @@ import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.util.language.processing.model.WordSpan; import nu.marginalia.util.language.processing.model.tag.WordSeparator; -import nu.marginalia.util.ranking.BuggyReversePageRank; -import nu.marginalia.util.ranking.BuggyStandardPageRank; +import nu.marginalia.wmsa.edge.assistant.dict.NGramDict; import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader; import nu.marginalia.wmsa.edge.model.EdgeDomain; -import nu.marginalia.wmsa.edge.model.EdgeUrl; import org.jsoup.Jsoup; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; @@ -103,6 +99,11 @@ class SentenceExtractorTest { }); reader.join(); } + + @Test + public void testPattern() { + System.out.println(WordPatterns.singleWordAdditionalPattern.matcher("2.6.18164.el5pae").matches()); + } @Test void extractSentences() throws IOException { var data = Path.of("/home/vlofgren/Code/tmp-data/"); diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java index 55015d13..da9206bf 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java @@ -11,12 +11,15 @@ import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; -import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; import nu.marginalia.wmsa.edge.model.EdgeId; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; import nu.marginalia.wmsa.edge.model.EdgeUrl; -import org.junit.jupiter.api.*; +import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; +import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; +import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; import org.junit.jupiter.api.parallel.Execution; import org.junit.jupiter.api.parallel.ExecutionMode; import org.junit.jupiter.api.parallel.ResourceAccessMode; @@ -141,7 +144,7 @@ public class EdgeIndexClientTest { void putWords(int didx, int idx, double quality, String... words) { EdgePageWords epw = new EdgePageWords(IndexBlock.Title); epw.addAll(Arrays.asList(words)); - client.putWords(Context.internal(), new EdgeId<>(didx), new EdgeId<>(idx), quality, + client.putWords(Context.internal(), new EdgeId<>(didx), new EdgeId<>(idx), new EdgePageWordSet(epw), 0).blockingSubscribe(); } diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java index c900f0f6..f0e6ecc0 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java @@ -61,7 +61,7 @@ class SearchIndexJournalWriterTest { void put() throws IOException { writer.put(new SearchIndexJournalEntryHeader(4, (1234L << 32) | 5678, IndexBlock.Link), new SearchIndexJournalEntry(new long[] { 1, 2, 3, 4 })); - writer.put(new SearchIndexJournalEntryHeader(4, (2345L << 32) | 2244, IndexBlock.Words), + writer.put(new SearchIndexJournalEntryHeader(4, (2345L << 32) | 2244, IndexBlock.Words_1), new SearchIndexJournalEntry(new long[] { 5, 6, 7 })); writer.forceWrite(); diff --git a/protocol/build.gradle b/protocol/build.gradle new file mode 100644 index 00000000..210a7612 --- /dev/null +++ b/protocol/build.gradle @@ -0,0 +1,27 @@ +plugins { + id "com.google.protobuf" version "0.8.19" + id "java" +} +repositories { + gradlePluginPortal() +} +protobuf { + protoc { + artifact = 'com.google.protobuf:protoc:3.0.0' + } +} + +sourceSets { + main { + java { + srcDirs 'build/generated/source/proto/main/grpc' + srcDirs 'build/generated/source/proto/main/java' + } + } +} + +dependencies { + protobuf files ("def/") + + implementation group: 'com.google.protobuf', name: 'protobuf-java', version: '3.0.0' +} \ No newline at end of file diff --git a/protocol/def/index.proto b/protocol/def/index.proto new file mode 100644 index 00000000..30cf916d --- /dev/null +++ b/protocol/def/index.proto @@ -0,0 +1,21 @@ +syntax = "proto3"; + +option java_package = "nu.wmsa.wmsa.edge.index.proto"; +option java_outer_classname = "IndexProto"; +option java_multiple_files = true; + +message IndexPutKeywordsReq { + int32 domain = 1; + int32 url = 2; + int32 index = 3; + repeated WordSet wordSet = 4; + + message WordSet { + int32 index = 1; + repeated string words = 2; + } +} + +message IndexSearchQueryRsp { + +} \ No newline at end of file diff --git a/settings.gradle b/settings.gradle index cb3868c8..149ff1ea 100644 --- a/settings.gradle +++ b/settings.gradle @@ -1,4 +1,5 @@ rootProject.name = 'wmsa' include 'marginalia_nu' -include 'third_party' \ No newline at end of file +include 'third_party' +include 'protocol' \ No newline at end of file