Experimental changes for 22-08/09 update.

2025-02-22 20:48:59 +00:00 · 2022-08-26 16:08:46 +02:00 · 2022-08-26 16:08:46 +02:00 · 3200c36072
commit 3200c36072
parent db056be06a
32 changed files with 475 additions and 175 deletions
--- a/marginalia_nu/build.gradle
+++ b/marginalia_nu/build.gradle
@ -58,7 +58,7 @@ jmhJar {
 }
 dependencies {
    implementation project(':third_party')
-
+    implementation project(':protocol')

    implementation 'org.projectlombok:lombok:1.18.24'
    annotationProcessor 'org.projectlombok:lombok:1.18.24'
@ -157,6 +157,9 @@ dependencies {

    jmh 'org.openjdk.jmh:jmh-core:1.35'
    jmh 'org.openjdk.jmh:jmh-generator-annprocess:1.35'
+
+    implementation 'com.dslplatform:dsl-json:1.9.9'
+    annotationProcessor 'com.dslplatform:dsl-json-processor:1.9.9'
 }

 configurations {
--- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java
@ -1,18 +1,18 @@
 package nu.marginalia.util.dict;

-import nu.marginalia.util.SeekDictionary;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.nio.ByteBuffer;
 import java.nio.LongBuffer;
+import java.util.ArrayList;

 public class DictionaryData {

    private final int DICTIONARY_BANK_SIZE;
    private static final Logger logger = LoggerFactory.getLogger(DictionaryData.class);

-    private final SeekDictionary<DictionaryDataBank> banks = SeekDictionary.of(DictionaryDataBank::getSize);
+    private final ArrayList<DictionaryDataBank> banks = new ArrayList(100);

    public DictionaryData(int bankSize) {
        DICTIONARY_BANK_SIZE = bankSize;
@ -20,12 +20,8 @@ public class DictionaryData {
        banks.add(new DictionaryDataBank(0, bankSize));
    }

-    public int size() {
-        return banks.end();
-    }
-
    public int add(long key) {
-        var activeBank = banks.last();
+        var activeBank = banks.get(banks.size()-1);
        int rb = activeBank.add(key);

        if (rb == -1) {
@ -42,10 +38,10 @@ public class DictionaryData {


    public long getKey(int offset) {
-        return banks.bankForOffset(offset).getKey(offset);
+        return banks.get(offset/DICTIONARY_BANK_SIZE).getKey(offset);
    }
    public boolean keyEquals(int offset, long otherKey) {
-        return banks.bankForOffset(offset).keyEquals(offset, otherKey);
+        return banks.get(offset/DICTIONARY_BANK_SIZE).keyEquals(offset, otherKey);
    }

    private static class DictionaryDataBank {
--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java
@ -19,7 +19,12 @@ public class WordPatterns {
    public static final Pattern wordBreakPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
    public static final Pattern characterNoisePattern = Pattern.compile("^[/+\\-]+$");

+    public static final Pattern singleWordAdditionalPattern =
+            Pattern.compile("[\\da-zA-Z]{1,15}([.\\-_/:][\\da-zA-Z]{1,10}){0,4}");
+
+    public static final Predicate<String> singleWordQualitiesPredicate = singleWordAdditionalPattern.asMatchPredicate();
    public static final Predicate<String> wordQualitiesPredicate = wordPattern.asMatchPredicate();
+
    public static final Predicate<String> wordAppendixPredicate = wordAppendixPattern.asMatchPredicate();
    public static final Predicate<String> wordPredicateEither = wordQualitiesPredicate.or(wordAppendixPredicate);
    public static final Predicate<String> characterNoisePredicate = characterNoisePattern.asMatchPredicate();
--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java
@ -8,7 +8,6 @@ import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
 import nu.marginalia.wmsa.edge.index.model.IndexBlock;
 import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
 import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
-import org.jetbrains.annotations.NotNull;

 import javax.inject.Inject;
 import java.util.*;
@ -45,7 +44,6 @@ public class DocumentKeywordExtractor {
        List<WordRep> wordsNamesRepeated = nameCounter.count(documentLanguageData, 2);
        List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
        List<WordRep> subjects = subjectCounter.count(documentLanguageData);
-        List<WordRep> wordsLongName = longNameCounter.count(documentLanguageData);

        int totalSize = wordsTfIdf.size();

@ -61,17 +59,6 @@ public class DocumentKeywordExtractor {

        var wordsToMatchWithTitle = joinWordLists(topKeywords, midKeywords, wordsNamesRepeated, subjects);

-        var words = getSimpleWords(documentLanguageData);
-
-        for (var w : wordsLongName)
-            words.add(w.word);
-        for (var w : lowKeywords)
-            words.remove(w.word);
-        for (var w : midKeywords)
-            words.remove(w.word);
-        for (var w : topKeywords)
-            words.remove(w.word);
-
        Collection<String> artifacts = getArtifacts(documentLanguageData);

        var wordSet = new EdgePageWordSet(
@ -85,15 +72,81 @@ public class DocumentKeywordExtractor {
                new EdgePageWords(IndexBlock.Artifacts, artifacts)
        );

-        wordSet.append(IndexBlock.Words, words);
+        getSimpleWords(wordSet, documentLanguageData,
+                IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus);

        return wordSet;
    }

+    private void getSimpleWords(EdgePageWordSet wordSet, DocumentLanguageData documentLanguageData, IndexBlock...  blocks) {
+
+        int start = 0;
+        int lengthGoal = 32;
+
+        for (int blockIdx = 0; blockIdx < blocks.length-1 && start < documentLanguageData.sentences.length; blockIdx++) {
+            IndexBlock block = blocks[blockIdx];
+            Set<String> words = new HashSet<>(lengthGoal+100);
+
+            int pos;
+            int length = 0;
+            for (pos = start; pos < documentLanguageData.sentences.length && length < lengthGoal; pos++) {
+                var sent = documentLanguageData.sentences[pos];
+                length += sent.length();
+
+                for (var word : sent) {
+                    if (!word.isStopWord()) {
+                        String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
+                        if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
+                            words.add(w);
+                        }
+                    }
+                }
+            }
+            wordSet.append(block, words);
+            start = pos;
+            lengthGoal+=32;
+        }
+
+        if (start < documentLanguageData.sentences.length) {
+
+            Map<String, Integer> counts = new HashMap<>(documentLanguageData.totalNumWords());
+            for (int pos = start; pos < documentLanguageData.sentences.length && counts.size() < lengthGoal; pos++) {
+                var sent = documentLanguageData.sentences[pos];
+                for (var word : sent) {
+                    if (!word.isStopWord()) {
+                        String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
+                        if (counts.containsKey(w) || (WordPatterns.singleWordQualitiesPredicate.test(w))) {
+                            counts.merge(w, 1, Integer::sum);
+                        }
+                    }
+                }
+            }
+
+            Set<String> lastSet;
+            if (counts.size() < 1024) {
+                lastSet = counts.keySet();
+            }
+            else {
+                lastSet = counts.entrySet().stream()
+                        .sorted(Comparator.comparing(e -> {
+                            double N = 11820118.; // Number of documents in term freq dictionary
+
+                            // Caveat: This is actually the *negated* term score, because the second logarithm has
+                            // its parameter inverted (log(a^b) = b log(a); here b = -1)
+                            return (1 + Math.log(e.getValue())) * Math.log((1. + dict.getTermFreq(e.getKey())) / N);
+                        }))
+                        .map(Map.Entry::getKey)
+                        .limit(1024)
+                        .collect(Collectors.toCollection(LinkedHashSet::new));
+            }
+
+            wordSet.append(blocks[blocks.length - 1], lastSet);
+        }
+    }
+
    private Collection<String> getArtifacts(DocumentLanguageData documentLanguageData) {
        Set<String> reps = new HashSet<>();

-
        for (var sent : documentLanguageData.sentences) {
            for (var word : sent) {
                String lc = word.wordLowerCase();
@ -138,33 +191,6 @@ public class DocumentKeywordExtractor {
        return ret;
    }

-    @NotNull
-    private Set<String> getSimpleWords(DocumentLanguageData documentLanguageData) {
-        Map<String, Integer> counts = new HashMap<>(documentLanguageData.totalNumWords());
-
-        for (var sent : documentLanguageData.sentences) {
-            for (int i = 0; i < sent.length(); i++) {
-                if (!sent.isStopWord(i)) {
-                    String w = AsciiFlattener.flattenUnicode(sent.wordsLowerCase[i]);
-                    if (counts.containsKey(w) || (WordPatterns.wordQualitiesPredicate.test(w) && WordPatterns.filter(w))) {
-                        counts.merge(w, 1, Integer::sum);
-                    }
-                }
-            }
-        }
-
-        return counts.entrySet().stream()
-                .sorted(Comparator.comparing(e -> {
-                    double N = 11820118.; // Number of documents in term freq dictionary
-
-                    // Caveat: This is actually the *negated* term score, because the second logarithm has
-                    // its parameter inverted (log(a^b) = b log(a); here b = -1)
-                    return (1+Math.log(e.getValue())) * Math.log((1.+dict.getTermFreq(e.getKey()))/N);
-                }))
-                .map(Map.Entry::getKey)
-                .limit(512).collect(Collectors.toCollection(LinkedHashSet::new));
-    }
-

    public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) {
        return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet()));
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractClient.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/AbstractClient.java
@ -2,6 +2,7 @@ package nu.marginalia.wmsa.client;

 import com.google.gson.Gson;
 import com.google.gson.GsonBuilder;
+import com.google.protobuf.GeneratedMessageV3;
 import io.reactivex.rxjava3.core.Observable;
 import io.reactivex.rxjava3.core.ObservableSource;
 import io.reactivex.rxjava3.plugins.RxJavaPlugins;
@ -17,8 +18,6 @@ import org.apache.http.HttpHost;
 import org.apache.logging.log4j.ThreadContext;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.slf4j.Marker;
-import org.slf4j.MarkerFactory;

 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
@ -186,6 +185,31 @@ public abstract class AbstractClient implements AutoCloseable {
                .doFinally(() -> ThreadContext.remove("outbound-request"));
    }

+    @SneakyThrows
+    protected synchronized Observable<HttpStatusCode> post(Context ctx, String endpoint, GeneratedMessageV3 data) {
+
+        ensureAlive();
+
+        RequestBody body = RequestBody.create(
+                MediaType.parse("application/protobuf"),
+                data.toByteArray());
+
+        var req = ctx.paint(new Request.Builder()).url(url + endpoint).post(body).build();
+        var call = client.newCall(req);
+
+        logInbound(call);
+        ThreadContext.put("outbound-request", url + endpoint);
+        try (var rsp = call.execute()) {
+            logOutbound(rsp);
+            int code = rsp.code();
+
+            return validateStatus(code, req).map(HttpStatusCode::new);
+        }
+        finally {
+            ThreadContext.remove("outbound-request");
+        }
+    }
+

    @SneakyThrows
    protected synchronized <T> Observable<T> postGet(Context ctx, String endpoint, Object data, Class<T> returnType) {
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java
@ -76,7 +76,7 @@ public class LinkKeywordLoaderMain {

 //                    System.out.println(lastLine + " -/- " + domainId + ":" + urlId + " : " + keywords);

-                    indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -5, new EdgePageWordSet(
+                    indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), new EdgePageWordSet(
                            new EdgePageWords(IndexBlock.Link, new HashSet<>(keywords))), 0
                    ).blockingSubscribe();
                }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java
@ -39,7 +39,7 @@ public class IndexLoadKeywords implements Runnable {
        while (!canceled) {
            var data = insertQueue.poll(1, TimeUnit.SECONDS);
            if (data != null) {
-                client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), -5., data.wordSet, index).blockingSubscribe();
+                client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.wordSet, index).blockingSubscribe();
            }
        }
    }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java
@ -147,7 +147,10 @@ public class DocumentProcessor {
            throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
        }

-        var dld = sentenceExtractor.extractSentences(doc.clone());
+        DomPruner domPruner = new DomPruner();
+        Document prunedDoc = doc.clone();
+        domPruner.prune(prunedDoc, 0.5);
+        var dld = sentenceExtractor.extractSentences(prunedDoc);

        checkDocumentLanguage(dld);

@ -192,7 +195,7 @@ public class DocumentProcessor {
        ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);

        words.append(IndexBlock.Meta, tagWords);
-        words.append(IndexBlock.Words, tagWords);
+        words.append(IndexBlock.Words_1, tagWords);
    }

    private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) {
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruner.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPruner.java
@ -0,0 +1,111 @@
+package nu.marginalia.wmsa.edge.converting.processor.logic;
+
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+import org.jsoup.nodes.TextNode;
+import org.jsoup.select.NodeVisitor;
+
+import java.util.HashMap;
+import java.util.Map;
+
+public class DomPruner {
+
+    public void prune(Document document, double pruneThreshold) {
+        PruningVisitor pruningVisitor = new PruningVisitor();
+        document.traverse(pruningVisitor);
+
+        pruningVisitor.data.forEach((node, data) -> {
+            if (data.depth <= 1) {
+                return;
+            }
+            if (data.signalNodeSize == 0) node.remove();
+            else if (data.noiseNodeSize > 0
+                    && data.signalRate() < pruneThreshold
+                    && data.treeSize > 3) {
+                node.remove();
+            }
+        });
+    }
+
+
+
+    private static class PruningVisitor implements NodeVisitor {
+
+        private final Map<Node, NodeData> data = new HashMap<>();
+        private final NodeData dummy = new NodeData(Integer.MAX_VALUE, 1, 0);
+
+        @Override
+        public void head(Node node, int depth) {}
+
+        @Override
+        public void tail(Node node, int depth) {
+            final NodeData dataForNode;
+
+            if (node instanceof TextNode tn) {
+                dataForNode = new NodeData(depth, tn.text().length(), 0);
+            }
+            else if (isSignal(node)) {
+                dataForNode = new NodeData(depth,  0,0);
+                for (var childNode : node.childNodes()) {
+                    dataForNode.add(data.getOrDefault(childNode, dummy));
+                }
+            }
+            else {
+                dataForNode = new NodeData(depth,  0,0);
+                for (var childNode : node.childNodes()) {
+                    dataForNode.addAsNoise(data.getOrDefault(childNode, dummy));
+                }
+            }
+
+
+
+            data.put(node, dataForNode);
+        }
+
+        public boolean isSignal(Node node) {
+
+            if (node instanceof Element e) {
+                if ("a".equalsIgnoreCase(e.tagName()))
+                    return false;
+                if ("nav".equalsIgnoreCase(e.tagName()))
+                    return false;
+                if ("footer".equalsIgnoreCase(e.tagName()))
+                    return false;
+                if ("header".equalsIgnoreCase(e.tagName()))
+                    return false;
+            }
+
+            return true;
+        }
+    }
+
+    private static class NodeData {
+        int signalNodeSize;
+        int noiseNodeSize;
+        int treeSize = 1;
+        int depth;
+
+        private NodeData(int depth, int signalNodeSize, int noiseNodeSize) {
+            this.depth = depth;
+            this.signalNodeSize = signalNodeSize;
+            this.noiseNodeSize = noiseNodeSize;
+        }
+
+        public void add(NodeData other) {
+            signalNodeSize += other.signalNodeSize;
+            noiseNodeSize += other.noiseNodeSize;
+            treeSize += other.treeSize;
+        }
+
+        public void addAsNoise(NodeData other) {
+            noiseNodeSize += other.noiseNodeSize + other.signalNodeSize;
+            treeSize += other.treeSize;
+        }
+
+
+        public double signalRate() {
+            return signalNodeSize / (double)(signalNodeSize + noiseNodeSize);
+        }
+    }
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java
@ -4,11 +4,11 @@ import com.google.gson.Gson;
 import com.google.gson.GsonBuilder;
 import com.google.inject.Inject;
 import com.google.inject.name.Named;
+import com.google.protobuf.InvalidProtocolBufferException;
 import gnu.trove.map.TLongIntMap;
 import gnu.trove.map.hash.TIntIntHashMap;
 import gnu.trove.map.hash.TLongIntHashMap;
 import gnu.trove.set.hash.TIntHashSet;
-import io.prometheus.client.Counter;
 import io.prometheus.client.Histogram;
 import io.reactivex.rxjava3.schedulers.Schedulers;
 import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
@ -22,18 +22,16 @@ import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
 import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
 import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
 import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
-import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest;
 import nu.marginalia.wmsa.edge.index.model.IndexBlock;
 import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
 import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
 import nu.marginalia.wmsa.edge.model.EdgeDomain;
 import nu.marginalia.wmsa.edge.model.EdgeId;
 import nu.marginalia.wmsa.edge.model.EdgeUrl;
-import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
-import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
 import nu.marginalia.wmsa.edge.model.search.*;
 import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
 import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
+import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq;
 import org.apache.http.HttpStatus;
 import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
@ -52,7 +50,7 @@ import static spark.Spark.get;
 import static spark.Spark.halt;

 public class EdgeIndexService extends Service {
-    private static final int SEARCH_BUDGET_TIMEOUT_MS = 100;
+    private static final int SEARCH_BUDGET_TIMEOUT_MS = 3000;

    private final Logger logger = LoggerFactory.getLogger(getClass());

@ -66,11 +64,9 @@ public class EdgeIndexService extends Service {
            .create();

    private static final Histogram wmsa_edge_index_query_time
-            = Histogram.build().name("wmsa_edge_index_query_time").help("-").register();
-    private static final Counter wmsa_edge_index_query_count
-            = Counter.build().name("wmsa_edge_index_query_count").help("-").register();
-    private static final Histogram wmsa_edge_index_put_words_time
-            = Histogram.build().name("wmsa_edge_index_put_words_time").help("-").register();
+            = Histogram.build().name("wmsa_edge_index_query_time")
+                               .linearBuckets(50, 50, 15)
+                               .help("-").register();

    public static final int DYNAMIC_BUCKET_LENGTH = 7;

@ -162,12 +158,15 @@ public class EdgeIndexService extends Service {
        indexes.initialize(init);
    }

-    private Object putWords(Request request, Response response) {
-        var putWordsRequest = gson.fromJson(request.body(), EdgePutWordsRequest.class);
+    private Object putWords(Request request, Response response) throws InvalidProtocolBufferException {
+        var req = IndexPutKeywordsReq.parseFrom(request.bodyAsBytes());

-        synchronized (this) {
-            putWords(putWordsRequest.getDomainId(), putWordsRequest.getUrlId(),
-                    putWordsRequest.wordSet, putWordsRequest.getIndex());
+        EdgeId<EdgeDomain> domainId = new EdgeId<>(req.getDomain());
+        EdgeId<EdgeUrl> urlId = new EdgeId<>(req.getUrl());
+        int idx = req.getIndex();
+
+        for (int ws = 0; ws < req.getWordSetCount(); ws++) {
+            putWords(domainId, urlId, req.getWordSet(ws), idx);
        }

        response.status(HttpStatus.SC_ACCEPTED);
@ -175,26 +174,16 @@ public class EdgeIndexService extends Service {
    }

    public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
-                         EdgePageWordSet wordSet, int idx
-    ) {
-
-        wmsa_edge_index_put_words_time.time(() -> {
-            for (EdgePageWords words : wordSet.values()) {
-                putWords(domainId, urlId, words, idx);
-            }
-        });
-
-    }
-
-    public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
-                         EdgePageWords words, int idx
+                         IndexPutKeywordsReq.WordSet words, int idx
    ) {
        SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);

-        for (var chunk : ListChunker.chopList(words.words, SearchIndexJournalEntry.MAX_LENGTH)) {
+        IndexBlock block = IndexBlock.values()[words.getIndex()];
+
+        for (var chunk : ListChunker.chopList(words.getWordsList(), SearchIndexJournalEntry.MAX_LENGTH)) {

            var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
-            var header = new SearchIndexJournalEntryHeader(domainId, urlId, words.block);
+            var header = new SearchIndexJournalEntryHeader(domainId, urlId, block);

            indexWriter.put(header, entry);
        };
@ -257,7 +246,6 @@ public class EdgeIndexService extends Service {
        }
        finally {
            wmsa_edge_index_query_time.observe(System.currentTimeMillis() - start);
-            wmsa_edge_index_query_count.inc();
        }
    }

@ -410,16 +398,6 @@ public class EdgeIndexService extends Service {

    }

-    public LongStream getHotDomainsQuery(int bucket, IndexBlock block, int wordId,
-                                         int queryDepth, int minHitCount, int maxResults) {
-        if (!indexes.isValidBucket(bucket)) {
-            logger.warn("Invalid bucket {}", bucket);
-            return LongStream.empty();
-        }
-
-        return indexes.getBucket(bucket).findHotDomainsForKeyword(block, wordId, queryDepth, minHitCount, maxResults);
-    }
-
    private LongStream getQuery(int bucket, IndexSearchBudget budget, IndexBlock block,
                                LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
        if (!indexes.isValidBucket(bucket)) {
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java
@ -9,7 +9,6 @@ import nu.marginalia.wmsa.client.AbstractDynamicClient;
 import nu.marginalia.wmsa.client.HttpStatusCode;
 import nu.marginalia.wmsa.configuration.ServiceDescriptor;
 import nu.marginalia.wmsa.configuration.server.Context;
-import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest;
 import nu.marginalia.wmsa.edge.model.EdgeDomain;
 import nu.marginalia.wmsa.edge.model.EdgeId;
 import nu.marginalia.wmsa.edge.model.EdgeUrl;
@ -18,6 +17,7 @@ import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultSet;
 import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
 import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
 import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
+import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

@ -37,13 +37,27 @@ public class EdgeIndexClient extends AbstractDynamicClient {
    }

    @CheckReturnValue
-    public Observable<HttpStatusCode> putWords(Context ctx, EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url, double quality,
+    public Observable<HttpStatusCode> putWords(Context ctx, EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url,
                                               EdgePageWordSet wordSet, int writer
                                               )
    {
-        EdgePutWordsRequest request = new EdgePutWordsRequest(domain, url, quality, wordSet, writer);

-        return this.post(ctx, "/words/", request);
+        var keywordBuilder =
+                IndexPutKeywordsReq.newBuilder()
+                    .setDomain(domain.id())
+                    .setUrl(url.id())
+                    .setIndex(writer);
+
+        for (var set : wordSet.wordSets.values()) {
+            var wordSetBuilder = IndexPutKeywordsReq.WordSet.newBuilder();
+            wordSetBuilder.setIndex(set.block.ordinal());
+            wordSetBuilder.addAllWords(set.words);
+            keywordBuilder.addWordSet(wordSetBuilder.build());
+        }
+
+        var req = keywordBuilder.build();
+
+        return this.post(ctx, "/words/", req);
    }


--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java
@ -15,6 +15,7 @@ import java.io.IOException;
 import java.io.RandomAccessFile;
 import java.nio.ByteBuffer;
 import java.nio.channels.FileChannel;
+import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.TimeUnit;

 public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
@ -36,6 +37,8 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {

        byteBuffer = ByteBuffer.allocate(MAX_BLOCK_SIZE);

+        new Thread(this::journalWriterThread, "Journal Writer").start();
+
        writerTask = Schedulers.io().schedulePeriodicallyDirect(this::forceWrite, 1, 1, TimeUnit.SECONDS);
        Runtime.getRuntime().addShutdownHook(new Thread(this::forceWrite));
    }
@ -56,25 +59,45 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
        }
    }

+    private record WriteJob(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {}
+    private final LinkedBlockingQueue<WriteJob> writeQueue = new LinkedBlockingQueue<>(512);
+
    @Override
    @SneakyThrows
-    public synchronized void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {
+    public void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {
+        writeQueue.put(new WriteJob(header, entryData));
+    }

-        byteBuffer.clear();
+    @SneakyThrows
+    public void journalWriterThread() {

-        byteBuffer.putInt(entryData.size());
-        byteBuffer.putInt(header.block().id);
-        byteBuffer.putLong(header.documentId());
+        while (true) {
+            var job = writeQueue.take();

-        entryData.write(byteBuffer);
+            writeEntry(job.header, job.entryData);
+        }
+    }
+    private synchronized void writeEntry(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {

-        byteBuffer.limit(byteBuffer.position());
-        byteBuffer.rewind();
+        try {
+            byteBuffer.clear();

-        while (byteBuffer.position() < byteBuffer.limit())
-            channel.write(byteBuffer);
+            byteBuffer.putInt(entryData.size());
+            byteBuffer.putInt(header.block().id);
+            byteBuffer.putLong(header.documentId());

-        writePositionMarker();
+            entryData.write(byteBuffer);
+
+            byteBuffer.limit(byteBuffer.position());
+            byteBuffer.rewind();
+
+            while (byteBuffer.position() < byteBuffer.limit())
+                channel.write(byteBuffer);
+
+            writePositionMarker();
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
    }

    @Override
@ -94,13 +117,11 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
    }

    private void writePositionMarker() throws IOException {
-        var lock = channel.lock(0, 16, false);
        pos = channel.size();
        raf.seek(0);
        raf.writeLong(pos);
        raf.writeLong(dictionaryWriter.size());
        raf.seek(pos);
-        lock.release();
    }

    public synchronized void close() throws IOException {
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePutWordsRequest.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePutWordsRequest.java
@ -5,16 +5,16 @@ import lombok.Getter;
 import lombok.ToString;
 import nu.marginalia.wmsa.edge.model.EdgeDomain;
 import nu.marginalia.wmsa.edge.model.EdgeId;
-import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
 import nu.marginalia.wmsa.edge.model.EdgeUrl;
+import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;

@AllArgsConstructor @Getter
@ToString
 public class EdgePutWordsRequest {
-    public final EdgeId<EdgeDomain> domainId;
-    public final EdgeId<EdgeUrl> urlId;
-    public final double quality;
+    public EdgeId<EdgeDomain> domainId;
+    public EdgeId<EdgeUrl> urlId;
+    public double quality;

-    public final EdgePageWordSet wordSet;
+    public EdgePageWordSet wordSet;
    private int index = 0;
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java
@ -5,14 +5,18 @@ public enum IndexBlock {
    Title(1, 1),
    Link(2, 1.25),
    Top(3, 2),
-    Middle(4, 3),
-    Low(5, 4),
-    Words(6, 6),
+    Middle(4, 2.5),
+    Low(5, 3.0),
+    Words_1(6, 3.0),
    Meta(7, 7),
-    PositionWords(8, 4.5),
+    Words_2(8, 3.5),
    NamesWords(9, 5),
    Artifacts(10, 10),
-    Topic(11, 0.5);
+    Topic(11, 0.5),
+    Words_4(12, 4.0),
+    Words_8(13, 4.5),
+    Words_16Plus(14, 7.0),
+    ;

    public final int id;
    public final double sortOrder;
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java
@ -29,8 +29,12 @@ public class SearchIndexReader implements AutoCloseable {
            IndexBlock.Top,
            IndexBlock.Middle,
            IndexBlock.Low,
-            IndexBlock.Words,
            IndexBlock.NamesWords,
+            IndexBlock.Words_1,
+            IndexBlock.Words_2,
+            IndexBlock.Words_4,
+            IndexBlock.Words_8,
+            IndexBlock.Words_16Plus,
    };

    @Inject
@ -44,24 +48,29 @@ public class SearchIndexReader implements AutoCloseable {
        var linkIndex  = indices.get(IndexBlock.Link);
        var titleIndex  = indices.get(IndexBlock.Title);
        var namesIndex  = indices.get(IndexBlock.NamesWords);
-        var positionIndex  = indices.get(IndexBlock.PositionWords);
        var titleKeywordsIndex  = indices.get(IndexBlock.TitleKeywords);
-        var wordsIndex  = indices.get(IndexBlock.Words);
        var metaIndex  = indices.get(IndexBlock.Meta);
        var topicIndex  = indices.get(IndexBlock.Topic);

+        var words1  = indices.get(IndexBlock.Words_1);
+        var words2  = indices.get(IndexBlock.Words_2);
+        var words4  = indices.get(IndexBlock.Words_4);
+        var words8  = indices.get(IndexBlock.Words_8);
+        var words16  = indices.get(IndexBlock.Words_16Plus);
+        var artifacts  = indices.get(IndexBlock.Artifacts);
+
        queryBuilders = new EnumMap<>(IndexBlock.class);
        underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class);

-        queryBuilders.put(IndexBlock.Words, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex, namesIndex, wordsIndex), wordsIndex));
-        queryBuilders.put(IndexBlock.Low, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex), wordsIndex));
-        queryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex), wordsIndex));
-        queryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex), wordsIndex));
-        queryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex), wordsIndex));
+        queryBuilders.put(IndexBlock.Words_1, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words1), words1));
+        queryBuilders.put(IndexBlock.Words_2, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words2), words1));
+        queryBuilders.put(IndexBlock.Words_4, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words4), words1));
+        queryBuilders.put(IndexBlock.Words_8, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words8), words1));
+        queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words1, words2, words4, words8, words16, artifacts), words1));

-        underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
-        underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
-        underspecifiedQueryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
+        underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
+        underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
+        underspecifiedQueryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
    }

    @SafeVarargs
@ -157,7 +166,7 @@ public class SearchIndexReader implements AutoCloseable {
                return block;
            }
        }
-        return IndexBlock.Words;
+        return IndexBlock.Words_1;
    }

    public boolean isTermInBucket(IndexBlock block, int searchTerm, long urlId) {
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/query/IndexQueryBuilder.java
@ -27,7 +27,8 @@ public class IndexQueryBuilder {

    public Query build(IndexSearchBudget budget,
                       LongPredicate filter,
-                       int wordId) {
+                       int wordId)
+    {
        return new QueryForIndices(budget, filter, wordId);
    }

--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/stackoverflow/StackOverflowPostProcessor.java
@ -1,13 +1,13 @@
 package nu.marginalia.wmsa.edge.integration.stackoverflow;

 import com.google.inject.Inject;
-import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
 import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
 import nu.marginalia.util.language.processing.SentenceExtractor;
+import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
 import nu.marginalia.wmsa.edge.index.model.IndexBlock;
 import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
 import nu.marginalia.wmsa.edge.integration.stackoverflow.model.StackOverflowPost;
-import nu.marginalia.wmsa.edge.model.*;
+import nu.marginalia.wmsa.edge.model.EdgeUrl;
 import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink;
 import org.apache.commons.lang3.StringUtils;
 import org.jsoup.Jsoup;
@ -46,8 +46,8 @@ public class StackOverflowPostProcessor {
        var keywords = documentKeywordExtractor.extractKeywords(dld);

        keywords.get(IndexBlock.Meta).addJust("site:"+post.getUrl().domain);
-        keywords.get(IndexBlock.Words).addJust("site:"+post.getUrl().domain);
-        keywords.get(IndexBlock.Words).addJust("special:wikipedia");
+        keywords.get(IndexBlock.Words_1).addJust("site:"+post.getUrl().domain);
+        keywords.get(IndexBlock.Words_1).addJust("special:wikipedia");
        keywords.get(IndexBlock.Meta).addJust("special:wikipedia");
        keywords.get(IndexBlock.Meta).addJust("js:true");

--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/integration/wikipedia/WikipediaProcessor.java
@ -1,13 +1,13 @@
 package nu.marginalia.wmsa.edge.integration.wikipedia;

-import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
 import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
 import nu.marginalia.util.language.processing.SentenceExtractor;
+import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
 import nu.marginalia.wmsa.edge.index.model.IndexBlock;
 import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
 import nu.marginalia.wmsa.edge.integration.wikipedia.model.WikipediaArticle;
-import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink;
 import nu.marginalia.wmsa.edge.model.EdgeUrl;
+import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink;
 import org.apache.commons.lang3.StringUtils;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
@ -42,8 +42,8 @@ public class WikipediaProcessor {
        var keywords = documentKeywordExtractor.extractKeywords(dld);

        keywords.get(IndexBlock.Meta).addJust("site:"+post.getUrl().domain);
-        keywords.get(IndexBlock.Words).addJust("site:"+post.getUrl().domain);
-        keywords.get(IndexBlock.Words).addJust("special:stackoverflow");
+        keywords.get(IndexBlock.Words_1).addJust("site:"+post.getUrl().domain);
+        keywords.get(IndexBlock.Words_1).addJust("special:stackoverflow");
        keywords.get(IndexBlock.Meta).addJust("special:stackoverflow");
        keywords.get(IndexBlock.Meta).addJust("js:true");

--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWordSet.java
@ -1,13 +1,15 @@
 package nu.marginalia.wmsa.edge.model.crawl;

+import com.dslplatform.json.JsonObject;
+import com.dslplatform.json.JsonWriter;
 import lombok.Data;
 import nu.marginalia.wmsa.edge.index.model.IndexBlock;

 import java.util.*;

@Data
-public class EdgePageWordSet {
-    public final Map<IndexBlock, EdgePageWords> wordSets;
+public class EdgePageWordSet implements JsonObject {
+    public Map<IndexBlock, EdgePageWords> wordSets;

    public EdgePageWordSet(EdgePageWords... words) {
        wordSets = new EnumMap<>(IndexBlock.class);
@ -45,4 +47,18 @@ public class EdgePageWordSet {
        });
        return sj.toString();
    }
+
+    @Override
+    public void serialize(JsonWriter writer, boolean minimal) {
+        writer.writeAscii("[");
+        boolean first = false;
+        for (var w : wordSets.values()) {
+            if (!first) first = true;
+            else writer.writeAscii(", ");
+
+            w.serialize(writer, minimal);
+        }
+        writer.writeAscii("]}");
+
+    }
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/crawl/EdgePageWords.java
@ -1,4 +1,7 @@
 package nu.marginalia.wmsa.edge.model.crawl;
+import com.dslplatform.json.JsonObject;
+import com.dslplatform.json.JsonWriter;
+import com.dslplatform.json.NumberConverter;
 import lombok.Getter;
 import lombok.ToString;
 import nu.marginalia.wmsa.edge.index.model.IndexBlock;
@ -8,7 +11,7 @@ import java.util.Collection;
 import java.util.List;

@ToString @Getter
-public class EdgePageWords {
+public class EdgePageWords implements JsonObject {
    public final IndexBlock block;
    public final List<String> words = new ArrayList<>();

@ -31,4 +34,19 @@ public class EdgePageWords {
        return words.size();
    }
    public void addJust(String word) { words.add(word); }
+
+    @Override
+    public void serialize(JsonWriter writer, boolean minimal) {
+        writer.writeAscii("{\"b\":");
+        NumberConverter.serialize(block.ordinal(), writer);
+        writer.writeAscii(", \"w\": [");
+        boolean first = false;
+        for (var word : words) {
+            if (!first) first = true;
+            else { writer.writeAscii(","); }
+
+            writer.writeString(word);
+        }
+        writer.writeAscii("]}");
+    }
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/EdgeSearchProfile.java
@ -10,25 +10,31 @@ import java.util.stream.Collectors;

 public enum EdgeSearchProfile {
    DEFAULT("default",
-            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
+            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Link,
+                    IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
+                    ),
            0, 1),
    MODERN("modern",
-            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
+            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Link, IndexBlock.NamesWords,
+                    IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
+                    ),
            2),
    CORPO("corpo",
-            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link,  IndexBlock.Words, IndexBlock.NamesWords),
-            4, 5, 6, 7),
+            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords,
+                    IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
+            4, 5, 7),
    YOLO("yolo",
-            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
+            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords,
+                    IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
            0, 2, 1, 3, 4, 6),
    CORPO_CLEAN("corpo-clean",
-            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
+            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords),
            4, 5),
    ACADEMIA("academia",
-            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link,  IndexBlock.Words, IndexBlock.NamesWords),
+            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords),
            3),
    FOOD("food",
-            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link,  IndexBlock.Words, IndexBlock.NamesWords),
+            List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link,  IndexBlock.Words_1, IndexBlock.NamesWords),
            2, 0),
    ;

--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/command/commands/SiteSearchCommand.java
@ -62,7 +62,7 @@ public class SiteSearchCommand implements SearchCommandInterface {
        DecoratedSearchResultSet resultSet;
        Path screenshotPath = null;
        if (null != domain) {
-            resultSet = searchOperator.performDumbQuery(ctx, EdgeSearchProfile.CORPO, IndexBlock.Words, 100, 100, "site:"+domain);
+            resultSet = searchOperator.performDumbQuery(ctx, EdgeSearchProfile.CORPO, IndexBlock.Words_1, 100, 100, "site:"+domain);

            screenshotPath = Path.of("/screenshot/" + dataStoreDao.getDomainId(domain).id());
        }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultValuator.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/search/results/SearchResultValuator.java
@ -30,7 +30,7 @@ public class SearchResultValuator {
        EdgeSearchResultKeywordScore[] scores = rawScores.stream().filter(w -> !w.keyword.contains(":")).toArray(EdgeSearchResultKeywordScore[]::new);

        if (scores.length == 0) {
-            return IndexBlock.Words.sortOrder;
+            return IndexBlock.Words_1.sortOrder;
        }

        final double[] weights = getTermWeights(scores);
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java
@ -51,7 +51,7 @@ public class FeaturesLoaderTool {
                            throw new RuntimeException(ex);
                        }

-                        client.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -5, wordSet, 0)
+                        client.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), wordSet, 0)
                                .blockingSubscribe();
                    });

--- a/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql
+++ b/marginalia_nu/src/main/resources/sql/edge-crawler-cache.sql
@ -141,7 +141,7 @@ CREATE OR REPLACE VIEW EC_URL_VIEW AS
        EC_PAGE_DATA.FEATURES AS FEATURES,

        EC_DOMAIN.IP AS IP,
-        EC_DOMAIN.STATE AS STATE,
+        EC_URL.STATE AS STATE,
        EC_DOMAIN.RANK AS RANK,
        EC_DOMAIN.STATE AS DOMAIN_STATE
    FROM EC_URL
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPrunerTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/converting/processor/logic/DomPrunerTest.java
@ -0,0 +1,12 @@
+package nu.marginalia.wmsa.edge.converting.processor.logic;
+
+import org.junit.jupiter.api.Test;
+
+import java.io.IOException;
+
+class DomPrunerTest {
+    @Test
+    public void test() throws IOException {
+
+    }
+}
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/crawling/SentenceExtractorTest.java
@ -1,10 +1,8 @@
 package nu.marginalia.wmsa.edge.crawling;

-import com.zaxxer.hikari.HikariConfig;
-import com.zaxxer.hikari.HikariDataSource;
 import lombok.SneakyThrows;
 import nu.marginalia.util.TestLanguageModels;
-import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
+import nu.marginalia.util.language.WordPatterns;
 import nu.marginalia.util.language.conf.LanguageModels;
 import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
 import nu.marginalia.util.language.processing.KeywordExtractor;
@ -12,11 +10,9 @@ import nu.marginalia.util.language.processing.SentenceExtractor;
 import nu.marginalia.util.language.processing.model.WordRep;
 import nu.marginalia.util.language.processing.model.WordSpan;
 import nu.marginalia.util.language.processing.model.tag.WordSeparator;
-import nu.marginalia.util.ranking.BuggyReversePageRank;
-import nu.marginalia.util.ranking.BuggyStandardPageRank;
+import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
 import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
 import nu.marginalia.wmsa.edge.model.EdgeDomain;
-import nu.marginalia.wmsa.edge.model.EdgeUrl;
 import org.jsoup.Jsoup;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Disabled;
@ -103,6 +99,11 @@ class SentenceExtractorTest {
                });
        reader.join();
    }
+
+    @Test
+    public void testPattern() {
+        System.out.println(WordPatterns.singleWordAdditionalPattern.matcher("2.6.18164.el5pae").matches());
+    }
    @Test
    void extractSentences() throws IOException {
        var data = Path.of("/home/vlofgren/Code/tmp-data/");
--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/EdgeIndexClientTest.java
@ -11,12 +11,15 @@ import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
 import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
 import nu.marginalia.wmsa.edge.index.model.IndexBlock;
 import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
-import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
-import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
 import nu.marginalia.wmsa.edge.model.EdgeId;
-import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
 import nu.marginalia.wmsa.edge.model.EdgeUrl;
-import org.junit.jupiter.api.*;
+import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
+import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
+import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Tag;
+import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.parallel.Execution;
 import org.junit.jupiter.api.parallel.ExecutionMode;
 import org.junit.jupiter.api.parallel.ResourceAccessMode;
@ -141,7 +144,7 @@ public class EdgeIndexClientTest {
    void putWords(int didx, int idx, double quality, String... words) {
        EdgePageWords epw = new EdgePageWords(IndexBlock.Title);
        epw.addAll(Arrays.asList(words));
-        client.putWords(Context.internal(), new EdgeId<>(didx), new EdgeId<>(idx), quality,
+        client.putWords(Context.internal(), new EdgeId<>(didx), new EdgeId<>(idx),
                new EdgePageWordSet(epw), 0).blockingSubscribe();
    }

--- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java
+++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/index/service/SearchIndexJournalWriterTest.java
@ -61,7 +61,7 @@ class SearchIndexJournalWriterTest {
    void put() throws IOException {
        writer.put(new SearchIndexJournalEntryHeader(4, (1234L << 32) | 5678, IndexBlock.Link),
                new SearchIndexJournalEntry(new long[] { 1, 2, 3, 4 }));
-        writer.put(new SearchIndexJournalEntryHeader(4, (2345L << 32) | 2244, IndexBlock.Words),
+        writer.put(new SearchIndexJournalEntryHeader(4, (2345L << 32) | 2244, IndexBlock.Words_1),
                new SearchIndexJournalEntry(new long[] { 5, 6, 7 }));
        writer.forceWrite();

--- a/protocol/build.gradle
+++ b/protocol/build.gradle
@ -0,0 +1,27 @@
+plugins {
+    id "com.google.protobuf" version "0.8.19"
+    id "java"
+}
+repositories {
+    gradlePluginPortal()
+}
+protobuf {
+    protoc {
+        artifact = 'com.google.protobuf:protoc:3.0.0'
+    }
+}
+
+sourceSets {
+    main {
+        java {
+            srcDirs 'build/generated/source/proto/main/grpc'
+            srcDirs 'build/generated/source/proto/main/java'
+        }
+    }
+}
+
+dependencies {
+    protobuf files ("def/")
+
+    implementation group: 'com.google.protobuf', name: 'protobuf-java', version: '3.0.0'
+}
--- a/protocol/def/index.proto
+++ b/protocol/def/index.proto
@ -0,0 +1,21 @@
+syntax = "proto3";
+
+option java_package = "nu.wmsa.wmsa.edge.index.proto";
+option java_outer_classname = "IndexProto";
+option java_multiple_files = true;
+
+message IndexPutKeywordsReq {
+    int32 domain = 1;
+    int32 url = 2;
+    int32 index = 3;
+    repeated WordSet wordSet = 4;
+
+    message WordSet {
+        int32 index = 1;
+        repeated string words = 2;
+    }
+}
+
+message IndexSearchQueryRsp {
+
+}
--- a/settings.gradle
+++ b/settings.gradle
@ -1,4 +1,5 @@
 rootProject.name = 'wmsa'

 include 'marginalia_nu'
-include 'third_party'
+include 'third_party'
+include 'protocol'