October Release (#118)

Co-authored-by: vlofgren <vlofgren@gmail.com> Co-authored-by: vlofgren <vlofgren@marginalia.nu> Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/118
2025-02-23 04:58:59 +00:00 · 2022-10-19 15:00:04 +02:00 · 2022-10-19 15:00:04 +02:00 · df49ccbe59
commit df49ccbe59
parent 9a7d052c43
186 changed files with 7472 additions and 2243 deletions
--- a/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java
+++ b/marginalia_nu/src/e2e/java/nu/marginalia/wmsa/edge/EdgeSearchE2ETest.java
@ -175,7 +175,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {

        driver.get("http://proxyNginx/");
        System.out.println(driver.getTitle());
-        System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
+//        System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));

        Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("frontpage"));
    }
@ -249,7 +249,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {

        driver.get("http://proxyNginx/search?query=browse:wikipedia.local");
        System.out.println(driver.getTitle());
-        System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
+//        System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));

        Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("browse"));
    }
@ -259,7 +259,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {

        driver.get("http://proxyNginx/search?query=define:adiabatic");
        System.out.println(driver.getTitle());
-        System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
+//        System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));

        Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("define"));
    }
@ -269,7 +269,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {

        driver.get("http://proxyNginx/search?query=3%2B3");
        System.out.println(driver.getTitle());
-        System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
+//        System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));

        Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("eval"));
    }
--- a/marginalia_nu/src/jmh/java/nu/marginalia/BitSetTest.java
+++ b/marginalia_nu/src/jmh/java/nu/marginalia/BitSetTest.java
@ -0,0 +1,313 @@
+package nu.marginalia;
+
+import nu.marginalia.util.AndCardIntSet;
+import org.openjdk.jmh.annotations.*;
+import org.roaringbitmap.RoaringBitmap;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Random;
+
+public class BitSetTest {
+    @org.openjdk.jmh.annotations.State(Scope.Benchmark)
+    public static class State {
+        List<RoaringBitmap> roar = new ArrayList<>();
+        List<AndCardIntSet> acbs = new ArrayList<>();
+
+        List<RoaringBitmap> roarLow = new ArrayList<>();
+        List<RoaringBitmap> roarHigh = new ArrayList<>();
+
+        List<AndCardIntSet> acbsLow = new ArrayList<>();
+        List<AndCardIntSet> acbsHigh = new ArrayList<>();
+
+        @Setup(Level.Trial)
+        public void setUp() {
+            var rand = new Random();
+
+            for (int i = 0; i < 100; i++) {
+                int card = 1 + rand.nextInt(10);
+
+                var rb = new RoaringBitmap();
+                var cbs = new AndCardIntSet();
+
+                for (int j = 0; j < card; j++) {
+                    int val = rand.nextInt(1_000_000);
+                    rb.add(val);
+                    cbs.add(val);
+                }
+                acbsLow.add(cbs);
+                roarLow.add(rb);
+            }
+
+            for (int i = 0; i < 10; i++) {
+                int card = 1 + rand.nextInt(10000, 20000);
+
+                var rb = new RoaringBitmap();
+
+                for (int j = 0; j < card; j++) {
+                    int val = rand.nextInt(1_000_000);
+                    rb.add(val);
+                }
+                acbsHigh.add(AndCardIntSet.of(rb));
+                roarHigh.add(rb);
+            }
+
+
+
+            for (int i = 0; i < 100000; i++) {
+                var rb = new RoaringBitmap();
+                var cbs = new AndCardIntSet();
+
+                int val = rand.nextInt(1_000_000);
+                rb.add(val);
+                cbs.add(val);
+
+                acbs.add(cbs);
+                roar.add(rb);
+            }
+
+            for (int i = 0; i < 10000; i++) {
+                int card = 1 + rand.nextInt(10);
+
+                var rb = new RoaringBitmap();
+                var cbs = new AndCardIntSet();
+
+                for (int j = 0; j < card; j++) {
+                    int val = rand.nextInt(1_000_000);
+                    rb.add(val);
+                    cbs.add(val);
+                }
+                acbs.add(cbs);
+                roar.add(rb);
+            }
+            for (int i = 0; i < 1000; i++) {
+                int card = 1 + rand.nextInt(100);
+
+                var rb = new RoaringBitmap();
+                var cbs = new AndCardIntSet();
+
+                for (int j = 0; j < card; j++) {
+                    int val = rand.nextInt(1_000_000);
+                    rb.add(val);
+                    cbs.add(val);
+                }
+                acbs.add(cbs);
+                roar.add(rb);
+            }
+            for (int i = 0; i < 100; i++) {
+                int card = 1 + rand.nextInt(1000);
+
+                var rb = new RoaringBitmap();
+                var cbs = new AndCardIntSet();
+
+                for (int j = 0; j < card; j++) {
+                    int val = rand.nextInt(1_000_000);
+                    rb.add(val);
+                    cbs.add(val);
+                }
+                acbs.add(cbs);
+                roar.add(rb);
+            }
+            for (int i = 0; i < 100; i++) {
+                int card = 1 + rand.nextInt(10000);
+
+                var rb = new RoaringBitmap();
+                var cbs = new AndCardIntSet();
+
+                for (int j = 0; j < card; j++) {
+                    int val = rand.nextInt(1_000_000);
+                    rb.add(val);
+                    cbs.add(val);
+                }
+                acbs.add(cbs);
+                roar.add(rb);
+            }
+
+            for (int i = 0; i < 2; i++) {
+                int card = 1 + rand.nextInt(100000);
+
+                var rb = new RoaringBitmap();
+                var cbs = new AndCardIntSet();
+
+                for (int j = 0; j < card; j++) {
+                    int val = rand.nextInt(1_000_000);
+                    rb.add(val);
+                    cbs.add(val);
+                }
+                acbs.add(cbs);
+                roar.add(rb);
+            }
+            Collections.shuffle(acbs);
+            Collections.shuffle(roar);
+        }
+    }
+
+//
+//    @Benchmark
+//    @BenchmarkMode(Mode.Throughput)
+//    @Fork(value = 5, warmups = 5)
+//    public Object roaringCard(State state) {
+//        long val = 0;
+//
+//        for (int i = 0; i < state.roar.size(); i++) {
+//            for (int j = i+1; j < state.roar.size(); j++) {
+//                val += RoaringBitmap.andCardinality(state.roar.get(i), state.roar.get(j));
+//            }
+//        }
+//
+//        return val;
+//    }
+//    @Benchmark
+//    @BenchmarkMode(Mode.Throughput)
+//    @Fork(value = 2, warmups = 2)
+//    public Object roaringCardNorm(State state) {
+//        long val = 0;
+//
+//        for (int i = 0; i < state.roar.size()/1000; i++) {
+//            for (int j = i+1; j < state.roar.size(); j++) {
+//
+//                var a = state.roar.get(i);
+//                var b = state.roar.get(j);
+//                val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
+//            }
+//        }
+//
+//        return val;
+//    }
+//    @Benchmark
+//    @BenchmarkMode(Mode.Throughput)
+//    @Fork(value = 5, warmups = 5)
+//    public Object cbsCard(State state) {
+//        long val = 0;
+//
+//        for (int i = 0; i < state.roar.size(); i++) {
+//            for (int j = i+1; j < state.roar.size(); j++) {
+//                val += AndCardIntSet.andCardinality(state.acbs.get(i), state.acbs.get(j));
+//            }
+//        }
+//
+//        return val;
+//    }
+//
+//    @Benchmark
+//    @BenchmarkMode(Mode.Throughput)
+//    @Fork(value = 1, warmups = 1)
+//    public Object cbsCardNorm(State state) {
+//        double val = 0;
+//
+//        for (int i = 0; i < state.roar.size()/1000; i++) {
+//            for (int j = i+1; j < state.roar.size(); j++) {
+//                var a = state.acbs.get(i);
+//                var b = state.acbs.get(j);
+//                val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.cardinality()*b.cardinality()));
+//            }
+//        }
+//
+//        return val;
+//    }
+
+    @Benchmark
+    @BenchmarkMode(Mode.Throughput)
+    @Fork(value = 1, warmups = 1)
+    public Object cbsLowLow(State state) {
+        double val = 0;
+
+        for (int i = 0; i < state.acbsLow.size(); i++) {
+            for (int j = 0; j < state.acbsLow.size(); j++) {
+                var a = state.acbsLow.get(i);
+                var b = state.acbsLow.get(j);
+                val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
+            }
+        }
+
+        return val;
+    }
+
+
+    @Benchmark
+    @BenchmarkMode(Mode.Throughput)
+    @Fork(value = 1, warmups = 1)
+    public Object cbsHighHigh(State state) {
+        double val = 0;
+
+        for (int i = 0; i < state.acbsHigh.size(); i++) {
+            for (int j = 0; j < state.acbsHigh.size(); j++) {
+                var a = state.acbsHigh.get(i);
+                var b = state.acbsHigh.get(j);
+                val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
+            }
+        }
+
+        return val;
+    }
+
+    @Benchmark
+    @BenchmarkMode(Mode.Throughput)
+    @Fork(value = 1, warmups = 1)
+    public Object cbsHighLow(State state) {
+        double val = 0;
+
+        for (int i = 0; i < state.acbsHigh.size(); i++) {
+            for (int j = 0; j < state.acbsLow.size(); j++) {
+                var a = state.acbsHigh.get(i);
+                var b = state.acbsLow.get(j);
+                val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
+            }
+        }
+
+        return val;
+    }
+
+    @Benchmark
+    @BenchmarkMode(Mode.Throughput)
+    @Fork(value = 1, warmups = 1)
+    public Object roarLowLow(State state) {
+        double val = 0;
+
+        for (int i = 0; i < state.roarLow.size(); i++) {
+            for (int j = 0; j < state.roarLow.size(); j++) {
+                var a = state.roarLow.get(i);
+                var b = state.roarLow.get(j);
+                val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
+            }
+        }
+
+        return val;
+    }
+
+
+    @Benchmark
+    @BenchmarkMode(Mode.Throughput)
+    @Fork(value = 1, warmups = 1)
+    public Object roarHighLow(State state) {
+        double val = 0;
+
+        for (int i = 0; i < state.roarHigh.size(); i++) {
+            for (int j = 0; j < state.roarLow.size(); j++) {
+                var a = state.roarHigh.get(i);
+                var b = state.roarLow.get(j);
+                val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
+            }
+        }
+
+        return val;
+    }
+
+    @Benchmark
+    @BenchmarkMode(Mode.Throughput)
+    @Fork(value = 1, warmups = 1)
+    public Object roarHighHigh(State state) {
+        double val = 0;
+
+        for (int i = 0; i < state.roarHigh.size(); i++) {
+            for (int j = 0; j < state.roarHigh.size(); j++) {
+                var a = state.roarHigh.get(i);
+                var b = state.roarHigh.get(j);
+                val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
+            }
+        }
+
+        return val;
+    }
+}
--- a/marginalia_nu/src/jmh/java/nu/marginalia/ByteBufferBlockReadVsIndividualRead.java
+++ b/marginalia_nu/src/jmh/java/nu/marginalia/ByteBufferBlockReadVsIndividualRead.java
@ -1,85 +0,0 @@
-package nu.marginalia;
-
-import lombok.SneakyThrows;
-import nu.marginalia.util.multimap.MultimapFileLong;
-import org.openjdk.jmh.annotations.*;
-
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.Arrays;
-import java.util.stream.IntStream;
-import java.util.stream.LongStream;
-
-public class ByteBufferBlockReadVsIndividualRead {
-
-    @State(Scope.Benchmark)
-    public static class ByteBufferState {
-        private MultimapFileLong mmf;
-        private Path file;
-        private static final int size = 800*1024*1024;
-        @Setup(Level.Iteration)
-        @SneakyThrows
-        public void setUp() {
-            file = Files.createTempFile("jmh", ".dat");
-            mmf = MultimapFileLong.forOutput(file, size);
-            for (int i = 0; i < size; i++) {
-                mmf.put(i, i);
-            }
-        }
-
-        @TearDown(Level.Iteration)
-        @SneakyThrows
-        public void tearDown() {
-            mmf.close();
-            Files.delete(file);
-        }
-
-        LongStream basicStream() {
-            return IntStream.range(0, size).mapToLong(mmf::get);
-        }
-
-        LongStream blockStream(int blockSize) {
-            long urlOffset = 0;
-            long endOffset = size;
-
-            long[] arry = new long[blockSize];
-
-            return LongStream
-                    .iterate(urlOffset, i -> i< endOffset, i->i+blockSize)
-                    .flatMap(pos -> {
-                        int sz = (int)(Math.min(pos+blockSize, endOffset) - pos);
-                        mmf.read(arry, sz, pos);
-                        return Arrays.stream(arry, 0, sz);
-                    });
-        }
-    }
-
-
-
-    // @Benchmark @BenchmarkMode(Mode.Throughput)
-    // @Fork(value = 1, warmups = 1)
-    // @Warmup(iterations = 1)
-    public long testBasic(ByteBufferState state) {
-        return state.basicStream().sum();
-    }
-
-
-    @Benchmark @BenchmarkMode(Mode.Throughput)
-    @Fork(value = 1, warmups = 1)
-    @Warmup(iterations = 0)
-    public long testBlock128(ByteBufferState state) {
-        return state.blockStream(128).sum();
-    }
-    @Benchmark @BenchmarkMode(Mode.Throughput)
-    @Fork(value = 1, warmups = 1)
-    @Warmup(iterations = 0)
-    public long testBlock1024(ByteBufferState state) {
-        return state.blockStream(1024).sum();
-    }
-    @Benchmark @BenchmarkMode(Mode.Throughput)
-    @Fork(value = 1, warmups = 1)
-    @Warmup(iterations = 0)
-    public long testBlock8192(ByteBufferState state) {
-        return state.blockStream(8192).sum();
-    }
-}
--- a/marginalia_nu/src/main/java/nu/marginalia/util/AndCardIntSet.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/AndCardIntSet.java
@ -0,0 +1,205 @@
+package nu.marginalia.util;
+
+import com.google.common.hash.HashFunction;
+import com.google.common.hash.Hashing;
+import gnu.trove.list.array.TIntArrayList;
+import gnu.trove.set.hash.TIntHashSet;
+import org.roaringbitmap.RoaringBitmap;
+
+
+public class AndCardIntSet  {
+    final TIntArrayList backingList;
+    long hash;
+
+    public AndCardIntSet() {
+        backingList = new TIntArrayList(16);
+        backingList.sort();
+    }
+
+    public static AndCardIntSet of(int... list) {
+        var set = new TIntHashSet(list);
+        TIntArrayList lst = new TIntArrayList(set);
+        lst.sort();
+
+        return new AndCardIntSet(lst);
+    }
+
+    public static AndCardIntSet of(RoaringBitmap bmap) {
+
+        TIntArrayList lst = new TIntArrayList(bmap.getCardinality());
+        lst.addAll(bmap.toArray());
+
+        return new AndCardIntSet(lst);
+    }
+
+
+    private AndCardIntSet(TIntArrayList list) {
+        backingList = list;
+        hash = 0;
+
+        if (list.size() < 128) {
+            for (int v : list.toArray()) {
+                int bit = hasher.hashInt(v).asInt() % 64;
+                hash |= (1L << bit);
+            }
+        }
+        else {
+            hash = ~0L;
+        }
+
+    }
+
+    private static final HashFunction hasher = Hashing.murmur3_128(0);
+
+    public boolean add(int val) {
+        if (!contains(val)) {
+            return false;
+        }
+
+        if (backingList.size() < 128) {
+            int bit = hasher.hashInt(val).asInt() % 64;
+            hash |= (1L << bit);
+        }
+        else {
+            hash = ~0L;
+        }
+        backingList.add(val);
+        backingList.sort();
+        return true;
+    }
+
+    public boolean contains(int val) {
+        return backingList.binarySearch(val) >= 0;
+    }
+
+    public int getCardinality() {
+        return backingList.size();
+    }
+
+    public static int andCardinality(AndCardIntSet a, AndCardIntSet b) {
+
+        if (!testHash(a,b)) {
+            return 0;
+        }
+
+        if (a.getCardinality() + b.getCardinality() < 10) {
+            return andLinearSmall(a, b);
+        }
+
+        return andLinear(a,b);
+    }
+
+    private static int andLinearSmall(AndCardIntSet a, AndCardIntSet b) {
+        int sum = 0;
+        for (int i = 0; i < a.getCardinality(); i++) {
+            for (int j = 0; j < b.getCardinality(); j++) {
+                if (a.backingList.getQuick(i) == b.backingList.getQuick(j))
+                    sum++;
+            }
+        }
+        return sum;
+    }
+
+    private static int andLinear(AndCardIntSet a, AndCardIntSet b) {
+
+        int i = 0, j = 0;
+        int card = 0;
+
+        do {
+            int diff = a.backingList.getQuick(i) - b.backingList.getQuick(j);
+
+            if (diff < 0) i++;
+            else if (diff > 0) j++;
+            else {
+                i++;
+                j++;
+                card++;
+            }
+        } while (i < a.getCardinality() && j < b.getCardinality());
+
+        return card;
+
+    }
+
+    private static boolean testHash(AndCardIntSet a, AndCardIntSet b) {
+        return (a.hash & b.hash) != 0;
+    }
+
+    public boolean cardinalityExceeds(int val) {
+        return getCardinality() >= val;
+    }
+
+    public static AndCardIntSet and(AndCardIntSet a, AndCardIntSet b) {
+        int i = 0;
+        int j = 0;
+
+        TIntArrayList andVals = new TIntArrayList(1 + (int)Math.sqrt(a.getCardinality()));
+
+        while (i < a.getCardinality() && j < b.getCardinality()) {
+            int diff = a.backingList.getQuick(i) - b.backingList.getQuick(j);
+            if (diff < 0) i++;
+            else if (diff > 0) j++;
+            else {
+                andVals.add(a.backingList.getQuick(i));
+                i++;
+                j++;
+            }
+        }
+
+        return new AndCardIntSet(andVals);
+    }
+
+    public static double weightedProduct(float[] weights, AndCardIntSet a, AndCardIntSet b) {
+        int i = 0;
+        int j = 0;
+
+        double sum = 0;
+
+        if (a.getCardinality() + b.getCardinality() < 10) {
+            return weightedProductSmall(weights, a, b);
+        }
+
+        do {
+            int diff = a.backingList.getQuick(i) - b.backingList.getQuick(j);
+            if (diff < 0) i++;
+            else if (diff > 0) j++;
+            else {
+                sum += weights[a.backingList.getQuick(i)];
+                i++;
+                j++;
+            }
+        } while (i < a.getCardinality() && j < b.getCardinality());
+
+        return sum;
+    }
+
+
+    private static double weightedProductSmall(float[] weights, AndCardIntSet a, AndCardIntSet b) {
+        double sum = 0;
+
+        for (int i = 0; i < a.getCardinality(); i++) {
+            for (int j = 0; j < b.getCardinality(); j++) {
+                int av = a.backingList.getQuick(i);
+                int bv = b.backingList.getQuick(j);
+                if (av == bv)
+                    sum+=weights[av];
+            }
+        }
+
+        return sum;
+    }
+    public double mulAndSum(float[] weights) {
+        double sum = 0;
+        for (int i = 0; i < backingList.size(); i++) {
+            sum += weights[backingList.getQuick(i)];
+        }
+        return sum;
+    }
+    public int[] toArray() {
+        return backingList.toArray();
+    }
+
+    public TIntArrayList values() {
+        return backingList;
+    }
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/util/BrailleBlockPunchCards.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/BrailleBlockPunchCards.java
@ -0,0 +1,52 @@
+package nu.marginalia.util;
+
+public class BrailleBlockPunchCards {
+
+    public static String printBits(int val, int bits) {
+        StringBuilder builder = new StringBuilder();
+
+        for (int b = 0; b < bits; b+=8, val>>>=8) {
+            builder.append((char)('\u2800'+bin2brail(val)));
+        }
+
+        return builder.toString();
+    }
+
+    /* The braille block in unicode U2800 is neat because it contains
+     * 8 "bits", but for historical reasons, they're addressed in a bit
+     * of an awkward way. Braille used to be a 2x6 grid, but it was extended
+     * to 2x8.
+     *
+     * It's addressed as follows
+     *
+     * 0 3
+     * 1 4
+     * 2 5
+     * 6 7 <-- extended braille
+     *
+     *
+     * We want to use it as a dot matrix to represent bits. To do that we need
+     * to do this transformation:
+     *
+     *  0  1  2  3  4  5  6  7   native order bits
+     *  |  |  |   \ _\__\/   |
+     *  |  |  |   / \  \ \   |
+     *  0  1  2  6  3  4  5  7   braille order bits
+     *
+     * 01 02 04 08 10 20 40 80
+     * 01+02+04            +80 : &0x87
+     *          << 10+20+40    : &0x70, <<1
+     *          08 >> >> >>    : &0x08, >>3
+     *
+     * Or in other words we do
+     *     (v & 0x87)
+     *  | ((v & 0x70) >> 1)
+     *  | ((v & 0x08) << 3)
+     *
+     * Thanks for coming to my TED talk.
+     */
+
+    private static char bin2brail(int v) {
+        return (char)((v & 0x87) | ((v & 0x70) >> 1) | ((v & 0x08) << 3));
+    }
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/util/ListChunker.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/ListChunker.java
@ -1,5 +1,7 @@
 package nu.marginalia.util;

+import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
+
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
@ -14,13 +16,13 @@ public class ListChunker {
     *
     * @see List#subList
     */
-    public static <T> List<List<T>> chopList(List<T> data, int size) {
+    public static List<DocumentKeywords> chopList(DocumentKeywords data, int size) {
        if (data.isEmpty())
            return Collections.emptyList();
        else if (data.size() < size)
            return List.of(data);

-        final List<List<T>> ret = new ArrayList<>(1 + data.size() / size);
+        final List<DocumentKeywords> ret = new ArrayList<>(1 + data.size() / size);

        for (int i = 0; i < data.size(); i+=size) {
            ret.add(data.subList(i, Math.min(data.size(), i+size)));
--- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeDogEar.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeDogEar.java
@ -0,0 +1,33 @@
+package nu.marginalia.util.btree;
+
+import nu.marginalia.util.btree.model.BTreeContext;
+import nu.marginalia.util.btree.model.BTreeHeader;
+import nu.marginalia.util.multimap.MultimapFileLongSlice;
+
+/*
+ * End-of-page mark that's used as a sentinel to verify that
+ * the BTreeWriter's caller actually writes as much as they say
+ * they want to. (Failing to do so will corrupt the tree)
+ *
+ */
+public class BTreeDogEar {
+
+    private MultimapFileLongSlice sentinelSlice;
+
+    public BTreeDogEar(BTreeContext ctx, BTreeHeader header, MultimapFileLongSlice base) {
+        if (header.numEntries() > 3) {
+            sentinelSlice = base.atOffset((long) header.numEntries() * ctx.entrySize() - 3);
+            sentinelSlice.put(0, 4L);
+            sentinelSlice.put(1, 5L);
+            sentinelSlice.put(2, 1L);
+        }
+    }
+
+    public boolean verify() {
+        if (sentinelSlice == null)
+            return true;
+
+        return 4 != sentinelSlice.get(0) || 5 != sentinelSlice.get(1) || 1 != sentinelSlice.get(2);
+    }
+
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeQueryBuffer.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeQueryBuffer.java
@ -0,0 +1,146 @@
+package nu.marginalia.util.btree;
+
+import java.util.Arrays;
+
+public class BTreeQueryBuffer {
+    public final long[] data;
+    public int end;
+
+    private int read = 0;
+    private int write = 0;
+
+    public BTreeQueryBuffer(int size) {
+        this.data = new long[size];
+        this.end = size;
+    }
+
+    public BTreeQueryBuffer(long [] data, int size) {
+        this.data = data;
+        this.end = size;
+    }
+
+    private BTreeQueryBuffer(long [] data) {
+        this.data = data;
+        this.end = data.length;
+    }
+
+    public BTreeQueryBuffer[] split(int... splitPoints) {
+        BTreeQueryBuffer[] ret = new BTreeQueryBuffer[splitPoints.length+1];
+
+        ret[0] = new BTreeQueryBuffer(Arrays.copyOfRange(data, 0, splitPoints[0]));
+        for (int i = 1; i < splitPoints.length; i++) {
+            ret[i] = new BTreeQueryBuffer(Arrays.copyOfRange(data, splitPoints[i-1], splitPoints[i]));
+        }
+        ret[ret.length-1] = new BTreeQueryBuffer(Arrays.copyOfRange(data, splitPoints[splitPoints.length-1], end));
+
+        return ret;
+    }
+
+    public void gather(BTreeQueryBuffer... buffers) {
+        int start = 0;
+
+        for (var buffer : buffers) {
+            System.arraycopy(buffer.data, 0, data, start, buffer.end);
+            start += buffer.end;
+        }
+
+        this.read = 0;
+        this.write = 0;
+        this.end = start;
+    }
+
+    public long[] copyData() {
+        return Arrays.copyOf(data, end);
+    }
+
+    public void retainAll() {
+        read = write = end;
+    }
+
+    public boolean isEmpty() {
+        return end == 0;
+    }
+
+    public int size() {
+        return end;
+    }
+
+    public long currentValue() {
+        return data[read];
+    }
+
+    public boolean rejectAndAdvance() {
+        return ++read < end;
+    }
+
+    public boolean retainAndAdvance() {
+        if (read != write) {
+            long tmp = data[write];
+            data[write] = data[read];
+            data[read] = tmp;
+        }
+
+        write++;
+
+        return ++read < end;
+    }
+
+    public boolean hasMore() {
+        return read < end;
+    }
+
+    public void finalizeFiltering() {
+        end = write;
+        read = 0;
+        write = 0;
+    }
+
+    public void startFilterForRange(int pos, int end) {
+        read = write = pos;
+        this.end = end;
+    }
+
+    public void reset() {
+        end = data.length;
+        read = 0;
+        write = 0;
+    }
+
+    public void zero() {
+        end = 0;
+        read = 0;
+        write = 0;
+        Arrays.fill(data, 0);
+    }
+
+    public void uniq() {
+        if (end <= 1) return;
+
+        long prev = currentValue();
+        retainAndAdvance();
+
+        while (hasMore()) {
+
+            long val = currentValue();
+
+            if (prev == val) {
+                rejectAndAdvance();
+            } else {
+                retainAndAdvance();
+                prev = val;
+            }
+
+        }
+
+        finalizeFiltering();
+    }
+
+    public String toString() {
+        return getClass().getSimpleName() + "[" +
+            "read = " + read +
+            ",write = " + write +
+            ",end = " + end +
+            ",data = [" + Arrays.toString(Arrays.copyOf(data, end)) + "]]";
+    }
+
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeReader.java
@ -1,5 +1,7 @@
 package nu.marginalia.util.btree;

+import it.unimi.dsi.fastutil.longs.LongLongImmutablePair;
+import lombok.SneakyThrows;
 import nu.marginalia.util.btree.model.BTreeContext;
 import nu.marginalia.util.btree.model.BTreeHeader;
 import nu.marginalia.util.multimap.MultimapFileLong;
@ -14,70 +16,275 @@ public class BTreeReader {

    private final MultimapSearcher indexSearcher;
    private final MultimapSearcher dataSearcher;
+    private final BTreeHeader header;

-    public BTreeReader(MultimapFileLong file, BTreeContext ctx) {
+    public BTreeReader(MultimapFileLong file, BTreeContext ctx, BTreeHeader header) {
        this.file = file;
        this.indexSearcher = MultimapSearcher.forContext(file, ~0, 1);
        this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize());

        this.ctx = ctx;
+        this.header = header;
    }

-    public BTreeHeader getHeader(long fileOffset) {
+    public BTreeReader(MultimapFileLong file, BTreeContext ctx, long offset) {
+        this.file = file;
+        this.indexSearcher = MultimapSearcher.forContext(file, ~0, 1);
+        this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize());
+
+        this.ctx = ctx;
+        this.header = createHeader(file, offset);
+    }
+
+    public static BTreeHeader createHeader(MultimapFileLong file, long fileOffset) {
        return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
    }

+    public BTreeHeader getHeader() {
+        return header;
+    }
+
+    public int numEntries() {
+        return header.numEntries();
+    }
+
+    @SneakyThrows
+    public void retainEntries(BTreeQueryBuffer buffer) {
+        if (header.layers() == 0) {
+            BTreePointer pointer = new BTreePointer(header);
+            pointer.retainData(buffer);
+        }
+        retainSingle(buffer);
+    }
+
+    @SneakyThrows
+    public void rejectEntries(BTreeQueryBuffer buffer) {
+        if (header.layers() == 0) {
+            BTreePointer pointer = new BTreePointer(header);
+            pointer.rejectData(buffer);
+        }
+        rejectSingle(buffer);
+    }
+
+    private void retainSingle(BTreeQueryBuffer buffer) {
+
+        BTreePointer pointer = new BTreePointer(header);
+
+        for (; buffer.hasMore(); pointer.resetToRoot()) {
+
+            long val = buffer.currentValue() & ctx.equalityMask();
+
+            if (!pointer.walkToData(val)) {
+                buffer.rejectAndAdvance();
+                continue;
+            }
+
+            pointer.retainData(buffer);
+        }
+    }
+
+    private void rejectSingle(BTreeQueryBuffer buffer) {
+        BTreePointer pointer = new BTreePointer(header);
+
+        for (; buffer.hasMore(); pointer.resetToRoot()) {
+
+            long val = buffer.currentValue() & ctx.equalityMask();
+
+            if (pointer.walkToData(val) && pointer.containsData(val)) {
+                buffer.rejectAndAdvance();
+            }
+            else {
+                buffer.retainAndAdvance();
+            }
+        }
+    }
+
+
    /**
     *
     * @return file offset of entry matching keyRaw, negative if absent
     */
-    public long findEntry(BTreeHeader header, final long keyRaw) {
-        final int blockSize = ctx.BLOCK_SIZE_WORDS();
-
+    public long findEntry(final long keyRaw) {
        final long key = keyRaw & ctx.equalityMask();
-        final long dataAddress = header.dataOffsetLongs();

-        final long searchStart;
-        final long numEntries;
+        BTreePointer ip = new BTreePointer(header);

-        if (header.layers() == 0) { // For small data, there is no index block, only a flat data block
-            searchStart = dataAddress;
-            numEntries = header.numEntries();
-        }
-        else {
-            long dataLayerOffset = searchIndex(header, key);
-            if (dataLayerOffset < 0) {
-                return dataLayerOffset;
+        while (!ip.isDataLayer())
+            ip.walkToChild(key);
+
+        return ip.findData(key);
    }

-            searchStart = dataAddress + dataLayerOffset * ctx.entrySize();
-            numEntries = min(header.numEntries() - dataLayerOffset, blockSize);
+    public void readData(long[] data, int n, long pos) {
+        file.read(data, n, header.dataOffsetLongs() + pos);
    }

+    public long[] queryData(long[] urls, int offset) {
+        BTreePointer pointer = new BTreePointer(header);
+
+        long[] ret = new long[urls.length];
+
+        for (int i = 0; i < urls.length; i++, pointer.resetToRoot()) {
+            if (pointer.walkToData(urls[i])) {
+                long dataAddress = pointer.findData(urls[i]);
+                if (dataAddress >= 0) {
+                    ret[i] = file.get(dataAddress + offset);
+                }
+            }
+        }
+
+        return ret;
+    }
+
+    /** Find the range of values so that prefixStart <= n < prefixNext */
+    public LongLongImmutablePair getRangeForPrefix(long prefixStart, long prefixNext) {
+        long lowerBoundStart = lowerBound(prefixStart);
+        long lowerBoundEnd = lowerBound(prefixNext);
+
+        return new LongLongImmutablePair(lowerBoundStart, lowerBoundEnd);
+    }
+
+    private long lowerBound(long key) {
+        key &= ctx.equalityMask();
+
+        BTreePointer ip = new BTreePointer(header);
+
+        while (!ip.isDataLayer())
+            ip.walkToChild(key);
+
+        return ip.findDataLower(key);
+    }
+
+    private class BTreePointer {
+        private final long[] layerOffsets;
+
+        private int layer;
+        private long offset;
+        private long boundary;
+
+        public String toString() {
+            return getClass().getSimpleName() + "[" +
+                "layer = " + layer + " ," +
+                "offset = " + offset + "]";
+        }
+
+        public BTreePointer(BTreeHeader header) {
+            layer = header.layers() - 1;
+            offset = 0;
+            layerOffsets = header.getRelativeLayerOffsets(ctx);
+            boundary = Long.MAX_VALUE;
+        }
+
+        public void resetToRoot() {
+            this.layer = header.layers() - 1;
+            this.offset = 0;
+            this.boundary = Long.MAX_VALUE;
+        }
+
+        public int layer() {
+            return layer;
+        }
+
+        public boolean walkToChild(long key) {
+            final long indexAddress = header.indexOffsetLongs();
+
+            final long indexLayerBlockOffset = layerOffsets[layer] + offset;
+
+            final long searchStart = indexAddress + indexLayerBlockOffset;
+            final long nextLayerOffset = (int)(indexSearcher.binarySearchLower(key, searchStart, ctx.BLOCK_SIZE_WORDS()) - searchStart);
+
+            if (nextLayerOffset < 0)
+                return false;
+
+            layer --;
+            boundary = file.get(searchStart + offset);
+            offset = ctx.BLOCK_SIZE_WORDS() * (offset + nextLayerOffset);
+
+            return true;
+        }
+
+        public boolean walkToData(long key) {
+            while (!isDataLayer()) {
+                if (!walkToChild(key)) {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        public boolean isDataLayer() {
+            return layer < 0;
+        }
+
+        public boolean containsData(long key) {
+            return findData(key) >= 0;
+        }
+
+        public long findData(long key) {
+            if (layer > 0) {
+                throw new IllegalStateException("Looking for data in an index layer");
+            }
+
+            long searchStart = header.dataOffsetLongs() + offset * ctx.entrySize();
+            int numEntries = min((int)(header.numEntries() - offset), ctx.BLOCK_SIZE_WORDS());
+
            return dataSearcher.binarySearch(key, searchStart, numEntries);
        }

-    private long searchIndex(BTreeHeader header, long key) {
-        final int blockSize = ctx.BLOCK_SIZE_WORDS();
-        final long indexAddress = header.indexOffsetLongs();
-
-        long layerOffset = 0;
-
-        for (int i = header.layers() - 1; i >= 0; --i) {
-            final long indexLayerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset;
-
-            final long nextLayerOffset = relativePositionInIndex(key, indexAddress + indexLayerBlockOffset, blockSize);
-            if (nextLayerOffset < 0)
-                return nextLayerOffset;
-
-            layerOffset = blockSize * (nextLayerOffset + layerOffset);
+        public long findDataLower(long key) {
+            if (layer > 0) {
+                throw new IllegalStateException("Looking for data in an index layer");
            }

-        return layerOffset;
+            long searchStart = header.dataOffsetLongs() + offset * ctx.entrySize();
+            int numEntries = min((int)(header.numEntries() - offset), ctx.BLOCK_SIZE_WORDS());
+
+            return dataSearcher.binarySearchLower(key, searchStart, numEntries);
        }

-    private long relativePositionInIndex(long key, long start, long n) {
-        return indexSearcher.binarySearchUpper(key, start, n) - start;
+        public void retainData(BTreeQueryBuffer buffer) {
+
+            long dataOffset = findData(buffer.currentValue());
+            if (dataOffset >= 0) {
+                buffer.retainAndAdvance();
+
+                long blockBase = header.dataOffsetLongs() + offset * ctx.entrySize();
+                long relOffset = dataOffset - blockBase;
+
+                int numEntries =
+                        min((int) (header.numEntries() - relOffset), ctx.BLOCK_SIZE_WORDS()) / ctx.entrySize();
+
+                if (buffer.currentValue() <= boundary) {
+                    file.retain(buffer, boundary, dataOffset, numEntries, ctx.equalityMask(), ctx.entrySize());
                }
+            }
+            else {
+                buffer.rejectAndAdvance();
+            }
+
+        }
+
+        public void rejectData(BTreeQueryBuffer buffer) {
+
+            long dataOffset = findData(buffer.currentValue());
+            if (dataOffset >= 0) {
+                buffer.rejectAndAdvance();
+
+                long blockBase = header.dataOffsetLongs() + offset * ctx.entrySize();
+                long relOffset = dataOffset - blockBase;
+
+                int numEntries =
+                        min((int) (header.numEntries() - relOffset), ctx.BLOCK_SIZE_WORDS()) / ctx.entrySize();
+
+                if (buffer.currentValue() <= boundary) {
+                    file.reject(buffer, boundary, dataOffset, numEntries, ctx.equalityMask(), ctx.entrySize());
+                }
+            }
+            else {
+                buffer.retainAndAdvance();
+            }
+        }
+    }
+

 }
--- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/BTreeWriter.java
@ -3,6 +3,8 @@ package nu.marginalia.util.btree;
 import nu.marginalia.util.btree.model.BTreeContext;
 import nu.marginalia.util.btree.model.BTreeHeader;
 import nu.marginalia.util.multimap.MultimapFileLongSlice;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;

 import java.io.IOException;

@ -10,6 +12,7 @@ import java.io.IOException;
 public class BTreeWriter {
    private final BTreeContext ctx;
    private final MultimapFileLongSlice map;
+    private final Logger logger = LoggerFactory.getLogger(getClass());

    public BTreeWriter(MultimapFileLongSlice map, BTreeContext ctx) {
        this.map = map;
@ -39,7 +42,16 @@ public class BTreeWriter {

        header.write(map, offset);

-        writeIndexCallback.write(map.atOffset(header.dataOffsetLongs()));
+
+        var slice = map.atOffset(header.dataOffsetLongs());
+
+        BTreeDogEar dogEar = new BTreeDogEar(ctx, header, slice);
+
+        writeIndexCallback.write(slice);
+
+        if (!dogEar.verify()) {
+            logger.error("Dog ear was not overwritten: {}", header);
+        }

        if (header.layers() < 1) { // The data is too small to benefit from indexing
            return ctx.calculateSize(numEntries);
--- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/CachingBTreeReader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/CachingBTreeReader.java
@ -1,136 +0,0 @@
-package nu.marginalia.util.btree;
-
-import nu.marginalia.util.btree.model.BTreeContext;
-import nu.marginalia.util.btree.model.BTreeHeader;
-import nu.marginalia.util.multimap.MultimapFileLong;
-import nu.marginalia.util.multimap.MultimapSearcher;
-
-import static java.lang.Math.min;
-
-public class CachingBTreeReader {
-
-    private final MultimapFileLong file;
-    public final BTreeContext ctx;
-
-    private final MultimapSearcher dataSearcher;
-
-    public CachingBTreeReader(MultimapFileLong file, BTreeContext ctx) {
-        this.file = file;
-        this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize());
-
-        this.ctx = ctx;
-    }
-
-    public BTreeHeader getHeader(long fileOffset) {
-        return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
-    }
-
-    public BTreeCachedIndex prepareCache(BTreeHeader header) {
-        return new BTreeCachedIndex(header);
-    }
-    /**
-     *
-     * @return file offset of entry matching keyRaw, negative if absent
-     */
-    public long findEntry(BTreeCachedIndex cache, final long keyRaw) {
-        BTreeHeader header = cache.header;
-
-        final int blockSize = ctx.BLOCK_SIZE_WORDS();
-
-        final long key = keyRaw & ctx.equalityMask();
-        final long dataAddress = header.dataOffsetLongs();
-
-        final long searchStart;
-        final long numEntries;
-
-        if (header.layers() == 0) { // For small data, there is no index block, only a flat data block
-            searchStart = dataAddress;
-            numEntries = header.numEntries();
-        }
-        else {
-            cache.load();
-
-            long dataLayerOffset = searchIndex(header, cache, key);
-            if (dataLayerOffset < 0) {
-                return dataLayerOffset;
-            }
-
-            searchStart = dataAddress + dataLayerOffset * ctx.entrySize();
-            numEntries = min(header.numEntries() - dataLayerOffset, blockSize);
-        }
-
-        return dataSearcher.binarySearch(key, searchStart, numEntries);
-    }
-
-    private long searchIndex(BTreeHeader header, BTreeCachedIndex cache, long key) {
-        final int blockSize = ctx.BLOCK_SIZE_WORDS();
-        long layerOffset = 0;
-
-        for (int i = header.layers() - 1; i >= 0; --i) {
-            final long indexLayerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset;
-
-            final long nextLayerOffset = cache.relativePositionInIndex(key, (int) indexLayerBlockOffset, blockSize);
-            if (nextLayerOffset < 0)
-                return nextLayerOffset;
-
-            layerOffset = blockSize * (nextLayerOffset + layerOffset);
-        }
-
-        return layerOffset;
-    }
-
-    /** A cache for the BTree index data that will drastically reduce the number of disk reads
-     * for repeated queries against the same tree. The memory consumption is typically very low
-     * and the disk access pattern for reading the entire index relatively cheap.
-     */
-    public class BTreeCachedIndex {
-        long[] indexData;
-        final BTreeHeader header;
-
-        final int indexedDataSize;
-
-        public BTreeCachedIndex(BTreeHeader header) {
-            this.header = header;
-            indexedDataSize = header.numEntries();
-        }
-
-        public void load() {
-            if (indexData != null)
-                return;
-
-            int size = (int)(header.dataOffsetLongs() - header.indexOffsetLongs());
-            indexData = new long[size];
-            file.read(indexData, header.indexOffsetLongs());
-        }
-
-        long relativePositionInIndex(long key, int fromIndex, int n) {
-            int low = 0;
-            int high = n - 1;
-
-            while (low <= high) {
-                int mid = (low + high) >>> 1;
-                long midVal = indexData[fromIndex + mid];
-
-                if (midVal < key)
-                    low = mid + 1;
-                else if (midVal > key)
-                    high = mid - 1;
-                else
-                    return mid;
-            }
-            return low;
-        }
-
-        public long sizeBytes() {
-            return isLoaded() ? 8L*indexData.length : 0;
-        }
-
-        public int getIndexedDataSize() {
-            return indexedDataSize;
-        }
-
-        public boolean isLoaded() {
-            return indexData != null;
-        }
-    }
-}
--- a/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/btree/model/BTreeContext.java
@ -19,7 +19,7 @@ public record BTreeContext(int MAX_LAYERS,
    }

    public int numIndexLayers(int numEntries) {
-        if (numEntries <= BLOCK_SIZE_WORDS*2) {
+        if (numEntries <= BLOCK_SIZE_WORDS*2/entrySize) {
            return 0;
        }
        for (int i = 1; i < MAX_LAYERS; i++) {
--- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryData.java
@ -26,7 +26,6 @@ public class DictionaryData {

        if (rb == -1) {
            int end = activeBank.getEnd();
-            logger.debug("Switching bank @ {}", end);
            var newBank = new DictionaryDataBank(end, DICTIONARY_BANK_SIZE);
            rb = newBank.add(key);

--- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryHashMap.java
@ -16,7 +16,7 @@ import static nu.marginalia.util.FileSizeUtil.readableSize;
 *  Spiritually influenced by GNU Trove's hash maps
 *  LGPL 2.1
 */
-public class DictionaryHashMap {
+public class DictionaryHashMap implements DictionaryMap {
    private static final Logger logger = LoggerFactory.getLogger(DictionaryHashMap.class);
    private static final Gauge probe_count_metrics
            = Gauge.build("wmsa_dictionary_hash_map_probe_count", "Probing Count")
@ -81,6 +81,7 @@ public class DictionaryHashMap {
        }
    }

+    @Override
    public int size() {
        return sz.get();
    }
@ -97,6 +98,7 @@ public class DictionaryHashMap {
        buffers[buffer].put(bufferIdx, val);
    }

+    @Override
    public int put(long key) {

        long hash = key & 0x7FFF_FFFF_FFFF_FFFFL;
@ -143,6 +145,7 @@ public class DictionaryHashMap {
        return di;
    }

+    @Override
    public int get(long key) {
        final long hash = key & 0x7FFF_FFFF_FFFF_FFFFL;
        final long cell = hash % hashTableSize;
--- a/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryMap.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/dict/DictionaryMap.java
@ -0,0 +1,9 @@
+package nu.marginalia.util.dict;
+
+public interface DictionaryMap {
+    int size();
+
+    int put(long key);
+
+    int get(long key);
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/UnicodeRanges.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/UnicodeRanges.java
@ -72,7 +72,7 @@ public enum UnicodeRanges {
        int count = 0;
        int max = sensitive ? 15 : 100;

-        for (int i = 0; i < text.length(); i++) {
+        for (int i = 0; i < Math.min(2000, text.length()); i++) {
            char c = text.charAt(i);
            if (c >= min && c <= max) {
                if (count++ > max) {
--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/WordPatterns.java
@ -88,6 +88,9 @@ public class WordPatterns {
    }

    public static boolean hasWordQualities(String s) {
+        if (s.isBlank())
+            return false;
+
        int start = 0;
        int end = s.length();
        if (s.charAt(0) == '#') start++;
@ -95,13 +98,14 @@ public class WordPatterns {

        for (int i = start; i < end; i++) {
            char c = s.charAt(i);
-            if (!("_@.'+-".indexOf(c) >= 0)
+            if (("_@.'+-".indexOf(c) < 0)
                && !(c >= 'a' && c <= 'z')
                && !(c >= 'A' && c <= 'Z')
                && !(c >= '0' && c <= '9')
                && !(c >= '\u00C0' && c <= '\u00D6')
                && !(c >= '\u00D8' && c <= '\u00f6')
-                && !(c >= '\u00f8' && c <= '\u00ff')) {
+                && !(c >= '\u00f8' && c <= '\u00ff'))
+            {
                        return false;
            }
        }
@ -119,10 +123,14 @@ public class WordPatterns {
        if (!filter(s)) {
            return true;
        }
-        if (topWords.contains(s.toLowerCase())) {
+        if (isTopWord(s)) {
            return true;
        }
        return false;
    }

+    public static boolean isTopWord(String s) {
+        return topWords.contains(s.toLowerCase());
+    }
+
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/DocumentKeywordExtractor.java
@ -2,8 +2,10 @@ package nu.marginalia.util.language.processing;

 import nu.marginalia.util.language.WordPatterns;
 import nu.marginalia.util.language.processing.model.DocumentLanguageData;
+import nu.marginalia.util.language.processing.model.KeywordMetadata;
 import nu.marginalia.util.language.processing.model.WordRep;
 import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
+import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
 import nu.marginalia.wmsa.edge.index.model.IndexBlock;
 import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
 import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
@ -20,14 +22,9 @@ public class DocumentKeywordExtractor {
    private final NameCounter nameCounter;
    private final SubjectCounter subjectCounter;

-    private final TermFrequencyDict dict;
-    private final double docCount;

    @Inject
    public DocumentKeywordExtractor(TermFrequencyDict dict) {
-        this.dict = dict;
-        docCount = dict.docCount();
-
        keywordExtractor = new KeywordExtractor();

        tfIdfCounter = new KeywordCounter(dict, keywordExtractor);
@ -36,69 +33,105 @@ public class DocumentKeywordExtractor {
    }


-    public EdgePageWordSet extractKeywordsMinimal(DocumentLanguageData documentLanguageData) {
+    public EdgePageWordSet extractKeywordsMinimal(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {

        List<WordRep> titleWords = extractTitleWords(documentLanguageData);
-
-        KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
        List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
        List<WordRep> subjects = subjectCounter.count(documentLanguageData);

-        List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
-        List<WordRep> topKeywords = new ArrayList<>(wordsTfIdf.top());
+        tfIdfCounter.countHisto(keywordMetadata, documentLanguageData);

-        Collection<String> artifacts = getArtifacts(documentLanguageData);
+        for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
+        for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
+        for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed);
+
+        List<String> artifacts = getArtifacts(documentLanguageData);
+
+        keywordMetadata.flagsTemplate().add(EdgePageWordFlags.Simple);

        return new EdgePageWordSet(
-                createWords(IndexBlock.Subjects, subjects),
-                createWords(IndexBlock.Title, titleWords),
-                createWords(IndexBlock.NamesWords, wordsNamesAll),
-                createWords(IndexBlock.Tfidf_Top, topKeywords),
-                createWords(IndexBlock.Tfidf_Middle, midKeywords),
-                new EdgePageWords(IndexBlock.Artifacts, artifacts)
+                createWords(keywordMetadata, IndexBlock.Title, titleWords),
+                EdgePageWords.withBlankMetadata(IndexBlock.Artifacts, artifacts)
        );
    }

-
-
-    public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData) {
+    public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {

        List<WordRep> titleWords = extractTitleWords(documentLanguageData);

-        KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
-        List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
+        getWordPositions(keywordMetadata, documentLanguageData);
+
+        List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
        List<WordRep> subjects = subjectCounter.count(documentLanguageData);

-        List<WordRep> lowKeywords = new ArrayList<>(wordsTfIdf.lower());
-        List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
-        List<WordRep> topKeywords = new ArrayList<>(wordsTfIdf.top());
+        List<WordRep> wordsTfIdf = tfIdfCounter.countHisto(keywordMetadata, documentLanguageData);

-        Collection<String> artifacts = getArtifacts(documentLanguageData);
+        for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
+        for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
+        for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed);
+
+        List<String> artifacts = getArtifacts(documentLanguageData);

        var wordSet = new EdgePageWordSet(
-                createWords(IndexBlock.Subjects, subjects),
-                createWords(IndexBlock.Title, titleWords),
-                createWords(IndexBlock.NamesWords, wordsNamesAll),
-                createWords(IndexBlock.Tfidf_Top, topKeywords),
-                createWords(IndexBlock.Tfidf_Middle, midKeywords),
-                createWords(IndexBlock.Tfidf_Lower, lowKeywords),
-                new EdgePageWords(IndexBlock.Artifacts, artifacts)
+                createWords(keywordMetadata, IndexBlock.Title, titleWords),
+                createWords(keywordMetadata, IndexBlock.Tfidf_High, wordsTfIdf),
+                createWords(keywordMetadata, IndexBlock.Subjects, subjects),
+                EdgePageWords.withBlankMetadata(IndexBlock.Artifacts, artifacts)
        );

-        getSimpleWords(wordSet, documentLanguageData,
+        getSimpleWords(keywordMetadata, wordSet, documentLanguageData,
                IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus);

        return wordSet;
    }

-    private void getSimpleWords(EdgePageWordSet wordSet, DocumentLanguageData documentLanguageData, IndexBlock...  blocks) {
+
+    public void getWordPositions(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
+        Map<String, Integer> ret = keywordMetadata.positionMask();
+
+        int posCtr = 0;
+        for (var sent : dld.titleSentences) {
+            int posBit = (int)((1L << (posCtr/4)) & 0xFFFF_FFFFL);
+
+            for (var word : sent) {
+                ret.merge(word.stemmed(), posBit, this::bitwiseOr);
+            }
+
+            for (var span : keywordExtractor.getNames(sent)) {
+                ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
+            }
+        }
+        posCtr+=4;
+        for (var sent : dld.sentences) {
+            int posBit = (int)((1L << (posCtr/4)) & 0xFFFF_FFFFL);
+
+            for (var word : sent) {
+                ret.merge(word.stemmed(), posBit, this::bitwiseOr);
+            }
+
+            for (var span : keywordExtractor.getNames(sent)) {
+                ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
+            }
+
+            posCtr++;
+        }
+    }
+
+    private int bitwiseOr(int a, int b) {
+        return a | b;
+    }
+
+
+    private void getSimpleWords(KeywordMetadata metadata, EdgePageWordSet wordSet, DocumentLanguageData documentLanguageData, IndexBlock...  blocks) {
+
+        EnumSet<EdgePageWordFlags> flagsTemplate = EnumSet.noneOf(EdgePageWordFlags.class);

        int start = 0;
        int lengthGoal = 32;

-        for (int blockIdx = 0; blockIdx < blocks.length-1 && start < documentLanguageData.sentences.length; blockIdx++) {
+        for (int blockIdx = 0; blockIdx < blocks.length && start < documentLanguageData.sentences.length; blockIdx++) {
            IndexBlock block = blocks[blockIdx];
-            Set<String> words = new HashSet<>(lengthGoal+100);
+            Set<EdgePageWords.Entry> words = new HashSet<>(lengthGoal+100);

            int pos;
            int length = 0;
@ -110,55 +143,26 @@ public class DocumentKeywordExtractor {
                    if (!word.isStopWord()) {
                        String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
                        if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
-                            words.add(w);
+                            words.add(new EdgePageWords.Entry(w, metadata.forWord(flagsTemplate, word.stemmed())));
                        }
                    }
                }
+
+                for (var names : keywordExtractor.getNames(sent)) {
+                    var rep = new WordRep(sent, names);
+                    String w = AsciiFlattener.flattenUnicode(rep.word);
+
+                    words.add(new EdgePageWords.Entry(w, metadata.forWord(flagsTemplate, rep.stemmed)));
+                }
            }
            wordSet.append(block, words);
            start = pos;
            lengthGoal+=32;
        }
-
-        if (start < documentLanguageData.sentences.length) {
-
-            Map<String, Integer> counts = new HashMap<>(documentLanguageData.totalNumWords());
-            for (int pos = start; pos < documentLanguageData.sentences.length && counts.size() < lengthGoal; pos++) {
-                var sent = documentLanguageData.sentences[pos];
-                for (var word : sent) {
-                    if (!word.isStopWord()) {
-                        String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
-                        if (counts.containsKey(w) || (WordPatterns.singleWordQualitiesPredicate.test(w))) {
-                            counts.merge(w, 1, Integer::sum);
-                        }
-                    }
-                }
-            }
-
-            Set<String> lastSet;
-            if (counts.size() < 1024) {
-                lastSet = counts.keySet();
-            }
-            else {
-                lastSet = counts.entrySet().stream()
-                        .sorted(Comparator.comparing(e -> {
-                            double N = docCount; // Number of documents in term freq dictionary
-
-                            // Caveat: This is actually the *negated* term score, because the second logarithm has
-                            // its parameter inverted (log(a^b) = b log(a); here b = -1)
-                            return (1 + Math.log(e.getValue())) * Math.log((1. + dict.getTermFreq(e.getKey())) / N);
-                        }))
-                        .map(Map.Entry::getKey)
-                        .limit(1024)
-                        .collect(Collectors.toCollection(LinkedHashSet::new));
-            }
-
-            wordSet.append(blocks[blocks.length - 1], lastSet);
-        }
    }

    private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+");
-    private Collection<String> getArtifacts(DocumentLanguageData documentLanguageData) {
+    private List<String> getArtifacts(DocumentLanguageData documentLanguageData) {
        Set<String> reps = new HashSet<>();

        for (var sent : documentLanguageData.sentences) {
@ -183,7 +187,7 @@ public class DocumentKeywordExtractor {
                }
            }
        }
-        return reps;
+        return new ArrayList<>(reps);
    }

    private List<WordRep> extractTitleWords(DocumentLanguageData documentLanguageData) {
@ -193,7 +197,21 @@ public class DocumentKeywordExtractor {
                .collect(Collectors.toList());
    }

-    public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) {
-        return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns::hasWordQualities).collect(Collectors.toSet()));
+    public EdgePageWords createWords(KeywordMetadata metadata,
+                                     IndexBlock block,
+                                     Collection<WordRep> words) {
+
+        Set<EdgePageWords.Entry> entries = new HashSet<>(words.size());
+        for (var word : words) {
+
+            String flatWord = AsciiFlattener.flattenUnicode(word.word);
+            if (!WordPatterns.hasWordQualities(flatWord)) {
+                continue;
+            }
+
+            entries.add(new EdgePageWords.Entry(flatWord, metadata.forWord(metadata.flagsTemplate(), word.stemmed)));
+        }
+
+        return new EdgePageWords(block, entries);
    }
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/KeywordCounter.java
@ -1,15 +1,19 @@
 package nu.marginalia.util.language.processing;

+import com.github.jknack.handlebars.internal.lang3.StringUtils;
+import gnu.trove.map.hash.TObjectIntHashMap;
 import nu.marginalia.util.language.WordPatterns;
 import nu.marginalia.util.language.processing.model.DocumentLanguageData;
+import nu.marginalia.util.language.processing.model.KeywordMetadata;
 import nu.marginalia.util.language.processing.model.WordRep;
 import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;

+import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
-import java.util.Map;
-import java.util.Set;
-import java.util.regex.Pattern;
+import java.util.List;
+
+import static java.lang.Math.max;

 public class KeywordCounter {
    private final KeywordExtractor keywordExtractor;
@ -19,72 +23,78 @@ public class KeywordCounter {
    public KeywordCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) {
        this.dict = dict;
        this.keywordExtractor = keywordExtractor;
-        this.docCount = (double) dict.docCount();
+        this.docCount = dict.docCount();
    }

-    public WordHistogram countHisto(DocumentLanguageData dld) {
-        HashMap<String, Integer> counts = new HashMap<>(15000);
+    public List<WordRep> countHisto(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
+        TObjectIntHashMap<String> counts = new TObjectIntHashMap<>(10_000, 0.7f);
        HashMap<String, HashSet<WordRep>> instances = new HashMap<>(15000);


        for (var sent : dld.sentences) {
            var keywords = keywordExtractor.getKeywordsFromSentence(sent);
            for (var span : keywords) {
-                if (span.size() == 1 &&
-                        WordPatterns.isStopWord(sent.words[span.start]))
+
+                if (span.size() == 1 && WordPatterns.isStopWord(sent.words[span.start])) {
                    continue;
+                }

-                String stemmed = sent.constructStemmedWordFromSpan(span);
+                var rep = new WordRep(sent, span);

-                counts.merge(stemmed, 1, Integer::sum);
-                instances.computeIfAbsent(stemmed, k -> new HashSet<>(500)).add(new WordRep(sent, span));
+                counts.adjustOrPutValue(rep.stemmed, 1, 1);
+                var instanceSet = instances.computeIfAbsent(rep.stemmed, k -> new HashSet<>(500));
+                if (instanceSet.size() < 250) {
+                    instanceSet.add(rep);
+                }
            }
        }

-        double maxC = counts.values().stream().mapToDouble(Double::valueOf).max().orElse(1);
+        HashMap<String, WordFrequencyData> tfIdf = keywordMetadata.wordsTfIdf();
+        List<WordRep> tfIdfHigh = new ArrayList<>();

-        Set<WordRep> h5 = new HashSet<>(2500);
-        Set<WordRep> h10 = new HashSet<>(500);
-        Set<WordRep> h15 = new HashSet<>(500);
+        int maxVal = maxValue(counts);

-        int doubleWordCount = 0;
+        counts.forEachEntry((key, cnt) -> {
+            int value = getTermValue(key, cnt, maxVal);

-        for (var entry : counts.entrySet()) {
-            double value = getTermValue(entry, maxC);
+            tfIdf.put(key, new WordFrequencyData(cnt, value));

-            double avgCnt = entry.getValue();
-            String wordStemmed = entry.getKey();
-
-            Set<WordRep> histogram;
-            if (value < -3 && avgCnt>1) histogram = h15;
-            else if (value < -1.75 && avgCnt>1) histogram = h10;
-            else if (value < -1 &&
-                    (!wordStemmed.contains("_") || doubleWordCount++ < 50))
-                histogram = h5;
-            else continue;
-
-            histogram.addAll(instances.get(wordStemmed));
-        }
-        return new WordHistogram(h5, h10, h15);
+            if (cnt > 1 && value > 100) {
+                tfIdfHigh.addAll(instances.get(key));
            }

-    private static final Pattern separator = Pattern.compile("_");
+            return true;
+        });

-    public double getTermValue(Map.Entry<String, Integer> e, double maxValue) {
-        String key = e.getKey();
-        if (key.contains("_")) {
-            String[] parts = separator.split(e.getKey());
+        return tfIdfHigh;
+    }
+
+    private int maxValue(TObjectIntHashMap<?> map) {
+        int maxC = 0;
+        for (int c : map.values()) {
+            maxC = max(c, maxC);
+        }
+        return maxC;
+    }
+
+    public int getTermValue(String key, int count, double maxValue) {
+        if (key.indexOf('_') >= 0) {
+            String[] parts = StringUtils.split(key, '_');
            double totalValue = 0.;
            for (String part : parts) {
-                totalValue += value(part, e.getValue(), maxValue);
+                totalValue += value(part, count, maxValue);
            }
-            return totalValue / parts.length;
+            return normalizeValue(totalValue / parts.length);
        }
        else {
-            return value(key, e.getValue(), maxValue);
+            return normalizeValue(value(key, count, maxValue));
        }
    }

+    int normalizeValue(double v) {
+        return (int)(-v*75);
+    }
+
    double value(String key, double value, double maxValue) {
        double freq = dict.getTermFreqStemmed(key);
        if (freq < 1) {
@ -93,5 +103,5 @@ public class KeywordCounter {
        return (0.1 + 0.9*value/maxValue) * Math.log(freq/docCount);
    }

-    public record WordHistogram(Set<WordRep> lower, Set<WordRep> mid, Set<WordRep> top) { }
+    public record WordFrequencyData(int count, int tfIdfNormalized) { }
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/LongNameCounter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/LongNameCounter.java
@ -1,64 +0,0 @@
-package nu.marginalia.util.language.processing;
-
-import nu.marginalia.util.language.processing.model.DocumentLanguageData;
-import nu.marginalia.util.language.processing.model.DocumentSentence;
-import nu.marginalia.util.language.processing.model.WordRep;
-import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
-
-import java.util.*;
-import java.util.regex.Pattern;
-import java.util.stream.Collectors;
-
-public class LongNameCounter {
-    private final KeywordExtractor keywordExtractor;
-    private final TermFrequencyDict dict;
-    private final double docCount;
-    public LongNameCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) {
-        this.dict = dict;
-        docCount = (double) dict.docCount();
-        this.keywordExtractor = keywordExtractor;
-    }
-
-    public List<WordRep> count(DocumentLanguageData dld) {
-        HashMap<String, Double> counts = new HashMap<>(1000);
-        HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
-
-        for (int i = 0; i < dld.sentences.length; i++) {
-            DocumentSentence sent = dld.sentences[i];
-            var keywords = keywordExtractor.getNamesStrict(sent);
-            for (var span : keywords) {
-                var stemmed = sent.constructStemmedWordFromSpan(span);
-                counts.merge(stemmed, 1., Double::sum);
-                instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span));
-            }
-        }
-
-        return counts.entrySet().stream().filter(e -> termSize(e.getKey()) > 1)
-                .sorted(Comparator.comparing(this::getTermValue))
-                .limit(Math.min(50, counts.size()/3))
-                .map(Map.Entry::getKey)
-                .flatMap(w -> instances.get(w).stream()).collect(Collectors.toList());
-    }
-
-    int termSize(String word) {
-        return 1 + (int) word.chars().filter(c -> c == '_').count();
-    }
-
-
-    final Pattern separator = Pattern.compile("_");
-
-    public double getTermValue(Map.Entry<String, Double> e) {
-        String[] parts = separator.split(e.getKey());
-        double totalValue = 0.;
-        for (String part : parts) {
-            totalValue += value(part, e.getValue());
-        }
-        return totalValue / Math.sqrt(parts.length);
-    }
-
-    double value(String key, double value) {
-        return (1+Math.log(value)) * Math.log((1.1+dict.getTermFreqStemmed(key))/11820118.);
-    }
-
-
-}
--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/NameCounter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/NameCounter.java
@ -37,7 +37,8 @@ public class NameCounter {
                .sorted(Comparator.comparing(e -> -e.getValue()))
                .limit(150)
                .map(Map.Entry::getKey)
-                .flatMap(w -> instances.get(w).stream()).collect(Collectors.toList());
+                .flatMap(w -> instances.get(w).stream())
+                .collect(Collectors.toList());
    }

 }
--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SentenceExtractor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SentenceExtractor.java
@ -1,6 +1,7 @@
 package nu.marginalia.util.language.processing;

 import com.github.datquocnguyen.RDRPOSTagger;
+import com.github.jknack.handlebars.internal.lang3.StringUtils;
 import gnu.trove.list.array.TIntArrayList;
 import gnu.trove.map.hash.TObjectIntHashMap;
 import lombok.AllArgsConstructor;
@ -125,11 +126,45 @@ public class SentenceExtractor {
        return counts;
    }

-    private static final Pattern dotPattern = Pattern.compile("\\.+$");
    private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)");
-    private static final Pattern spacesPattern = Pattern.compile("\\s+");

-    private static final Pattern badCharPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|(\\.(\\s+|$))");
+//    private static final Pattern badCharPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|(\\.(\\s+|$))");
+
+    private boolean isBadChar(char c) {
+        if (c >= 'a' && c <= 'z') return false;
+        if (c >= 'A' && c <= 'Z') return false;
+        if (c >= '0' && c <= '9') return false;
+        if ("_#@.".indexOf(c) >= 0) return false;
+        if (c >= '\u00C0' && c <= '\u00D6') return false;
+        if (c >= '\u00D8' && c <= '\u00F6') return false;
+        if (c >= '\u00F8' && c <= '\u00FF') return false;
+
+        return true;
+    }
+    private String sanitizeString(String s) {
+        char[] newChars = new char[s.length()];
+        int pi = 0;
+
+        for (int i = 0; i < newChars.length; i++) {
+            char c = s.charAt(i);
+            if (!isBadChar(c)) {
+                newChars[pi++] = c;
+            }
+            else {
+                newChars[pi++] = ' ';
+            }
+        }
+
+        s = new String(newChars, 0, pi);
+
+        if (s.startsWith(".")) {
+            s = s.substring(1);
+            if (s.isBlank())
+                return "";
+        }
+        return s;
+
+    }

    public DocumentSentence extractSentence(String text) {
        var wordsAndSeps = splitSegment(text);
@ -139,7 +174,7 @@ public class SentenceExtractor {
        var lc = toLc(wordsAndSeps.words);

        return new DocumentSentence(
-            badCharPattern.matcher(text).replaceAll(" "), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc)
+            sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc)
        );
    }

@ -161,7 +196,7 @@ public class SentenceExtractor {
            sentences = sentenceDetector.sentDetect(textNormalizedSpaces);
        }
        catch (Exception ex) {
-            sentences = textNormalizedSpaces.split("[.]");
+            sentences = StringUtils.split(textNormalizedSpaces, '.');
        }

        if (sentences.length > 250) {
@ -196,8 +231,8 @@ public class SentenceExtractor {
                separators[i] = Arrays.copyOf(separators[i], 250);
            }
            for (int j = 0; j < tokens[i].length; j++) {
-                if (tokens[i][j].endsWith(".")) {
-                    tokens[i][j] = dotPattern.matcher(tokens[i][j]).replaceAll("");
+                while (tokens[i][j].endsWith(".")) {
+                    tokens[i][j] = StringUtils.removeEnd(tokens[i][j], ".");
                }
            }
        }
@ -216,7 +251,7 @@ public class SentenceExtractor {

        DocumentSentence[] ret = new DocumentSentence[sentences.length];
        for (int i = 0; i < ret.length; i++) {
-            ret[i] = new DocumentSentence(badCharPattern.matcher(sentences[i]).replaceAll(" "), tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]);
+            ret[i] = new DocumentSentence(sanitizeString(sentences[i]), tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]);
        }
        return ret;
    }
--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/SubjectCounter.java
@ -5,9 +5,7 @@ import nu.marginalia.util.language.processing.model.WordRep;
 import nu.marginalia.util.language.processing.model.WordSpan;
 import nu.marginalia.util.language.processing.model.tag.WordSeparator;

-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 import java.util.stream.Collectors;

 public class SubjectCounter {
@ -27,7 +25,9 @@ public class SubjectCounter {

    public List<WordRep> count(DocumentLanguageData dld) {

-        Map<WordRep, Integer> counts = new HashMap<>();
+        Map<String, Integer> counts = new HashMap<>();
+        Map<String, Set<WordRep>> instances = new HashMap<>();
+
        for (var sentence : dld.sentences) {
            for (WordSpan kw : keywordExtractor.getNames(sentence)) {
                if (kw.end + 2 >= sentence.length()) {
@ -41,7 +41,13 @@ public class SubjectCounter {
                String nextNextTag = sentence.posTags[kw.end+1];

                if (isVerb(nextTag) && isDetOrAdverbOrVerb(nextNextTag)) {
-                    counts.merge(new WordRep(sentence, new WordSpan(kw.start, kw.end)), -1, Integer::sum);
+                    var span = new WordSpan(kw.start, kw.end);
+                    var rep = new WordRep(sentence, span);
+
+                    String stemmed = rep.stemmed;
+
+                    counts.merge(stemmed, -1, Integer::sum);
+                    instances.computeIfAbsent(stemmed, s -> new HashSet<>()).add(rep);
                }
            }
        }
@ -49,8 +55,8 @@ public class SubjectCounter {
        int best = counts.values().stream().mapToInt(Integer::valueOf).min().orElse(0);

        return counts.entrySet().stream().sorted(Map.Entry.comparingByValue())
-                .filter(e -> e.getValue()<-2 && e.getValue()<best*0.75)
-                .map(Map.Entry::getKey)
+                .filter(e -> e.getValue()<-2 && e.getValue()<=best*0.75)
+                .flatMap(e -> instances.getOrDefault(e.getKey(), Collections.emptySet()).stream())
                .collect(Collectors.toList());
    }

--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/KeywordMetadata.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/KeywordMetadata.java
@ -0,0 +1,58 @@
+package nu.marginalia.util.language.processing.model;
+
+import nu.marginalia.util.language.processing.KeywordCounter;
+import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
+import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
+
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.HashSet;
+
+public record KeywordMetadata(HashSet<String> titleKeywords,
+                              HashSet<String> subjectKeywords,
+                              HashSet<String> namesKeywords,
+                              HashMap<String, KeywordCounter.WordFrequencyData> wordsTfIdf,
+                              HashMap<String, Integer> positionMask,
+                              EnumSet<EdgePageWordFlags> flagsTemplate,
+                              int quality
+)
+{
+
+    private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0);
+
+    public KeywordMetadata(double quality, EnumSet<EdgePageWordFlags> flags) {
+        this(new HashSet<>(50), new HashSet<>(10), new HashSet<>(50),
+                new HashMap<>(15_000),
+                new HashMap<>(10_000),
+                flags,
+                (int)(-quality));
+    }
+
+    public KeywordMetadata(double quality) {
+        this(quality, EnumSet.noneOf(EdgePageWordFlags.class));
+    }
+
+    public long forWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
+
+        KeywordCounter.WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty);
+        EnumSet<EdgePageWordFlags> flags = flagsTemplate.clone();
+
+        if (subjectKeywords.contains(stemmed))
+            flags.add(EdgePageWordFlags.Subjects);
+
+        if (namesKeywords.contains(stemmed))
+            flags.add(EdgePageWordFlags.NamesWords);
+
+        if (titleKeywords.contains(stemmed))
+            flags.add(EdgePageWordFlags.Title);
+
+        int positions = positionMask.getOrDefault(stemmed, 0);
+
+        return new EdgePageWordMetadata(tfidf.tfIdfNormalized(), positions, quality, tfidf.count(), flags).encode();
+    }
+
+    public int quality() {
+        return -quality;
+    }
+
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/WordRep.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/language/processing/model/WordRep.java
@ -1,21 +1,22 @@
 package nu.marginalia.util.language.processing.model;

 import lombok.AllArgsConstructor;
-import lombok.EqualsAndHashCode;
 import lombok.Getter;
 import org.jetbrains.annotations.NotNull;

 import java.util.Objects;

-@AllArgsConstructor @EqualsAndHashCode @Getter
+@AllArgsConstructor @Getter
 public class WordRep implements Comparable<WordRep> {

    public WordRep(DocumentSentence sent, WordSpan span) {
        word = sent.constructWordFromSpan(span);
        stemmed = sent.constructStemmedWordFromSpan(span);
        length = span.end - span.start;
+
        hashCode = Objects.hash(word);
    }
+
    public final int length;
    public final String word;
    public final String stemmed;
@ -34,4 +35,12 @@ public class WordRep implements Comparable<WordRep> {
    public int hashCode() {
        return hashCode;
    }
+
+    public boolean equals(Object other) {
+        if (other == this) return true;
+        if (other instanceof WordRep wr) {
+            return Objects.equals(wr.word, word);
+        }
+        return false;
+    }
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLong.java
@ -2,6 +2,7 @@ package nu.marginalia.util.multimap;

 import com.upserve.uppend.blobs.NativeIO;
 import lombok.SneakyThrows;
+import nu.marginalia.util.btree.BTreeQueryBuffer;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

@ -100,8 +101,8 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
    public MultimapSearcherBase createSearcher() {
        return new MultimapSearcherBase(this);
    }
-    public MultimapSorter createSorter(Path tmpFile, int internalSortLimit) {
-        return new MultimapSorter(this, tmpFile, internalSortLimit);
+    public MultimapSorter createSorter(Path tmpFile, int internalSortLimit, int minStepSize) {
+        return new MultimapSorter(this, tmpFile, internalSortLimit, minStepSize);
    }

    @SneakyThrows
@ -340,6 +341,49 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {

    }

+
+    @Override
+    public void write(LongBuffer vals, int n, long idx) {
+        if (idx+n >= mappedSize) {
+            grow(idx+n);
+        }
+        int iN = (int)((idx + n) / bufferSize);
+
+        for (int i = 0; i < n; ) {
+            int i0 = (int)((idx + i) / bufferSize);
+
+            int bufferOffset = (int) ((idx+i) % bufferSize);
+            var buffer = buffers.get(i0);
+
+            final int l;
+
+            if (i0 < iN) l = bufferSize - bufferOffset;
+            else l = Math.min(n - i, bufferSize - bufferOffset);
+
+            buffer.put(bufferOffset, vals, vals.position() + i, l);
+            i+=l;
+        }
+
+    }
+
+    @Override
+    public void swapn(int n, long idx1, long idx2) {
+        for (int i = 0; i < n; i++)
+            swap(idx1+i, idx2+i);
+    }
+
+    private void swap(long idx1, long idx2) {
+        LongBuffer buff1 = buffers.get((int)(idx1) / bufferSize);
+        final int o1 = (int) (idx1) % bufferSize;
+
+        LongBuffer buff2 = buffers.get((int)(idx2) / bufferSize);
+        final int o2 = (int) (idx2) % bufferSize;
+
+        long tmp = buff1.get(o1);
+        buff1.put(o1, buff2.get(o2));
+        buff2.put(o2, tmp);
+    }
+
    @Override
    public void setRange(long idx, int n, long val) {
        if (n == 0) return;
@ -410,6 +454,387 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {

    }

+    @Override
+    public long binarySearchInternal(long key, long fromIndex, int step, long n, long mask) {
+        if (fromIndex + n*step >= mappedSize)
+            grow(fromIndex + n*step);
+
+        long low = 0;
+        long high = n - 1;
+
+        if (fromIndex/bufferSize == (fromIndex+step*n)/bufferSize) {
+            int idx = (int)(fromIndex / bufferSize);
+
+            while (low <= high) {
+                long mid = (low + high) >>> 1;
+                long off = fromIndex + mid*step;
+                long midVal = buffers.get(idx).get((int)(off % bufferSize)) & mask;
+
+                if (midVal < key)
+                    low = mid + 1;
+                else if (midVal > key)
+                    high = mid - 1;
+                else
+                    return fromIndex + mid*step;
+            }
+        }
+        else {
+            while (low <= high) {
+                long mid = (low + high) >>> 1;
+                long off = fromIndex + mid*step;
+                long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize)) & mask;
+
+                if (midVal < key)
+                    low = mid + 1;
+                else if (midVal > key)
+                    high = mid - 1;
+                else
+                    return fromIndex + mid*step;
+            }
+        }
+
+        return -1L-(fromIndex + high*step);
+    }
+
+    @Override
+    public long binarySearchInternal(long key, long fromIndex, long n, long mask) {
+        if (fromIndex + n >= mappedSize)
+            grow(fromIndex + n);
+
+        long low = 0;
+        long high = n - 1;
+
+        if (fromIndex/bufferSize == (fromIndex+n)/bufferSize) {
+            int idx = (int)(fromIndex / bufferSize);
+
+            while (low <= high) {
+                long mid = (low + high) >>> 1;
+                long off = fromIndex + mid;
+                long midVal = buffers.get(idx).get((int)(off % bufferSize)) & mask;
+
+                if (midVal < key)
+                    low = mid + 1;
+                else if (midVal > key)
+                    high = mid - 1;
+                else
+                    return fromIndex + mid;
+            }
+        }
+        else {
+            while (low <= high) {
+                long mid = (low + high) >>> 1;
+                long off = fromIndex + mid;
+                long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize)) & mask;
+
+                if (midVal < key)
+                    low = mid + 1;
+                else if (midVal > key)
+                    high = mid - 1;
+                else
+                    return fromIndex + mid;
+            }
+        }
+
+        return -1L-(fromIndex + high);
+    }
+
+
+
+    @Override
+    public long binarySearchInternal(long key, long fromIndex, long n) {
+        if (fromIndex + n >= mappedSize)
+            grow(fromIndex + n);
+
+        long low = 0;
+        long high = n - 1;
+
+        if (fromIndex/bufferSize == (fromIndex+n)/bufferSize) {
+            int idx = (int)(fromIndex / bufferSize);
+
+            while (low <= high) {
+                long mid = (low + high) >>> 1;
+                long off = fromIndex + mid;
+                long midVal = buffers.get(idx).get((int)(off % bufferSize));
+
+                if (midVal < key)
+                    low = mid + 1;
+                else if (midVal > key)
+                    high = mid - 1;
+                else
+                    return fromIndex + mid;
+            }
+        }
+        else {
+            while (low <= high) {
+                long mid = (low + high) >>> 1;
+                long off = fromIndex + mid;
+                long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize));
+
+                if (midVal < key)
+                    low = mid + 1;
+                else if (midVal > key)
+                    high = mid - 1;
+                else
+                    return fromIndex + mid;
+            }
+        }
+
+        return -1L-(fromIndex + high);
+    }
+
+
+    @Override
+    public long binarySearchUpperInternal(long key, long fromIndex, long n) {
+        if (fromIndex + n >= mappedSize)
+            grow(fromIndex + n);
+
+        long low = 0;
+        long high = n - 1;
+
+        if (fromIndex/bufferSize == (fromIndex+n)/bufferSize) {
+            int idx = (int)(fromIndex / bufferSize);
+
+            while (low <= high) {
+                long mid = (low + high) >>> 1;
+                long off = fromIndex + mid;
+                long midVal = buffers.get(idx).get((int)(off % bufferSize));
+
+                if (midVal < key)
+                    low = mid + 1;
+                else if (midVal > key)
+                    high = mid - 1;
+                else
+                    return fromIndex + mid;
+            }
+        }
+        else {
+            while (low <= high) {
+                long mid = (low + high) >>> 1;
+                long off = fromIndex + mid;
+                long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize));
+
+                if (midVal < key)
+                    low = mid + 1;
+                else if (midVal > key)
+                    high = mid - 1;
+                else
+                    return fromIndex + mid;
+            }
+        }
+
+        return fromIndex + low;
+    }
+
+    private boolean isSameBuffer(long a, long b) {
+        return a / bufferSize == b/bufferSize;
+    }
+
+    @Override
+    public long quickSortPartition(int wordSize, long low, long high) {
+        if (high >= mappedSize)
+            grow(high + wordSize - 1);
+
+        if (isSameBuffer(low, high + wordSize - 1)) {
+            // Specialization that circumvents the need for expensive calls to
+            // MultimapFileLong.get() in the most common scenario
+
+            return quickSortPartitionSameBuffer(wordSize, low, high);
+        }
+        else {
+            return quickSortPartitionDifferentBuffers(wordSize, low, high);
+        }
+    }
+
+    @Override
+    public void insertionSort(int wordSize, long start, int n) {
+        if (start + n + wordSize - 1 >= mappedSize)
+            grow(start + n + wordSize - 1);
+
+        if (n == 1) {
+            return;
+        }
+
+        if (isSameBuffer(start, start + (long)n*wordSize-1L)) {
+            final var buffer = buffers.get((int) (start / bufferSize));
+            int off = (int) (start % bufferSize);
+
+            for (int i = 1; i < n; i++) {
+                for (int j = i; j > 0; j--) {
+                    int a = off + wordSize*(j-1);
+                    int b = off + wordSize*j;
+
+                    if (buffer.get(a) > buffer.get(b)) {
+                        for (int w = 0; w < wordSize; w++) {
+                            long tmp = buffer.get(a+w);
+                            buffer.put(a+w, buffer.get(b+w));
+                            buffer.put(b+w, tmp);
+                        }
+                    }
+                    else break;
+                }
+            }
+        }
+        else for (int i = 1; i < n; i++) {
+            for (int j = i; j > 0; j--) {
+                long a = start + (long)wordSize*(j-1);
+                long b = start + (long)wordSize*j;
+
+                if (get(a) > get(b)) {
+                    swap(a, b);
+                }
+                else {
+                    break;
+                }
+            }
+        }
+    }
+
+
+    private long quickSortPartitionDifferentBuffers(int wordSize, long low, long high) {
+
+        long pivotPoint = ((low + high) / (2L*wordSize)) * wordSize;
+        long pivot = get(pivotPoint);
+
+        long i = low - wordSize;
+        long j = high + wordSize;
+
+        for (;;) {
+            do {
+                i+=wordSize;
+            } while (get(i) < pivot);
+
+            do {
+                j-=wordSize;
+            }
+            while (get(j) > pivot);
+
+            if (i >= j) return j;
+            else swapn(wordSize, i, j);
+        }
+    }
+
+    private long quickSortPartitionSameBuffer(int wordSize, long low, long high) {
+
+        final var buffer = buffers.get((int) (low / bufferSize));
+
+        int pivotPoint = (int) ((low + high) / (2L*wordSize)) * wordSize % bufferSize;
+        long pivot = buffer.get(pivotPoint);
+
+        int j = (int) (high) % bufferSize + wordSize;
+        int i = (int) (low) % bufferSize - wordSize;
+
+        long j0 = high + wordSize - j;
+
+        for (;;) {
+            do {
+                i+=wordSize;
+            } while (buffer.get(i) < pivot);
+
+            do {
+                j-=wordSize;
+            }
+            while (buffer.get(j) > pivot);
+
+            if (i >= j) return j0 + j;
+            else {
+                for (int w = 0; w < wordSize; w++) {
+                    long tmp = buffer.get(i+w);
+                    buffer.put(i+w, buffer.get(j+w));
+                    buffer.put(j+w, tmp);
+                }
+            }
+        }
+    }
+
+
+
+    public void retain(BTreeQueryBuffer buffer, long boundary, long searchStart, long numEntries, long mask, int stepSize) {
+
+        final long end = searchStart + stepSize * numEntries;
+        if (end < mappedSize) {
+            grow(end);
+        }
+
+        long bv = buffer.currentValue() & mask;
+        long av = get(searchStart) & mask;
+        long pos = searchStart;
+
+        int bi = (int)(searchStart / bufferSize);
+        int bo = (int)(searchStart % bufferSize);
+
+        LongBuffer data = buffers.get(bi);
+
+        while (bv <= boundary && buffer.hasMore()) {
+            if (bv < av) {
+                if (!buffer.rejectAndAdvance()) break;
+                bv = buffer.currentValue() & mask;
+                continue;
+            }
+            else if (bv == av) {
+                if (!buffer.retainAndAdvance()) break;
+                bv = buffer.currentValue() & mask;
+                continue;
+            }
+
+            pos += stepSize;
+            if (pos < end) {
+                bo += stepSize;
+                if (bo >= bufferSize) {
+                    data = buffers.get(++bi);
+                    bo = 0;
+                }
+                av = data.get(bo) & mask;
+            }
+            else {
+                break;
+            }
+        }
+
+    }
+
+    public void reject(BTreeQueryBuffer buffer, long boundary, long searchStart, long numEntries, long mask, int stepSize) {
+
+        final long end = searchStart + stepSize * numEntries;
+        if (end < mappedSize) {
+            grow(end);
+        }
+
+        long bv = buffer.currentValue() & mask;
+        long av = get(searchStart) & mask;
+        long pos = searchStart;
+
+        int bi = (int)(searchStart / bufferSize);
+        int bo = (int)(searchStart % bufferSize);
+
+        LongBuffer data = buffers.get(bi);
+
+        while (bv <= boundary && buffer.hasMore()) {
+            if (bv < av) {
+                if (!buffer.retainAndAdvance()) break;
+                bv = buffer.currentValue() & mask;
+                continue;
+            }
+            else if (bv == av) {
+                if (!buffer.rejectAndAdvance()) break;
+                bv = buffer.currentValue() & mask;
+                continue;
+            }
+
+            pos += stepSize;
+            if (pos < end) {
+                bo += stepSize;
+                if (bo >= bufferSize) {
+                    data = buffers.get(++bi);
+                    bo = 0;
+                }
+                av = data.get(bo) & mask;
+            }
+            else {
+                break;
+            }
+        }
+
+    }

    @Override
    public void close() throws IOException {
@ -424,6 +849,4 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
        System.runFinalization();
        System.gc();
    }
-
-
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongOffsetSlice.java
@ -61,6 +61,17 @@ public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice {
        map.write(vals, idx+off);
    }

+    @Override
+    public void write(LongBuffer vals, int n, long idx) {
+        map.write(vals,  n,idx+off);
+    }
+
+    @Override
+    public void swapn(int n, long idx1, long idx2) {
+        map.swapn(n, idx1+off, idx2+off);
+    }
+
+
    @Override
    public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd)
            throws IOException {
@ -75,4 +86,35 @@ public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice {

        return new MultimapFileLongOffsetSlice(map, this.off + off);
    }
+
+    @Override
+    public long binarySearchInternal(long key, long fromIndex, int step, long n, long mask) {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public long binarySearchInternal(long key, long fromIndex, long n, long mask) {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public long binarySearchInternal(long key, long fromIndex, long n) {
+        throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public long binarySearchUpperInternal(long key, long fromIndex, long n) {
+        throw new UnsupportedOperationException();
+
+    }
+
+    @Override
+    public long quickSortPartition(int wordSize, long low, long highInclusive) {
+        return map.quickSortPartition(wordSize, low+off, highInclusive+off);
+    }
+
+    @Override
+    public void insertionSort(int wordSize, long start, int n) {
+        map.insertionSort(wordSize, start+off, n);
+    }
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapFileLongSlice.java
@ -25,9 +25,23 @@ public interface MultimapFileLongSlice {

    void write(LongBuffer vals, long idx);

+    void write(LongBuffer vals, int n, long idx);
+
+    void swapn(int n, long idx1, long idx2);
+
    void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException;

    default MultimapFileLongSlice atOffset(long off) {
        return new MultimapFileLongOffsetSlice(this, off);
    }
+    long binarySearchInternal(long key, long fromIndex, int step, long n, long mask);
+    long binarySearchInternal(long key, long fromIndex, long n, long mask);
+
+    long binarySearchInternal(long key, long fromIndex, long n);
+
+    long binarySearchUpperInternal(long key, long fromIndex, long n);
+
+    long quickSortPartition(int wordSize, long low, long highInclusive);
+
+    void insertionSort(int wordSize, long start, int n);
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcher.java
@ -1,7 +1,7 @@
 package nu.marginalia.util.multimap;

 public interface MultimapSearcher {
-    long binarySearchUpper(long key, long fromIndex, long n);
+    long binarySearchLower(long key, long fromIndex, long n);
    long binarySearch(long key, long fromIndex, long n);

    static MultimapSearcher forContext(MultimapFileLongSlice slice, long mask, int stepSize) {
@ -25,8 +25,8 @@ class SimpleMultimapSearcher implements  MultimapSearcher {
    }

    @Override
-    public long binarySearchUpper(long key, long fromIndex, long n) {
-        return base.binarySearchUpper(key, fromIndex, n);
+    public long binarySearchLower(long key, long fromIndex, long n) {
+        return base.binarySearchLower(key, fromIndex, n);
    }

    @Override
@ -46,8 +46,8 @@ class MaskedMultimapSearcher implements  MultimapSearcher {
    }

    @Override
-    public long binarySearchUpper(long key, long fromIndex, long n) {
-        return base.binarySearchUpper(key, fromIndex, n, mask);
+    public long binarySearchLower(long key, long fromIndex, long n) {
+        return base.binarySearchLower(key, fromIndex, n, mask);
    }

    @Override
@ -69,8 +69,8 @@ class SteppingMaskedMultimapSearcher implements  MultimapSearcher {
    }

    @Override
-    public long binarySearchUpper(long key, long fromIndex, long n) {
-        return base.binarySearchUpper(key, fromIndex, step, n, mask);
+    public long binarySearchLower(long key, long fromIndex, long n) {
+        return base.binarySearchLower(key, fromIndex, step, n, mask);
    }

    @Override
--- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSearcherBase.java
@ -29,26 +29,12 @@ public class MultimapSearcherBase {
        return false;
    }

-    public long binarySearchUpper(long key, long fromIndex, long n) {
-        long low = 0;
-        long high = n - 1;
-
-        while (low <= high) {
-            long mid = (low + high) >>> 1;
-            long midVal = get(fromIndex + mid);
-
-            if (midVal < key)
-                low = mid + 1;
-            else if (midVal > key)
-                high = mid - 1;
-            else
-                return fromIndex + mid;
-        }
-        return fromIndex + low;
+    public long binarySearchLower(long key, long fromIndex, long n) {
+        return mmf.binarySearchUpperInternal(key, fromIndex, n);
    }


-    public long binarySearchUpper(long key, long fromIndex, long n, long mask) {
+    public long binarySearchLower(long key, long fromIndex, long n, long mask) {
        long low = 0;
        long high = n - 1;

@ -67,7 +53,7 @@ public class MultimapSearcherBase {
    }


-    public long binarySearchUpper(long key, long fromIndex, int step, long n, long mask) {
+    public long binarySearchLower(long key, long fromIndex, int step, long n, long mask) {
        long low = 0;
        long high = n - 1;

@ -82,62 +68,19 @@ public class MultimapSearcherBase {
            else
                return fromIndex + mid*step;
        }
-        return fromIndex + low;
+        return fromIndex + low*step;
    }

    public long binarySearch(long key, long fromIndex, long n) {
-        long low = 0;
-        long high = n - 1;
-
-        while (low <= high) {
-            long mid = (low + high) >>> 1;
-            long midVal = get(fromIndex + mid);
-
-            if (midVal < key)
-                low = mid + 1;
-            else if (midVal > key)
-                high = mid - 1;
-            else
-                return fromIndex + mid;
-        }
-        return -1;
+        return mmf.binarySearchInternal(key, fromIndex, n);
    }


    public long binarySearch(long key, long fromIndex, long n, long mask) {
-        long low = 0;
-        long high = n - 1;
-
-        while (low <= high) {
-            long mid = (low + high) >>> 1;
-            long midVal = get(fromIndex + mid) & mask;
-
-            if (midVal < key)
-                low = mid + 1;
-            else if (midVal > key)
-                high = mid - 1;
-            else
-                return fromIndex + mid;
+        return mmf.binarySearchInternal(key, fromIndex, n, mask);
    }
-        return -1;
-    }
-

    public long binarySearch(long key, long fromIndex, int step, long n, long mask) {
-        long low = 0;
-        long high = n - 1;
-
-        while (low <= high) {
-            long mid = (low + high) >>> 1;
-            long midVal = get(fromIndex + mid*step) & mask;
-
-            if (midVal < key)
-                low = mid + 1;
-            else if (midVal > key)
-                high = mid - 1;
-            else
-                return fromIndex + mid*step;
-        }
-        return -1;
+        return mmf.binarySearchInternal(key, fromIndex, step, n, mask);
    }
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/multimap/MultimapSorter.java
@ -1,56 +1,85 @@
 package nu.marginalia.util.multimap;

+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import java.io.IOException;
 import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
 import java.nio.LongBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.file.Files;
 import java.nio.file.Path;
-import java.util.Arrays;

 import static nu.marginalia.util.multimap.MultimapFileLong.WORD_SIZE;

 public class MultimapSorter {
    private final Path tmpFileDir;
-    private final int internalSortLimit;
    private final MultimapFileLongSlice multimapFileLong;
-    private final long[] buffer;
+    private final LongBuffer buffer;
+    private final int internalSortLimit;
+    private final int wordSize;

-    public MultimapSorter(MultimapFileLongSlice multimapFileLong, Path tmpFileDir, int internalSortLimit) {
+    private static final Logger logger = LoggerFactory.getLogger(MultimapSorter.class);
+
+    public MultimapSorter(MultimapFileLongSlice multimapFileLong, Path tmpFileDir, int internalSortLimit, int wordSize) {
        this.multimapFileLong = multimapFileLong;
        this.tmpFileDir = tmpFileDir;
        this.internalSortLimit = internalSortLimit;
-        buffer = new long[internalSortLimit];
+        this.wordSize = wordSize;
+        buffer = ByteBuffer.allocateDirect(internalSortLimit * wordSize * 8).asLongBuffer();
    }

-    public void sort(long start, int length) throws IOException {
-        if (length <= internalSortLimit) {
-            multimapFileLong.read(buffer, length, start);
-            Arrays.sort(buffer, 0, length);
-            multimapFileLong.write(buffer, length, start);
+    public void sortRange(long start, long end) throws IOException {
+        if (end - start < internalSortLimit) {
+            quickSortLH(start, end - wordSize);
        }
        else {
-            externalSort(start, length);
+            mergeSort(start, (int) (end - start));
+        }
+
+        for (long lp = start + wordSize; lp < end; lp += wordSize) {
+            if (multimapFileLong.get(lp - wordSize) > multimapFileLong.get(lp)) {
+
+                logger.error("Sort contract breached [{}:{} ({}), ws={}, <isl={}, bc={}]",
+                        start, end,
+                        end - start,
+                        wordSize, end - start < internalSortLimit,
+                        buffer.capacity());
+
+            }
        }
    }

+    public void mergeSort(long start, int lengthLongs) throws IOException {
+        if (lengthLongs == 1)
+            return;

-    private void externalSort(long start, int length) throws IOException {
-        Path tmpFile = Files.createTempFile(tmpFileDir,"sort-"+start+"-"+(start+length), ".dat");
-
+        if (lengthLongs < buffer.capacity()) {
+            mergeSort(start, lengthLongs, buffer);
+        }
+        else {
+            Path tmpFile = Files.createTempFile(tmpFileDir,"sort-"+start+"-"+(start+lengthLongs), ".dat");
            try (var raf = new RandomAccessFile(tmpFile.toFile(), "rw"); var channel = raf.getChannel()) {
                var workBuffer =
-                    channel.map(FileChannel.MapMode.READ_WRITE, 0,  length * WORD_SIZE)
+                        channel.map(FileChannel.MapMode.READ_WRITE, 0, wordSize * lengthLongs * WORD_SIZE)
                                .asLongBuffer();
-
-            int width = Math.min(Integer.highestOneBit(length), Integer.highestOneBit(internalSortLimit));
+                mergeSort(start, lengthLongs, workBuffer);
+            }
+            finally {
+                tmpFile.toFile().delete();
+            }
+        }
+    }
+    private void mergeSort(long start, int length, LongBuffer workBuffer) throws IOException {
+        int width = Math.min(Integer.highestOneBit(length), Integer.highestOneBit(buffer.capacity()));

        // Do in-memory sorting up until internalSortLimit first
        for (int i = 0; i < length; i += width) {
-                sort(start + i, Math.min(width, length-i));
+            quickSort(start + i, Math.min(width, length-i));
        }

-            // Then merge sort on disk for the rest
+        // Then finish with merge sort
        for (; width < length; width*=2) {

            for (int i = 0; i < length; i += 2*width) {
@ -58,30 +87,61 @@ public class MultimapSorter {
            }

            workBuffer.clear();
-                multimapFileLong.write(workBuffer, start);
+            multimapFileLong.write(workBuffer, length, start);
        }

    }
-        finally {
-            tmpFile.toFile().delete();
-        }
-    }
+

    void merge(long offset, int left, int right, int end, LongBuffer workBuffer) {
-        int i = left;
-        int j = right;
+        long idxL = left;
+        long idxR = right;

-        for (int k = left; k < end; k++) {
-            final long bufferI = multimapFileLong.get(offset+i);
-            final long bufferJ = multimapFileLong.get(offset+j);
+        for (int putPos = left; putPos < end; putPos+= wordSize) {
+            final long bufferL = multimapFileLong.get(offset+idxL);
+            final long bufferR = multimapFileLong.get(offset+idxR);

-            if (i < right && (j >= end || bufferI < bufferJ)) {
-                workBuffer.put(k, bufferI);
-                i++;
+            if (idxL < right && (idxR >= end || bufferL < bufferR)) {
+                workBuffer.put(putPos, bufferL);
+                for (int s = 1; s < wordSize; s++) {
+                    workBuffer.put(putPos + s, multimapFileLong.get(offset + idxL + s));
+                }
+                idxL+= wordSize;
            }
            else {
-                workBuffer.put(k, bufferJ);
-                j++;
+                workBuffer.put(putPos, bufferR);
+                for (int s = 1; s < wordSize; s++) {
+                    workBuffer.put(putPos + s, multimapFileLong.get(offset + idxR + s));
+                }
+                idxR+= wordSize;
+            }
+        }
+    }
+
+    public void insertionSort(long start, int n) {
+        multimapFileLong.insertionSort(wordSize, start, n);
+    }
+
+    private void swap(long a, long b) {
+        multimapFileLong.swapn(wordSize, a, b);
+    }
+
+    public void quickSort(long start, long length) {
+        quickSortLH(start, start + length - wordSize);
+
+    }
+    public void quickSortLH(long low, long highInclusive) {
+
+        if (low >= 0 && highInclusive >= 0 && low < highInclusive) {
+
+            if (highInclusive - low < 32) {
+                multimapFileLong.insertionSort(wordSize, low, (int) (1 + (highInclusive - low) / wordSize));
+            }
+            else {
+                long p = multimapFileLong.quickSortPartition(wordSize, low, highInclusive);
+
+                quickSortLH(low, p);
+                quickSortLH(p + wordSize, highInclusive);
            }
        }
    }
--- a/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/ranking/tool/UpdateDomainRanksTool2.java
@ -11,27 +11,16 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.sql.SQLException;
-import java.util.HashSet;
-import java.util.Set;
 import java.util.concurrent.LinkedBlockingQueue;

 public class UpdateDomainRanksTool2 {

    private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool2.class);

-    public Set<String> originDomains = new HashSet<>();
-    public Set<Integer> originDomainIds = new HashSet<>();
    public final long domainIdMax = -1;
    public int domainCount;
    private volatile static int rankMax;

-    public int maxId() {
-        return (int) domainIdMax;
-    }
-    public int domainCount() {
-        return domainCount;
-    }
-
    static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
    volatile static boolean running = true;

@ -44,23 +33,14 @@ public class UpdateDomainRanksTool2 {
        var uploader = new Thread(() -> uploadThread(conn), "Uploader");

        logger.info("Ranking");
-        // "memex.marginalia.nu", "wiki.xxiivv.com", "bikobatanari.art", "sadgrl.online", "lileks.com",
-        // "www.rep.routledge.com", "www.personal.kent.edu", "xroads.virginia.edu", "classics.mit.edu", "faculty.washington.edu", "monadnock.net"
        var ds = new DatabaseModule().provideConnection();
        var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
         var rpr = new BetterReversePageRank(domains,  "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
-//        var rpr = new BetterStandardPageRank(new DatabaseModule().provideConnection(),  "%edu");
-//        var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu");

        var rankVector = rpr.pageRankVector();
-        var norm = rankVector.norm();
        rankMax = rpr.size();
        uploader.start();

-
-        rankMax = rpr.size();
-
-
        rpr.pageRankWithPeripheralNodes(rankMax).forEach(i -> {
            try {
                uploadQueue.put(i);
--- a/marginalia_nu/src/main/java/nu/marginalia/util/tool/EdgeDomainLinkConsineSimilarityMain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/util/tool/EdgeDomainLinkConsineSimilarityMain.java
@ -0,0 +1,298 @@
+package nu.marginalia.util.tool;
+
+import com.zaxxer.hikari.HikariDataSource;
+import gnu.trove.map.hash.TIntIntHashMap;
+import gnu.trove.map.hash.TIntObjectHashMap;
+import gnu.trove.set.hash.TIntHashSet;
+import lombok.SneakyThrows;
+import nu.marginalia.util.AndCardIntSet;
+import nu.marginalia.wmsa.configuration.module.DatabaseModule;
+import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl;
+import nu.marginalia.wmsa.edge.model.EdgeDomain;
+import nu.marginalia.wmsa.edge.model.id.EdgeId;
+import org.roaringbitmap.RoaringBitmap;
+
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.*;
+import java.util.concurrent.LinkedBlockingDeque;
+import java.util.concurrent.TimeUnit;
+import java.util.function.Consumer;
+
+import static nu.marginalia.util.AndCardIntSet.*;
+
+public class EdgeDomainLinkConsineSimilarityMain {
+    ArrayList<Integer> idsList = new ArrayList<>(100_000);
+    ArrayList<AndCardIntSet> itemsList = new ArrayList<>(100_000);
+    TIntObjectHashMap<AndCardIntSet> dToSMap = new TIntObjectHashMap<>(100_000);
+    TIntIntHashMap aliasMap = new TIntIntHashMap(100_000, 0.75f, -1, -1);
+    TIntHashSet indexed = new TIntHashSet(100_000);
+
+    float weights[];
+
+    private HikariDataSource dataSource;
+
+    public EdgeDomainLinkConsineSimilarityMain(HikariDataSource dataSource) throws SQLException {
+        this.dataSource = dataSource;
+
+        Map<Integer, RoaringBitmap> tmpMap = new HashMap<>(100_000);
+        try (
+             var conn = dataSource.getConnection();
+             var aliasStmt = conn.prepareStatement("SELECT ID, DOMAIN_ALIAS FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NOT NULL");
+             var indexedStmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE INDEXED>0");
+             var linksStmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) {
+            ResultSet rsp;
+
+            aliasStmt.setFetchSize(10_000);
+            rsp = aliasStmt.executeQuery();
+            while (rsp.next()) {
+                aliasMap.put(rsp.getInt(1), rsp.getInt(2));
+            }
+
+            indexedStmt.setFetchSize(10_000);
+            rsp = indexedStmt.executeQuery();
+            while (rsp.next()) {
+                indexed.add(rsp.getInt(1));
+            }
+
+
+            linksStmt.setFetchSize(10_000);
+            rsp = linksStmt.executeQuery();
+            while (rsp.next()) {
+                int source = deAlias(rsp.getInt(1));
+                int dest = deAlias(rsp.getInt(2));
+
+                tmpMap.computeIfAbsent(dest, this::createBitmapWithSelf).add(source);
+            }
+        }
+
+        tmpMap.entrySet().stream()
+                .filter(e -> isEligible(e.getValue()))
+                .forEach(e -> {
+                    var val = of(e.getValue());
+                    idsList.add(e.getKey());
+                    itemsList.add(val);
+                    dToSMap.put(e.getKey(), val);
+                });
+        weights = new float[1 + idsList.stream().mapToInt(i -> i).max().orElse(0)];
+        for (int i = 0; i < idsList.size(); i++) {
+            weights[idsList.get(i)] = getWeight(idsList.get(i));
+        }
+    }
+
+    private boolean isEligible(RoaringBitmap value) {
+        int cardinality = value.getCardinality();
+
+        return cardinality < 10000;
+    }
+
+    private int deAlias(int id) {
+        int val = aliasMap.get(id);
+        if (val < 0)
+            return id;
+        return val;
+    }
+
+    LinkedBlockingDeque<DomainSimilarities> similaritiesLinkedBlockingDeque = new LinkedBlockingDeque<>(10);
+    volatile boolean running;
+
+    @SneakyThrows
+    public void tryDomains(String... domainName) {
+        var dataStoreDao = new EdgeDataStoreDaoImpl(dataSource);
+
+        System.out.println(Arrays.toString(domainName));
+
+        int[] domainIds = Arrays.stream(domainName).map(EdgeDomain::new)
+                .map(dataStoreDao::getDomainId)
+                .mapToInt(EdgeId::id)
+                .map(this::deAlias)
+                .toArray();
+
+        for (int domainId : domainIds) {
+            findAdjacentDtoS(domainId, similarities -> {
+                for (var similarity : similarities.similarities()) {
+                    if (indexed.contains(similarity.domainId)) System.out.print("*");
+                    System.out.println(dataStoreDao.getDomain(new EdgeId<>(similarity.domainId)).map(Object::toString).orElse("") + " " + prettyPercent(similarity.value));
+                }
+            });
+        }
+    }
+
+    private String prettyPercent(double val) {
+        return String.format("%2.2f%%", 100. * val);
+    }
+
+    @SneakyThrows
+    public void loadAll() {
+        running = true;
+        var thread = new Thread(this::insertThreadRun);
+        thread.start();
+        idsList.parallelStream()
+                .filter(id -> !aliasMap.containsKey(id))
+                .forEach(id -> findAdjacent(id, this::addToQueue));
+        running = false;
+        thread.join();
+    }
+
+    @SneakyThrows
+    void addToQueue(DomainSimilarities similarities) {
+        similaritiesLinkedBlockingDeque.putLast(similarities);
+    }
+
+    public void insertThreadRun() {
+        try (var conn = dataSource.getConnection();
+             var stmt = conn.prepareStatement(
+                     """
+                     INSERT INTO EC_DOMAIN_NEIGHBORS_2
+                     (DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS)
+                     VALUES (?, ?, ?)
+                     ON DUPLICATE KEY UPDATE RELATEDNESS = GREATEST(EC_DOMAIN_NEIGHBORS_2.RELATEDNESS, VALUES(RELATEDNESS))
+                     """)
+        ) {
+            while (running || !similaritiesLinkedBlockingDeque.isEmpty()) {
+                var item = similaritiesLinkedBlockingDeque.pollFirst(60, TimeUnit.SECONDS);
+                if (item == null) continue;
+
+                for (var similarity : item.similarities) {
+                    stmt.setInt(1, item.domainId);
+                    stmt.setInt(2, similarity.domainId);
+                    stmt.setDouble(3, similarity.value);
+                    stmt.addBatch();
+                }
+                stmt.executeBatch();
+            }
+        } catch (SQLException | InterruptedException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public RoaringBitmap createBitmapWithSelf(int val) {
+        var bm = new RoaringBitmap();
+        bm.add(val);
+        return bm;
+    }
+
+    public void findAdjacent(int domainId, Consumer<DomainSimilarities> andThen) {
+        findAdjacentDtoS(domainId, andThen);
+    }
+
+    double cosineSimilarity(AndCardIntSet a, AndCardIntSet b) {
+        double andCardinality = andCardinality(a, b);
+        andCardinality /= Math.sqrt(a.getCardinality());
+        andCardinality /= Math.sqrt(b.getCardinality());
+        return andCardinality;
+    }
+
+    double expensiveCosineSimilarity(AndCardIntSet a, AndCardIntSet b) {
+        return weightedProduct(weights, a, b) / Math.sqrt(a.mulAndSum(weights) * b.mulAndSum(weights));
+    }
+
+    float getWeight(int i) {
+        var vector = dToSMap.get(i);
+
+        if (vector == null) return 1.0f;
+        return 1.0f / (float) Math.log(2+vector.getCardinality());
+    }
+
+    record DomainSimilarities(int domainId, List<DomainSimilarity> similarities) {};
+    record DomainSimilarity(int domainId, double value) {};
+
+    @SneakyThrows
+    private void findAdjacentDtoS(int domainId, Consumer<DomainSimilarities> andThen) {
+        var vector = dToSMap.get(domainId);
+        if (vector == null || !vector.cardinalityExceeds(10)) {
+            return;
+        }
+
+        System.out.println("DtoS " + domainId);
+
+        List<DomainSimilarity> similarities = new ArrayList<>(1000);
+
+        /** The minimum cardinality a vector can have so that
+         *
+         * a (x) b
+         * ------- < k is given by k^2
+         * |a||b|
+         *
+         */
+        int cardMin = Math.max(2, (int) (0.01 * vector.getCardinality()));
+
+        for (int i = 0; i < itemsList.size(); i++) {
+
+            int id = idsList.get(i);
+            if (id == domainId)
+                continue;
+
+            var otherVec = itemsList.get(i);
+            if (otherVec.getCardinality() < cardMin)
+                continue;
+
+            double similarity = cosineSimilarity(vector, otherVec);
+            if (similarity > 0.1) {
+                var recalculated = expensiveCosineSimilarity(vector, otherVec);
+                if (recalculated > 0.1) {
+                    similarities.add(new DomainSimilarity(id, recalculated));
+                }
+            }
+        }
+
+        if (similarities.size() > 128) {
+            similarities.sort(Comparator.comparing(DomainSimilarity::value));
+            similarities.subList(0, similarities.size() - 128).clear();
+        }
+
+
+        andThen.accept(new DomainSimilarities(domainId, similarities));
+    }
+
+
+//    @SneakyThrows
+//    private void findAdjacentDtoS(Consumer<DomainSimilarities> andThen, int... domainIds) {
+//        var vectors = Arrays.stream(domainIds).mapToObj(dToSMap::get)
+//                .filter(Objects::nonNull)
+//                .filter(vec -> vec.cardinalityExceeds(10))
+//                .toArray(AndCardIntSet[]::new);
+//        Set<Integer> domainIdsSet = new HashSet<>(Arrays.stream(domainIds).boxed().toList());
+//
+//        if (vectors.length != domainIds.length)
+//            return;
+//
+//        List<DomainSimilarity> similarities = dToSMap.entrySet().parallelStream()
+//                .filter(e -> !domainIdsSet.contains(e.getKey()) && indexed.contains(e.getKey()))
+//                .flatMap(entry -> {
+//
+//                double similarity = 0.;
+//                for (var vector : vectors) {
+//                    similarity += cosineSimilarity(vector, entry.getValue());
+//                }
+//
+//                if (similarity > 0.1 * vectors.length) {
+//                    double recalculated = 0;
+//                    for (var vector : vectors) {
+//                        recalculated += expensiveCosineSimilarity(vector, entry.getValue());
+//                    }
+//                    if (recalculated > 0.1 * vectors.length) {
+//                        return Stream.of(new DomainSimilarity(entry.getKey(), recalculated));
+//                    }
+//                }
+//                return Stream.empty();
+//        }).sorted(Comparator.comparing(DomainSimilarity::value))
+//                .toList();
+//
+//        andThen.accept(new DomainSimilarities(domainIds[0], similarities));
+//    }
+
+
+    public static void main(String[] args) throws SQLException {
+        DatabaseModule dm = new DatabaseModule();
+
+        var main = new EdgeDomainLinkConsineSimilarityMain(dm.provideConnection());
+        if (args.length == 0) {
+            main.loadAll();
+        }
+        else {
+            main.tryDomains(args);
+        }
+    }
+
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResult.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResult.java
@ -2,8 +2,14 @@ package nu.marginalia.wmsa.api.model;

 import lombok.AllArgsConstructor;
 import lombok.Getter;
+import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore;
 import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;

+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+
@AllArgsConstructor @Getter
 public class ApiSearchResult {
    public String url;
@ -11,10 +17,30 @@ public class ApiSearchResult {
    public String description;
    public double quality;

+    public List<List<ApiSearchResultQueryDetails>> details = new ArrayList<>();
+
    public ApiSearchResult(EdgeUrlDetails url) {
        this.url = url.url.toString();
        this.title = url.getTitle();
        this.description = url.getDescription();
        this.quality = url.getTermScore();
+
+        if (url.resultItem != null) {
+            var bySet = url.resultItem.scores.stream().collect(Collectors.groupingBy(EdgeSearchResultKeywordScore::set));
+
+            outer:
+            for (var entries : bySet.values()) {
+                List<ApiSearchResultQueryDetails> lst = new ArrayList<>();
+                for (var entry : entries) {
+                    var metadata = entry.metadata();
+                    if (metadata.isEmpty())
+                        continue outer;
+
+                    Set<String> flags = metadata.flags().stream().map(Object::toString).collect(Collectors.toSet());
+                    lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(),metadata.count(), flags));
+                }
+                details.add(lst);
+            }
+        }
    }
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResultQueryDetails.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/api/model/ApiSearchResultQueryDetails.java
@ -0,0 +1,16 @@
+package nu.marginalia.wmsa.api.model;
+
+import lombok.AllArgsConstructor;
+import lombok.Getter;
+
+import java.util.Set;
+
+@AllArgsConstructor @Getter
+public class ApiSearchResultQueryDetails {
+
+    String keyword;
+    int tfIdf;
+    int count;
+
+    Set<String> flagsUnstableAPI;
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java
@ -5,6 +5,7 @@ import nu.marginalia.wmsa.auth.AuthMain;
 import nu.marginalia.wmsa.configuration.command.*;
 import nu.marginalia.wmsa.edge.assistant.EdgeAssistantMain;
 import nu.marginalia.wmsa.edge.dating.DatingMain;
+import nu.marginalia.wmsa.edge.explorer.ExplorerMain;
 import nu.marginalia.wmsa.edge.index.EdgeIndexMain;
 import nu.marginalia.wmsa.edge.search.EdgeSearchMain;
 import nu.marginalia.wmsa.encyclopedia.EncyclopediaMain;
@ -37,6 +38,7 @@ public enum ServiceDescriptor {
    ENCYCLOPEDIA("encyclopedia", 5040, EncyclopediaMain.class),

    DATING("dating", 5070, DatingMain.class),
+    EXPLORER("explorer", 5071, ExplorerMain.class),

    TEST_1("test-1", 0, null),
    TEST_2("test-2", 0, null);
@ -77,7 +79,8 @@ public enum ServiceDescriptor {

    public static void main(String... args) {
        MainMapLookup.setMainArguments(args);
-        Map<String, Command> functions = Stream.of(new ListCommand(),
+        Map<String, Command> functions = Stream.of(
+                new ListCommand(),
                new StartCommand(),
                new ConvertCommand(),
                new CrawlCommand(),
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotService.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/assistant/screenshot/ScreenshotService.java
@ -12,6 +12,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import spark.Request;
 import spark.Response;
+import spark.Spark;

 import java.sql.SQLException;

@ -85,6 +86,12 @@ public class ScreenshotService {
    }

    private Object serveSvgPlaceholder(Response response, int id) {
+
+        var domainName = edgeDataStoreDao.getDomain(new EdgeId<>(id)).map(Object::toString);
+        if (domainName.isEmpty()) {
+            Spark.halt(404);
+        }
+
        response.type("image/svg+xml");
        return String.format("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" +
                "<svg\n" +
@ -111,6 +118,6 @@ public class ScreenshotService {
                "       style=\"font-size:32px;fill:#000000;font-family:monospace;\"\n" +
                "       x=\"320\" y=\"240\" dominant-baseline=\"middle\" text-anchor=\"middle\">%s</text>\n" +
                "  </g>\n" +
-                "</svg>\n", edgeDataStoreDao.getDomain(new EdgeId<>(id)));
+                "</svg>\n", domainName.get());
    }
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConversionLog.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConversionLog.java
@ -0,0 +1,69 @@
+package nu.marginalia.wmsa.edge.converting;
+
+import com.github.luben.zstd.ZstdOutputStream;
+import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter;
+import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
+import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
+import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
+import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
+import nu.marginalia.wmsa.edge.model.EdgeDomain;
+import nu.marginalia.wmsa.edge.model.EdgeUrl;
+import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
+
+import java.io.BufferedOutputStream;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.time.LocalDateTime;
+import java.time.ZoneOffset;
+
+public class ConversionLog implements AutoCloseable, Interpreter {
+
+
+
+    private final PrintWriter writer;
+
+    public ConversionLog(Path rootDir) throws IOException {
+        String fileName = String.format("conversion-log-%s.zstd", LocalDateTime.now().toEpochSecond(ZoneOffset.UTC));
+        Path logFile = rootDir.resolve(fileName);
+
+        writer = new PrintWriter(new ZstdOutputStream(
+                new BufferedOutputStream(Files.newOutputStream(logFile, StandardOpenOption.WRITE, StandardOpenOption.CREATE))));
+    }
+
+    @Override
+    public void close() throws IOException {
+        writer.close();
+    }
+
+    @Override
+    public void loadUrl(EdgeUrl[] url) {}
+
+    @Override
+    public void loadDomain(EdgeDomain[] domain) {}
+
+    @Override
+    public void loadRssFeed(EdgeUrl[] rssFeed) {}
+
+    @Override
+    public void loadDomainLink(DomainLink[] links) {}
+
+    @Override
+    public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {}
+
+    @Override
+    public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {}
+
+    @Override
+    public synchronized void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) {
+        writer.printf("%s\t%s\n", loadProcessedDocumentWithError.url(), loadProcessedDocumentWithError.reason());
+    }
+
+    @Override
+    public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {}
+
+    @Override
+    public void loadDomainRedirect(DomainLink link) {}
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConvertedDomainReader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConvertedDomainReader.java
@ -54,5 +54,4 @@ public class ConvertedDomainReader {

        return ret;
    }
-
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterMain.java
@ -5,9 +5,9 @@ import com.google.inject.Guice;
 import com.google.inject.Inject;
 import com.google.inject.Injector;
 import nu.marginalia.util.ParallelPipe;
+import nu.marginalia.wmsa.edge.converting.compiler.InstructionsCompiler;
 import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
 import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor;
-import nu.marginalia.wmsa.edge.converting.processor.InstructionsCompiler;
 import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
 import nu.marginalia.wmsa.edge.crawling.WorkLog;
 import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
@ -47,11 +47,15 @@ public class ConverterMain {
            Gson gson
            ) throws Exception {

-        instructionWriter = new LoadInstructionWriter(plan.process.getDir(), gson);
+        ;
+
+

        logger.info("Starting pipe");

-        try (WorkLog processLog = plan.createProcessWorkLog()) {
+        try (WorkLog processLog = plan.createProcessWorkLog();
+             ConversionLog log = new ConversionLog(plan.process.getDir())) {
+            instructionWriter = new LoadInstructionWriter(log, plan.process.getDir(), gson);
            var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 20, 4, 2) {

                @Override
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoadInstructionWriter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoadInstructionWriter.java
@ -24,10 +24,13 @@ import java.util.List;

 public class LoadInstructionWriter {

+    private ConversionLog log;
    private final Path outputDir;
    private final Gson gson;
    private static final Logger logger = LoggerFactory.getLogger(LoadInstructionWriter.class);
-    public LoadInstructionWriter(Path outputDir, Gson gson) {
+
+    public LoadInstructionWriter(ConversionLog log, Path outputDir, Gson gson) {
+        this.log = log;
        this.outputDir = outputDir;
        this.gson = gson;

@ -35,6 +38,7 @@ public class LoadInstructionWriter {
            throw new IllegalArgumentException("Output dir " + outputDir + " does not exist");
        }
    }
+
    public String accept(String id, List<Instruction> instructionList) throws IOException {
        Path outputFile = getOutputFile(id);

@ -48,6 +52,8 @@ public class LoadInstructionWriter {
            logger.info("Writing {} - {} - {}", id, instructionList.size(), summary);

            for (var instr : instructionList) {
+                instr.apply(log);
+
                outputStream.append(instr.tag().name());
                outputStream.append(' ');
                gson.toJson(instr, outputStream);
@ -66,6 +72,7 @@ public class LoadInstructionWriter {
        if (!Files.exists(destDir)) {
            Files.createDirectories(destDir);
        }
+
        return destDir.resolve(id + ".pzstd");
    }

--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ReindexTriggerMain.java
@ -70,7 +70,11 @@ public class ReindexTriggerMain {
        };

        client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/repartition")).build()).execute();
+
+        if (!Boolean.getBoolean("no-preconvert")) {
            client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/preconvert")).build()).execute();
+        }
+
        for (int i = 0; i < DYNAMIC_BUCKET_LENGTH+1; i++) {
            client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/reindex/" + i)).build()).execute();
        }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/DocumentsCompiler.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/DocumentsCompiler.java
@ -0,0 +1,58 @@
+package nu.marginalia.wmsa.edge.converting.compiler;
+
+import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
+import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
+import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadKeywords;
+import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
+import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
+import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
+import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
+import nu.marginalia.wmsa.edge.index.model.IndexBlockType;
+import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
+
+import java.util.List;
+
+public class DocumentsCompiler {
+
+    public void compile(List<Instruction> ret, List<ProcessedDocument> documents) {
+
+        for (var doc : documents) {
+            compileDocumentDetails(ret, doc);
+        }
+
+        for (var doc : documents) {
+            compileWords(ret, doc);
+        }
+
+    }
+
+    private void compileDocumentDetails(List<Instruction> ret, ProcessedDocument doc) {
+        var details = doc.details;
+
+        if (details != null) {
+            ret.add(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard, details.length, details.hashCode, details.quality));
+        }
+        else {
+            ret.add(new LoadProcessedDocumentWithError(doc.url, doc.state, doc.stateReason));
+        }
+    }
+
+    private void compileWords(List<Instruction> ret, ProcessedDocument doc) {
+        var words = doc.words;
+
+        if (words != null) {
+
+            var wordsArray = words.values().stream()
+                    .filter(this::filterNonTransients)
+                    .map(DocumentKeywords::new)
+                    .toArray(DocumentKeywords[]::new);
+
+            ret.add(new LoadKeywords(doc.url, wordsArray));
+        }
+    }
+
+    private boolean filterNonTransients(EdgePageWords words) {
+        return words.block.type != IndexBlockType.TRANSIENT;
+    }
+
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/FeedsCompiler.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/FeedsCompiler.java
@ -0,0 +1,23 @@
+package nu.marginalia.wmsa.edge.converting.compiler;
+
+import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
+import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadRssFeed;
+import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
+import nu.marginalia.wmsa.edge.model.EdgeUrl;
+
+import java.util.List;
+import java.util.Objects;
+
+public class FeedsCompiler {
+
+    public void compile(List<Instruction> ret, List<ProcessedDocument> documents) {
+
+        EdgeUrl[] feeds = documents.stream().map(doc -> doc.details)
+                .filter(Objects::nonNull)
+                .flatMap(dets -> dets.feedLinks.stream())
+                .distinct()
+                .toArray(EdgeUrl[]::new);
+
+        ret.add(new LoadRssFeed(feeds));
+    }
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/InstructionsCompiler.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/InstructionsCompiler.java
@ -0,0 +1,57 @@
+package nu.marginalia.wmsa.edge.converting.compiler;
+
+import com.google.inject.Inject;
+import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
+import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDomain;
+import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public class InstructionsCompiler {
+    private final UrlsCompiler urlsCompiler;
+    private final DocumentsCompiler documentsCompiler;
+    private final FeedsCompiler feedsCompiler;
+    private final LinksCompiler linksCompiler;
+    private final RedirectCompiler redirectCompiler;
+
+    @Inject
+    public InstructionsCompiler(UrlsCompiler urlsCompiler,
+                                DocumentsCompiler documentsCompiler,
+                                FeedsCompiler feedsCompiler,
+                                LinksCompiler linksCompiler,
+                                RedirectCompiler redirectCompiler)
+    {
+        this.urlsCompiler = urlsCompiler;
+        this.documentsCompiler = documentsCompiler;
+        this.feedsCompiler = feedsCompiler;
+        this.linksCompiler = linksCompiler;
+        this.redirectCompiler = redirectCompiler;
+    }
+
+    public List<Instruction> compile(ProcessedDomain domain) {
+        List<Instruction> ret = new ArrayList<>(domain.size()*4);
+
+        ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.ip));
+
+        if (domain.documents != null) {
+            urlsCompiler.compile(ret, domain.documents);
+            documentsCompiler.compile(ret, domain.documents);
+
+            feedsCompiler.compile(ret, domain.documents);
+
+            linksCompiler.compile(ret, domain.domain, domain.documents);
+        }
+        if (domain.redirect != null) {
+            redirectCompiler.compile(ret, domain.domain, domain.redirect);
+        }
+
+        return ret;
+    }
+
+
+
+
+
+
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/LinksCompiler.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/LinksCompiler.java
@ -0,0 +1,26 @@
+package nu.marginalia.wmsa.edge.converting.compiler;
+
+import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
+import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
+import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomainLink;
+import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
+import nu.marginalia.wmsa.edge.model.EdgeDomain;
+
+import java.util.List;
+import java.util.Objects;
+
+public class LinksCompiler {
+
+    public void compile(List<Instruction> ret, EdgeDomain from, List<ProcessedDocument> documents) {
+
+        DomainLink[] links = documents.stream().map(doc -> doc.details)
+                .filter(Objects::nonNull)
+                .flatMap(dets -> dets.linksExternal.stream())
+                .map(link -> link.domain)
+                .distinct()
+                .map(domain -> new DomainLink(from, domain))
+                .toArray(DomainLink[]::new);
+
+        ret.add(new LoadDomainLink(links));
+    }
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/RedirectCompiler.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/RedirectCompiler.java
@ -0,0 +1,19 @@
+package nu.marginalia.wmsa.edge.converting.compiler;
+
+import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
+import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
+import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomain;
+import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomainLink;
+import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomainRedirect;
+import nu.marginalia.wmsa.edge.model.EdgeDomain;
+
+import java.util.List;
+
+public class RedirectCompiler {
+
+    public void compile(List<Instruction> ret, EdgeDomain from, EdgeDomain to) {
+        ret.add(new LoadDomain(to));
+        ret.add(new LoadDomainLink(new DomainLink(from, to)));
+        ret.add(new LoadDomainRedirect(new DomainLink(from, to)));
+    }
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/UrlsCompiler.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/compiler/UrlsCompiler.java
@ -0,0 +1,49 @@
+package nu.marginalia.wmsa.edge.converting.compiler;
+
+import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
+import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomain;
+import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadUrl;
+import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
+import nu.marginalia.wmsa.edge.model.EdgeDomain;
+import nu.marginalia.wmsa.edge.model.EdgeUrl;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+public class UrlsCompiler {
+
+    private static final int MAX_INTERNAL_LINKS = 25;
+
+    public void compile(List<Instruction> ret, List<ProcessedDocument> documents) {
+        Set<EdgeUrl> seenUrls = new HashSet<>(documents.size()*4);
+        Set<EdgeDomain> seenDomains = new HashSet<>(documents.size());
+
+        for (var doc : documents) {
+            seenUrls.add(doc.url);
+
+            if (doc.details != null) {
+
+                for (var url : doc.details.linksExternal) {
+                    if (seenDomains.add(url.domain)) {
+                        seenUrls.add(url);
+                    }
+                }
+
+                if (doc.isOk()) {
+                    // Don't load more than a few from linksInternal, grows too big for no reason
+                    var linksToAdd = new ArrayList<>(doc.details.linksInternal);
+                    if (linksToAdd.size() > MAX_INTERNAL_LINKS) {
+                        linksToAdd.subList(MAX_INTERNAL_LINKS, linksToAdd.size()).clear();
+                    }
+                    seenUrls.addAll(linksToAdd);
+                }
+            }
+        }
+
+        ret.add(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new)));
+        ret.add(new LoadUrl(seenUrls.toArray(EdgeUrl[]::new)));
+    }
+
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/DocumentKeywords.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/DocumentKeywords.java
@ -1,17 +1,47 @@
 package nu.marginalia.wmsa.edge.converting.interpreter.instruction;

+import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
 import nu.marginalia.wmsa.edge.index.model.IndexBlock;
 import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;

 import java.util.Arrays;

-public record DocumentKeywords(IndexBlock block, String... keywords) {
+public record DocumentKeywords(IndexBlock block,
+                               String[] keywords,
+                               long[] metadata) {
+
    public DocumentKeywords(EdgePageWords words) {
-        this(words.block, words.words.toArray(String[]::new));
+        this(words.block,
+                words.words.toArray(String[]::new),
+                words.metadata.toArray());
    }

    @Override
    public String toString() {
-        return getClass().getSimpleName()+"["+block +", "+Arrays.toString(keywords)+"]";
+        StringBuilder sb = new StringBuilder();
+        sb.append(getClass().getSimpleName());
+        sb.append('[').append(block).append(", ");
+        for (int i = 0; i < keywords.length; i++) {
+            sb.append("\n\t ");
+            if (metadata[i] != 0) {
+                sb.append(keywords[i]).append("/").append(new EdgePageWordMetadata(metadata[i]));
+            }
+            else {
+                sb.append(keywords[i]);
+            }
+        }
+        return sb.append("\n]").toString();
+    }
+
+    public boolean isEmpty() {
+        return keywords.length == 0;
+    }
+
+    public int size() {
+        return keywords.length;
+    }
+
+    public DocumentKeywords subList(int start, int end) {
+        return new DocumentKeywords(block, Arrays.copyOfRange(keywords, start, end), Arrays.copyOfRange(metadata, start, end));
    }
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDocumentWithError.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/interpreter/instruction/LoadProcessedDocumentWithError.java
@ -8,7 +8,8 @@ import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;


 public record LoadProcessedDocumentWithError(EdgeUrl url,
-                                             EdgeUrlState state) implements Instruction
+                                             EdgeUrlState state,
+                                             String reason) implements Instruction
 {
    @Override
    public void apply(Interpreter interpreter) {
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java
@ -25,34 +25,13 @@ public class SqlLoadUrls {
    @Inject
    public SqlLoadUrls(HikariDataSource dataSource) {
        this.dataSource = dataSource;
-        try (var conn = dataSource.getConnection()) {
-            try (var stmt = conn.createStatement()) {
-                stmt.execute("DROP PROCEDURE IF EXISTS INSERT_URL");
-                stmt.execute("""
-                        CREATE PROCEDURE INSERT_URL (
-                            IN PROTO VARCHAR(255),
-                            IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
-                            IN PORT INT,
-                            IN PATH VARCHAR(255),
-                            IN PARAM VARCHAR(255),
-                            IN PATH_HASH BIGINT
-                            )
-                        BEGIN
-                            INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PARAM,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PARAM,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
-                        END
-                        """);
-            }
-        }
-        catch (SQLException ex) {
-            throw new RuntimeException("Failed to set up loader", ex);
-        }
    }

    public void load(LoaderData data, EdgeUrl[] urls) {
        Set<EdgeDomain> affectedDomains = new HashSet<>();

        try (var conn = dataSource.getConnection();
-             var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?,?,?)");
+             var insertCall = conn.prepareStatement("INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PARAM,PATH_HASH) VALUES (?,?,?,?,?,?)");
             var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=?")
             )
        {
@ -67,7 +46,7 @@ public class SqlLoadUrls {
                affectedDomains.add(url.domain);

                insertCall.setString(1, url.proto);
-                insertCall.setString(2, url.domain.toString());
+                insertCall.setInt(2, data.getDomainId(url.domain));
                if (url.port != null) {
                    insertCall.setInt(3, url.port);
                }
@ -79,7 +58,7 @@ public class SqlLoadUrls {
                insertCall.setLong(6, hashPath(url.path, url.param));
                insertCall.addBatch();

-                if (cnt++ == 250) {
+                if (cnt++ == 1000) {
                    var ret = insertCall.executeBatch();
                    conn.commit();

--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/DisqualifiedException.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/DisqualifiedException.java
@ -1,11 +1,18 @@
 package nu.marginalia.wmsa.edge.converting.model;

+import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
+
 public class DisqualifiedException extends Exception {
    public final DisqualificationReason reason;

    public DisqualifiedException(DisqualificationReason reason) {
        this.reason = reason;
    }
+
+    public DisqualifiedException(CrawlerDocumentStatus crawlerStatus) {
+        this.reason = DisqualificationReason.fromCrawlerStatus(crawlerStatus);
+    }
+
    @Override
    public Throwable fillInStackTrace() {
        return this;
@ -18,6 +25,22 @@ public class DisqualifiedException extends Exception {
        STATUS,
        QUALITY,
        ACCEPTABLE_ADS,
-        FORBIDDEN
+        FORBIDDEN,
+        SHORT_CIRCUIT,
+
+        PROCESSING_EXCEPTION,
+
+        BAD_CONTENT_TYPE,
+        BAD_CHARSET,
+        REDIRECT,
+        ROBOTS_TXT,
+        ERROR,
+        Timeout, // Don't you dare
+        BAD_CANONICAL
+        ;
+
+        public static DisqualificationReason fromCrawlerStatus(CrawlerDocumentStatus crawlerStatus) {
+            return DisqualificationReason.valueOf(crawlerStatus.name());
+        }
    }
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocument.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/model/ProcessedDocument.java
@ -17,6 +17,10 @@ public class ProcessedDocument {
    public EdgeUrlState state;
    public String stateReason;

+    public boolean isOk() {
+        return EdgeUrlState.OK == state;
+    }
+
    public OptionalDouble quality() {
        if (details != null) {
            return OptionalDouble.of(details.quality);
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DocumentProcessor.java
@ -7,6 +7,7 @@ import nu.marginalia.util.language.LanguageFilter;
 import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
 import nu.marginalia.util.language.processing.SentenceExtractor;
 import nu.marginalia.util.language.processing.model.DocumentLanguageData;
+import nu.marginalia.util.language.processing.model.KeywordMetadata;
 import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
 import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException.DisqualificationReason;
 import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
@ -81,32 +82,12 @@ public class DocumentProcessor {

        return ret;
    }
+
    public ProcessedDocument process(CrawledDocument crawledDocument, CrawledDomain crawledDomain) {
        ProcessedDocument ret = new ProcessedDocument();

        try {
-            ret.url = getDocumentUrl(crawledDocument);
-            ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus);
-
-            if (ret.state == EdgeUrlState.OK) {
-
-                if (AcceptableAds.hasAcceptableAdsHeader(crawledDocument)) {
-                    throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
-                }
-
-                if (isAcceptedContentType(crawledDocument)) {
-                    var detailsWords = createDetails(crawledDomain, crawledDocument);
-
-                    ret.details = detailsWords.details();
-                    ret.words = detailsWords.words();
-                }
-                else {
-                    throw new DisqualifiedException(DisqualificationReason.CONTENT_TYPE);
-                }
-            }
-            else {
-                throw new DisqualifiedException(DisqualificationReason.STATUS);
-            }
+            processDocument(crawledDocument, crawledDomain, ret);
        }
        catch (DisqualifiedException ex) {
            ret.state = EdgeUrlState.DISQUALIFIED;
@ -115,6 +96,7 @@ public class DocumentProcessor {
        }
        catch (Exception ex) {
            ret.state = EdgeUrlState.DISQUALIFIED;
+            ret.stateReason = DisqualificationReason.PROCESSING_EXCEPTION.toString();
            logger.info("Failed to convert " + crawledDocument.url, ex);
            ex.printStackTrace();
        }
@ -122,6 +104,32 @@ public class DocumentProcessor {
        return ret;
    }

+    private void processDocument(CrawledDocument crawledDocument, CrawledDomain crawledDomain, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException {
+
+        var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus);
+        if (crawlerStatus != CrawlerDocumentStatus.OK) {
+            throw new DisqualifiedException(crawlerStatus);
+        }
+
+        if (AcceptableAds.hasAcceptableAdsHeader(crawledDocument)) {
+            throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
+        }
+
+        if (!isAcceptedContentType(crawledDocument)) {
+            throw new DisqualifiedException(DisqualificationReason.CONTENT_TYPE);
+        }
+
+
+        ret.url = getDocumentUrl(crawledDocument);
+        ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus);
+
+        var detailsWithWordsLinks = createDetails(crawledDomain, crawledDocument);
+
+        ret.details = detailsWithWordsLinks.details();
+        ret.words = detailsWithWordsLinks.words();
+    }
+
+
    private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument)
            throws URISyntaxException
    {
@ -193,9 +201,11 @@ public class DocumentProcessor {
        ret.standard = getHtmlStandard(doc);
        ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);

-        ret.quality = documentValuator.getQuality(ret.standard, doc, dld);
+        ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc, dld);
        ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();

+        KeywordMetadata keywordMetadata = new KeywordMetadata(ret.quality);
+
        EdgePageWordSet words;
        if (shouldDoSimpleProcessing(url, ret)) {
            /* Some documents we'll index, but only superficially. This is a compromise
@ -203,12 +213,12 @@ public class DocumentProcessor {
               queries. This also saves a lot of processing power.
             */
            ret.features = Set.of(HtmlFeature.UNKNOWN);
-            words = keywordExtractor.extractKeywordsMinimal(dld);
+            words = keywordExtractor.extractKeywordsMinimal(dld, keywordMetadata);
            ret.description = "";
        }
        else {
            ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld);
-            words = keywordExtractor.extractKeywords(dld);
+            words = keywordExtractor.extractKeywords(dld, keywordMetadata);
            ret.description = getDescription(doc);
        }

@ -239,6 +249,10 @@ public class DocumentProcessor {
            return true;
        }

+        // Annoying wordpress crap
+        if (url.path.startsWith("/tag/") && url.path.endsWith("/")) {
+            return true;
+        }
        return false;
    }

@ -262,7 +276,7 @@ public class DocumentProcessor {

        ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);

-        words.append(IndexBlock.Meta, tagWords);
+        words.appendWithNoMeta(IndexBlock.Meta, tagWords);
    }

    private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) {
@ -296,14 +310,21 @@ public class DocumentProcessor {
                    .ifPresent(lp::acceptFeed);
        }

+        createLinkKeywords(words, lp);
+        createFileLinkKeywords(words, lp, domain);
+    }
+
+    private void createLinkKeywords(EdgePageWordSet words, LinkProcessor lp) {
        final Set<String> linkTerms = new HashSet<>();

        for (var fd : lp.getForeignDomains()) {
            linkTerms.add("links:"+fd.toString().toLowerCase());
            linkTerms.add("links:"+fd.getDomain().toLowerCase());
        }
-        words.append(IndexBlock.Meta, linkTerms);
+        words.appendWithNoMeta(IndexBlock.Meta, linkTerms);
+    }

+    private void createFileLinkKeywords(EdgePageWordSet words, LinkProcessor lp, EdgeDomain domain) {
        Set<String> fileKeywords = new HashSet<>(100);
        for (var link : lp.getNonIndexableUrls()) {

@ -314,8 +335,8 @@ public class DocumentProcessor {
            synthesizeFilenameKeyword(fileKeywords, link);

        }
-        words.append(IndexBlock.Artifacts, fileKeywords);

+        words.appendWithNoMeta(IndexBlock.Artifacts, fileKeywords);
    }

    private void synthesizeFilenameKeyword(Set<String> fileKeywords, EdgeUrl link) {
@ -364,5 +385,7 @@ public class DocumentProcessor {
        return doc.text().length();
    }

-    private record DetailsWithWords(ProcessedDocumentDetails details, EdgePageWordSet words) {}
+    private record DetailsWithWords(ProcessedDocumentDetails details,
+                                    EdgePageWordSet words) {}
+
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/DomainProcessor.java
@ -3,17 +3,22 @@ package nu.marginalia.wmsa.edge.converting.processor;
 import com.google.common.base.Strings;
 import com.google.inject.Inject;
 import com.google.inject.name.Named;
+import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
 import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
 import nu.marginalia.wmsa.edge.converting.processor.logic.CommonKeywordExtractor;
+import nu.marginalia.wmsa.edge.converting.processor.logic.InternalLinkGraph;
 import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
 import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
 import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus;
+import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
 import nu.marginalia.wmsa.edge.index.model.IndexBlock;
+import nu.marginalia.wmsa.edge.index.model.IndexBlockType;
 import nu.marginalia.wmsa.edge.model.EdgeDomain;
 import nu.marginalia.wmsa.edge.model.EdgeUrl;
 import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;

 import java.util.*;
+import java.util.stream.Collectors;

 import static nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus.BAD_CANONICAL;

@ -47,6 +52,8 @@ public class DomainProcessor {

            fixBadCanonicalTags(crawledDomain.doc);

+            InternalLinkGraph internalLinkGraph = new InternalLinkGraph();
+
            DocumentDisqualifier disqualifier = new DocumentDisqualifier();
            for (var doc : crawledDomain.doc) {
                if (disqualifier.isQualified()) {
@ -54,6 +61,9 @@ public class DomainProcessor {

                    if (processedDoc.url != null) {
                        ret.documents.add(processedDoc);
+
+                        internalLinkGraph.accept(processedDoc);
+
                        processedDoc.quality().ifPresent(disqualifier::offer);
                    }
                    else if ("LANGUAGE".equals(processedDoc.stateReason)) {
@ -62,24 +72,16 @@ public class DomainProcessor {
                }
                else { // Short-circuit processing if quality is too low
                    var stub = documentProcessor.makeDisqualifiedStub(doc);
+                    stub.stateReason = DisqualifiedException.DisqualificationReason.SHORT_CIRCUIT.toString();
                    if (stub.url != null) {
                        ret.documents.add(stub);
                    }
                }
            }

-            Set<String> commonSiteWords = new HashSet<>(10);
+            flagCommonSiteWords(ret);
+            flagAdjacentSiteWords(internalLinkGraph, ret);

-            commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Tfidf_Top, IndexBlock.Subjects));
-            commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Title));
-
-            if (!commonSiteWords.isEmpty()) {
-                for (var doc : ret.documents) {
-                    if (doc.words != null) {
-                        doc.words.get(IndexBlock.Site).addAll(commonSiteWords);
-                    }
-                }
-            }
        }
        else {
            ret.documents = Collections.emptyList();
@ -90,6 +92,70 @@ public class DomainProcessor {
        return ret;
    }

+    private void flagCommonSiteWords(ProcessedDomain processedDomain) {
+        Set<String> commonSiteWords = new HashSet<>(10);
+
+        commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain, IndexBlock.Tfidf_High, IndexBlock.Subjects));
+        commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain, IndexBlock.Title));
+
+        if (commonSiteWords.isEmpty()) {
+            return;
+        }
+
+        for (var doc : processedDomain.documents) {
+            if (doc.words != null) {
+                for (var block : IndexBlock.values()) {
+                    if (block.type == IndexBlockType.PAGE_DATA) {
+                        doc.words.get(block).setFlagOnMetadataForWords(EdgePageWordFlags.Site, commonSiteWords);
+                    }
+                }
+            }
+        }
+    }
+
+    private void flagAdjacentSiteWords(InternalLinkGraph internalLinkGraph, ProcessedDomain processedDomain) {
+        var invertedGraph = internalLinkGraph.trimAndInvert();
+
+        Map<EdgeUrl, Set<String>> linkedKeywords = new HashMap<>(100);
+
+        invertedGraph.forEach((url, linkingUrls) -> {
+            Map<String, Integer> keywords = new HashMap<>(100);
+
+            for (var linkingUrl : linkingUrls) {
+                for (var keyword : internalLinkGraph.getKeywords(linkingUrl)) {
+                    keywords.merge(keyword, 1, Integer::sum);
+                }
+            }
+
+            var words = keywords.entrySet().stream()
+                    .filter(e -> e.getValue() > 3)
+                    .map(Map.Entry::getKey)
+                    .filter(internalLinkGraph.getCandidateKeywords(url)::contains)
+                    .collect(Collectors.toSet());
+            if (!words.isEmpty()) {
+                linkedKeywords.put(url, words);
+            }
+        });
+
+        for (var doc : processedDomain.documents) {
+            if (doc.words == null)
+                continue;
+
+            final Set<String> keywords = linkedKeywords.get(doc.url);
+            if (keywords == null)
+                continue;
+
+            for (var block : IndexBlock.values()) {
+                if (block.type == IndexBlockType.PAGE_DATA) {
+                    doc.words.get(block).setFlagOnMetadataForWords(EdgePageWordFlags.SiteAdjacent, keywords);
+                }
+            }
+        }
+
+
+    }
+
+
    private void fixBadCanonicalTags(List<CrawledDocument> docs) {
        Map<String, Set<String>> seenCanonicals = new HashMap<>();
        Set<String> seenUrls = new HashSet<>();
@ -162,7 +228,8 @@ public class DomainProcessor {
        }

        boolean isQualified() {
-            return count < 25 || goodCount*10 >= count;
+            return true;
+//            return count < 25 || goodCount*10 >= count;
        }
    }
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/InstructionsCompiler.java
@ -1,116 +0,0 @@
-package nu.marginalia.wmsa.edge.converting.processor;
-
-import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
-import nu.marginalia.wmsa.edge.converting.interpreter.instruction.*;
-import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
-import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
-import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
-import nu.marginalia.wmsa.edge.model.EdgeDomain;
-import nu.marginalia.wmsa.edge.model.EdgeUrl;
-
-import java.util.*;
-
-public class InstructionsCompiler {
-
-    public List<Instruction> compile(ProcessedDomain domain) {
-        List<Instruction> ret = new ArrayList<>(domain.size()*4);
-
-        ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.ip));
-
-        if (domain.documents != null) {
-            compileUrls(ret, domain.documents);
-            compileDocuments(ret, domain.documents);
-            compileFeeds(ret, domain.documents);
-
-            compileLinks(ret, domain.domain, domain.documents);
-        }
-        if (domain.redirect != null) {
-            compileRedirect(ret, domain.domain, domain.redirect);
-        }
-
-        return ret;
-    }
-
-    private void compileRedirect(List<Instruction> ret, EdgeDomain from, EdgeDomain to) {
-        ret.add(new LoadDomain(to));
-        ret.add(new LoadDomainLink(new DomainLink(from, to)));
-        ret.add(new LoadDomainRedirect(new DomainLink(from, to)));
-    }
-
-    private void compileUrls(List<Instruction> ret, List<ProcessedDocument> documents) {
-        Set<EdgeUrl> seenUrls = new HashSet<>(documents.size()*4);
-        Set<EdgeDomain> seenDomains = new HashSet<>(documents.size());
-
-        for (var doc : documents) {
-            seenUrls.add(doc.url);
-
-            if (doc.details != null) {
-                for (var url : doc.details.linksExternal) {
-                    seenDomains.add(url.domain);
-                }
-                seenUrls.addAll(doc.details.linksExternal);
-                seenUrls.addAll(doc.details.linksInternal);
-            }
-        }
-
-        ret.add(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new)));
-        ret.add(new LoadUrl(seenUrls.toArray(EdgeUrl[]::new)));
-    }
-
-    private void compileLinks(List<Instruction> ret, EdgeDomain from, List<ProcessedDocument> documents) {
-        DomainLink[] links = documents.stream().map(doc -> doc.details)
-                .filter(Objects::nonNull)
-                .flatMap(dets -> dets.linksExternal.stream())
-                .map(link -> link.domain)
-                .distinct()
-                .map(domain -> new DomainLink(from, domain))
-                .toArray(DomainLink[]::new);
-
-        ret.add(new LoadDomainLink(links));
-    }
-
-    private void compileFeeds(List<Instruction> ret, List<ProcessedDocument> documents) {
-
-        EdgeUrl[] feeds = documents.stream().map(doc -> doc.details)
-                .filter(Objects::nonNull)
-                .flatMap(dets -> dets.feedLinks.stream())
-                .distinct()
-                .toArray(EdgeUrl[]::new);
-
-        ret.add(new LoadRssFeed(feeds));
-    }
-
-    private void compileDocuments(List<Instruction> ret, List<ProcessedDocument> documents) {
-
-        for (var doc : documents) {
-            compileDocumentDetails(ret, doc);
-        }
-
-        for (var doc : documents) {
-            compileWords(ret, doc);
-        }
-
-    }
-
-    private void compileDocumentDetails(List<Instruction> ret, ProcessedDocument doc) {
-        var details = doc.details;
-
-        if (details != null) {
-            ret.add(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard, details.length, details.hashCode, details.quality));
-        }
-        else {
-            ret.add(new LoadProcessedDocumentWithError(doc.url, doc.state));
-        }
-    }
-
-    private void compileWords(List<Instruction> ret, ProcessedDocument doc) {
-        var words = doc.words;
-        if (words != null) {
-            var wordsArray = words.values().stream()
-                    .map(DocumentKeywords::new)
-                    .toArray(DocumentKeywords[]::new);
-
-            ret.add(new LoadKeywords(doc.url, wordsArray));
-        }
-    }
-}
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DocumentValuator.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/DocumentValuator.java
@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
 import crawlercommons.utils.Strings;
 import nu.marginalia.util.language.processing.model.DocumentLanguageData;
 import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
+import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
 import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
 import org.jsoup.nodes.Document;

@ -23,13 +24,12 @@ public class DocumentValuator {

    );

-    public double getQuality(EdgeHtmlStandard htmlStandard, Document doc, DocumentLanguageData dld) throws DisqualifiedException {
+    public double getQuality(CrawledDocument crawledDocument, EdgeHtmlStandard htmlStandard, Document parsedDocument, DocumentLanguageData dld) throws DisqualifiedException {
        double smutCoefficient = dld.streamLowerCase().filter(filthTable::contains).count();
-        double scriptPenalty = getScriptPenalty(doc);
+        double scriptPenalty = getScriptPenalty(parsedDocument);

-
-        int textBodyLength = doc.text().length();
-        int rawLength = doc.html().length();
+        int textBodyLength = parsedDocument.text().length();
+        int rawLength = crawledDocument.documentBody.length();

        if (textBodyLength == 0) {
            throw new DisqualifiedException(LENGTH);
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeatureExtractor.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/FeatureExtractor.java
@ -3,10 +3,7 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
 import com.google.inject.Inject;
 import com.google.inject.Singleton;
 import nu.marginalia.util.language.processing.model.DocumentLanguageData;
-import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
-import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector;
-import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector;
-import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector;
+import nu.marginalia.wmsa.edge.converting.processor.logic.topic.*;
 import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
@ -43,13 +40,15 @@ public class FeatureExtractor {
    private final RecipeDetector recipeDetector;
    private final TextileCraftDetector textileCraftDetector;
    private final WoodworkingDetector woodworkingDetector;
+    private final GoogleAnwersSpamDetector googleAnwersSpamDetector;

    @Inject
-    public FeatureExtractor(AdblockSimulator adblockSimulator, RecipeDetector recipeDetector, TextileCraftDetector textileCraftDetector, WoodworkingDetector woodworkingDetector) {
+    public FeatureExtractor(AdblockSimulator adblockSimulator, RecipeDetector recipeDetector, TextileCraftDetector textileCraftDetector, WoodworkingDetector woodworkingDetector, GoogleAnwersSpamDetector googleAnwersSpamDetector) {
        this.adblockSimulator = adblockSimulator;
        this.recipeDetector = recipeDetector;
        this.textileCraftDetector = textileCraftDetector;
        this.woodworkingDetector = woodworkingDetector;
+        this.googleAnwersSpamDetector = googleAnwersSpamDetector;
    }

    public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc, DocumentLanguageData dld) {
@ -57,6 +56,10 @@ public class FeatureExtractor {

        final Elements scriptTags = doc.getElementsByTag("script");

+        if (googleAnwersSpamDetector.testP(doc) > 0.5) {
+            features.add(HtmlFeature.GA_SPAM);
+        }
+
        for (var scriptTag : scriptTags) {
            if (isJavascriptTag(scriptTag)) {
                features.add(HtmlFeature.JS);
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/HtmlFeature.java
@ -7,14 +7,14 @@ public enum HtmlFeature {
    JS("special:scripts"),
    AFFILIATE_LINK( "special:affiliate"),
    TRACKING("special:tracking"),
+
    COOKIES("special:cookies"),
-
    CATEGORY_FOOD("category:food"),
-
    ADVERTISEMENT("special:ads"),
-
    CATEGORY_CRAFTS("category:crafts"),

+    GA_SPAM("special:gaspam"),
+
    UNKNOWN("special:uncategorized")
    ;

--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/InternalLinkGraph.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/InternalLinkGraph.java
@ -0,0 +1,54 @@
+package nu.marginalia.wmsa.edge.converting.processor.logic;
+
+import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
+import nu.marginalia.wmsa.edge.index.model.IndexBlock;
+import nu.marginalia.wmsa.edge.model.EdgeUrl;
+
+import java.util.*;
+
+public class InternalLinkGraph {
+    private final Map<EdgeUrl, Set<EdgeUrl>> internalLinkGraph = new HashMap<>(1000);
+    private final Set<EdgeUrl> goodUrls = new HashSet<>(1000);
+    private final Map<EdgeUrl, Set<String>> topKeywordsByUrl = new HashMap<>(1000);
+    private final Map<EdgeUrl, Set<String>> candidateKeywordsByUrl = new HashMap<>(1000);
+
+    public void accept(ProcessedDocument doc) {
+        if (doc.details == null || doc.details.linksInternal == null)
+            return;
+
+        goodUrls.add(doc.url);
+        internalLinkGraph.put(doc.url, new HashSet<>(doc.details.linksInternal));
+
+        Set<String> topKeywords = new HashSet<>(doc.words.get(IndexBlock.Tfidf_High).words);
+        topKeywords.addAll(doc.words.get(IndexBlock.Subjects).words);
+        topKeywordsByUrl.put(doc.url, topKeywords);
+
+        Set<String> candidateKeywords = new HashSet<>(topKeywords);
+        candidateKeywords.addAll(doc.words.get(IndexBlock.Tfidf_High).words);
+        candidateKeywords.addAll(doc.words.get(IndexBlock.Subjects).words);
+        candidateKeywordsByUrl.put(doc.url, candidateKeywords);
+    }
+
+    public Map<EdgeUrl, Set<EdgeUrl>> trimAndInvert() {
+        internalLinkGraph.values().forEach(dest -> dest.retainAll(goodUrls));
+
+        Map<EdgeUrl, Set<EdgeUrl>> inverted = new HashMap<>(goodUrls.size());
+
+        internalLinkGraph.forEach((source, dests) -> {
+            dests.forEach(dest -> inverted.computeIfAbsent(dest,
+                    d->new HashSet<>(25))
+                    .add(source));
+        });
+
+        internalLinkGraph.clear();
+
+        return inverted;
+    }
+
+    public Set<String> getKeywords(EdgeUrl url) {
+        return topKeywordsByUrl.getOrDefault(url, Collections.emptySet());
+    }
+    public Set<String> getCandidateKeywords(EdgeUrl url) {
+        return candidateKeywordsByUrl.getOrDefault(url, Collections.emptySet());
+    }
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/LinkParser.java
@ -5,7 +5,6 @@ import com.google.common.base.Strings;
 import lombok.SneakyThrows;
 import nu.marginalia.wmsa.edge.model.EdgeUrl;
 import org.jetbrains.annotations.Contract;
-import org.jetbrains.annotations.Nullable;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.slf4j.Logger;
@ -202,7 +201,6 @@ public class LinkParser {
        return binarySuffixList.stream().anyMatch(str::endsWith);
    }

-    @Nullable
    public EdgeUrl getBaseLink(Document parsed, EdgeUrl documentUrl) {
        var baseTags = parsed.getElementsByTag("base");

--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/QueryParams.java
@ -1,9 +1,13 @@
 package nu.marginalia.wmsa.edge.converting.processor.logic;

+import org.apache.commons.lang3.StringUtils;
+
 import javax.annotation.Nullable;
-import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+import java.util.StringJoiner;
 import java.util.regex.Pattern;
-import java.util.stream.Collectors;

 public class QueryParams {

@ -15,10 +19,28 @@ public class QueryParams {
            return null;
        }

-        var ret = Arrays.stream(paramSplitterPattern.split(queryParams))
-                .filter(param -> QueryParams.isPermittedParam(path, param))
-                .sorted()
-                .collect(Collectors.joining("&"));
+        String ret;
+        if (queryParams.indexOf('&') >= 0) {
+
+            List<String> parts = new ArrayList<>();
+            for (var part : StringUtils.split(queryParams, '&')) {
+                if (QueryParams.isPermittedParam(path, part)) {
+                    parts.add(part);
+                }
+            }
+            if (parts.size() > 1) {
+                parts.sort(Comparator.naturalOrder());
+            }
+            StringJoiner retJoiner = new StringJoiner("&");
+            parts.forEach(retJoiner::add);
+            ret = retJoiner.toString();
+        }
+        else if (isPermittedParam(path, queryParams)) {
+            ret = queryParams;
+        }
+        else {
+            return null;
+        }

        if (ret.isBlank())
            return null;
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/GoogleAnwersSpamDetector.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/processor/logic/topic/GoogleAnwersSpamDetector.java
@ -0,0 +1,36 @@
+package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
+
+import org.jsoup.nodes.Document;
+
+import java.util.List;
+
+public class GoogleAnwersSpamDetector {
+
+    private final List<String> prefixes = List.of("What", "Why", "How", "When", "Is");
+
+    public double testP(Document doc) {
+        if (trialTag(doc, "h1")) return 1;
+        if (trialTag(doc, "h2")) return 1;
+        if (trialTag(doc, "h3")) return 1;
+
+        return 0;
+    }
+
+    private boolean trialTag(Document doc, String tagName) {
+        int positive = 0;
+        int total = 0;
+
+        for (var elem : doc.getElementsByTag(tagName)) {
+            String text = elem.text();
+            for (var prefix : prefixes) {
+                if (text.startsWith(prefix)) {
+                    positive++;
+                    break;
+                }
+            }
+            total ++;
+        }
+
+        return positive > 4 && positive / (double) total > 0.5;
+    }
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/CrawlJobExtractorMain.java
@ -29,7 +29,7 @@ public class CrawlJobExtractorMain {
            """
                SELECT ID
                FROM EC_DOMAIN
-                WHERE URL_PART=?
+                WHERE DOMAIN_NAME=?
            """;

    private static final String domainsSql =
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/blocklist/UrlBlocklist.java
@ -11,6 +11,17 @@ import java.util.regex.Pattern;
 public class UrlBlocklist {
    private final List<Predicate<String>> patterns = new ArrayList<>();

+    private record UrlPatternContains(String contains, Pattern pattern) implements Predicate<String> {
+        public boolean test(String s) {
+            return s.contains(contains) && pattern.matcher(s).find();
+        }
+    }
+    private record UrlPatternMinLength(int minLength, Pattern pattern) implements Predicate<String> {
+        public boolean test(String s) {
+            return s.length() >= minLength && pattern.matcher(s).find();
+        }
+    }
+
    // domains that have a lot of links but we know we don't want to crawl
    private final Set<String> badDomains = Set.of("t.co", "facebook.com",
            "instagram.com", "youtube.com",
@ -18,18 +29,24 @@ public class UrlBlocklist {

    public UrlBlocklist() {
        // Don't deep-crawl git repos
-        patterns.add(Pattern.compile("\\.git/.+").asPredicate());
-        patterns.add(Pattern.compile("wp-content/upload").asPredicate());
+        patterns.add(s -> s.contains(".git/"));
+
+        patterns.add(s -> s.contains("wp-content/upload"));
+        patterns.add(s -> s.contains("-download-free"));

        // long base64-strings in URLs are typically git hashes or the like, rarely worth crawling
-        patterns.add(Pattern.compile(".*/[^/]*[a-f0-9]{32,}(/|$)").asPredicate());
+        patterns.add(new UrlPatternMinLength(48, Pattern.compile(".*/[^/]*[a-f0-9]{32,}(/|$)")));

        // link farms &c
-        patterns.add(Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$").asPredicate());
-        patterns.add(Pattern.compile("/permalink/[a-z]+(-([A-Za-z]+|[0-9]+)){3,}\\.(htm|html|php)$").asPredicate());
-        patterns.add(Pattern.compile("(webrx3|lib|pdf|book|720p).*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$").asPredicate());
-        patterns.add(Pattern.compile("/node/.*/[a-z]+(-[a-z0-9]+)+.htm$").asPredicate());
-        patterns.add(Pattern.compile(".*-download-free$").asPredicate());
+        patterns.add(new UrlPatternContains("/download", Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$")));
+        patterns.add(new UrlPatternContains("/permalink", Pattern.compile("/permalink/[a-z]+(-([A-Za-z]+|[0-9]+)){3,}\\.(htm|html|php)$")));
+        patterns.add(new UrlPatternContains("/webrx", Pattern.compile("webrx3.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")));
+        patterns.add(new UrlPatternContains("/lib", Pattern.compile("lib.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")));
+        patterns.add(new UrlPatternContains("/pdf", Pattern.compile("pdf.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")));
+        patterns.add(new UrlPatternContains("/book", Pattern.compile("book.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")));
+        patterns.add(new UrlPatternContains("/720p", Pattern.compile("720p.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")));
+        patterns.add(new UrlPatternContains("/node", Pattern.compile("/node/.*/[a-z]+(-[a-z0-9]+)+.htm$")));
+
    }

    public boolean isUrlBlocked(EdgeUrl url) {
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java
@ -31,6 +31,8 @@ public class CrawlerRetreiver {
    private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 500);
    private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500);

+    private static final int MAX_ERRORS = 10;
+
    private final LinkedList<EdgeUrl> queue = new LinkedList<>();
    private final HttpFetcher fetcher;

@ -50,6 +52,8 @@ public class CrawlerRetreiver {
    private static final IpBlockList ipBlocklist;
    private static final UrlBlocklist urlBlocklist = new UrlBlocklist();

+    int errorCount = 0;
+
    static {
        try {
            ipBlocklist = new IpBlockList(new GeoIpBlocklist());
@ -137,7 +141,7 @@ public class CrawlerRetreiver {

        int fetchedCount = 0;

-        while (!queue.isEmpty() && visited.size() < depth) {
+        while (!queue.isEmpty() && visited.size() < depth && errorCount < MAX_ERRORS ) {
            var top = queue.removeFirst();

            if (!robotsRules.isAllowed(top.toString())) {
@ -179,6 +183,10 @@ public class CrawlerRetreiver {
                EdgeUrl.parse(d.url).map(EdgeUrl::toString).ifPresent(visited::add);
            }

+            if ("ERROR".equals(d.crawlerStatus)) {
+                errorCount++;
+            }
+
        }

        long crawledTime = System.currentTimeMillis() - startTime;
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDao.java
@ -10,6 +10,7 @@ import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
 import nu.marginalia.wmsa.edge.search.model.BrowseResult;

 import java.util.List;
+import java.util.Optional;

@ImplementedBy(EdgeDataStoreDaoImpl.class)
 public interface EdgeDataStoreDao {
@ -23,7 +24,7 @@ public interface EdgeDataStoreDao {

    List<EdgeUrlDetails> getUrlDetailsMulti(EdgeIdCollection<EdgeUrl> ids);

-    EdgeDomain getDomain(EdgeId<EdgeDomain> id);
+    Optional<EdgeDomain> getDomain(EdgeId<EdgeDomain> id);


 }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/EdgeDataStoreDaoImpl.java
@ -93,7 +93,9 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
                                    WORDS_TOTAL, FORMAT, FEATURES,
                                    IP, DOMAIN_STATE,
                                    DATA_HASH
-                                    FROM EC_URL_VIEW WHERE ID IN
+                                    FROM EC_URL_VIEW
+                                    WHERE TITLE IS NOT NULL 
+                                    AND ID IN
                            """ + idString)) {
                stmt.setFetchSize(ids.size());

@ -113,7 +115,9 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
                            EdgePageScoreAdjustment.zero(), // urlQualityAdjustment
                            Integer.MAX_VALUE, // rankingId
                            Double.MAX_VALUE, // termScore
-                            1 // resultsFromSameDomain
+                            1, // resultsFromSameDomain
+                            "", // positions
+                            null // result item
                            );
                    if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF
                    && Strings.isNullOrEmpty(val.description)
@ -309,18 +313,17 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {

    @Override
    @SneakyThrows
-    public EdgeDomain getDomain(EdgeId<EdgeDomain> id) {
+    public Optional<EdgeDomain> getDomain(EdgeId<EdgeDomain> id) {
        try (var connection = dataSource.getConnection()) {

            try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
                stmt.setInt(1, id.id());
                var rsp = stmt.executeQuery();
                if (rsp.next()) {
-                    return new EdgeDomain(rsp.getString(1));
+                    return Optional.of(new EdgeDomain(rsp.getString(1)));
                }
-                throw new NoSuchElementException();
+                return Optional.empty();
            }
        }
    }
-
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklistImpl.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/data/dao/task/EdgeDomainBlacklistImpl.java
@ -4,7 +4,6 @@ import com.google.inject.Inject;
 import com.google.inject.Singleton;
 import com.zaxxer.hikari.HikariDataSource;
 import gnu.trove.set.hash.TIntHashSet;
-import io.prometheus.client.Counter;
 import io.reactivex.rxjava3.schedulers.Schedulers;
 import lombok.SneakyThrows;
 import org.slf4j.Logger;
@ -18,8 +17,6 @@ public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist {
    private final HikariDataSource dataSource;
    private final Logger logger = LoggerFactory.getLogger(getClass());

-    private static final Counter wmsa_blacklist_intercept = Counter.build("wmsa_blacklist_intercept",
-            "wmsa_blacklist_intercept").register();
    @Inject
    public EdgeDomainBlacklistImpl(HikariDataSource dataSource) {
        this.dataSource = dataSource;
@ -65,7 +62,6 @@ public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist {
    @Override
    public boolean isBlacklisted(int domainId) {
        if (spamDomainSet.contains(domainId)) {
-            wmsa_blacklist_intercept.inc();
            return true;
        }

--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/explorer/ExplorerMain.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/explorer/ExplorerMain.java
@ -0,0 +1,34 @@
+package nu.marginalia.wmsa.edge.explorer;
+
+import com.google.inject.Guice;
+import com.google.inject.Inject;
+import com.google.inject.Injector;
+import nu.marginalia.wmsa.configuration.MainClass;
+import nu.marginalia.wmsa.configuration.ServiceDescriptor;
+import nu.marginalia.wmsa.configuration.module.ConfigurationModule;
+import nu.marginalia.wmsa.configuration.module.DatabaseModule;
+import nu.marginalia.wmsa.configuration.server.Initialization;
+import spark.Spark;
+
+public class ExplorerMain extends MainClass {
+    final ExplorerService service;
+
+    @Inject
+    public ExplorerMain(ExplorerService service) {
+        this.service = service;
+    }
+
+    public static void main(String... args) {
+        init(ServiceDescriptor.EXPLORER, args);
+
+        Spark.staticFileLocation("/static/explore/");
+
+        Injector injector = Guice.createInjector(
+                new ConfigurationModule(),
+                new DatabaseModule()
+        );
+
+        injector.getInstance(ExplorerMain.class);
+        injector.getInstance(Initialization.class).setReady();
+    }
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/explorer/ExplorerService.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/explorer/ExplorerService.java
@ -0,0 +1,253 @@
+package nu.marginalia.wmsa.edge.explorer;
+
+import com.google.inject.Inject;
+import com.google.inject.name.Named;
+import com.zaxxer.hikari.HikariDataSource;
+import lombok.SneakyThrows;
+import nu.marginalia.wmsa.configuration.server.Initialization;
+import nu.marginalia.wmsa.configuration.server.MetricsServer;
+import nu.marginalia.wmsa.configuration.server.Service;
+import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
+import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
+import nu.marginalia.wmsa.resource_store.StaticResources;
+import org.jetbrains.annotations.NotNull;
+import spark.Request;
+import spark.Response;
+import spark.Spark;
+
+import java.sql.SQLException;
+import java.util.*;
+
+public class ExplorerService extends Service {
+
+    private final MustacheRenderer<Object> renderer;
+    private final HikariDataSource dataSource;
+    private final StaticResources staticResources;
+
+    record SearchResult(
+            String domain,
+            String url,
+            double relatedness,
+            boolean hasMore,
+            boolean active,
+            boolean indexed) implements Comparable<SearchResult> {
+
+        @Override
+        public int compareTo(@NotNull SearchResult o) {
+            return (int)(o.relatedness - relatedness);
+        }
+    }
+
+    record SearchResults(String query, String message, String aliasDomain, List<SearchResult> resultList) { }
+
+    @SneakyThrows
+    @Inject
+    public ExplorerService(@Named("service-host") String ip,
+                         @Named("service-port") Integer port,
+                         Initialization initialization,
+                         MetricsServer metricsServer,
+                         RendererFactory rendererFactory,
+                         HikariDataSource dataSource,
+                           StaticResources staticResources
+                           ) {
+
+        super(ip, port, initialization, metricsServer);
+
+        renderer = rendererFactory.renderer("explorer/explorer");
+        this.dataSource = dataSource;
+        this.staticResources = staticResources;
+        Spark.get("/public/", this::serveIndex, this::render);
+        Spark.get("/public/search", this::search, this::render);
+        Spark.get("/public/:resource", this::serveStatic);
+
+    }
+
+
+    private Object serveStatic(Request request, Response response) {
+        String resource = request.params("resource");
+        staticResources.serveStatic("explore", resource, request, response);
+        return "";
+    }
+
+    public String render(Object results) {
+        return renderer.render(results);
+    }
+
+    private SearchResults serveIndex(Request request, Response response) {
+
+        return new SearchResults("", "", null, Collections.emptyList());
+    }
+
+
+    private SearchResults search(Request request, Response response) throws SQLException {
+        String query = request.queryParams("domain");
+
+        query = trimUrlJunk(query);
+
+        DomainIdInformation domainId = getDomainId(query);
+        if (!domainId.isPresent()) {
+            return new SearchResults(query,
+                    "Could not find such a domain (maybe try adding/removing www?)",
+                    null, Collections.emptyList());
+        }
+
+        var relatedDomains = getRelatedDomains(domainId);
+
+        if (relatedDomains.isEmpty()) {
+            String message =  """
+                 I've got nothing. This may either be due to the website being far out in the periphery of Marginalia's
+                 search engine index, or it may be due to the website being too big.
+                 A few hundred of the biggest websites are excluded for performance reasons. They are usually
+                 not very interesting to look at either as everyone links to them and there's no real pattern to discern.
+                """;
+
+            return new SearchResults(query, message, domainId.alias, relatedDomains);
+        }
+
+        return new SearchResults(query, "", domainId.alias, relatedDomains);
+    }
+
+    private List<SearchResult> getRelatedDomains(DomainIdInformation domainIdInformation) throws SQLException {
+        List<SearchResult> ret = new ArrayList<>();
+        Set<String> seen = new HashSet<>();
+
+        try (var conn = dataSource.getConnection();
+             var stmt = conn.prepareStatement("""
+                SELECT
+                    NV.NEIGHBOR_NAME,
+                    NV.RELATEDNESS,
+                    (LV.DOMAIN_ID IS NOT NULL),
+                    (STATE = 'ACTIVE' OR STATE='SOCIAL_MEDIA'),
+                    INDEXED > 0
+                FROM EC_NEIGHBORS_VIEW NV
+                LEFT JOIN EC_NEIGHBORS_VIEW LV ON (NV.NEIGHBOR_ID=LV.DOMAIN_ID)
+                INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NV.NEIGHBOR_ID
+                WHERE NV.DOMAIN_ID=?
+                GROUP BY NV.NEIGHBOR_ID
+                ORDER BY NV.RELATEDNESS DESC
+                """);
+             var stmtRev = conn.prepareStatement("""
+                SELECT
+                    NV.DOMAIN_NAME,
+                    NV.RELATEDNESS,
+                    (LV.NEIGHBOR_ID IS NOT NULL),
+                    (STATE = 'ACTIVE' OR STATE='SOCIAL_MEDIA'),
+                    INDEXED > 0
+                FROM EC_NEIGHBORS_VIEW NV
+                LEFT JOIN EC_NEIGHBORS_VIEW LV ON (NV.DOMAIN_ID=LV.NEIGHBOR_ID)
+                INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NV.DOMAIN_ID
+                WHERE NV.NEIGHBOR_ID=?
+                GROUP BY NV.DOMAIN_ID
+                ORDER BY NV.RELATEDNESS DESC
+                """
+             );
+
+             ) {
+
+            stmt.setInt(1, domainIdInformation.domainId);
+            var rsp = stmt.executeQuery();
+            while (rsp.next()) {
+
+                String domainName = rsp.getString(1);
+                double relatedness = rsp.getDouble(2);
+                boolean hasMore = rsp.getBoolean(3);
+                boolean active = rsp.getBoolean(4);
+                boolean indexed = rsp.getBoolean(5);
+
+                seen.add(domainName);
+
+                String url = "http://" + domainName + "/";
+
+
+                if (domainName.length() < 48 && domainName.contains(".")) {
+                    ret.add(new SearchResult(
+                            domainName,
+                            url,
+                            relatedness,
+                            hasMore,
+                            active,
+                            indexed
+                            ));
+                }
+            }
+
+            stmtRev.setInt(1, domainIdInformation.domainId);
+            rsp = stmtRev.executeQuery();
+            while (rsp.next()) {
+
+                String domainName = rsp.getString(1);
+                double relatedness = rsp.getDouble(2);
+                boolean hasMore = rsp.getBoolean(3);
+                boolean active = rsp.getBoolean(4);
+                boolean indexed = rsp.getBoolean(5);
+
+                String url = "http://" + domainName + "/";
+
+                if (!seen.add(domainName))
+                    continue;
+
+                if (domainName.length() < 48 && domainName.contains(".")) {
+                    ret.add(new SearchResult(
+                            domainName,
+                            url,
+                            relatedness,
+                            hasMore,
+                            active,
+                            indexed
+                    ));
+                }
+            }
+        }
+
+        Comparator<SearchResult> comp = SearchResult::compareTo;
+        comp = comp.thenComparing(SearchResult::domain);
+        ret.sort(comp);
+
+        return ret;
+
+    }
+
+    private DomainIdInformation getDomainId(String query) throws SQLException {
+
+        try (var conn = dataSource.getConnection();
+             var stmt = conn.prepareStatement("""
+                SELECT IFNULL(ALIAS.ID, DOMAIN.ID), DOMAIN.INDEXED>0 OR ALIAS.INDEXED>0, ALIAS.DOMAIN_NAME 
+                FROM EC_DOMAIN DOMAIN 
+                LEFT JOIN EC_DOMAIN ALIAS ON DOMAIN.DOMAIN_ALIAS=ALIAS.ID                
+                WHERE DOMAIN.DOMAIN_NAME=?
+                """)) {
+            stmt.setString(1, query);
+            var rsp = stmt.executeQuery();
+            if (rsp.next()) {
+                return new DomainIdInformation(
+                        rsp.getInt(1),
+                        rsp.getBoolean(2),
+                        rsp.getString(3)
+                );
+            }
+        }
+        return new DomainIdInformation(-1,  false, null);
+    }
+
+    private String trimUrlJunk(String query) {
+        if (query.startsWith("http://")) {
+            query = query.substring(7);
+        }
+        if (query.startsWith("https://")) {
+            query = query.substring(8);
+        }
+
+        int lastSlash = query.indexOf('/');
+        if (lastSlash > 0) {
+            query = query.substring(0, lastSlash);
+        }
+
+        return query;
+    }
+
+    record DomainIdInformation(int domainId, boolean indexed, String alias) {
+        boolean isPresent() {
+            return domainId >= 0;
+        }
+    }
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexBucket.java
@ -1,20 +1,19 @@
 package nu.marginalia.wmsa.edge.index;

 import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter;
-import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
 import nu.marginalia.wmsa.edge.index.model.IndexBlock;
 import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
 import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
-import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
 import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory;
+import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryParams;
 import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator;
 import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepFromPredicate;
+import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryRankLimitingFilter;
 import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.util.Collections;
-import java.util.Comparator;
 import java.util.concurrent.locks.Lock;
 import java.util.concurrent.locks.ReadWriteLock;
 import java.util.concurrent.locks.ReentrantReadWriteLock;
@ -103,54 +102,65 @@ public class EdgeIndexBucket {
        return indexReader != null;
    }

-    public IndexQuery getQuery(IndexQueryCachePool cachePool, IndexBlock block, LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
+    public IndexQuery getQuery(LongPredicate filter, IndexQueryParams params) {
+
        if (null == indexReader) {
-            logger.warn("Index reader not neady {}", block);
+            logger.warn("Index reader not neady {}", params.block());
            return new IndexQuery(Collections.emptyList());
        }

-        final int[] orderedIncludes = searchTerms.includes
-                .stream()
-                .sorted(Comparator.comparingLong(i -> indexReader.numHits(cachePool, block, i)))
-                .distinct()
-                .mapToInt(Integer::intValue)
-                .toArray();
+        final int[] orderedIncludes = params.searchTerms()
+                .sortedDistinctIncludes((a, b) -> compareKeywords(params.block(), a, b));

-        IndexQueryFactory.IndexQueryBuilder query;
+        IndexQueryFactory.IndexQueryBuilder query = createQueryBuilder(orderedIncludes[0], params);

-        query = indexReader.findWord(cachePool, block, orderedIncludes[0]);
        if (query == null) {
            return new IndexQuery(Collections.emptyList());
        }

-        query.filter(filter);
+        query.addInclusionFilter(new QueryFilterStepFromPredicate(filter));
+        if (params.rankLimit() != null) {
+            query.addInclusionFilter(new QueryRankLimitingFilter(params.rankLimit()));
+        }

        for (int i = 1; i < orderedIncludes.length; i++) {
            query = query.also(orderedIncludes[i]);
        }

-        for (int term : searchTerms.excludes) {
+        for (int term : params.searchTerms().excludes()) {
            query = query.not(term);
        }

        return query.build();
    }

+    private IndexQueryFactory.IndexQueryBuilder createQueryBuilder(int firstKeyword, IndexQueryParams params) {

-    public IndexQuery getDomainQuery(IndexQueryCachePool pool, int wordId, ResultDomainDeduplicator localFilter) {
-        var query = indexReader.findDomain(pool, wordId);
+        if (params.targetDomains() != null && !params.targetDomains().isEmpty()) {
+            return indexReader.findWordForDomainList(params.block(), params.targetDomains(), firstKeyword);
+        }
+        return indexReader.findWord(params.block(), params.qualityLimit(), firstKeyword);
+
+    }
+
+    private int compareKeywords(IndexBlock block, int a, int b) {
+        return Long.compare(
+                indexReader.numHits(block, a),
+                indexReader.numHits(block, b)
+        );
+    }
+
+
+    public IndexQuery getDomainQuery(int wordId, ResultDomainDeduplicator localFilter) {
+        var query = indexReader.findDomain(wordId);

        query.addInclusionFilter(new QueryFilterStepFromPredicate(localFilter::filterRawValue));

        return query;
    }

-    public IndexBlock getTermScore(IndexQueryCachePool cachePool, int termId, long urlId) {
-        return indexReader.getBlockForResult(cachePool, termId, urlId);
+    /** Replaces the values of ids with their associated metadata, or 0L if absent */
+    public long[] getMetadata(IndexBlock block, int termId, long[] ids) {
+        return indexReader.getMetadata(block, termId, ids);
    }
-
-    public boolean isTermInBucket(IndexQueryCachePool cachePool, IndexBlock block, int termId, long urlId) {
-        return indexReader.isTermInBucket(cachePool, block, termId, urlId);
-    }
-
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexControl.java
@ -2,8 +2,8 @@ package nu.marginalia.wmsa.edge.index;


 import com.google.inject.Inject;
-import nu.marginalia.wmsa.edge.index.model.IndexBlock;
 import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException;
+import nu.marginalia.wmsa.edge.index.model.IndexBlock;

 import java.io.IOException;

@ -18,9 +18,6 @@ public class EdgeIndexControl {
    }

    public void regenerateIndex(int id) {
-        System.runFinalization();
-        System.gc();
-
        for (IndexBlock block : IndexBlock.values()) {
            try {
                servicesFactory.convertIndex(id, block);
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java
@ -9,6 +9,7 @@ import nu.marginalia.wmsa.configuration.server.Initialization;
 import nu.marginalia.wmsa.configuration.server.MetricsServer;
 import nu.marginalia.wmsa.configuration.server.Service;
 import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
+import nu.marginalia.wmsa.edge.index.svc.EdgeIndexDomainQueryService;
 import nu.marginalia.wmsa.edge.index.svc.EdgeIndexLexiconService;
 import nu.marginalia.wmsa.edge.index.svc.EdgeIndexOpsService;
 import nu.marginalia.wmsa.edge.index.svc.EdgeIndexQueryService;
@ -39,7 +40,9 @@ public class EdgeIndexService extends Service {

                            EdgeIndexOpsService opsService,
                            EdgeIndexLexiconService lexiconService,
-                            EdgeIndexQueryService indexQueryService)
+                            EdgeIndexQueryService indexQueryService,
+                            EdgeIndexDomainQueryService domainQueryService
+                            )
    {
        super(ip, port, init, metricsServer);

@ -51,7 +54,7 @@ public class EdgeIndexService extends Service {
        Spark.post("/words/", lexiconService::putWords);

        Spark.post("/search/", indexQueryService::search, gson::toJson);
-        Spark.post("/search-domain/", indexQueryService::searchDomain, gson::toJson);
+        Spark.post("/search-domain/", domainQueryService::searchDomain, gson::toJson);

        Spark.get("/dictionary/*", lexiconService::getWordId, gson::toJson);

--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/IndexServicesFactory.java
@ -103,9 +103,9 @@ public class IndexServicesFactory {

    public void convertIndex(int id, IndexBlock block) throws ConversionUnnecessaryException, IOException {
        var converter = new SearchIndexConverter(block, id, tmpFileDir,
-                preconverterOutputFile.get(id, block.ordinal()),
-                indexWriteWordsFile.get(id, block.id),
-                indexWriteUrlsFile.get(id, block.id),
+                preconverterOutputFile.get(id, block),
+                indexWriteWordsFile.get(id, block),
+                indexWriteUrlsFile.get(id, block),
                partitioner,
                domainBlacklist
                );
@ -118,7 +118,7 @@ public class IndexServicesFactory {

        for (int index = 0; index < (DYNAMIC_BUCKET_LENGTH + 1); index++) {
            for (IndexBlock block : IndexBlock.values()) {
-                shards.put(new SearchIndexPreconverter.Shard(index, block.ordinal()), getPreconverterOutputFile(index, block.ordinal()));
+                shards.put(new SearchIndexPreconverter.Shard(index, block.ordinal()), getPreconverterOutputFile(index, block));
            }
        }

@ -129,7 +129,7 @@ public class IndexServicesFactory {
        );
    }

-    private File getPreconverterOutputFile(int index, int block) {
+    private File getPreconverterOutputFile(int index, IndexBlock block) {
        return preconverterOutputFile.get(index, block);
    }

@ -141,7 +141,7 @@ public class IndexServicesFactory {
                indexMap.put(block, createSearchIndex(id, block));
            }
            catch (Exception ex) {
-                logger.error("Could not create index {}-{}", id, block);
+                logger.error("Could not create index {}-{} ({})", id, block, ex.getMessage());
            }
        }
        return new SearchIndexReader(indexMap);
@ -150,8 +150,8 @@ public class IndexServicesFactory {
    private SearchIndex createSearchIndex(int bucketId, IndexBlock block) {
        try {
            return new SearchIndex("IndexReader"+bucketId+":"+ block.name(),
-                    indexReadUrlsFile.get(bucketId, block.id),
-                    indexReadWordsFile.get(bucketId, block.id));
+                    indexReadUrlsFile.get(bucketId, block),
+                    indexReadWordsFile.get(bucketId, block));
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
@ -159,7 +159,8 @@ public class IndexServicesFactory {

    public Callable<Boolean> switchFilesJob(int id) {
        return () -> {
-            for (int block = 0; block < IndexBlock.values().length; block++) {
+
+            for (var block : IndexBlock.values()) {
                if (Files.exists(indexWriteWordsFile.get(id, block).toPath()) &&
                        Files.exists(indexWriteUrlsFile.get(id, block).toPath())) {
                    Files.move(
@ -172,6 +173,7 @@ public class IndexServicesFactory {
                            StandardCopyOption.REPLACE_EXISTING);
                }
            }
+
            return true;
        };
    }
@ -205,8 +207,8 @@ class PartitionedDataFile {
        this.pattern = pattern;
    }

-    public File get(int id) {
-        Path partitionDir = partition.resolve(Integer.toString(id));
+    public File get(Object id) {
+        Path partitionDir = partition.resolve(id.toString());
        if (!partitionDir.toFile().exists()) {
            partitionDir.toFile().mkdir();
        }
@ -223,13 +225,13 @@ class DoublePartitionedDataFile {
        this.pattern = pattern;
    }

-    public File get(int id, int id2) {
-        Path partitionDir = partition.resolve(Integer.toString(id));
+    public File get(Object id, Object id2) {
+        Path partitionDir = partition.resolve(id.toString());

        if (!partitionDir.toFile().exists()) {
            partitionDir.toFile().mkdir();
        }
-        partitionDir = partitionDir.resolve(Integer.toString(id2));
+        partitionDir = partitionDir.resolve(id2.toString());
        if (!partitionDir.toFile().exists()) {
            partitionDir.toFile().mkdir();
        }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java
@ -47,6 +47,9 @@ public class EdgeIndexClient extends AbstractDynamicClient implements EdgeIndexW
        var wordSetBuilder = IndexPutKeywordsReq.WordSet.newBuilder();
        wordSetBuilder.setIndex(wordSet.block().ordinal());
        wordSetBuilder.addAllWords(List.of(wordSet.keywords()));
+        for (var meta : wordSet.metadata()) {
+            wordSetBuilder.addMeta(meta);
+        }
        keywordBuilder.addWordSet(wordSetBuilder.build());

        var req = keywordBuilder.build();
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java
@ -21,7 +21,6 @@ import org.slf4j.LoggerFactory;
 import java.io.IOException;
 import java.nio.file.Path;
 import java.util.Arrays;
-import java.util.List;

@Singleton
 public class EdgeIndexLocalService implements EdgeIndexWriterClient {
@ -53,9 +52,9 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient {
            return;
        }

-        for (var chunk : ListChunker.chopList(List.of(wordSet.keywords()), SearchIndexJournalEntry.MAX_LENGTH)) {
+        for (var chunk : ListChunker.chopList(wordSet, SearchIndexJournalEntry.MAX_LENGTH)) {

-            var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
+            var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk.keywords(), chunk.metadata()));
            var header = new SearchIndexJournalEntryHeader(domain, url, wordSet.block());

            indexWriter.put(header, entry);
@ -63,19 +62,22 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient {

    }

-    private long[] getOrInsertWordIds(List<String> words) {
-        long[] ids = new long[words.size()];
-        int putId = 0;
+    private long[] getOrInsertWordIds(String[] words, long[] meta) {
+        long[] ids = new long[words.length*2];
+        int putIdx = 0;
+
+        for (int i = 0; i < words.length; i++) {
+            String word = words[i];

-        for (String word : words) {
            long id = lexicon.getOrInsert(word);
            if (id != DictionaryHashMap.NO_VALUE) {
-                ids[putId++] = id;
+                ids[putIdx++] = id;
+                ids[putIdx++] = meta[i];
            }
        }

-        if (putId != words.size()) {
-            ids = Arrays.copyOf(ids, putId);
+        if (putIdx != words.length*2) {
+            ids = Arrays.copyOf(ids, putIdx);
        }
        return ids;
    }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexConverter.java
@ -20,12 +20,14 @@ import java.nio.channels.FileChannel;
 import java.nio.file.Files;
 import java.nio.file.Path;

-import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry.MAX_LENGTH;
-
 public class SearchIndexConverter {
-    public static final BTreeContext urlsBTreeContext = new BTreeContext(5, 1, ~0, 8);
+    public static final int ENTRY_URL_OFFSET = 0;
+    public static final int ENTRY_METADATA_OFFSET = 1;
+    public static final int ENTRY_SIZE = 2;

-    private final long[] tmpWordsBuffer = new long[MAX_LENGTH];
+    public static final BTreeContext urlsBTreeContext = new BTreeContext(5, ENTRY_SIZE, ~0, 8);
+
+    private final long[] tmpWordsBuffer = SearchIndexJournalReader.createAdequateTempBuffer();

    private final Path tmpFileDir;

@ -72,7 +74,7 @@ public class SearchIndexConverter {
            return;
        }

-        logger.info("Converting {} ({}) {} {}", block.id, block, inputFile, journalReader.fileHeader);
+        logger.info("Converting {} ({}) {} {}", block.ordinal(), block, inputFile, journalReader.fileHeader);

        var lock = partitioner.getReadLock();
        try {
@ -80,10 +82,10 @@ public class SearchIndexConverter {

            var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");

-            logger.info("Creating word index table {} for block {} ({})", outputFileWords, block.id, block);
+            logger.info("Creating word index table {} for block {}", outputFileWords, block.ordinal());
            WordIndexOffsetsTable wordIndexTable = createWordIndexTable(journalReader, outputFileWords);

-            logger.info("Creating word urls table {} for block {} ({})", outputFileUrls, block.id, block);
+            logger.info("Creating word urls table {} for block {}", outputFileUrls, block.ordinal());
            createUrlTable(journalReader, tmpUrlsFile, wordIndexTable);

            Files.delete(tmpUrlsFile);
@ -111,10 +113,10 @@ public class SearchIndexConverter {

            final SearchIndexJournalEntry entryData = entry.readEntryUsingBuffer(tmpWordsBuffer);

-            for (int i = 0; i < entryData.size(); i++) {
-                int wordId = (int) entryData.get(i);
+            for (var record : entryData) {
+                int wordId = record.wordId();
                if (wordId < 0 || wordId >= topWord) {
-                    logger.warn("Bad wordId {}", wordId);
+                    logger.warn("Bad word {}", record);
                }
                wordsTableWriter.acceptWord(wordId);
            }
@ -138,7 +140,7 @@ public class SearchIndexConverter {
        try (RandomAccessFile urlsTmpFileRAF = new RandomAccessFile(tmpUrlsFile.toFile(), "rw");
             FileChannel urlsTmpFileChannel = urlsTmpFileRAF.getChannel()) {

-            try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, numberOfWordsTotal, 10_000_000)) {
+            try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, ENTRY_SIZE * numberOfWordsTotal, 10_000_000)) {
                int[] wordWriteOffset = new int[wordOffsetsTable.length()];

                for (var entry : journalReader) {
@ -146,21 +148,29 @@ public class SearchIndexConverter {

                    var entryData = entry.readEntryUsingBuffer(tmpWordsBuffer);

-                    for (int i = 0; i < entryData.size(); i++) {
-                        int wordId = (int) entryData.get(i);
+                    for (var record : entryData) {
+                        int wordId = record.wordId();
+                        long metadata = record.metadata();

-                        if (wordId >= wordWriteOffset.length)
+                        if (wordId >= wordWriteOffset.length) {
+                            logger.warn("Overflowing wordId {}", wordId);
                            continue;
+                        }
+
                        if (wordId < 0) {
                            logger.warn("Negative wordId {}", wordId);
                        }

                        final long urlInternal = translateUrl(entry.docId());
-                        if (wordId > 0) {
-                            rwf.put(wordOffsetsTable.get(wordId - 1) + wordWriteOffset[wordId]++, urlInternal);
-                        } else {
-                            rwf.put(wordWriteOffset[wordId]++, urlInternal);
-                        }
+
+                        long offset;
+                        if (wordId > 0) offset = wordOffsetsTable.get(wordId - 1) + wordWriteOffset[wordId];
+                        else offset = wordWriteOffset[wordId];
+
+                        rwf.put(offset + ENTRY_URL_OFFSET, urlInternal);
+                        rwf.put(offset + ENTRY_METADATA_OFFSET, metadata);
+
+                        wordWriteOffset[wordId] += ENTRY_SIZE;
                    }
                }

@ -171,9 +181,9 @@ public class SearchIndexConverter {

            try (var urlsTmpFileMap = MultimapFileLong.forOutput(tmpUrlsFile, numberOfWordsTotal)) {
                if (wordOffsetsTable.length() > 0) {
-                    var urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit);
+                    var urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit, ENTRY_SIZE);

-                    wordOffsetsTable.forEachRange(urlTmpFileSorter::sort);
+                    wordOffsetsTable.forEachRange(urlTmpFileSorter::sortRange);

                    urlsTmpFileMap.force();
                } else {
@ -187,7 +197,7 @@ public class SearchIndexConverter {
                wordOffsetsTable.foldRanges((accumulatorIdx, start, length) -> {
                    // Note: The return value is accumulated into accumulatorIdx!

-                    return writer.write(accumulatorIdx, length,
+                    return writer.write(accumulatorIdx, length/ENTRY_SIZE,
                            slice -> slice.transferFromFileChannel(urlsTmpFileChannel, 0, start, start + length));
                });

--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexDao.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexDao.java
@ -9,7 +9,6 @@ import gnu.trove.set.hash.TIntHashSet;
 import lombok.SneakyThrows;
 import nu.marginalia.util.ranking.BetterReversePageRank;
 import nu.marginalia.util.ranking.BetterStandardPageRank;
-import nu.marginalia.util.ranking.BuggyStandardPageRank;
 import nu.marginalia.util.ranking.RankingDomainFetcher;
 import nu.marginalia.wmsa.edge.index.model.RankingSettings;
 import org.slf4j.Logger;
@ -87,8 +86,25 @@ public class SearchIndexDao {

    @SneakyThrows
    public TIntList getStandardDomains() {
-        var spr = new BuggyStandardPageRank(rankingDomains,rankingSettings.standard.toArray(String[]::new));
-        return spr.pageRankWithPeripheralNodes(spr.size()/2);
+        TIntArrayList results = new TIntArrayList();
+
+        try (var connection = dataSource.getConnection();
+             var stmt = connection.prepareStatement(
+            """
+            SELECT ID FROM EC_DOMAIN 
+            WHERE INDEXED>0 
+            AND STATE='ACTIVE' 
+            AND DOMAIN_ALIAS IS NULL 
+            ORDER BY ID ASC
+            """);
+        ) {
+            var rs = stmt.executeQuery();
+            while (rs.next()) {
+                results.add(rs.getInt(1));
+            }
+        }
+        return results;
+
    }

    @SneakyThrows
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPartitioner.java
@ -110,11 +110,12 @@ public class SearchIndexPartitioner {
            return true;
        if (academiaRanking.hasBucket(bucketId, domainId))
            return true;
-        if (standardRanking.hasBucket(bucketId, domainId))
-            return true;
        if (specialDomainRanking.hasBucket(bucketId, domainId))
            return true;

+        if (standardRanking.hasBucket(bucketId, domainId))
+            return true;
+
        return DYNAMIC_BUCKET_LENGTH == bucketId;
    }

@ -148,15 +149,15 @@ public class SearchIndexPartitioner {
        if (academiaRanking != null && academiaRanking.ownsBucket(bucketId)) {
            return academiaRanking.translateId(id);
        }
-        if (standardRanking != null && standardRanking.ownsBucket(bucketId)) {
-            return standardRanking.translateId(id);
-        }
        if (specialDomainRanking != null && specialDomainRanking.ownsBucket(bucketId)) {
            return specialDomainRanking.translateId(id);
        }
-        if (retroRanking != null) {
-            return retroRanking.translateId(id);
+
+        // standard gets passed traight through
+        if (standardRanking != null && standardRanking.ownsBucket(bucketId)) {
+            return id;
        }
+
        return id;
    }

--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/SearchIndexPreconverter.java
@ -52,7 +52,7 @@ public class SearchIndexPreconverter {
        var lock = partitioner.getReadLock();
        try {
            lock.lock();
-            ByteBuffer buffer = ByteBuffer.allocateDirect(8192);
+            ByteBuffer buffer = ByteBuffer.allocateDirect(65536);
            for (var entry : indexJournalReader) {
                if (!partitioner.isGoodUrl(entry.urlId())
                    || spamDomains.contains(entry.domainId())) {
@ -93,7 +93,7 @@ public class SearchIndexPreconverter {
        }

        public boolean shouldWrite(SearchIndexPartitioner partitioner, SearchIndexJournalReader.JournalEntry entry) {
-            return shard.block == entry.header.block().id
+            return shard.block == entry.header.block().ordinal()
                    && partitioner.filterUnsafe(entry.domainId(), shard.bucket);
        }

--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordIndexOffsetsTable.java
@ -23,10 +23,10 @@ public class WordIndexOffsetsTable {

        for (int i = 1; i < table.length; i++) {
            long start = table[i-1];
-            int length = (int) (table[i] - start);
+            long end = table[i];

-            if (length != 0) {
-                o.accept(start, length);
+            if (start != end) {
+                o.accept(start, end);
            }
        }
    }
@ -58,7 +58,7 @@ public class WordIndexOffsetsTable {
    }

    public interface OffsetTableEntryConsumer {
-        void accept(long start, int length) throws IOException;
+        void accept(long start, long end) throws IOException;
    }

    public interface OffsetTableEntryFoldConsumer {
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/conversion/words/WordsTableWriter.java
@ -8,8 +8,10 @@ import nu.marginalia.wmsa.edge.index.reader.IndexWordsTable;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

-import java.io.*;
+import java.io.File;
+import java.io.IOException;

+import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.ENTRY_SIZE;
 import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.urlsBTreeContext;

 public class WordsTableWriter {
@ -23,8 +25,10 @@ public class WordsTableWriter {
    }

    public void acceptWord(int wordId) {
+        for (int i = 0; i < ENTRY_SIZE; i++) {
            table.lengths().increment(wordId);
        }
+    }

    public WordIndexOffsetsTable getTable() {
        return table.offsets();
@ -58,7 +62,7 @@ public class WordsTableWriter {
            mapSlice.put(idx++, (long)length<<32);
            mapSlice.put(idx++, 0);

-            urlFileOffset += (urlsBTreeContext.calculateSize(length));
+            urlFileOffset += (urlsBTreeContext.calculateSize(length / ENTRY_SIZE));
        }

        for (int i = 1; i < offsetTable.length; i++) {
@ -68,7 +72,7 @@ public class WordsTableWriter {
                mapSlice.put(idx++, (long)length << 32 | i);
                mapSlice.put(idx++, urlFileOffset);

-                urlFileOffset += (urlsBTreeContext.calculateSize(length));
+                urlFileOffset += (urlsBTreeContext.calculateSize(length / ENTRY_SIZE));
            }
        }
    }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalReader.java
@ -12,6 +12,8 @@ import org.jetbrains.annotations.NotNull;
 import java.nio.ByteBuffer;
 import java.util.Iterator;

+import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry.ENTRY_SIZE;
+import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry.MAX_LENGTH;
 import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS;

 public class SearchIndexJournalReader implements Iterable<SearchIndexJournalReader.JournalEntry>  {
@ -23,6 +25,10 @@ public class SearchIndexJournalReader implements Iterable<SearchIndexJournalRead
    private final MultimapFileLongSlice map;
    private final long committedSize;

+    public static long[] createAdequateTempBuffer() {
+        return new long[MAX_LENGTH*ENTRY_SIZE];
+    }
+
    public SearchIndexJournalReader(MultimapFileLong map) {
        fileHeader = new SearchIndexJournalFileHeader(map.get(0), map.get(1));
        committedSize = map.get(0) / 8 - FILE_HEADER_SIZE_LONGS;
@ -92,7 +98,7 @@ public class SearchIndexJournalReader implements Iterable<SearchIndexJournalRead
        public IndexBlock block() {
            return header.block();
        }
-        public int wordCount() { return header.entrySize(); }
+        public int wordCount() { return header.entrySize() / ENTRY_SIZE; }

        public SearchIndexJournalEntry readEntry() {
            long[] dest = new long[header.entrySize()];
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java
@ -26,7 +26,7 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
    private RandomAccessFile raf;
    private FileChannel channel;

-    public static final int MAX_BLOCK_SIZE = SearchIndexJournalEntry.MAX_LENGTH*32*8*4;
+    public static final int MAX_BLOCK_SIZE = SearchIndexJournalEntry.MAX_LENGTH*128*8*4;
    private final ByteBuffer byteBuffer;
    private long pos;

@ -83,7 +83,7 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
            byteBuffer.clear();

            byteBuffer.putInt(entryData.size());
-            byteBuffer.putInt(header.block().id);
+            byteBuffer.putInt(header.block().ordinal());
            byteBuffer.putLong(header.documentId());

            entryData.write(byteBuffer);
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalEntry.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/model/SearchIndexJournalEntry.java
@ -2,12 +2,14 @@ package nu.marginalia.wmsa.edge.index.journal.model;

 import java.nio.ByteBuffer;
 import java.util.Arrays;
+import java.util.Iterator;

-public class SearchIndexJournalEntry {
+public class SearchIndexJournalEntry implements Iterable<SearchIndexJournalEntry.Record> {
    private final int size;
    private final long[] underlyingArray;

    public static final int MAX_LENGTH = 1000;
+    public static final int ENTRY_SIZE = 2;

    public SearchIndexJournalEntry(long[] underlyingArray) {
        this.size = underlyingArray.length;
@ -46,4 +48,24 @@ public class SearchIndexJournalEntry {
        return String.format("%s[%s]", getClass().getSimpleName(), Arrays.toString(toArray()));
    }

+    public Iterator<Record> iterator() {
+        return new EntryIterator();
+    }
+
+    private class EntryIterator implements Iterator<Record> {
+        int pos = -ENTRY_SIZE;
+
+        public boolean hasNext() {
+            return pos + ENTRY_SIZE < size;
+        }
+
+        @Override
+        public Record next() {
+            pos+=ENTRY_SIZE;
+
+            return new Record((int) underlyingArray[pos], underlyingArray[pos+1]);
+        }
+    }
+
+    public record Record(int wordId, long metadata) {}
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java
@ -5,6 +5,7 @@ import com.google.common.hash.Hashing;
 import io.prometheus.client.Gauge;
 import lombok.SneakyThrows;
 import nu.marginalia.util.dict.DictionaryHashMap;
+import nu.marginalia.util.dict.DictionaryMap;
 import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -16,7 +17,7 @@ import java.util.concurrent.locks.ReadWriteLock;
 import java.util.concurrent.locks.ReentrantReadWriteLock;

 public class KeywordLexicon implements AutoCloseable {
-    private final DictionaryHashMap reverseIndex;
+    private final DictionaryMap reverseIndex;

    private final ReadWriteLock memoryLock = new ReentrantReadWriteLock();
    private final Logger logger = LoggerFactory.getLogger(getClass());
@ -30,7 +31,7 @@ public class KeywordLexicon implements AutoCloseable {
    private final KeywordLexiconJournal journal;

    @SneakyThrows
-    public KeywordLexicon(KeywordLexiconJournal keywordLexiconJournal, DictionaryHashMap reverseIndexHashMap) {
+    public KeywordLexicon(KeywordLexiconJournal keywordLexiconJournal, DictionaryMap reverseIndexHashMap) {

        journal = keywordLexiconJournal;
        reverseIndex = reverseIndexHashMap;
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgeIndexSearchTerms.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgeIndexSearchTerms.java
@ -1,16 +0,0 @@
-package nu.marginalia.wmsa.edge.index.model;
-
-import lombok.AllArgsConstructor;
-
-import java.util.ArrayList;
-import java.util.List;
-
-@AllArgsConstructor
-public class EdgeIndexSearchTerms {
-    public List<Integer> includes = new ArrayList<>();
-    public List<Integer> excludes = new ArrayList<>();
-
-    public boolean isEmpty() {
-        return includes.isEmpty();
-    }
-}
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordFlags.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordFlags.java
@ -0,0 +1,32 @@
+package nu.marginalia.wmsa.edge.index.model;
+
+import java.util.EnumSet;
+
+public enum EdgePageWordFlags {
+    Title,
+    Subjects,
+    NamesWords,
+    Site,
+    SiteAdjacent,
+    Simple;
+
+    public int asBit() {
+        return 1 << ordinal();
+    }
+
+    public boolean isPresent(long value) {
+        return (asBit() & value) > 0;
+    }
+
+    public static EnumSet<EdgePageWordFlags> decode(long encodedValue) {
+        EnumSet<EdgePageWordFlags> ret = EnumSet.noneOf(EdgePageWordFlags.class);
+
+        for (EdgePageWordFlags f : values()) {
+            if ((encodedValue & f.asBit()) > 0) {
+                ret.add(f);
+            }
+        }
+
+        return ret;
+    }
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordMetadata.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePageWordMetadata.java
@ -0,0 +1,90 @@
+package nu.marginalia.wmsa.edge.index.model;
+
+import nu.marginalia.util.BrailleBlockPunchCards;
+
+import java.util.EnumSet;
+
+import static java.lang.Math.max;
+import static java.lang.Math.min;
+
+public record EdgePageWordMetadata(int tfIdf,
+                                   int positions,
+                                   int quality,
+                                   int count,
+                                   EnumSet<EdgePageWordFlags> flags) {
+
+    // If flags are moved from the least significant end of
+    // this struct, then EntrySourceFromBTree will break.
+
+    public static final long COUNT_MASK = 0xFL;
+    public static final int COUNT_SHIFT = 8;
+
+    public static final long QUALITY_MASK = 0xFL;
+    public static final int QUALITY_SHIFT = 12;
+
+    public static final long TF_IDF_MASK = 0xFFFFL;
+    public static final int TF_IDF_SHIFT = 16;
+
+    public static final int POSITIONS_SHIFT = 32;
+
+    public EdgePageWordMetadata(long value) {
+        this(
+                (int)((value >>> TF_IDF_SHIFT) & TF_IDF_MASK),
+                (int)(value >>> POSITIONS_SHIFT),
+                (int)((value >>> QUALITY_SHIFT) & QUALITY_MASK),
+                (int)((value >>> COUNT_SHIFT) & COUNT_MASK),
+                EdgePageWordFlags.decode(value)
+        );
+    }
+
+    public static int decodeQuality(long encoded) {
+        return (int)((encoded >>> QUALITY_SHIFT) & QUALITY_MASK);
+    }
+
+    public static boolean hasFlags(long encoded, long metadataBitMask) {
+        return (encoded & metadataBitMask) == encoded;
+    }
+
+    public String toString() {
+        StringBuilder sb = new StringBuilder(getClass().getSimpleName());
+        sb.append('[')
+                .append("tfidf=").append(tfIdf).append(", ")
+                .append("quality=").append(quality).append(", ")
+                .append("count=").append(count).append(", ")
+                .append("positions=[").append(BrailleBlockPunchCards.printBits(positions, 32)).append(']');
+        sb.append(", flags=").append(flags).append(']');
+        return sb.toString();
+    }
+
+    /* Encoded in a 64 bit long as
+       0-8 flags
+       8-12 count,
+       12-16 quality,
+       16-32 tf-idf [0, 65536]
+       32-64 position mask
+     */
+    public long encode() {
+        long ret = 0;
+
+        for (var flag : flags) {
+            ret |= flag.asBit();
+        }
+
+        ret |= min(TF_IDF_MASK, max(0, tfIdf)) << TF_IDF_SHIFT;
+        ret |= min(COUNT_MASK, max(0, count)) << COUNT_SHIFT;
+        ret |= min(QUALITY_MASK, max(0, quality)) << QUALITY_SHIFT;
+        ret |= ((long)(positions)) << POSITIONS_SHIFT;
+
+        return ret;
+    }
+
+    public boolean isEmpty() {
+        return count == 0 && positions == 0 && flags.isEmpty() && tfIdf == 0;
+    }
+
+    public static long emptyValue() {
+        return 0L;
+    }
+
+
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePutWordsRequest.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/EdgePutWordsRequest.java
@ -1,20 +0,0 @@
-package nu.marginalia.wmsa.edge.index.model;
-
-import lombok.AllArgsConstructor;
-import lombok.Getter;
-import lombok.ToString;
-import nu.marginalia.wmsa.edge.model.EdgeDomain;
-import nu.marginalia.wmsa.edge.model.EdgeUrl;
-import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
-import nu.marginalia.wmsa.edge.model.id.EdgeId;
-
-@AllArgsConstructor @Getter
-@ToString
-public class EdgePutWordsRequest {
-    public EdgeId<EdgeDomain> domainId;
-    public EdgeId<EdgeUrl> urlId;
-    public double quality;
-
-    public EdgePageWordSet wordSet;
-    private int index = 0;
-}
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlock.java
@ -1,47 +1,35 @@
 package nu.marginalia.wmsa.edge.index.model;

 public enum IndexBlock {
-    TitleKeywords(IndexBlockType.QUALITY_SIGNAL, 0, 0),
-    Title(IndexBlockType.QUALITY_SIGNAL, 1, 1),
+    Title(IndexBlockType.PAGE_DATA),
+    Meta(IndexBlockType.PAGE_DATA),

-    Link(IndexBlockType.QUALITY_SIGNAL, 2, 1.15),
+    Words_1(IndexBlockType.PAGE_DATA),
+    Words_2(IndexBlockType.PAGE_DATA),
+    Words_4(IndexBlockType.PAGE_DATA),
+    Words_8(IndexBlockType.PAGE_DATA),
+    Words_16Plus(IndexBlockType.PAGE_DATA),

-    Subjects(IndexBlockType.QUALITY_SIGNAL, 3, 1.0),
-    NamesWords(IndexBlockType.QUALITY_SIGNAL, 4, 3.0),
+    Link(IndexBlockType.QUALITY_SIGNAL),
+    Site(IndexBlockType.QUALITY_SIGNAL),

-    Artifacts(IndexBlockType.PAGE_DATA, 5, 10),
-    Meta(IndexBlockType.PAGE_DATA, 6, 7),
+    Artifacts(IndexBlockType.PAGE_DATA),

-    Tfidf_Top(IndexBlockType.TF_IDF, 7, 1.5),
-    Tfidf_Middle(IndexBlockType.TF_IDF, 8, 2),
-    Tfidf_Lower(IndexBlockType.TF_IDF, 9, 3.5),
-
-    Words_1(IndexBlockType.PAGE_DATA, 10, 2.0),
-    Words_2(IndexBlockType.PAGE_DATA,11, 3.5),
-    Words_4(IndexBlockType.PAGE_DATA,12, 4.0),
-    Words_8(IndexBlockType.PAGE_DATA,13, 4.5),
-    Words_16Plus(IndexBlockType.PAGE_DATA,14, 7.0),
-
-    Site(IndexBlockType.QUALITY_SIGNAL, 15, 1.2)
+    Tfidf_High(IndexBlockType.TRANSIENT),
+    Subjects(IndexBlockType.TRANSIENT)
    ;

    public final IndexBlockType type;
-    public final int id;
-    public final double sortOrder;

-    IndexBlock(IndexBlockType type, int id, double sortOrder) {
+    IndexBlock(IndexBlockType type) {
        this.type = type;
-        this.sortOrder = sortOrder;
-        this.id = id;
    }

+    // This is kind of a hot method, and Enum.values() allocates a new
+    // array each call.
+    private static final IndexBlock[] values = IndexBlock.values();
    public static IndexBlock byId(int id) {
-        for (IndexBlock block : values()) {
-            if (id == block.id) {
-                return block;
-            }
-        }
-        throw new IllegalArgumentException("Bad block id");
+        return values[id];
    }
 }

--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlockType.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/model/IndexBlockType.java
@ -1,7 +1,10 @@
 package nu.marginalia.wmsa.edge.index.model;

 public enum IndexBlockType {
+    /** This block is only used for joins */
    QUALITY_SIGNAL,
-    TF_IDF,
-    PAGE_DATA
+    /** This block contains page keywords */
+    PAGE_DATA,
+    /** This block is only used for generation */
+    TRANSIENT
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/IndexWordsTable.java
@ -2,7 +2,6 @@ package nu.marginalia.wmsa.edge.index.reader;

 import com.upserve.uppend.blobs.NativeIO;
 import nu.marginalia.util.btree.BTreeReader;
-import nu.marginalia.util.btree.model.BTreeHeader;
 import nu.marginalia.util.multimap.MultimapFileLong;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -17,7 +16,6 @@ import static nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter.wo
 public class IndexWordsTable implements AutoCloseable {
    protected final MultimapFileLong words;
    protected final BTreeReader reader;
-    protected final BTreeHeader header;
    protected final int HEADER_OFFSET = 1;
    final Logger logger = LoggerFactory.getLogger(getClass());

@ -26,8 +24,7 @@ public class IndexWordsTable implements AutoCloseable {
    public IndexWordsTable(MultimapFileLong words) {
        this.words = words;

-        reader = new BTreeReader(words, wordsBTreeContext);
-        header = reader.getHeader(HEADER_OFFSET);
+        reader = new BTreeReader(words, wordsBTreeContext, HEADER_OFFSET);

        madvise();
    }
@ -49,7 +46,7 @@ public class IndexWordsTable implements AutoCloseable {
    }

    public long positionForWord(int wordId) {
-        long offset = reader.findEntry(header, wordId);
+        long offset = reader.findEntry(wordId);

        if (offset < 0) {
            return -1L;
@ -60,7 +57,7 @@ public class IndexWordsTable implements AutoCloseable {

    public int wordLength(int wordId) {

-        long offset = reader.findEntry(header, wordId);
+        long offset = reader.findEntry(wordId);
        if (offset < 0) {
            return -1;
        }
@ -72,7 +69,7 @@ public class IndexWordsTable implements AutoCloseable {
        words.advice(NativeIO.Advice.Random);
        words.advice0(NativeIO.Advice.WillNeed);

-        var h = reader.getHeader(HEADER_OFFSET);
+        var h = reader.getHeader();
        int length = (int)(h.dataOffsetLongs() - h.indexOffsetLongs());

        words.adviceRange(NativeIO.Advice.WillNeed, h.indexOffsetLongs(), length);
@ -80,8 +77,8 @@ public class IndexWordsTable implements AutoCloseable {
    }

    public void forEachWordsOffset(LongConsumer offsetConsumer) {
-        int n = header.numEntries();
-        long offset = header.dataOffsetLongs();
+        int n = reader.numEntries();
+        long offset = reader.getHeader().dataOffsetLongs();

        for (int i = 0; i < n; i++) {
            try {
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndex.java
@ -5,21 +5,13 @@ import com.google.inject.name.Named;
 import com.upserve.uppend.blobs.NativeIO;
 import io.reactivex.rxjava3.schedulers.Schedulers;
 import nu.marginalia.util.btree.BTreeReader;
-import nu.marginalia.util.btree.CachingBTreeReader;
-import nu.marginalia.util.btree.model.BTreeHeader;
 import nu.marginalia.util.multimap.MultimapFileLong;
-import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
-import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
-import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
-import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.io.File;
 import java.io.IOException;
 import java.io.RandomAccessFile;
-import java.util.Arrays;
-import java.util.stream.LongStream;

 public class SearchIndex implements  AutoCloseable {

@ -27,8 +19,6 @@ public class SearchIndex implements  AutoCloseable {
    private final IndexWordsTable words;
    public final String name;
    private final RandomAccessFile wordsFile;
-    private final BTreeReader bTreeReader;
-    private final CachingBTreeReader cachingBTreeReader;

    private final Logger logger;

@ -49,16 +39,13 @@ public class SearchIndex implements  AutoCloseable {
        urls = MultimapFileLong.forReading(inUrls.toPath());
        words = IndexWordsTable.ofFile(wordsFile);

-        bTreeReader = new BTreeReader(urls, SearchIndexConverter.urlsBTreeContext);
-        cachingBTreeReader = new CachingBTreeReader(urls, SearchIndexConverter.urlsBTreeContext);
-
-        Schedulers.io().scheduleDirect(() -> madvise(urls, bTreeReader));
+        Schedulers.io().scheduleDirect(() -> madvise(urls));
    }

-    private void madvise(MultimapFileLong urls, BTreeReader reader) {
+    private void madvise(MultimapFileLong urls) {

        words.forEachWordsOffset(offset -> {
-            var h = reader.getHeader(offset);
+            var h = BTreeReader.createHeader(urls, offset);
            long length = h.dataOffsetLongs() - h.indexOffsetLongs();

            urls.adviceRange(NativeIO.Advice.WillNeed, offset, 512);
@ -70,174 +57,16 @@ public class SearchIndex implements  AutoCloseable {
    }


-    public long numUrls(IndexQueryCachePool pool, int wordId) {
+    public long numUrls(int wordId) {
        int length = words.wordLength(wordId);
        if (length < 0) return 0;
        if (length > 0) return length;

-        return rangeForWord(pool, wordId).numEntries();
-    }
-
-    public IndexBTreeRange rangeForWord(IndexQueryCachePool pool, int wordId) {
-        IndexBTreeRange range = pool.getRange(words, wordId);
-
-        if (range == null) {
-            range = new IndexBTreeRange(words.positionForWord(wordId));
-            pool.cacheRange(words, wordId, range);
-        }
-
-        return range;
-    }
-
-    public IndexBTreeRange rangeForWord(int wordId) {
-        return new IndexBTreeRange(words.positionForWord(wordId));
-    }
-
-    public class IndexBTreeRange {
-        public final long dataOffset;
-        private BTreeHeader header;
-        public IndexBTreeRange(long dataOffset) {
-            this.dataOffset = dataOffset;
-        }
-
-        public LongStream stream(int bufferSize) {
-            if (dataOffset < 0) {
-                return LongStream.empty();
-            }
-            if (header == null) {
-                header = bTreeReader.getHeader(dataOffset);
-            }
-
-            long urlOffset = header.dataOffsetLongs();
-            long endOffset = header.dataOffsetLongs() + header.numEntries();
-            int stepSize = Math.min(bufferSize, header.numEntries());
-
-            long[] buffer = new long[stepSize];
-
-            return LongStream
-                    .iterate(urlOffset, i -> i< endOffset, i->i+stepSize)
-                    .flatMap(pos -> {
-                        int sz = (int)(Math.min(pos+stepSize, endOffset) - pos);
-                        urls.read(buffer, sz, pos);
-                        return Arrays.stream(buffer, 0, sz);
-                    });
-        }
-
-        public EntrySource asEntrySource() {
-            return new AsEntrySource();
-        }
-
-        public QueryFilterStepIf asExcludeFilterStep(IndexQueryCachePool pool) {
-            return new AsExcludeQueryFilterStep(pool);
-        }
-
-
-        public LongStream stream() {
-            return stream(1024);
-        }
-
-        public boolean isPresent() {
-            return dataOffset >= 0;
-        }
-
-        public long numEntries() {
-            if (header != null) {
-                return header.numEntries();
-            }
-            else if (dataOffset < 0) return 0L;
-            else {
-                header = bTreeReader.getHeader(dataOffset);
-                return header.numEntries();
-            }
-        }
-
-        public boolean hasUrl(CachingBTreeReader.BTreeCachedIndex cache, long url) {
-            if (dataOffset < 0) return false;
-
-            return cachingBTreeReader.findEntry(cache, url) >= 0;
-        }
-
-        public boolean hasUrl(IndexQueryCachePool pool, long url) {
-            if (dataOffset < 0)
-                return false;
-
-            CachingBTreeReader.BTreeCachedIndex cache = pool.getIndexCache(SearchIndex.this, this);
-
-            return cachingBTreeReader.findEntry(cache, url) >= 0;
-        }
-
-        public CachingBTreeReader.BTreeCachedIndex createIndexCache() {
-            if (dataOffset < 0)
-                return null;
-
-            if (header == null) {
-                header = cachingBTreeReader.getHeader(dataOffset);
-            }
-
-            return cachingBTreeReader.prepareCache(header);
-        }
-
-        class AsEntrySource implements EntrySource {
-            long pos;
-            final long endOffset;
-
-            public SearchIndex getIndex() {
-                return SearchIndex.this;
-            };
-
-            public AsEntrySource() {
-                if (dataOffset <= 0) {
-                    pos = -1;
-                    endOffset = -1;
-                    return;
-                }
-
-                if (header == null) {
-                    header = bTreeReader.getHeader(dataOffset);
-                }
-
-                pos = header.dataOffsetLongs();
-                endOffset = header.dataOffsetLongs() + header.numEntries();
-            }
-
-
-            @Override
-            public int read(long[] buffer, int n) {
-                if (pos >= endOffset) {
-                    return 0;
-                }
-
-                int rb = Math.min(n, (int)(endOffset - pos));
-                urls.read(buffer, rb, pos);
-                pos += rb;
-                return rb;
-            }
-        }
-
-        class AsExcludeQueryFilterStep implements QueryFilterStepIf {
-            private final CachingBTreeReader.BTreeCachedIndex cache;
-
-            public AsExcludeQueryFilterStep(IndexQueryCachePool pool) {
-                cache = pool.getIndexCache(SearchIndex.this, IndexBTreeRange.this);
-            }
-
-            public SearchIndex getIndex() {
-                return SearchIndex.this;
-            };
-            public double cost() {
-                return cache.getIndexedDataSize();
-            }
-
-            @Override
-            public boolean test(long value) {
-                return !hasUrl(cache, value);
-            }
-
-            public String describe() {
-                return "Exclude["+name+"]";
-            }
+        return rangeForWord(wordId).numEntries();
    }

+    public SearchIndexURLRange rangeForWord(int wordId) {
+        return new SearchIndexURLRange(urls, words.positionForWord(wordId));
    }

    @Override
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexReader.java
@ -5,7 +5,6 @@ import lombok.SneakyThrows;
 import nu.marginalia.wmsa.edge.index.model.IndexBlock;
 import nu.marginalia.wmsa.edge.index.svc.query.IndexDomainQueryFactory;
 import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
-import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
 import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -22,31 +21,14 @@ public class SearchIndexReader implements AutoCloseable {
    private final IndexDomainQueryFactory domainQueryFactory;
    private final Logger logger = LoggerFactory.getLogger(getClass());

-    private static final IndexBlock[] indicesBySearchOrder = new IndexBlock[] {
-            IndexBlock.Title,
-            IndexBlock.Tfidf_Top,
-            IndexBlock.Tfidf_Middle,
-            IndexBlock.Tfidf_Lower,
-            IndexBlock.Words_1,
-            IndexBlock.Words_2,
-            IndexBlock.Words_4,
-            IndexBlock.Words_8,
-            IndexBlock.Words_16Plus,
-    };
-
    @Inject
    public SearchIndexReader(
            EnumMap<IndexBlock, SearchIndex> indices) {
        this.indices = indices;

-        var lowIndex  = indices.get(IndexBlock.Tfidf_Lower);
-        var midIndex  = indices.get(IndexBlock.Tfidf_Middle);
-        var topIndex  = indices.get(IndexBlock.Tfidf_Top);
        var linkIndex  = indices.get(IndexBlock.Link);
        var titleIndex  = indices.get(IndexBlock.Title);
-        var siteIndex  = indices.get(IndexBlock.Site);
        var metaIndex  = indices.get(IndexBlock.Meta);
-        var topicIndex  = indices.get(IndexBlock.Subjects);

        var words1  = indices.get(IndexBlock.Words_1);
        var words2  = indices.get(IndexBlock.Words_2);
@ -57,7 +39,7 @@ public class SearchIndexReader implements AutoCloseable {

        queryBuilders = new EnumMap<>(IndexBlock.class);

-        List<SearchIndex> excludeIndices = listOfNonNulls(metaIndex, titleIndex, topIndex, midIndex, lowIndex, words1);
+        List<SearchIndex> excludeIndices = listOfNonNulls(metaIndex, titleIndex, words1, words2, words4, words8, words16);

        queryBuilders.put(IndexBlock.Title, new IndexQueryFactory(listOfNonNulls(metaIndex, titleIndex, linkIndex), excludeIndices));
        queryBuilders.put(IndexBlock.Words_1, new IndexQueryFactory(listOfNonNulls(metaIndex, words1), excludeIndices));
@ -66,7 +48,7 @@ public class SearchIndexReader implements AutoCloseable {
        queryBuilders.put(IndexBlock.Words_8, new IndexQueryFactory(listOfNonNulls(metaIndex, words8), excludeIndices));
        queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryFactory(listOfNonNulls(metaIndex, words16, artifacts), excludeIndices));

-        domainQueryFactory = new IndexDomainQueryFactory(siteIndex, listOfNonNulls(topicIndex));
+        domainQueryFactory = new IndexDomainQueryFactory(indices.get(IndexBlock.Words_1));
    }

    @SafeVarargs
@ -75,17 +57,31 @@ public class SearchIndexReader implements AutoCloseable {
    }


-    public IndexQueryFactory.IndexQueryBuilder findWord(IndexQueryCachePool cachePool, IndexBlock block, int wordId) {
+    public IndexQueryFactory.IndexQueryBuilder findWord(IndexBlock block, Integer quality, int wordId) {
        var builder = queryBuilders.get(block);

        if (builder == null)
            return null;

-        return builder.buildQuery(cachePool, wordId);
+        if (quality == null) {
+            return builder.buildQuery(wordId);
+        }
+        else {
+            return builder.buildQuery(quality, wordId);
+        }
    }

-    public IndexQuery findDomain(IndexQueryCachePool cachePool, int wordId) {
-        return domainQueryFactory.buildQuery(cachePool, wordId);
+    public IndexQueryFactory.IndexQueryBuilder findWordForDomainList(IndexBlock block, List<Integer> domains, int wordId) {
+        var builder = queryBuilders.get(block);
+
+        if (builder == null)
+            return null;
+
+        return builder.buildQuery(domains, wordId);
+    }
+
+    public IndexQuery findDomain(int wordId) {
+        return domainQueryFactory.buildQuery(wordId);
    }

    @Override
@ -96,7 +92,7 @@ public class SearchIndexReader implements AutoCloseable {
    }

    @SneakyThrows
-    public long numHits(IndexQueryCachePool pool, IndexBlock block, int word) {
+    public long numHits(IndexBlock block, int word) {
        IndexQueryFactory builder = queryBuilders.get(block);

        if (builder == null)
@ -104,31 +100,18 @@ public class SearchIndexReader implements AutoCloseable {

        long hits = 0;
        for (var index : builder.getIndicies()) {
-            hits += index.numUrls(pool, word);
+            hits += index.numUrls(word);
        }
        return hits;
    }

-    public IndexBlock getBlockForResult(IndexQueryCachePool cachePool, int searchTerm, long urlId) {
-        for (var block : indicesBySearchOrder) {
-            var index = indices.get(block);

-            if (null == index) {
-                continue;
-            }
-
-            if (cachePool.isUrlPresent(index, searchTerm, urlId))
-                return block;
-
-        }
-
-        return IndexBlock.Words_16Plus;
-    }
-
-    public boolean isTermInBucket(IndexQueryCachePool cachePool, IndexBlock block, int searchTerm, long urlId) {
+    public long[] getMetadata(IndexBlock block, int termId, long[] ids) {
        final var index = indices.get(block);
-        if (null == index) return false;
+        if (null == index) {
+            return new long[ids.length];
+        }

-        return cachePool.isUrlPresent(index, searchTerm, urlId);
+        return indices.get(block).rangeForWord(termId).getMetadata(ids);
    }
 }
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexURLRange.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/reader/SearchIndexURLRange.java
@ -0,0 +1,100 @@
+package nu.marginalia.wmsa.edge.index.reader;
+
+import it.unimi.dsi.fastutil.longs.LongLongImmutablePair;
+import nu.marginalia.util.btree.BTreeQueryBuffer;
+import nu.marginalia.util.btree.BTreeReader;
+import nu.marginalia.util.multimap.MultimapFileLong;
+import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
+import nu.marginalia.wmsa.edge.index.svc.query.types.EmptyEntrySource;
+import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
+import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySourceFromBTree;
+import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySourceFromMapRange;
+
+import javax.annotation.Nullable;
+
+import static nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags.*;
+
+public class SearchIndexURLRange {
+    public final long dataOffset;
+    private final MultimapFileLong urlsFile;
+
+    @Nullable
+    private final BTreeReader reader;
+
+    public SearchIndexURLRange(MultimapFileLong urlsFile, long dataOffset) {
+        this.dataOffset = dataOffset;
+        this.urlsFile = urlsFile;
+
+        if (dataOffset >= 0) {
+            this.reader = new BTreeReader(urlsFile, SearchIndexConverter.urlsBTreeContext, dataOffset);
+        } else {
+            this.reader = null;
+        }
+    }
+
+    public EntrySource asPrefixSource(long prefix, long prefixNext) {
+        if (reader == null)
+            return new EmptyEntrySource();
+
+        LongLongImmutablePair startAndEnd = reader.getRangeForPrefix(prefix, prefixNext);
+
+        if (startAndEnd.firstLong() == startAndEnd.secondLong()) {
+            return new EmptyEntrySource();
+        }
+
+        return new EntrySourceFromMapRange(urlsFile, startAndEnd.firstLong(), startAndEnd.secondLong());
+    }
+
+    public EntrySource asEntrySource() {
+        return new EntrySourceFromBTree(reader, EntrySourceFromBTree.NO_MASKING, null);
+    }
+    public EntrySource asQualityLimitingEntrySource(int limit) {
+        return new EntrySourceFromBTree(reader, EntrySourceFromBTree.NO_MASKING, limit);
+    }
+    public EntrySource asDomainEntrySource() {
+        return new EntrySourceFromBTree(reader, Subjects.asBit() | Site.asBit() | Title.asBit(), null);
+    }
+
+    public boolean isPresent() {
+        return dataOffset >= 0;
+    }
+
+    public long numEntries() {
+        if (reader == null)
+            return 0L;
+
+        return reader.numEntries();
+    }
+
+    public void retainUrls(BTreeQueryBuffer buffer) {
+        if (reader != null)
+            reader.retainEntries(buffer);
+    }
+
+    public void rejectUrls(BTreeQueryBuffer buffer) {
+        if (reader != null)
+            reader.rejectEntries(buffer);
+    }
+
+    public boolean hasUrl(long url) {
+        if (reader == null)
+            return false;
+
+        return reader.findEntry(url) >= 0;
+    }
+
+
+    public long[] getMetadata(long[] urls) {
+        if (reader == null) {
+            return new long[urls.length];
+        }
+
+        return reader.queryData(urls, 1);
+    }
+
+    @Override
+    public String toString() {
+        return String.format("BTreeRange(@" + dataOffset + ", size = " + numEntries() + ")");
+    }
+
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexDomainQueryService.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexDomainQueryService.java
@ -0,0 +1,111 @@
+package nu.marginalia.wmsa.edge.index.svc;
+
+import com.google.gson.Gson;
+import com.google.inject.Inject;
+import com.google.inject.Singleton;
+import io.prometheus.client.Histogram;
+import nu.marginalia.util.btree.BTreeQueryBuffer;
+import nu.marginalia.util.dict.DictionaryHashMap;
+import nu.marginalia.wmsa.client.GsonFactory;
+import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
+import nu.marginalia.wmsa.edge.index.svc.query.IndexSearchBudget;
+import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator;
+import nu.marginalia.wmsa.edge.model.EdgeUrl;
+import nu.marginalia.wmsa.edge.model.id.EdgeIdList;
+import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
+import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
+import org.apache.http.HttpStatus;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import spark.HaltException;
+import spark.Request;
+import spark.Response;
+import spark.Spark;
+
+import java.util.OptionalInt;
+
+import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH;
+import static spark.Spark.halt;
+
+@Singleton
+public class EdgeIndexDomainQueryService {
+
+    private final Logger logger = LoggerFactory.getLogger(getClass());
+
+    private static final Histogram wmsa_edge_index_domain_query_time = Histogram.build().name("wmsa_edge_index_domain_query_time").linearBuckets(25/1000., 25/1000., 15).help("-").register();
+
+    private final Gson gson = GsonFactory.get();
+
+    private final SearchIndexes indexes;
+
+    @Inject
+    public EdgeIndexDomainQueryService(SearchIndexes indexes) {
+        this.indexes = indexes;
+    }
+
+    public Object searchDomain(Request request, Response response) {
+        if (indexes.getLexiconReader() == null) {
+            logger.warn("Dictionary reader not yet initialized");
+            halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
+        }
+
+        String json = request.body();
+        EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class);
+
+        try {
+            return wmsa_edge_index_domain_query_time.time(() -> queryDomain(specsSet));
+        }
+        catch (HaltException ex) {
+            logger.warn("Halt", ex);
+            throw ex;
+        }
+        catch (Exception ex) {
+            logger.info("Error during domain search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
+            logger.info("Error", ex);
+            Spark.halt(500, "Error");
+            return null;
+        }
+    }
+
+    public EdgeDomainSearchResults queryDomain(EdgeDomainSearchSpecification specsSet) {
+
+        final OptionalInt wordId = lookUpWord(specsSet.keyword);
+        final EdgeIdList<EdgeUrl> urlIds = new EdgeIdList<>();
+
+        final IndexSearchBudget budget = new IndexSearchBudget(50);
+
+        if (wordId.isEmpty()) {
+            return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
+        }
+
+        BTreeQueryBuffer buffer = new BTreeQueryBuffer(512);
+
+        for (int bucket = 0; budget.hasTimeLeft() && bucket < DYNAMIC_BUCKET_LENGTH+1; bucket++) {
+
+            final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(1);
+            var query = indexes.getBucket(bucket).getDomainQuery(wordId.getAsInt(), localFilter);
+
+            while (query.hasMore() && urlIds.size() < specsSet.maxResults) {
+                query.getMoreResults(buffer);
+
+                for (int i = 0; i < buffer.end && urlIds.size() < specsSet.maxResults; i++) {
+                    long result = buffer.data[i];
+                    if (localFilter.test(result)) {
+                        urlIds.add((int) (result & 0xFFFF_FFFFL));
+                    }
+                }
+            }
+        }
+
+        return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
+    }
+
+    private OptionalInt lookUpWord(String s) {
+        int ret = indexes.getLexiconReader().get(s);
+        if (ret == DictionaryHashMap.NO_VALUE) {
+            return OptionalInt.empty();
+        }
+        return OptionalInt.of(ret);
+    }
+
+}
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexLexiconService.java
@ -5,6 +5,7 @@ import com.google.inject.Singleton;
 import com.google.protobuf.InvalidProtocolBufferException;
 import nu.marginalia.util.ListChunker;
 import nu.marginalia.util.dict.DictionaryHashMap;
+import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
 import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
 import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
 import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
@ -21,7 +22,6 @@ import spark.Request;
 import spark.Response;

 import java.util.Arrays;
-import java.util.List;

@Singleton
 public class EdgeIndexLexiconService {
@ -35,6 +35,11 @@ public class EdgeIndexLexiconService {
        this.keywordLexicon = servicesFactory.getKeywordLexicon();
    }

+    public EdgeIndexLexiconService(SearchIndexes indexes, KeywordLexicon lexicon) {
+        this.indexes = indexes;
+        this.keywordLexicon = lexicon;
+    }
+
    public Object getWordId(Request request, Response response) {
        final String word = request.splat()[0];

@ -73,31 +78,37 @@ public class EdgeIndexLexiconService {
    public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
                         IndexPutKeywordsReq.WordSet words, int idx
    ) {
-        SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);

+        SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
        IndexBlock block = IndexBlock.values()[words.getIndex()];

-        for (var chunk : ListChunker.chopList(words.getWordsList(), SearchIndexJournalEntry.MAX_LENGTH)) {
+        var wordArray = words.getWordsList().toArray(String[]::new);
+        var metaArray = words.getMetaList().stream().mapToLong(Long::valueOf).toArray();

-            var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
+        DocumentKeywords documentKeywords = new DocumentKeywords(block, wordArray, metaArray);
+        for (var chunk : ListChunker.chopList(documentKeywords, SearchIndexJournalEntry.MAX_LENGTH)) {
+            var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk.keywords(), chunk.metadata()));
            var header = new SearchIndexJournalEntryHeader(domainId, urlId, block);

            indexWriter.put(header, entry);
        }
    }

-    private long[] getOrInsertWordIds(List<String> words) {
-        long[] ids = new long[words.size()];
+    private long[] getOrInsertWordIds(String[] words, long[] meta) {
+        long[] ids = new long[words.length*2];
        int putIdx = 0;

-        for (String word : words) {
+        for (int i = 0; i < words.length; i++) {
+            String word = words[i];
+
            long id = keywordLexicon.getOrInsert(word);
            if (id != DictionaryHashMap.NO_VALUE) {
                ids[putIdx++] = id;
+                ids[putIdx++] = meta[i];
            }
        }

-        if (putIdx != words.size()) {
+        if (putIdx != words.length*2) {
            ids = Arrays.copyOf(ids, putIdx);
        }
        return ids;
--- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java
+++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/svc/EdgeIndexQueryService.java
@ -7,22 +7,23 @@ import gnu.trove.set.hash.TIntHashSet;
 import io.prometheus.client.Counter;
 import io.prometheus.client.Gauge;
 import io.prometheus.client.Histogram;
+import it.unimi.dsi.fastutil.ints.IntArrayList;
+import it.unimi.dsi.fastutil.ints.IntComparator;
+import it.unimi.dsi.fastutil.ints.IntList;
+import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
+import it.unimi.dsi.fastutil.longs.LongAVLTreeSet;
+import nu.marginalia.util.btree.BTreeQueryBuffer;
 import nu.marginalia.util.dict.DictionaryHashMap;
 import nu.marginalia.wmsa.client.GsonFactory;
-import nu.marginalia.wmsa.configuration.WmsaHome;
 import nu.marginalia.wmsa.edge.index.EdgeIndexBucket;
-import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
+import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
 import nu.marginalia.wmsa.edge.index.model.IndexBlock;
 import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
 import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
-import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
+import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryParams;
 import nu.marginalia.wmsa.edge.index.svc.query.IndexSearchBudget;
 import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator;
-import nu.marginalia.wmsa.edge.model.EdgeUrl;
-import nu.marginalia.wmsa.edge.model.id.EdgeIdList;
 import nu.marginalia.wmsa.edge.model.search.*;
-import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
-import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
 import org.apache.http.HttpStatus;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@ -36,7 +37,6 @@ import java.util.function.LongPredicate;
 import java.util.stream.Collectors;

 import static java.util.Comparator.comparing;
-import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH;
 import static spark.Spark.halt;

@Singleton
@ -50,7 +50,6 @@ public class EdgeIndexQueryService {

    private static final Gauge wmsa_edge_index_query_cost = Gauge.build().name("wmsa_edge_index_query_cost").help("-").register();
    private static final Histogram wmsa_edge_index_query_time = Histogram.build().name("wmsa_edge_index_query_time").linearBuckets(25/1000., 25/1000., 15).help("-").register();
-    private static final Histogram wmsa_edge_index_domain_query_time = Histogram.build().name("wmsa_edge_index_domain_query_time").linearBuckets(25/1000., 25/1000., 15).help("-").register();

    private final Gson gson = GsonFactory.get();

@ -61,30 +60,6 @@ public class EdgeIndexQueryService {
        this.indexes = indexes;
    }

-    public Object searchDomain(Request request, Response response) {
-        if (indexes.getLexiconReader() == null) {
-            logger.warn("Dictionary reader not yet initialized");
-            halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
-        }
-
-        String json = request.body();
-        EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class);
-
-        try {
-            return wmsa_edge_index_domain_query_time.time(() -> queryDomain(specsSet));
-        }
-        catch (HaltException ex) {
-            logger.warn("Halt", ex);
-            throw ex;
-        }
-        catch (Exception ex) {
-            logger.info("Error during domain search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
-            logger.info("Error", ex);
-            Spark.halt(500, "Error");
-            return null;
-        }
-    }
-
    public Object search(Request request, Response response) {
        if (indexes.getLexiconReader() == null) {
            logger.warn("Dictionary reader not yet initialized");
@ -94,6 +69,7 @@ public class EdgeIndexQueryService {
        String json = request.body();
        EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class);

+
        try {
            return wmsa_edge_index_query_time.time(() -> query(specsSet));
        }
@ -117,51 +93,20 @@ public class EdgeIndexQueryService {

        wmsa_edge_index_query_cost.set(searchQuery.getDataCost());

+        if (!searchQuery.hasTimeLeft()) {
+            wmsa_edge_index_query_timeouts.inc();
+        }
+
        return new EdgeSearchResultSet(results);
    }

-    public EdgeDomainSearchResults queryDomain(EdgeDomainSearchSpecification specsSet) {
-
-        final OptionalInt wordId = lookUpWord(specsSet.keyword);
-
-        final EdgeIdList<EdgeUrl> urlIds = new EdgeIdList<>();
-
-        final IndexQueryCachePool pool = new IndexQueryCachePool();
-        final IndexSearchBudget budget = new IndexSearchBudget(50);
-
-        if (wordId.isEmpty()) {
-
-            return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
-        }
-
-        for (int bucket = 0; budget.hasTimeLeft() && bucket < DYNAMIC_BUCKET_LENGTH+1; bucket++) {
-
-            final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(1);
-
-            var query = indexes.getBucket(bucket).getDomainQuery(pool, wordId.getAsInt(), localFilter);
-            long[] buffer = new long[512];
-
-            while (query.hasMore() && urlIds.size() < specsSet.maxResults) {
-                int cnt = query.getMoreResults(buffer, budget);
-                for (int i = 0; i < cnt && urlIds.size() < specsSet.maxResults; i++) {
-                    long result = buffer[i];
-                    if (localFilter.test(result)) {
-                        urlIds.add((int) (result & 0xFFFF_FFFFL));
-                    }
-                }
-            }
-        }
-
-        return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
-    }
-
    private class SearchQuery {
        private final int fetchSize;
        private final TIntHashSet seenResults;
        private final EdgeSearchSpecification specsSet;
        private final IndexSearchBudget budget;
-        private final IndexQueryCachePool cachePool = new IndexQueryCachePool();
-
+        private final Integer qualityLimit;
+        private final Integer rankLimit;
        private long dataCost = 0;

        public SearchQuery(EdgeSearchSpecification specsSet) {
@ -169,6 +114,8 @@ public class EdgeIndexQueryService {
            this.budget = new IndexSearchBudget(specsSet.timeoutMs);
            this.fetchSize = specsSet.fetchSize;
            this.seenResults =  new TIntHashSet(fetchSize, 0.5f);
+            this.qualityLimit = specsSet.quality;
+            this.rankLimit = specsSet.rank;
        }

        private List<EdgeSearchResultItem> execute() {
@ -178,22 +125,18 @@ public class EdgeIndexQueryService {
                results.addAll(performSearch(sq));
            }

-
+            final SearchTermEvaluator evaluator = new SearchTermEvaluator(specsSet, results);
            for (var result : results) {
-                addResultScores(result);
+                evaluator.addResultScores(result);
            }

-            if (!budget.hasTimeLeft()) {
-                wmsa_edge_index_query_timeouts.inc();
+            return createResultList(results);
        }

+        private List<EdgeSearchResultItem> createResultList(Set<EdgeSearchResultItem> results) {
+
            var domainCountFilter = new ResultDomainDeduplicator(specsSet.limitByDomain);

-            if (WmsaHome.isDebug()) {
-                cachePool.printSummary(logger);
-            }
-            cachePool.clear();
-
            List<EdgeSearchResultItem> resultList = results.stream()
                    .sorted(
                            comparing(EdgeSearchResultItem::getScore)
@ -204,6 +147,9 @@ public class EdgeIndexQueryService {
                    .collect(Collectors.toList());

            if (resultList.size() > specsSet.getLimitTotal()) {
+                // This can't be made a stream limit() operation because we need domainCountFilter
+                // to run over the entire list to provide accurate statistics
+
                resultList.subList(specsSet.getLimitTotal(), resultList.size()).clear();
            }

@ -219,16 +165,20 @@ public class EdgeIndexQueryService {
        {

            final List<EdgeSearchResultItem> results = new ArrayList<>(fetchSize);
-            final EdgeIndexSearchTerms searchTerms = getSearchTerms(sq);
+            final SearchTerms searchTerms = getSearchTerms(sq);

-            if (searchTerms.isEmpty())
+            if (searchTerms.isEmpty()) {
                return Collections.emptyList();
+            }
+
+            final BTreeQueryBuffer buffer = new BTreeQueryBuffer(fetchSize);

            for (int indexBucket : specsSet.buckets) {
                final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT);

                if (!budget.hasTimeLeft()) {
-                    logger.info("Query timed out, omitting {}:{} for query {}, ({}), -{}", indexBucket, sq.block, sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude);
+                    logger.info("Query timed out, omitting {}:{} for query {}, ({}), -{}",
+                            indexBucket, sq.block, sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude);
                    continue;

                }
@ -237,20 +187,22 @@ public class EdgeIndexQueryService {
                    break;
                }

-                IndexQuery query = getQuery(cachePool, indexBucket, sq.block, localFilter::filterRawValue, searchTerms);
-                long[] buf = new long[fetchSize];
+                IndexQueryParams queryParams = new IndexQueryParams(sq.block, searchTerms, qualityLimit, rankLimit, specsSet.domains);
+
+                IndexQuery query = getQuery(indexBucket, localFilter::filterRawValue, queryParams);

                while (query.hasMore() && results.size() < fetchSize && budget.hasTimeLeft()) {
-                    int cnt = query.getMoreResults(buf, budget);
+                    buffer.reset();
+                    query.getMoreResults(buffer);

-                    for (int i = 0; i < cnt && results.size() < fetchSize; i++) {
-                        final long id = buf[i];
+                    for (int i = 0; i < buffer.size() && results.size() < fetchSize; i++) {
+                        final long id = buffer.data[i];

                        if (!seenResults.add((int)(id & 0xFFFF_FFFFL)) || !localFilter.test(id)) {
                            continue;
                        }

-                        results.add(new EdgeSearchResultItem(indexBucket, id));
+                        results.add(new EdgeSearchResultItem(indexBucket, sq.block, id));
                    }
                }

@ -261,40 +213,127 @@ public class EdgeIndexQueryService {
            return results;
        }

-        private IndexQuery getQuery(IndexQueryCachePool cachePool, int bucket, IndexBlock block,
-                                    LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
+        private IndexQuery getQuery(int bucket, LongPredicate filter, IndexQueryParams params) {

            if (!indexes.isValidBucket(bucket)) {
                logger.warn("Invalid bucket {}", bucket);
                return new IndexQuery(Collections.emptyList());
            }

-            return indexes.getBucket(bucket).getQuery(cachePool, block, filter, searchTerms);
+            return indexes.getBucket(bucket).getQuery(filter, params);
        }

-        private void addResultScores(EdgeSearchResultItem searchResult) {
+        public boolean hasTimeLeft() {
+            return budget.hasTimeLeft();
+        }
+
+        private record IndexAndBucket(IndexBlock block, int bucket) {}
+
+        public long getDataCost() {
+            return dataCost;
+        }
+
+        record ResultTerm (int bucket, int termId, long combinedUrlId) {}
+    }
+
+    public class SearchTermEvaluator {
+        private static final EdgePageWordMetadata blankMetadata = new EdgePageWordMetadata(EdgePageWordMetadata.emptyValue());
+
+        private final Map<SearchQuery.ResultTerm, EdgePageWordMetadata> termData = new HashMap<>(16);
+
+        private final List<List<String>> searchTermVariants;
+
+        public SearchTermEvaluator(EdgeSearchSpecification specsSet, Set<EdgeSearchResultItem> results) {
+            this.searchTermVariants = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
+
+            final int[] termIdsAll = getIncludeTermIds(specsSet);
+
+            Map<SearchQuery.IndexAndBucket, LongAVLTreeSet> resultIdsByBucket = new HashMap<>(7);
+
+            for (int termId : termIdsAll) {
+
+                for (var result: results) {
+                    resultIdsByBucket
+                            .computeIfAbsent(new SearchQuery.IndexAndBucket(result.block, result.bucketId),
+                                    id -> new LongAVLTreeSet())
+                            .add(result.combinedId);
+                }
+
+                resultIdsByBucket.forEach((indexAndBucket, resultIds) ->
+                        loadMetadata(termId, indexAndBucket.bucket, indexAndBucket.block, resultIds));
+
+                resultIdsByBucket.clear();
+            }
+        }
+
+        private int[] getIncludeTermIds(EdgeSearchSpecification specsSet) {
+
            final var reader = Objects.requireNonNull(indexes.getLexiconReader());

-            List<List<String>> searchTermVariants = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
+            final List<String> terms = specsSet.allIncludeSearchTerms();
+            final IntList ret = new IntArrayList(terms.size());

-            // Memoize calls to getTermData, as they're somewhat expensive and highly redundant
-            Map<ResultTerm, ResultTermData> termMetadata = new HashMap<>(32);
+            for (var term : terms) {
+                int id = reader.get(term);
+
+                if (id >= 0)
+                    ret.add(id);
+            }
+
+            return ret.toIntArray();
+        }
+
+        private void loadMetadata(int termId, int bucket, IndexBlock indexBlock,
+                                  LongAVLTreeSet docIdsMissingMetadata)
+        {
+            EdgeIndexBucket index = indexes.getBucket(bucket);
+
+            if (docIdsMissingMetadata.isEmpty())
+                return;
+
+
+            long[] ids = docIdsMissingMetadata.toLongArray();
+            long[] metadata = index.getMetadata(indexBlock, termId, ids);
+
+            for (int i = 0; i < metadata.length; i++) {
+                if (metadata[i] == 0L)
+                    continue;
+
+                termData.put(
+                        new SearchQuery.ResultTerm(bucket, termId, ids[i]),
+                        new EdgePageWordMetadata(metadata[i])
+                );
+
+                docIdsMissingMetadata.remove(ids[i]);
+            }
+        }
+
+        public void addResultScores(EdgeSearchResultItem searchResult) {
+            final var reader = Objects.requireNonNull(indexes.getLexiconReader());

            double bestScore = 0;

            for (int searchTermListIdx = 0; searchTermListIdx < searchTermVariants.size(); searchTermListIdx++) {
                double setScore = 0;
                int setSize = 0;
-                for (var searchTerm : searchTermVariants.get(searchTermListIdx)) {
+                var termList = searchTermVariants.get(searchTermListIdx);
+
+                for (int termIdx = 0; termIdx < termList.size(); termIdx++) {
+                    String searchTerm = termList.get(termIdx);

                    final int termId = reader.get(searchTerm);

-                    ResultTermData data = termMetadata.computeIfAbsent(
-                            new ResultTerm(searchResult.bucketId, termId, searchResult.getCombinedId()), this::getTermData);
+                    var key = new SearchQuery.ResultTerm(searchResult.bucketId, termId, searchResult.getCombinedId());
+                    var metadata = termData.getOrDefault(key, blankMetadata);
+
+                    EdgeSearchResultKeywordScore score = new EdgeSearchResultKeywordScore(searchTermListIdx, searchTerm, metadata);

-                    var score = data.asScore(searchTermListIdx, searchTerm);
                    searchResult.scores.add(score);
-                    setScore += score.value();
+                    setScore += score.termValue();
+                    if (termIdx == 0) {
+                        setScore += score.documentValue();
+                    }
+
                    setSize++;
                }
                bestScore = Math.min(bestScore, setScore/setSize);
@ -303,64 +342,27 @@ public class EdgeIndexQueryService {
            searchResult.setScore(bestScore);
        }

-        private ResultTermData getTermData(ResultTerm resultTerm) {
-            final EdgeIndexBucket bucket = indexes.getBucket(resultTerm.bucket);
-            final int termId = resultTerm.termId;
-            final long combinedUrlId = resultTerm.combinedUrlId;

-            return new ResultTermData(bucket.getTermScore(cachePool, termId, combinedUrlId),
-                    bucket.isTermInBucket(cachePool, IndexBlock.Title, termId, combinedUrlId),
-                    bucket.isTermInBucket(cachePool, IndexBlock.Link, termId, combinedUrlId),
-                    bucket.isTermInBucket(cachePool, IndexBlock.Site, termId, combinedUrlId),
-                    bucket.isTermInBucket(cachePool, IndexBlock.Subjects, termId, combinedUrlId),
-                    bucket.isTermInBucket(cachePool, IndexBlock.NamesWords, termId, combinedUrlId),
-                    bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Top, termId, combinedUrlId),
-                    bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Middle, termId, combinedUrlId),
-                    bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Lower, termId, combinedUrlId)
-            );
    }

-        public long getDataCost() {
-            return dataCost;
-        }
-
-        record ResultTerm (int bucket, int termId, long combinedUrlId) {}
-        record ResultTermData (IndexBlock index,
-                               boolean title,
-                               boolean link,
-                               boolean site,
-                               boolean subject,
-                               boolean name,
-                               boolean high,
-                               boolean mid,
-                               boolean low
-        ) {
-            public EdgeSearchResultKeywordScore asScore(int set, String searchTerm) {
-                return new EdgeSearchResultKeywordScore(set, searchTerm, index, title, link, site, subject, name, high, mid, low);
-            }
-        }
-    }
-
-
-    private EdgeIndexSearchTerms getSearchTerms(EdgeSearchSubquery request) {
-        final List<Integer> excludes = new ArrayList<>();
-        final List<Integer> includes = new ArrayList<>();
+    private SearchTerms getSearchTerms(EdgeSearchSubquery request) {
+        final IntList excludes = new IntArrayList();
+        final IntList includes = new IntArrayList();

        for (var include : request.searchTermsInclude) {
            var word = lookUpWord(include);
            if (word.isEmpty()) {
                logger.debug("Unknown search term: " + include);
-                return new EdgeIndexSearchTerms(Collections.emptyList(), Collections.emptyList());
+                return new SearchTerms();
            }
            includes.add(word.getAsInt());
        }

-
        for (var advice : request.searchTermsAdvice) {
            var word = lookUpWord(advice);
            if (word.isEmpty()) {
                logger.debug("Unknown search term: " + advice);
-                return new EdgeIndexSearchTerms(Collections.emptyList(), Collections.emptyList());
+                return new SearchTerms();
            }
            includes.add(word.getAsInt());
        }
@ -369,7 +371,26 @@ public class EdgeIndexQueryService {
            lookUpWord(exclude).ifPresent(excludes::add);
        }

-        return new EdgeIndexSearchTerms(includes, excludes);
+        return new SearchTerms(includes, excludes);
+    }
+
+    public record SearchTerms(IntList includes, IntList excludes) {
+        public SearchTerms() {
+            this(IntList.of(), IntList.of());
+        }
+
+        public boolean isEmpty() {
+            return includes.isEmpty();
+        }
+
+        public int[] sortedDistinctIncludes(IntComparator comparator) {
+            if (includes.isEmpty())
+                return includes.toIntArray();
+
+            IntList list = new IntArrayList(new IntOpenHashSet(includes));
+            list.sort(comparator);
+            return list.toIntArray();
+        }
    }


--- a/Show More
+++ b/Show More