October Release (#118)

Co-authored-by: vlofgren <vlofgren@gmail.com>
Co-authored-by: vlofgren <vlofgren@marginalia.nu>
Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/118
This commit is contained in:
Viktor Lofgren 2022-10-19 15:00:04 +02:00
parent 9a7d052c43
commit df49ccbe59
186 changed files with 7472 additions and 2243 deletions

View File

@ -175,7 +175,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
driver.get("http://proxyNginx/"); driver.get("http://proxyNginx/");
System.out.println(driver.getTitle()); System.out.println(driver.getTitle());
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); // System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("frontpage")); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("frontpage"));
} }
@ -249,7 +249,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
driver.get("http://proxyNginx/search?query=browse:wikipedia.local"); driver.get("http://proxyNginx/search?query=browse:wikipedia.local");
System.out.println(driver.getTitle()); System.out.println(driver.getTitle());
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); // System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("browse")); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("browse"));
} }
@ -259,7 +259,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
driver.get("http://proxyNginx/search?query=define:adiabatic"); driver.get("http://proxyNginx/search?query=define:adiabatic");
System.out.println(driver.getTitle()); System.out.println(driver.getTitle());
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); // System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("define")); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("define"));
} }
@ -269,7 +269,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
driver.get("http://proxyNginx/search?query=3%2B3"); driver.get("http://proxyNginx/search?query=3%2B3");
System.out.println(driver.getTitle()); System.out.println(driver.getTitle());
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML")); // System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("eval")); Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("eval"));
} }

View File

@ -0,0 +1,313 @@
package nu.marginalia;
import nu.marginalia.util.AndCardIntSet;
import org.openjdk.jmh.annotations.*;
import org.roaringbitmap.RoaringBitmap;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Random;
public class BitSetTest {
@org.openjdk.jmh.annotations.State(Scope.Benchmark)
public static class State {
List<RoaringBitmap> roar = new ArrayList<>();
List<AndCardIntSet> acbs = new ArrayList<>();
List<RoaringBitmap> roarLow = new ArrayList<>();
List<RoaringBitmap> roarHigh = new ArrayList<>();
List<AndCardIntSet> acbsLow = new ArrayList<>();
List<AndCardIntSet> acbsHigh = new ArrayList<>();
@Setup(Level.Trial)
public void setUp() {
var rand = new Random();
for (int i = 0; i < 100; i++) {
int card = 1 + rand.nextInt(10);
var rb = new RoaringBitmap();
var cbs = new AndCardIntSet();
for (int j = 0; j < card; j++) {
int val = rand.nextInt(1_000_000);
rb.add(val);
cbs.add(val);
}
acbsLow.add(cbs);
roarLow.add(rb);
}
for (int i = 0; i < 10; i++) {
int card = 1 + rand.nextInt(10000, 20000);
var rb = new RoaringBitmap();
for (int j = 0; j < card; j++) {
int val = rand.nextInt(1_000_000);
rb.add(val);
}
acbsHigh.add(AndCardIntSet.of(rb));
roarHigh.add(rb);
}
for (int i = 0; i < 100000; i++) {
var rb = new RoaringBitmap();
var cbs = new AndCardIntSet();
int val = rand.nextInt(1_000_000);
rb.add(val);
cbs.add(val);
acbs.add(cbs);
roar.add(rb);
}
for (int i = 0; i < 10000; i++) {
int card = 1 + rand.nextInt(10);
var rb = new RoaringBitmap();
var cbs = new AndCardIntSet();
for (int j = 0; j < card; j++) {
int val = rand.nextInt(1_000_000);
rb.add(val);
cbs.add(val);
}
acbs.add(cbs);
roar.add(rb);
}
for (int i = 0; i < 1000; i++) {
int card = 1 + rand.nextInt(100);
var rb = new RoaringBitmap();
var cbs = new AndCardIntSet();
for (int j = 0; j < card; j++) {
int val = rand.nextInt(1_000_000);
rb.add(val);
cbs.add(val);
}
acbs.add(cbs);
roar.add(rb);
}
for (int i = 0; i < 100; i++) {
int card = 1 + rand.nextInt(1000);
var rb = new RoaringBitmap();
var cbs = new AndCardIntSet();
for (int j = 0; j < card; j++) {
int val = rand.nextInt(1_000_000);
rb.add(val);
cbs.add(val);
}
acbs.add(cbs);
roar.add(rb);
}
for (int i = 0; i < 100; i++) {
int card = 1 + rand.nextInt(10000);
var rb = new RoaringBitmap();
var cbs = new AndCardIntSet();
for (int j = 0; j < card; j++) {
int val = rand.nextInt(1_000_000);
rb.add(val);
cbs.add(val);
}
acbs.add(cbs);
roar.add(rb);
}
for (int i = 0; i < 2; i++) {
int card = 1 + rand.nextInt(100000);
var rb = new RoaringBitmap();
var cbs = new AndCardIntSet();
for (int j = 0; j < card; j++) {
int val = rand.nextInt(1_000_000);
rb.add(val);
cbs.add(val);
}
acbs.add(cbs);
roar.add(rb);
}
Collections.shuffle(acbs);
Collections.shuffle(roar);
}
}
//
// @Benchmark
// @BenchmarkMode(Mode.Throughput)
// @Fork(value = 5, warmups = 5)
// public Object roaringCard(State state) {
// long val = 0;
//
// for (int i = 0; i < state.roar.size(); i++) {
// for (int j = i+1; j < state.roar.size(); j++) {
// val += RoaringBitmap.andCardinality(state.roar.get(i), state.roar.get(j));
// }
// }
//
// return val;
// }
// @Benchmark
// @BenchmarkMode(Mode.Throughput)
// @Fork(value = 2, warmups = 2)
// public Object roaringCardNorm(State state) {
// long val = 0;
//
// for (int i = 0; i < state.roar.size()/1000; i++) {
// for (int j = i+1; j < state.roar.size(); j++) {
//
// var a = state.roar.get(i);
// var b = state.roar.get(j);
// val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
// }
// }
//
// return val;
// }
// @Benchmark
// @BenchmarkMode(Mode.Throughput)
// @Fork(value = 5, warmups = 5)
// public Object cbsCard(State state) {
// long val = 0;
//
// for (int i = 0; i < state.roar.size(); i++) {
// for (int j = i+1; j < state.roar.size(); j++) {
// val += AndCardIntSet.andCardinality(state.acbs.get(i), state.acbs.get(j));
// }
// }
//
// return val;
// }
//
// @Benchmark
// @BenchmarkMode(Mode.Throughput)
// @Fork(value = 1, warmups = 1)
// public Object cbsCardNorm(State state) {
// double val = 0;
//
// for (int i = 0; i < state.roar.size()/1000; i++) {
// for (int j = i+1; j < state.roar.size(); j++) {
// var a = state.acbs.get(i);
// var b = state.acbs.get(j);
// val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.cardinality()*b.cardinality()));
// }
// }
//
// return val;
// }
@Benchmark
@BenchmarkMode(Mode.Throughput)
@Fork(value = 1, warmups = 1)
public Object cbsLowLow(State state) {
double val = 0;
for (int i = 0; i < state.acbsLow.size(); i++) {
for (int j = 0; j < state.acbsLow.size(); j++) {
var a = state.acbsLow.get(i);
var b = state.acbsLow.get(j);
val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
}
}
return val;
}
@Benchmark
@BenchmarkMode(Mode.Throughput)
@Fork(value = 1, warmups = 1)
public Object cbsHighHigh(State state) {
double val = 0;
for (int i = 0; i < state.acbsHigh.size(); i++) {
for (int j = 0; j < state.acbsHigh.size(); j++) {
var a = state.acbsHigh.get(i);
var b = state.acbsHigh.get(j);
val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
}
}
return val;
}
@Benchmark
@BenchmarkMode(Mode.Throughput)
@Fork(value = 1, warmups = 1)
public Object cbsHighLow(State state) {
double val = 0;
for (int i = 0; i < state.acbsHigh.size(); i++) {
for (int j = 0; j < state.acbsLow.size(); j++) {
var a = state.acbsHigh.get(i);
var b = state.acbsLow.get(j);
val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
}
}
return val;
}
@Benchmark
@BenchmarkMode(Mode.Throughput)
@Fork(value = 1, warmups = 1)
public Object roarLowLow(State state) {
double val = 0;
for (int i = 0; i < state.roarLow.size(); i++) {
for (int j = 0; j < state.roarLow.size(); j++) {
var a = state.roarLow.get(i);
var b = state.roarLow.get(j);
val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
}
}
return val;
}
@Benchmark
@BenchmarkMode(Mode.Throughput)
@Fork(value = 1, warmups = 1)
public Object roarHighLow(State state) {
double val = 0;
for (int i = 0; i < state.roarHigh.size(); i++) {
for (int j = 0; j < state.roarLow.size(); j++) {
var a = state.roarHigh.get(i);
var b = state.roarLow.get(j);
val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
}
}
return val;
}
@Benchmark
@BenchmarkMode(Mode.Throughput)
@Fork(value = 1, warmups = 1)
public Object roarHighHigh(State state) {
double val = 0;
for (int i = 0; i < state.roarHigh.size(); i++) {
for (int j = 0; j < state.roarHigh.size(); j++) {
var a = state.roarHigh.get(i);
var b = state.roarHigh.get(j);
val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
}
}
return val;
}
}

View File

@ -1,85 +0,0 @@
package nu.marginalia;
import lombok.SneakyThrows;
import nu.marginalia.util.multimap.MultimapFileLong;
import org.openjdk.jmh.annotations.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.stream.IntStream;
import java.util.stream.LongStream;
public class ByteBufferBlockReadVsIndividualRead {
@State(Scope.Benchmark)
public static class ByteBufferState {
private MultimapFileLong mmf;
private Path file;
private static final int size = 800*1024*1024;
@Setup(Level.Iteration)
@SneakyThrows
public void setUp() {
file = Files.createTempFile("jmh", ".dat");
mmf = MultimapFileLong.forOutput(file, size);
for (int i = 0; i < size; i++) {
mmf.put(i, i);
}
}
@TearDown(Level.Iteration)
@SneakyThrows
public void tearDown() {
mmf.close();
Files.delete(file);
}
LongStream basicStream() {
return IntStream.range(0, size).mapToLong(mmf::get);
}
LongStream blockStream(int blockSize) {
long urlOffset = 0;
long endOffset = size;
long[] arry = new long[blockSize];
return LongStream
.iterate(urlOffset, i -> i< endOffset, i->i+blockSize)
.flatMap(pos -> {
int sz = (int)(Math.min(pos+blockSize, endOffset) - pos);
mmf.read(arry, sz, pos);
return Arrays.stream(arry, 0, sz);
});
}
}
// @Benchmark @BenchmarkMode(Mode.Throughput)
// @Fork(value = 1, warmups = 1)
// @Warmup(iterations = 1)
public long testBasic(ByteBufferState state) {
return state.basicStream().sum();
}
@Benchmark @BenchmarkMode(Mode.Throughput)
@Fork(value = 1, warmups = 1)
@Warmup(iterations = 0)
public long testBlock128(ByteBufferState state) {
return state.blockStream(128).sum();
}
@Benchmark @BenchmarkMode(Mode.Throughput)
@Fork(value = 1, warmups = 1)
@Warmup(iterations = 0)
public long testBlock1024(ByteBufferState state) {
return state.blockStream(1024).sum();
}
@Benchmark @BenchmarkMode(Mode.Throughput)
@Fork(value = 1, warmups = 1)
@Warmup(iterations = 0)
public long testBlock8192(ByteBufferState state) {
return state.blockStream(8192).sum();
}
}

View File

@ -0,0 +1,205 @@
package nu.marginalia.util;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.set.hash.TIntHashSet;
import org.roaringbitmap.RoaringBitmap;
public class AndCardIntSet {
final TIntArrayList backingList;
long hash;
public AndCardIntSet() {
backingList = new TIntArrayList(16);
backingList.sort();
}
public static AndCardIntSet of(int... list) {
var set = new TIntHashSet(list);
TIntArrayList lst = new TIntArrayList(set);
lst.sort();
return new AndCardIntSet(lst);
}
public static AndCardIntSet of(RoaringBitmap bmap) {
TIntArrayList lst = new TIntArrayList(bmap.getCardinality());
lst.addAll(bmap.toArray());
return new AndCardIntSet(lst);
}
private AndCardIntSet(TIntArrayList list) {
backingList = list;
hash = 0;
if (list.size() < 128) {
for (int v : list.toArray()) {
int bit = hasher.hashInt(v).asInt() % 64;
hash |= (1L << bit);
}
}
else {
hash = ~0L;
}
}
private static final HashFunction hasher = Hashing.murmur3_128(0);
public boolean add(int val) {
if (!contains(val)) {
return false;
}
if (backingList.size() < 128) {
int bit = hasher.hashInt(val).asInt() % 64;
hash |= (1L << bit);
}
else {
hash = ~0L;
}
backingList.add(val);
backingList.sort();
return true;
}
public boolean contains(int val) {
return backingList.binarySearch(val) >= 0;
}
public int getCardinality() {
return backingList.size();
}
public static int andCardinality(AndCardIntSet a, AndCardIntSet b) {
if (!testHash(a,b)) {
return 0;
}
if (a.getCardinality() + b.getCardinality() < 10) {
return andLinearSmall(a, b);
}
return andLinear(a,b);
}
private static int andLinearSmall(AndCardIntSet a, AndCardIntSet b) {
int sum = 0;
for (int i = 0; i < a.getCardinality(); i++) {
for (int j = 0; j < b.getCardinality(); j++) {
if (a.backingList.getQuick(i) == b.backingList.getQuick(j))
sum++;
}
}
return sum;
}
private static int andLinear(AndCardIntSet a, AndCardIntSet b) {
int i = 0, j = 0;
int card = 0;
do {
int diff = a.backingList.getQuick(i) - b.backingList.getQuick(j);
if (diff < 0) i++;
else if (diff > 0) j++;
else {
i++;
j++;
card++;
}
} while (i < a.getCardinality() && j < b.getCardinality());
return card;
}
private static boolean testHash(AndCardIntSet a, AndCardIntSet b) {
return (a.hash & b.hash) != 0;
}
public boolean cardinalityExceeds(int val) {
return getCardinality() >= val;
}
public static AndCardIntSet and(AndCardIntSet a, AndCardIntSet b) {
int i = 0;
int j = 0;
TIntArrayList andVals = new TIntArrayList(1 + (int)Math.sqrt(a.getCardinality()));
while (i < a.getCardinality() && j < b.getCardinality()) {
int diff = a.backingList.getQuick(i) - b.backingList.getQuick(j);
if (diff < 0) i++;
else if (diff > 0) j++;
else {
andVals.add(a.backingList.getQuick(i));
i++;
j++;
}
}
return new AndCardIntSet(andVals);
}
public static double weightedProduct(float[] weights, AndCardIntSet a, AndCardIntSet b) {
int i = 0;
int j = 0;
double sum = 0;
if (a.getCardinality() + b.getCardinality() < 10) {
return weightedProductSmall(weights, a, b);
}
do {
int diff = a.backingList.getQuick(i) - b.backingList.getQuick(j);
if (diff < 0) i++;
else if (diff > 0) j++;
else {
sum += weights[a.backingList.getQuick(i)];
i++;
j++;
}
} while (i < a.getCardinality() && j < b.getCardinality());
return sum;
}
private static double weightedProductSmall(float[] weights, AndCardIntSet a, AndCardIntSet b) {
double sum = 0;
for (int i = 0; i < a.getCardinality(); i++) {
for (int j = 0; j < b.getCardinality(); j++) {
int av = a.backingList.getQuick(i);
int bv = b.backingList.getQuick(j);
if (av == bv)
sum+=weights[av];
}
}
return sum;
}
public double mulAndSum(float[] weights) {
double sum = 0;
for (int i = 0; i < backingList.size(); i++) {
sum += weights[backingList.getQuick(i)];
}
return sum;
}
public int[] toArray() {
return backingList.toArray();
}
public TIntArrayList values() {
return backingList;
}
}

View File

@ -0,0 +1,52 @@
package nu.marginalia.util;
public class BrailleBlockPunchCards {
public static String printBits(int val, int bits) {
StringBuilder builder = new StringBuilder();
for (int b = 0; b < bits; b+=8, val>>>=8) {
builder.append((char)('\u2800'+bin2brail(val)));
}
return builder.toString();
}
/* The braille block in unicode U2800 is neat because it contains
* 8 "bits", but for historical reasons, they're addressed in a bit
* of an awkward way. Braille used to be a 2x6 grid, but it was extended
* to 2x8.
*
* It's addressed as follows
*
* 0 3
* 1 4
* 2 5
* 6 7 <-- extended braille
*
*
* We want to use it as a dot matrix to represent bits. To do that we need
* to do this transformation:
*
* 0 1 2 3 4 5 6 7 native order bits
* | | | \ _\__\/ |
* | | | / \ \ \ |
* 0 1 2 6 3 4 5 7 braille order bits
*
* 01 02 04 08 10 20 40 80
* 01+02+04 +80 : &0x87
* << 10+20+40 : &0x70, <<1
* 08 >> >> >> : &0x08, >>3
*
* Or in other words we do
* (v & 0x87)
* | ((v & 0x70) >> 1)
* | ((v & 0x08) << 3)
*
* Thanks for coming to my TED talk.
*/
private static char bin2brail(int v) {
return (char)((v & 0x87) | ((v & 0x70) >> 1) | ((v & 0x08) << 3));
}
}

View File

@ -1,5 +1,7 @@
package nu.marginalia.util; package nu.marginalia.util;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
@ -14,13 +16,13 @@ public class ListChunker {
* *
* @see List#subList * @see List#subList
*/ */
public static <T> List<List<T>> chopList(List<T> data, int size) { public static List<DocumentKeywords> chopList(DocumentKeywords data, int size) {
if (data.isEmpty()) if (data.isEmpty())
return Collections.emptyList(); return Collections.emptyList();
else if (data.size() < size) else if (data.size() < size)
return List.of(data); return List.of(data);
final List<List<T>> ret = new ArrayList<>(1 + data.size() / size); final List<DocumentKeywords> ret = new ArrayList<>(1 + data.size() / size);
for (int i = 0; i < data.size(); i+=size) { for (int i = 0; i < data.size(); i+=size) {
ret.add(data.subList(i, Math.min(data.size(), i+size))); ret.add(data.subList(i, Math.min(data.size(), i+size)));

View File

@ -0,0 +1,33 @@
package nu.marginalia.util.btree;
import nu.marginalia.util.btree.model.BTreeContext;
import nu.marginalia.util.btree.model.BTreeHeader;
import nu.marginalia.util.multimap.MultimapFileLongSlice;
/*
* End-of-page mark that's used as a sentinel to verify that
* the BTreeWriter's caller actually writes as much as they say
* they want to. (Failing to do so will corrupt the tree)
*
*/
public class BTreeDogEar {
private MultimapFileLongSlice sentinelSlice;
public BTreeDogEar(BTreeContext ctx, BTreeHeader header, MultimapFileLongSlice base) {
if (header.numEntries() > 3) {
sentinelSlice = base.atOffset((long) header.numEntries() * ctx.entrySize() - 3);
sentinelSlice.put(0, 4L);
sentinelSlice.put(1, 5L);
sentinelSlice.put(2, 1L);
}
}
public boolean verify() {
if (sentinelSlice == null)
return true;
return 4 != sentinelSlice.get(0) || 5 != sentinelSlice.get(1) || 1 != sentinelSlice.get(2);
}
}

View File

@ -0,0 +1,146 @@
package nu.marginalia.util.btree;
import java.util.Arrays;
public class BTreeQueryBuffer {
public final long[] data;
public int end;
private int read = 0;
private int write = 0;
public BTreeQueryBuffer(int size) {
this.data = new long[size];
this.end = size;
}
public BTreeQueryBuffer(long [] data, int size) {
this.data = data;
this.end = size;
}
private BTreeQueryBuffer(long [] data) {
this.data = data;
this.end = data.length;
}
public BTreeQueryBuffer[] split(int... splitPoints) {
BTreeQueryBuffer[] ret = new BTreeQueryBuffer[splitPoints.length+1];
ret[0] = new BTreeQueryBuffer(Arrays.copyOfRange(data, 0, splitPoints[0]));
for (int i = 1; i < splitPoints.length; i++) {
ret[i] = new BTreeQueryBuffer(Arrays.copyOfRange(data, splitPoints[i-1], splitPoints[i]));
}
ret[ret.length-1] = new BTreeQueryBuffer(Arrays.copyOfRange(data, splitPoints[splitPoints.length-1], end));
return ret;
}
public void gather(BTreeQueryBuffer... buffers) {
int start = 0;
for (var buffer : buffers) {
System.arraycopy(buffer.data, 0, data, start, buffer.end);
start += buffer.end;
}
this.read = 0;
this.write = 0;
this.end = start;
}
public long[] copyData() {
return Arrays.copyOf(data, end);
}
public void retainAll() {
read = write = end;
}
public boolean isEmpty() {
return end == 0;
}
public int size() {
return end;
}
public long currentValue() {
return data[read];
}
public boolean rejectAndAdvance() {
return ++read < end;
}
public boolean retainAndAdvance() {
if (read != write) {
long tmp = data[write];
data[write] = data[read];
data[read] = tmp;
}
write++;
return ++read < end;
}
public boolean hasMore() {
return read < end;
}
public void finalizeFiltering() {
end = write;
read = 0;
write = 0;
}
public void startFilterForRange(int pos, int end) {
read = write = pos;
this.end = end;
}
public void reset() {
end = data.length;
read = 0;
write = 0;
}
public void zero() {
end = 0;
read = 0;
write = 0;
Arrays.fill(data, 0);
}
public void uniq() {
if (end <= 1) return;
long prev = currentValue();
retainAndAdvance();
while (hasMore()) {
long val = currentValue();
if (prev == val) {
rejectAndAdvance();
} else {
retainAndAdvance();
prev = val;
}
}
finalizeFiltering();
}
public String toString() {
return getClass().getSimpleName() + "[" +
"read = " + read +
",write = " + write +
",end = " + end +
",data = [" + Arrays.toString(Arrays.copyOf(data, end)) + "]]";
}
}

View File

@ -1,5 +1,7 @@
package nu.marginalia.util.btree; package nu.marginalia.util.btree;
import it.unimi.dsi.fastutil.longs.LongLongImmutablePair;
import lombok.SneakyThrows;
import nu.marginalia.util.btree.model.BTreeContext; import nu.marginalia.util.btree.model.BTreeContext;
import nu.marginalia.util.btree.model.BTreeHeader; import nu.marginalia.util.btree.model.BTreeHeader;
import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.util.multimap.MultimapFileLong;
@ -14,70 +16,275 @@ public class BTreeReader {
private final MultimapSearcher indexSearcher; private final MultimapSearcher indexSearcher;
private final MultimapSearcher dataSearcher; private final MultimapSearcher dataSearcher;
private final BTreeHeader header;
public BTreeReader(MultimapFileLong file, BTreeContext ctx) { public BTreeReader(MultimapFileLong file, BTreeContext ctx, BTreeHeader header) {
this.file = file; this.file = file;
this.indexSearcher = MultimapSearcher.forContext(file, ~0, 1); this.indexSearcher = MultimapSearcher.forContext(file, ~0, 1);
this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize()); this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize());
this.ctx = ctx; this.ctx = ctx;
this.header = header;
} }
public BTreeHeader getHeader(long fileOffset) { public BTreeReader(MultimapFileLong file, BTreeContext ctx, long offset) {
this.file = file;
this.indexSearcher = MultimapSearcher.forContext(file, ~0, 1);
this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize());
this.ctx = ctx;
this.header = createHeader(file, offset);
}
public static BTreeHeader createHeader(MultimapFileLong file, long fileOffset) {
return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2)); return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
} }
public BTreeHeader getHeader() {
return header;
}
public int numEntries() {
return header.numEntries();
}
@SneakyThrows
public void retainEntries(BTreeQueryBuffer buffer) {
if (header.layers() == 0) {
BTreePointer pointer = new BTreePointer(header);
pointer.retainData(buffer);
}
retainSingle(buffer);
}
@SneakyThrows
public void rejectEntries(BTreeQueryBuffer buffer) {
if (header.layers() == 0) {
BTreePointer pointer = new BTreePointer(header);
pointer.rejectData(buffer);
}
rejectSingle(buffer);
}
private void retainSingle(BTreeQueryBuffer buffer) {
BTreePointer pointer = new BTreePointer(header);
for (; buffer.hasMore(); pointer.resetToRoot()) {
long val = buffer.currentValue() & ctx.equalityMask();
if (!pointer.walkToData(val)) {
buffer.rejectAndAdvance();
continue;
}
pointer.retainData(buffer);
}
}
private void rejectSingle(BTreeQueryBuffer buffer) {
BTreePointer pointer = new BTreePointer(header);
for (; buffer.hasMore(); pointer.resetToRoot()) {
long val = buffer.currentValue() & ctx.equalityMask();
if (pointer.walkToData(val) && pointer.containsData(val)) {
buffer.rejectAndAdvance();
}
else {
buffer.retainAndAdvance();
}
}
}
/** /**
* *
* @return file offset of entry matching keyRaw, negative if absent * @return file offset of entry matching keyRaw, negative if absent
*/ */
public long findEntry(BTreeHeader header, final long keyRaw) { public long findEntry(final long keyRaw) {
final int blockSize = ctx.BLOCK_SIZE_WORDS();
final long key = keyRaw & ctx.equalityMask(); final long key = keyRaw & ctx.equalityMask();
final long dataAddress = header.dataOffsetLongs();
final long searchStart; BTreePointer ip = new BTreePointer(header);
final long numEntries;
if (header.layers() == 0) { // For small data, there is no index block, only a flat data block while (!ip.isDataLayer())
searchStart = dataAddress; ip.walkToChild(key);
numEntries = header.numEntries();
} return ip.findData(key);
else {
long dataLayerOffset = searchIndex(header, key);
if (dataLayerOffset < 0) {
return dataLayerOffset;
} }
searchStart = dataAddress + dataLayerOffset * ctx.entrySize(); public void readData(long[] data, int n, long pos) {
numEntries = min(header.numEntries() - dataLayerOffset, blockSize); file.read(data, n, header.dataOffsetLongs() + pos);
} }
public long[] queryData(long[] urls, int offset) {
BTreePointer pointer = new BTreePointer(header);
long[] ret = new long[urls.length];
for (int i = 0; i < urls.length; i++, pointer.resetToRoot()) {
if (pointer.walkToData(urls[i])) {
long dataAddress = pointer.findData(urls[i]);
if (dataAddress >= 0) {
ret[i] = file.get(dataAddress + offset);
}
}
}
return ret;
}
/** Find the range of values so that prefixStart <= n < prefixNext */
public LongLongImmutablePair getRangeForPrefix(long prefixStart, long prefixNext) {
long lowerBoundStart = lowerBound(prefixStart);
long lowerBoundEnd = lowerBound(prefixNext);
return new LongLongImmutablePair(lowerBoundStart, lowerBoundEnd);
}
private long lowerBound(long key) {
key &= ctx.equalityMask();
BTreePointer ip = new BTreePointer(header);
while (!ip.isDataLayer())
ip.walkToChild(key);
return ip.findDataLower(key);
}
private class BTreePointer {
private final long[] layerOffsets;
private int layer;
private long offset;
private long boundary;
public String toString() {
return getClass().getSimpleName() + "[" +
"layer = " + layer + " ," +
"offset = " + offset + "]";
}
public BTreePointer(BTreeHeader header) {
layer = header.layers() - 1;
offset = 0;
layerOffsets = header.getRelativeLayerOffsets(ctx);
boundary = Long.MAX_VALUE;
}
public void resetToRoot() {
this.layer = header.layers() - 1;
this.offset = 0;
this.boundary = Long.MAX_VALUE;
}
public int layer() {
return layer;
}
public boolean walkToChild(long key) {
final long indexAddress = header.indexOffsetLongs();
final long indexLayerBlockOffset = layerOffsets[layer] + offset;
final long searchStart = indexAddress + indexLayerBlockOffset;
final long nextLayerOffset = (int)(indexSearcher.binarySearchLower(key, searchStart, ctx.BLOCK_SIZE_WORDS()) - searchStart);
if (nextLayerOffset < 0)
return false;
layer --;
boundary = file.get(searchStart + offset);
offset = ctx.BLOCK_SIZE_WORDS() * (offset + nextLayerOffset);
return true;
}
public boolean walkToData(long key) {
while (!isDataLayer()) {
if (!walkToChild(key)) {
return false;
}
}
return true;
}
public boolean isDataLayer() {
return layer < 0;
}
public boolean containsData(long key) {
return findData(key) >= 0;
}
public long findData(long key) {
if (layer > 0) {
throw new IllegalStateException("Looking for data in an index layer");
}
long searchStart = header.dataOffsetLongs() + offset * ctx.entrySize();
int numEntries = min((int)(header.numEntries() - offset), ctx.BLOCK_SIZE_WORDS());
return dataSearcher.binarySearch(key, searchStart, numEntries); return dataSearcher.binarySearch(key, searchStart, numEntries);
} }
private long searchIndex(BTreeHeader header, long key) { public long findDataLower(long key) {
final int blockSize = ctx.BLOCK_SIZE_WORDS(); if (layer > 0) {
final long indexAddress = header.indexOffsetLongs(); throw new IllegalStateException("Looking for data in an index layer");
long layerOffset = 0;
for (int i = header.layers() - 1; i >= 0; --i) {
final long indexLayerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset;
final long nextLayerOffset = relativePositionInIndex(key, indexAddress + indexLayerBlockOffset, blockSize);
if (nextLayerOffset < 0)
return nextLayerOffset;
layerOffset = blockSize * (nextLayerOffset + layerOffset);
} }
return layerOffset; long searchStart = header.dataOffsetLongs() + offset * ctx.entrySize();
int numEntries = min((int)(header.numEntries() - offset), ctx.BLOCK_SIZE_WORDS());
return dataSearcher.binarySearchLower(key, searchStart, numEntries);
} }
private long relativePositionInIndex(long key, long start, long n) { public void retainData(BTreeQueryBuffer buffer) {
return indexSearcher.binarySearchUpper(key, start, n) - start;
long dataOffset = findData(buffer.currentValue());
if (dataOffset >= 0) {
buffer.retainAndAdvance();
long blockBase = header.dataOffsetLongs() + offset * ctx.entrySize();
long relOffset = dataOffset - blockBase;
int numEntries =
min((int) (header.numEntries() - relOffset), ctx.BLOCK_SIZE_WORDS()) / ctx.entrySize();
if (buffer.currentValue() <= boundary) {
file.retain(buffer, boundary, dataOffset, numEntries, ctx.equalityMask(), ctx.entrySize());
}
}
else {
buffer.rejectAndAdvance();
} }
} }
public void rejectData(BTreeQueryBuffer buffer) {
long dataOffset = findData(buffer.currentValue());
if (dataOffset >= 0) {
buffer.rejectAndAdvance();
long blockBase = header.dataOffsetLongs() + offset * ctx.entrySize();
long relOffset = dataOffset - blockBase;
int numEntries =
min((int) (header.numEntries() - relOffset), ctx.BLOCK_SIZE_WORDS()) / ctx.entrySize();
if (buffer.currentValue() <= boundary) {
file.reject(buffer, boundary, dataOffset, numEntries, ctx.equalityMask(), ctx.entrySize());
}
}
else {
buffer.retainAndAdvance();
}
}
}
}

View File

@ -3,6 +3,8 @@ package nu.marginalia.util.btree;
import nu.marginalia.util.btree.model.BTreeContext; import nu.marginalia.util.btree.model.BTreeContext;
import nu.marginalia.util.btree.model.BTreeHeader; import nu.marginalia.util.btree.model.BTreeHeader;
import nu.marginalia.util.multimap.MultimapFileLongSlice; import nu.marginalia.util.multimap.MultimapFileLongSlice;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException; import java.io.IOException;
@ -10,6 +12,7 @@ import java.io.IOException;
public class BTreeWriter { public class BTreeWriter {
private final BTreeContext ctx; private final BTreeContext ctx;
private final MultimapFileLongSlice map; private final MultimapFileLongSlice map;
private final Logger logger = LoggerFactory.getLogger(getClass());
public BTreeWriter(MultimapFileLongSlice map, BTreeContext ctx) { public BTreeWriter(MultimapFileLongSlice map, BTreeContext ctx) {
this.map = map; this.map = map;
@ -39,7 +42,16 @@ public class BTreeWriter {
header.write(map, offset); header.write(map, offset);
writeIndexCallback.write(map.atOffset(header.dataOffsetLongs()));
var slice = map.atOffset(header.dataOffsetLongs());
BTreeDogEar dogEar = new BTreeDogEar(ctx, header, slice);
writeIndexCallback.write(slice);
if (!dogEar.verify()) {
logger.error("Dog ear was not overwritten: {}", header);
}
if (header.layers() < 1) { // The data is too small to benefit from indexing if (header.layers() < 1) { // The data is too small to benefit from indexing
return ctx.calculateSize(numEntries); return ctx.calculateSize(numEntries);

View File

@ -1,136 +0,0 @@
package nu.marginalia.util.btree;
import nu.marginalia.util.btree.model.BTreeContext;
import nu.marginalia.util.btree.model.BTreeHeader;
import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.util.multimap.MultimapSearcher;
import static java.lang.Math.min;
public class CachingBTreeReader {
private final MultimapFileLong file;
public final BTreeContext ctx;
private final MultimapSearcher dataSearcher;
public CachingBTreeReader(MultimapFileLong file, BTreeContext ctx) {
this.file = file;
this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize());
this.ctx = ctx;
}
public BTreeHeader getHeader(long fileOffset) {
return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
}
public BTreeCachedIndex prepareCache(BTreeHeader header) {
return new BTreeCachedIndex(header);
}
/**
*
* @return file offset of entry matching keyRaw, negative if absent
*/
public long findEntry(BTreeCachedIndex cache, final long keyRaw) {
BTreeHeader header = cache.header;
final int blockSize = ctx.BLOCK_SIZE_WORDS();
final long key = keyRaw & ctx.equalityMask();
final long dataAddress = header.dataOffsetLongs();
final long searchStart;
final long numEntries;
if (header.layers() == 0) { // For small data, there is no index block, only a flat data block
searchStart = dataAddress;
numEntries = header.numEntries();
}
else {
cache.load();
long dataLayerOffset = searchIndex(header, cache, key);
if (dataLayerOffset < 0) {
return dataLayerOffset;
}
searchStart = dataAddress + dataLayerOffset * ctx.entrySize();
numEntries = min(header.numEntries() - dataLayerOffset, blockSize);
}
return dataSearcher.binarySearch(key, searchStart, numEntries);
}
private long searchIndex(BTreeHeader header, BTreeCachedIndex cache, long key) {
final int blockSize = ctx.BLOCK_SIZE_WORDS();
long layerOffset = 0;
for (int i = header.layers() - 1; i >= 0; --i) {
final long indexLayerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset;
final long nextLayerOffset = cache.relativePositionInIndex(key, (int) indexLayerBlockOffset, blockSize);
if (nextLayerOffset < 0)
return nextLayerOffset;
layerOffset = blockSize * (nextLayerOffset + layerOffset);
}
return layerOffset;
}
/** A cache for the BTree index data that will drastically reduce the number of disk reads
* for repeated queries against the same tree. The memory consumption is typically very low
* and the disk access pattern for reading the entire index relatively cheap.
*/
public class BTreeCachedIndex {
long[] indexData;
final BTreeHeader header;
final int indexedDataSize;
public BTreeCachedIndex(BTreeHeader header) {
this.header = header;
indexedDataSize = header.numEntries();
}
public void load() {
if (indexData != null)
return;
int size = (int)(header.dataOffsetLongs() - header.indexOffsetLongs());
indexData = new long[size];
file.read(indexData, header.indexOffsetLongs());
}
long relativePositionInIndex(long key, int fromIndex, int n) {
int low = 0;
int high = n - 1;
while (low <= high) {
int mid = (low + high) >>> 1;
long midVal = indexData[fromIndex + mid];
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return mid;
}
return low;
}
public long sizeBytes() {
return isLoaded() ? 8L*indexData.length : 0;
}
public int getIndexedDataSize() {
return indexedDataSize;
}
public boolean isLoaded() {
return indexData != null;
}
}
}

View File

@ -19,7 +19,7 @@ public record BTreeContext(int MAX_LAYERS,
} }
public int numIndexLayers(int numEntries) { public int numIndexLayers(int numEntries) {
if (numEntries <= BLOCK_SIZE_WORDS*2) { if (numEntries <= BLOCK_SIZE_WORDS*2/entrySize) {
return 0; return 0;
} }
for (int i = 1; i < MAX_LAYERS; i++) { for (int i = 1; i < MAX_LAYERS; i++) {

View File

@ -26,7 +26,6 @@ public class DictionaryData {
if (rb == -1) { if (rb == -1) {
int end = activeBank.getEnd(); int end = activeBank.getEnd();
logger.debug("Switching bank @ {}", end);
var newBank = new DictionaryDataBank(end, DICTIONARY_BANK_SIZE); var newBank = new DictionaryDataBank(end, DICTIONARY_BANK_SIZE);
rb = newBank.add(key); rb = newBank.add(key);

View File

@ -16,7 +16,7 @@ import static nu.marginalia.util.FileSizeUtil.readableSize;
* Spiritually influenced by GNU Trove's hash maps * Spiritually influenced by GNU Trove's hash maps
* LGPL 2.1 * LGPL 2.1
*/ */
public class DictionaryHashMap { public class DictionaryHashMap implements DictionaryMap {
private static final Logger logger = LoggerFactory.getLogger(DictionaryHashMap.class); private static final Logger logger = LoggerFactory.getLogger(DictionaryHashMap.class);
private static final Gauge probe_count_metrics private static final Gauge probe_count_metrics
= Gauge.build("wmsa_dictionary_hash_map_probe_count", "Probing Count") = Gauge.build("wmsa_dictionary_hash_map_probe_count", "Probing Count")
@ -81,6 +81,7 @@ public class DictionaryHashMap {
} }
} }
@Override
public int size() { public int size() {
return sz.get(); return sz.get();
} }
@ -97,6 +98,7 @@ public class DictionaryHashMap {
buffers[buffer].put(bufferIdx, val); buffers[buffer].put(bufferIdx, val);
} }
@Override
public int put(long key) { public int put(long key) {
long hash = key & 0x7FFF_FFFF_FFFF_FFFFL; long hash = key & 0x7FFF_FFFF_FFFF_FFFFL;
@ -143,6 +145,7 @@ public class DictionaryHashMap {
return di; return di;
} }
@Override
public int get(long key) { public int get(long key) {
final long hash = key & 0x7FFF_FFFF_FFFF_FFFFL; final long hash = key & 0x7FFF_FFFF_FFFF_FFFFL;
final long cell = hash % hashTableSize; final long cell = hash % hashTableSize;

View File

@ -0,0 +1,9 @@
package nu.marginalia.util.dict;
public interface DictionaryMap {
int size();
int put(long key);
int get(long key);
}

View File

@ -72,7 +72,7 @@ public enum UnicodeRanges {
int count = 0; int count = 0;
int max = sensitive ? 15 : 100; int max = sensitive ? 15 : 100;
for (int i = 0; i < text.length(); i++) { for (int i = 0; i < Math.min(2000, text.length()); i++) {
char c = text.charAt(i); char c = text.charAt(i);
if (c >= min && c <= max) { if (c >= min && c <= max) {
if (count++ > max) { if (count++ > max) {

View File

@ -88,6 +88,9 @@ public class WordPatterns {
} }
public static boolean hasWordQualities(String s) { public static boolean hasWordQualities(String s) {
if (s.isBlank())
return false;
int start = 0; int start = 0;
int end = s.length(); int end = s.length();
if (s.charAt(0) == '#') start++; if (s.charAt(0) == '#') start++;
@ -95,13 +98,14 @@ public class WordPatterns {
for (int i = start; i < end; i++) { for (int i = start; i < end; i++) {
char c = s.charAt(i); char c = s.charAt(i);
if (!("_@.'+-".indexOf(c) >= 0) if (("_@.'+-".indexOf(c) < 0)
&& !(c >= 'a' && c <= 'z') && !(c >= 'a' && c <= 'z')
&& !(c >= 'A' && c <= 'Z') && !(c >= 'A' && c <= 'Z')
&& !(c >= '0' && c <= '9') && !(c >= '0' && c <= '9')
&& !(c >= '\u00C0' && c <= '\u00D6') && !(c >= '\u00C0' && c <= '\u00D6')
&& !(c >= '\u00D8' && c <= '\u00f6') && !(c >= '\u00D8' && c <= '\u00f6')
&& !(c >= '\u00f8' && c <= '\u00ff')) { && !(c >= '\u00f8' && c <= '\u00ff'))
{
return false; return false;
} }
} }
@ -119,10 +123,14 @@ public class WordPatterns {
if (!filter(s)) { if (!filter(s)) {
return true; return true;
} }
if (topWords.contains(s.toLowerCase())) { if (isTopWord(s)) {
return true; return true;
} }
return false; return false;
} }
public static boolean isTopWord(String s) {
return topWords.contains(s.toLowerCase());
}
} }

View File

@ -2,8 +2,10 @@ package nu.marginalia.util.language.processing;
import nu.marginalia.util.language.WordPatterns; import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.util.language.processing.model.KeywordMetadata;
import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
@ -20,14 +22,9 @@ public class DocumentKeywordExtractor {
private final NameCounter nameCounter; private final NameCounter nameCounter;
private final SubjectCounter subjectCounter; private final SubjectCounter subjectCounter;
private final TermFrequencyDict dict;
private final double docCount;
@Inject @Inject
public DocumentKeywordExtractor(TermFrequencyDict dict) { public DocumentKeywordExtractor(TermFrequencyDict dict) {
this.dict = dict;
docCount = dict.docCount();
keywordExtractor = new KeywordExtractor(); keywordExtractor = new KeywordExtractor();
tfIdfCounter = new KeywordCounter(dict, keywordExtractor); tfIdfCounter = new KeywordCounter(dict, keywordExtractor);
@ -36,69 +33,105 @@ public class DocumentKeywordExtractor {
} }
public EdgePageWordSet extractKeywordsMinimal(DocumentLanguageData documentLanguageData) { public EdgePageWordSet extractKeywordsMinimal(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
List<WordRep> titleWords = extractTitleWords(documentLanguageData); List<WordRep> titleWords = extractTitleWords(documentLanguageData);
KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2); List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
List<WordRep> subjects = subjectCounter.count(documentLanguageData); List<WordRep> subjects = subjectCounter.count(documentLanguageData);
List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid()); tfIdfCounter.countHisto(keywordMetadata, documentLanguageData);
List<WordRep> topKeywords = new ArrayList<>(wordsTfIdf.top());
Collection<String> artifacts = getArtifacts(documentLanguageData); for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed);
List<String> artifacts = getArtifacts(documentLanguageData);
keywordMetadata.flagsTemplate().add(EdgePageWordFlags.Simple);
return new EdgePageWordSet( return new EdgePageWordSet(
createWords(IndexBlock.Subjects, subjects), createWords(keywordMetadata, IndexBlock.Title, titleWords),
createWords(IndexBlock.Title, titleWords), EdgePageWords.withBlankMetadata(IndexBlock.Artifacts, artifacts)
createWords(IndexBlock.NamesWords, wordsNamesAll),
createWords(IndexBlock.Tfidf_Top, topKeywords),
createWords(IndexBlock.Tfidf_Middle, midKeywords),
new EdgePageWords(IndexBlock.Artifacts, artifacts)
); );
} }
public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData) {
List<WordRep> titleWords = extractTitleWords(documentLanguageData); List<WordRep> titleWords = extractTitleWords(documentLanguageData);
KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData); getWordPositions(keywordMetadata, documentLanguageData);
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
List<WordRep> subjects = subjectCounter.count(documentLanguageData); List<WordRep> subjects = subjectCounter.count(documentLanguageData);
List<WordRep> lowKeywords = new ArrayList<>(wordsTfIdf.lower()); List<WordRep> wordsTfIdf = tfIdfCounter.countHisto(keywordMetadata, documentLanguageData);
List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
List<WordRep> topKeywords = new ArrayList<>(wordsTfIdf.top());
Collection<String> artifacts = getArtifacts(documentLanguageData); for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed);
List<String> artifacts = getArtifacts(documentLanguageData);
var wordSet = new EdgePageWordSet( var wordSet = new EdgePageWordSet(
createWords(IndexBlock.Subjects, subjects), createWords(keywordMetadata, IndexBlock.Title, titleWords),
createWords(IndexBlock.Title, titleWords), createWords(keywordMetadata, IndexBlock.Tfidf_High, wordsTfIdf),
createWords(IndexBlock.NamesWords, wordsNamesAll), createWords(keywordMetadata, IndexBlock.Subjects, subjects),
createWords(IndexBlock.Tfidf_Top, topKeywords), EdgePageWords.withBlankMetadata(IndexBlock.Artifacts, artifacts)
createWords(IndexBlock.Tfidf_Middle, midKeywords),
createWords(IndexBlock.Tfidf_Lower, lowKeywords),
new EdgePageWords(IndexBlock.Artifacts, artifacts)
); );
getSimpleWords(wordSet, documentLanguageData, getSimpleWords(keywordMetadata, wordSet, documentLanguageData,
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus); IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus);
return wordSet; return wordSet;
} }
private void getSimpleWords(EdgePageWordSet wordSet, DocumentLanguageData documentLanguageData, IndexBlock... blocks) {
public void getWordPositions(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
Map<String, Integer> ret = keywordMetadata.positionMask();
int posCtr = 0;
for (var sent : dld.titleSentences) {
int posBit = (int)((1L << (posCtr/4)) & 0xFFFF_FFFFL);
for (var word : sent) {
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getNames(sent)) {
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
}
posCtr+=4;
for (var sent : dld.sentences) {
int posBit = (int)((1L << (posCtr/4)) & 0xFFFF_FFFFL);
for (var word : sent) {
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getNames(sent)) {
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
posCtr++;
}
}
private int bitwiseOr(int a, int b) {
return a | b;
}
private void getSimpleWords(KeywordMetadata metadata, EdgePageWordSet wordSet, DocumentLanguageData documentLanguageData, IndexBlock... blocks) {
EnumSet<EdgePageWordFlags> flagsTemplate = EnumSet.noneOf(EdgePageWordFlags.class);
int start = 0; int start = 0;
int lengthGoal = 32; int lengthGoal = 32;
for (int blockIdx = 0; blockIdx < blocks.length-1 && start < documentLanguageData.sentences.length; blockIdx++) { for (int blockIdx = 0; blockIdx < blocks.length && start < documentLanguageData.sentences.length; blockIdx++) {
IndexBlock block = blocks[blockIdx]; IndexBlock block = blocks[blockIdx];
Set<String> words = new HashSet<>(lengthGoal+100); Set<EdgePageWords.Entry> words = new HashSet<>(lengthGoal+100);
int pos; int pos;
int length = 0; int length = 0;
@ -110,55 +143,26 @@ public class DocumentKeywordExtractor {
if (!word.isStopWord()) { if (!word.isStopWord()) {
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase()); String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
if (WordPatterns.singleWordQualitiesPredicate.test(w)) { if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
words.add(w); words.add(new EdgePageWords.Entry(w, metadata.forWord(flagsTemplate, word.stemmed())));
} }
} }
} }
for (var names : keywordExtractor.getNames(sent)) {
var rep = new WordRep(sent, names);
String w = AsciiFlattener.flattenUnicode(rep.word);
words.add(new EdgePageWords.Entry(w, metadata.forWord(flagsTemplate, rep.stemmed)));
}
} }
wordSet.append(block, words); wordSet.append(block, words);
start = pos; start = pos;
lengthGoal+=32; lengthGoal+=32;
} }
if (start < documentLanguageData.sentences.length) {
Map<String, Integer> counts = new HashMap<>(documentLanguageData.totalNumWords());
for (int pos = start; pos < documentLanguageData.sentences.length && counts.size() < lengthGoal; pos++) {
var sent = documentLanguageData.sentences[pos];
for (var word : sent) {
if (!word.isStopWord()) {
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
if (counts.containsKey(w) || (WordPatterns.singleWordQualitiesPredicate.test(w))) {
counts.merge(w, 1, Integer::sum);
}
}
}
}
Set<String> lastSet;
if (counts.size() < 1024) {
lastSet = counts.keySet();
}
else {
lastSet = counts.entrySet().stream()
.sorted(Comparator.comparing(e -> {
double N = docCount; // Number of documents in term freq dictionary
// Caveat: This is actually the *negated* term score, because the second logarithm has
// its parameter inverted (log(a^b) = b log(a); here b = -1)
return (1 + Math.log(e.getValue())) * Math.log((1. + dict.getTermFreq(e.getKey())) / N);
}))
.map(Map.Entry::getKey)
.limit(1024)
.collect(Collectors.toCollection(LinkedHashSet::new));
}
wordSet.append(blocks[blocks.length - 1], lastSet);
}
} }
private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+"); private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+");
private Collection<String> getArtifacts(DocumentLanguageData documentLanguageData) { private List<String> getArtifacts(DocumentLanguageData documentLanguageData) {
Set<String> reps = new HashSet<>(); Set<String> reps = new HashSet<>();
for (var sent : documentLanguageData.sentences) { for (var sent : documentLanguageData.sentences) {
@ -183,7 +187,7 @@ public class DocumentKeywordExtractor {
} }
} }
} }
return reps; return new ArrayList<>(reps);
} }
private List<WordRep> extractTitleWords(DocumentLanguageData documentLanguageData) { private List<WordRep> extractTitleWords(DocumentLanguageData documentLanguageData) {
@ -193,7 +197,21 @@ public class DocumentKeywordExtractor {
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) { public EdgePageWords createWords(KeywordMetadata metadata,
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns::hasWordQualities).collect(Collectors.toSet())); IndexBlock block,
Collection<WordRep> words) {
Set<EdgePageWords.Entry> entries = new HashSet<>(words.size());
for (var word : words) {
String flatWord = AsciiFlattener.flattenUnicode(word.word);
if (!WordPatterns.hasWordQualities(flatWord)) {
continue;
}
entries.add(new EdgePageWords.Entry(flatWord, metadata.forWord(metadata.flagsTemplate(), word.stemmed)));
}
return new EdgePageWords(block, entries);
} }
} }

View File

@ -1,15 +1,19 @@
package nu.marginalia.util.language.processing; package nu.marginalia.util.language.processing;
import com.github.jknack.handlebars.internal.lang3.StringUtils;
import gnu.trove.map.hash.TObjectIntHashMap;
import nu.marginalia.util.language.WordPatterns; import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.util.language.processing.model.KeywordMetadata;
import nu.marginalia.util.language.processing.model.WordRep; import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict; import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Map; import java.util.List;
import java.util.Set;
import java.util.regex.Pattern; import static java.lang.Math.max;
public class KeywordCounter { public class KeywordCounter {
private final KeywordExtractor keywordExtractor; private final KeywordExtractor keywordExtractor;
@ -19,72 +23,78 @@ public class KeywordCounter {
public KeywordCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) { public KeywordCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) {
this.dict = dict; this.dict = dict;
this.keywordExtractor = keywordExtractor; this.keywordExtractor = keywordExtractor;
this.docCount = (double) dict.docCount(); this.docCount = dict.docCount();
} }
public WordHistogram countHisto(DocumentLanguageData dld) { public List<WordRep> countHisto(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
HashMap<String, Integer> counts = new HashMap<>(15000); TObjectIntHashMap<String> counts = new TObjectIntHashMap<>(10_000, 0.7f);
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(15000); HashMap<String, HashSet<WordRep>> instances = new HashMap<>(15000);
for (var sent : dld.sentences) { for (var sent : dld.sentences) {
var keywords = keywordExtractor.getKeywordsFromSentence(sent); var keywords = keywordExtractor.getKeywordsFromSentence(sent);
for (var span : keywords) { for (var span : keywords) {
if (span.size() == 1 &&
WordPatterns.isStopWord(sent.words[span.start])) if (span.size() == 1 && WordPatterns.isStopWord(sent.words[span.start])) {
continue; continue;
}
String stemmed = sent.constructStemmedWordFromSpan(span); var rep = new WordRep(sent, span);
counts.merge(stemmed, 1, Integer::sum); counts.adjustOrPutValue(rep.stemmed, 1, 1);
instances.computeIfAbsent(stemmed, k -> new HashSet<>(500)).add(new WordRep(sent, span)); var instanceSet = instances.computeIfAbsent(rep.stemmed, k -> new HashSet<>(500));
if (instanceSet.size() < 250) {
instanceSet.add(rep);
}
} }
} }
double maxC = counts.values().stream().mapToDouble(Double::valueOf).max().orElse(1); HashMap<String, WordFrequencyData> tfIdf = keywordMetadata.wordsTfIdf();
List<WordRep> tfIdfHigh = new ArrayList<>();
Set<WordRep> h5 = new HashSet<>(2500); int maxVal = maxValue(counts);
Set<WordRep> h10 = new HashSet<>(500);
Set<WordRep> h15 = new HashSet<>(500);
int doubleWordCount = 0; counts.forEachEntry((key, cnt) -> {
int value = getTermValue(key, cnt, maxVal);
for (var entry : counts.entrySet()) { tfIdf.put(key, new WordFrequencyData(cnt, value));
double value = getTermValue(entry, maxC);
double avgCnt = entry.getValue(); if (cnt > 1 && value > 100) {
String wordStemmed = entry.getKey(); tfIdfHigh.addAll(instances.get(key));
Set<WordRep> histogram;
if (value < -3 && avgCnt>1) histogram = h15;
else if (value < -1.75 && avgCnt>1) histogram = h10;
else if (value < -1 &&
(!wordStemmed.contains("_") || doubleWordCount++ < 50))
histogram = h5;
else continue;
histogram.addAll(instances.get(wordStemmed));
}
return new WordHistogram(h5, h10, h15);
} }
private static final Pattern separator = Pattern.compile("_"); return true;
});
public double getTermValue(Map.Entry<String, Integer> e, double maxValue) { return tfIdfHigh;
String key = e.getKey(); }
if (key.contains("_")) {
String[] parts = separator.split(e.getKey()); private int maxValue(TObjectIntHashMap<?> map) {
int maxC = 0;
for (int c : map.values()) {
maxC = max(c, maxC);
}
return maxC;
}
public int getTermValue(String key, int count, double maxValue) {
if (key.indexOf('_') >= 0) {
String[] parts = StringUtils.split(key, '_');
double totalValue = 0.; double totalValue = 0.;
for (String part : parts) { for (String part : parts) {
totalValue += value(part, e.getValue(), maxValue); totalValue += value(part, count, maxValue);
} }
return totalValue / parts.length; return normalizeValue(totalValue / parts.length);
} }
else { else {
return value(key, e.getValue(), maxValue); return normalizeValue(value(key, count, maxValue));
} }
} }
int normalizeValue(double v) {
return (int)(-v*75);
}
double value(String key, double value, double maxValue) { double value(String key, double value, double maxValue) {
double freq = dict.getTermFreqStemmed(key); double freq = dict.getTermFreqStemmed(key);
if (freq < 1) { if (freq < 1) {
@ -93,5 +103,5 @@ public class KeywordCounter {
return (0.1 + 0.9*value/maxValue) * Math.log(freq/docCount); return (0.1 + 0.9*value/maxValue) * Math.log(freq/docCount);
} }
public record WordHistogram(Set<WordRep> lower, Set<WordRep> mid, Set<WordRep> top) { } public record WordFrequencyData(int count, int tfIdfNormalized) { }
} }

View File

@ -1,64 +0,0 @@
package nu.marginalia.util.language.processing;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.util.language.processing.model.DocumentSentence;
import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class LongNameCounter {
private final KeywordExtractor keywordExtractor;
private final TermFrequencyDict dict;
private final double docCount;
public LongNameCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) {
this.dict = dict;
docCount = (double) dict.docCount();
this.keywordExtractor = keywordExtractor;
}
public List<WordRep> count(DocumentLanguageData dld) {
HashMap<String, Double> counts = new HashMap<>(1000);
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
for (int i = 0; i < dld.sentences.length; i++) {
DocumentSentence sent = dld.sentences[i];
var keywords = keywordExtractor.getNamesStrict(sent);
for (var span : keywords) {
var stemmed = sent.constructStemmedWordFromSpan(span);
counts.merge(stemmed, 1., Double::sum);
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span));
}
}
return counts.entrySet().stream().filter(e -> termSize(e.getKey()) > 1)
.sorted(Comparator.comparing(this::getTermValue))
.limit(Math.min(50, counts.size()/3))
.map(Map.Entry::getKey)
.flatMap(w -> instances.get(w).stream()).collect(Collectors.toList());
}
int termSize(String word) {
return 1 + (int) word.chars().filter(c -> c == '_').count();
}
final Pattern separator = Pattern.compile("_");
public double getTermValue(Map.Entry<String, Double> e) {
String[] parts = separator.split(e.getKey());
double totalValue = 0.;
for (String part : parts) {
totalValue += value(part, e.getValue());
}
return totalValue / Math.sqrt(parts.length);
}
double value(String key, double value) {
return (1+Math.log(value)) * Math.log((1.1+dict.getTermFreqStemmed(key))/11820118.);
}
}

View File

@ -37,7 +37,8 @@ public class NameCounter {
.sorted(Comparator.comparing(e -> -e.getValue())) .sorted(Comparator.comparing(e -> -e.getValue()))
.limit(150) .limit(150)
.map(Map.Entry::getKey) .map(Map.Entry::getKey)
.flatMap(w -> instances.get(w).stream()).collect(Collectors.toList()); .flatMap(w -> instances.get(w).stream())
.collect(Collectors.toList());
} }
} }

View File

@ -1,6 +1,7 @@
package nu.marginalia.util.language.processing; package nu.marginalia.util.language.processing;
import com.github.datquocnguyen.RDRPOSTagger; import com.github.datquocnguyen.RDRPOSTagger;
import com.github.jknack.handlebars.internal.lang3.StringUtils;
import gnu.trove.list.array.TIntArrayList; import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TObjectIntHashMap; import gnu.trove.map.hash.TObjectIntHashMap;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
@ -125,11 +126,45 @@ public class SentenceExtractor {
return counts; return counts;
} }
private static final Pattern dotPattern = Pattern.compile("\\.+$");
private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)"); private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)");
private static final Pattern spacesPattern = Pattern.compile("\\s+");
private static final Pattern badCharPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|(\\.(\\s+|$))"); // private static final Pattern badCharPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|(\\.(\\s+|$))");
private boolean isBadChar(char c) {
if (c >= 'a' && c <= 'z') return false;
if (c >= 'A' && c <= 'Z') return false;
if (c >= '0' && c <= '9') return false;
if ("_#@.".indexOf(c) >= 0) return false;
if (c >= '\u00C0' && c <= '\u00D6') return false;
if (c >= '\u00D8' && c <= '\u00F6') return false;
if (c >= '\u00F8' && c <= '\u00FF') return false;
return true;
}
private String sanitizeString(String s) {
char[] newChars = new char[s.length()];
int pi = 0;
for (int i = 0; i < newChars.length; i++) {
char c = s.charAt(i);
if (!isBadChar(c)) {
newChars[pi++] = c;
}
else {
newChars[pi++] = ' ';
}
}
s = new String(newChars, 0, pi);
if (s.startsWith(".")) {
s = s.substring(1);
if (s.isBlank())
return "";
}
return s;
}
public DocumentSentence extractSentence(String text) { public DocumentSentence extractSentence(String text) {
var wordsAndSeps = splitSegment(text); var wordsAndSeps = splitSegment(text);
@ -139,7 +174,7 @@ public class SentenceExtractor {
var lc = toLc(wordsAndSeps.words); var lc = toLc(wordsAndSeps.words);
return new DocumentSentence( return new DocumentSentence(
badCharPattern.matcher(text).replaceAll(" "), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc) sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc)
); );
} }
@ -161,7 +196,7 @@ public class SentenceExtractor {
sentences = sentenceDetector.sentDetect(textNormalizedSpaces); sentences = sentenceDetector.sentDetect(textNormalizedSpaces);
} }
catch (Exception ex) { catch (Exception ex) {
sentences = textNormalizedSpaces.split("[.]"); sentences = StringUtils.split(textNormalizedSpaces, '.');
} }
if (sentences.length > 250) { if (sentences.length > 250) {
@ -196,8 +231,8 @@ public class SentenceExtractor {
separators[i] = Arrays.copyOf(separators[i], 250); separators[i] = Arrays.copyOf(separators[i], 250);
} }
for (int j = 0; j < tokens[i].length; j++) { for (int j = 0; j < tokens[i].length; j++) {
if (tokens[i][j].endsWith(".")) { while (tokens[i][j].endsWith(".")) {
tokens[i][j] = dotPattern.matcher(tokens[i][j]).replaceAll(""); tokens[i][j] = StringUtils.removeEnd(tokens[i][j], ".");
} }
} }
} }
@ -216,7 +251,7 @@ public class SentenceExtractor {
DocumentSentence[] ret = new DocumentSentence[sentences.length]; DocumentSentence[] ret = new DocumentSentence[sentences.length];
for (int i = 0; i < ret.length; i++) { for (int i = 0; i < ret.length; i++) {
ret[i] = new DocumentSentence(badCharPattern.matcher(sentences[i]).replaceAll(" "), tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]); ret[i] = new DocumentSentence(sanitizeString(sentences[i]), tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]);
} }
return ret; return ret;
} }

View File

@ -5,9 +5,7 @@ import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.util.language.processing.model.WordSpan; import nu.marginalia.util.language.processing.model.WordSpan;
import nu.marginalia.util.language.processing.model.tag.WordSeparator; import nu.marginalia.util.language.processing.model.tag.WordSeparator;
import java.util.HashMap; import java.util.*;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
public class SubjectCounter { public class SubjectCounter {
@ -27,7 +25,9 @@ public class SubjectCounter {
public List<WordRep> count(DocumentLanguageData dld) { public List<WordRep> count(DocumentLanguageData dld) {
Map<WordRep, Integer> counts = new HashMap<>(); Map<String, Integer> counts = new HashMap<>();
Map<String, Set<WordRep>> instances = new HashMap<>();
for (var sentence : dld.sentences) { for (var sentence : dld.sentences) {
for (WordSpan kw : keywordExtractor.getNames(sentence)) { for (WordSpan kw : keywordExtractor.getNames(sentence)) {
if (kw.end + 2 >= sentence.length()) { if (kw.end + 2 >= sentence.length()) {
@ -41,7 +41,13 @@ public class SubjectCounter {
String nextNextTag = sentence.posTags[kw.end+1]; String nextNextTag = sentence.posTags[kw.end+1];
if (isVerb(nextTag) && isDetOrAdverbOrVerb(nextNextTag)) { if (isVerb(nextTag) && isDetOrAdverbOrVerb(nextNextTag)) {
counts.merge(new WordRep(sentence, new WordSpan(kw.start, kw.end)), -1, Integer::sum); var span = new WordSpan(kw.start, kw.end);
var rep = new WordRep(sentence, span);
String stemmed = rep.stemmed;
counts.merge(stemmed, -1, Integer::sum);
instances.computeIfAbsent(stemmed, s -> new HashSet<>()).add(rep);
} }
} }
} }
@ -49,8 +55,8 @@ public class SubjectCounter {
int best = counts.values().stream().mapToInt(Integer::valueOf).min().orElse(0); int best = counts.values().stream().mapToInt(Integer::valueOf).min().orElse(0);
return counts.entrySet().stream().sorted(Map.Entry.comparingByValue()) return counts.entrySet().stream().sorted(Map.Entry.comparingByValue())
.filter(e -> e.getValue()<-2 && e.getValue()<best*0.75) .filter(e -> e.getValue()<-2 && e.getValue()<=best*0.75)
.map(Map.Entry::getKey) .flatMap(e -> instances.getOrDefault(e.getKey(), Collections.emptySet()).stream())
.collect(Collectors.toList()); .collect(Collectors.toList());
} }

View File

@ -0,0 +1,58 @@
package nu.marginalia.util.language.processing.model;
import nu.marginalia.util.language.processing.KeywordCounter;
import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
public record KeywordMetadata(HashSet<String> titleKeywords,
HashSet<String> subjectKeywords,
HashSet<String> namesKeywords,
HashMap<String, KeywordCounter.WordFrequencyData> wordsTfIdf,
HashMap<String, Integer> positionMask,
EnumSet<EdgePageWordFlags> flagsTemplate,
int quality
)
{
private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0);
public KeywordMetadata(double quality, EnumSet<EdgePageWordFlags> flags) {
this(new HashSet<>(50), new HashSet<>(10), new HashSet<>(50),
new HashMap<>(15_000),
new HashMap<>(10_000),
flags,
(int)(-quality));
}
public KeywordMetadata(double quality) {
this(quality, EnumSet.noneOf(EdgePageWordFlags.class));
}
public long forWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
KeywordCounter.WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty);
EnumSet<EdgePageWordFlags> flags = flagsTemplate.clone();
if (subjectKeywords.contains(stemmed))
flags.add(EdgePageWordFlags.Subjects);
if (namesKeywords.contains(stemmed))
flags.add(EdgePageWordFlags.NamesWords);
if (titleKeywords.contains(stemmed))
flags.add(EdgePageWordFlags.Title);
int positions = positionMask.getOrDefault(stemmed, 0);
return new EdgePageWordMetadata(tfidf.tfIdfNormalized(), positions, quality, tfidf.count(), flags).encode();
}
public int quality() {
return -quality;
}
}

View File

@ -1,21 +1,22 @@
package nu.marginalia.util.language.processing.model; package nu.marginalia.util.language.processing.model;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.EqualsAndHashCode;
import lombok.Getter; import lombok.Getter;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import java.util.Objects; import java.util.Objects;
@AllArgsConstructor @EqualsAndHashCode @Getter @AllArgsConstructor @Getter
public class WordRep implements Comparable<WordRep> { public class WordRep implements Comparable<WordRep> {
public WordRep(DocumentSentence sent, WordSpan span) { public WordRep(DocumentSentence sent, WordSpan span) {
word = sent.constructWordFromSpan(span); word = sent.constructWordFromSpan(span);
stemmed = sent.constructStemmedWordFromSpan(span); stemmed = sent.constructStemmedWordFromSpan(span);
length = span.end - span.start; length = span.end - span.start;
hashCode = Objects.hash(word); hashCode = Objects.hash(word);
} }
public final int length; public final int length;
public final String word; public final String word;
public final String stemmed; public final String stemmed;
@ -34,4 +35,12 @@ public class WordRep implements Comparable<WordRep> {
public int hashCode() { public int hashCode() {
return hashCode; return hashCode;
} }
public boolean equals(Object other) {
if (other == this) return true;
if (other instanceof WordRep wr) {
return Objects.equals(wr.word, word);
}
return false;
}
} }

View File

@ -2,6 +2,7 @@ package nu.marginalia.util.multimap;
import com.upserve.uppend.blobs.NativeIO; import com.upserve.uppend.blobs.NativeIO;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.util.btree.BTreeQueryBuffer;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -100,8 +101,8 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
public MultimapSearcherBase createSearcher() { public MultimapSearcherBase createSearcher() {
return new MultimapSearcherBase(this); return new MultimapSearcherBase(this);
} }
public MultimapSorter createSorter(Path tmpFile, int internalSortLimit) { public MultimapSorter createSorter(Path tmpFile, int internalSortLimit, int minStepSize) {
return new MultimapSorter(this, tmpFile, internalSortLimit); return new MultimapSorter(this, tmpFile, internalSortLimit, minStepSize);
} }
@SneakyThrows @SneakyThrows
@ -340,6 +341,49 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
} }
@Override
public void write(LongBuffer vals, int n, long idx) {
if (idx+n >= mappedSize) {
grow(idx+n);
}
int iN = (int)((idx + n) / bufferSize);
for (int i = 0; i < n; ) {
int i0 = (int)((idx + i) / bufferSize);
int bufferOffset = (int) ((idx+i) % bufferSize);
var buffer = buffers.get(i0);
final int l;
if (i0 < iN) l = bufferSize - bufferOffset;
else l = Math.min(n - i, bufferSize - bufferOffset);
buffer.put(bufferOffset, vals, vals.position() + i, l);
i+=l;
}
}
@Override
public void swapn(int n, long idx1, long idx2) {
for (int i = 0; i < n; i++)
swap(idx1+i, idx2+i);
}
private void swap(long idx1, long idx2) {
LongBuffer buff1 = buffers.get((int)(idx1) / bufferSize);
final int o1 = (int) (idx1) % bufferSize;
LongBuffer buff2 = buffers.get((int)(idx2) / bufferSize);
final int o2 = (int) (idx2) % bufferSize;
long tmp = buff1.get(o1);
buff1.put(o1, buff2.get(o2));
buff2.put(o2, tmp);
}
@Override @Override
public void setRange(long idx, int n, long val) { public void setRange(long idx, int n, long val) {
if (n == 0) return; if (n == 0) return;
@ -410,6 +454,387 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
} }
@Override
public long binarySearchInternal(long key, long fromIndex, int step, long n, long mask) {
if (fromIndex + n*step >= mappedSize)
grow(fromIndex + n*step);
long low = 0;
long high = n - 1;
if (fromIndex/bufferSize == (fromIndex+step*n)/bufferSize) {
int idx = (int)(fromIndex / bufferSize);
while (low <= high) {
long mid = (low + high) >>> 1;
long off = fromIndex + mid*step;
long midVal = buffers.get(idx).get((int)(off % bufferSize)) & mask;
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid*step;
}
}
else {
while (low <= high) {
long mid = (low + high) >>> 1;
long off = fromIndex + mid*step;
long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize)) & mask;
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid*step;
}
}
return -1L-(fromIndex + high*step);
}
@Override
public long binarySearchInternal(long key, long fromIndex, long n, long mask) {
if (fromIndex + n >= mappedSize)
grow(fromIndex + n);
long low = 0;
long high = n - 1;
if (fromIndex/bufferSize == (fromIndex+n)/bufferSize) {
int idx = (int)(fromIndex / bufferSize);
while (low <= high) {
long mid = (low + high) >>> 1;
long off = fromIndex + mid;
long midVal = buffers.get(idx).get((int)(off % bufferSize)) & mask;
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
}
}
else {
while (low <= high) {
long mid = (low + high) >>> 1;
long off = fromIndex + mid;
long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize)) & mask;
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
}
}
return -1L-(fromIndex + high);
}
@Override
public long binarySearchInternal(long key, long fromIndex, long n) {
if (fromIndex + n >= mappedSize)
grow(fromIndex + n);
long low = 0;
long high = n - 1;
if (fromIndex/bufferSize == (fromIndex+n)/bufferSize) {
int idx = (int)(fromIndex / bufferSize);
while (low <= high) {
long mid = (low + high) >>> 1;
long off = fromIndex + mid;
long midVal = buffers.get(idx).get((int)(off % bufferSize));
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
}
}
else {
while (low <= high) {
long mid = (low + high) >>> 1;
long off = fromIndex + mid;
long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize));
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
}
}
return -1L-(fromIndex + high);
}
@Override
public long binarySearchUpperInternal(long key, long fromIndex, long n) {
if (fromIndex + n >= mappedSize)
grow(fromIndex + n);
long low = 0;
long high = n - 1;
if (fromIndex/bufferSize == (fromIndex+n)/bufferSize) {
int idx = (int)(fromIndex / bufferSize);
while (low <= high) {
long mid = (low + high) >>> 1;
long off = fromIndex + mid;
long midVal = buffers.get(idx).get((int)(off % bufferSize));
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
}
}
else {
while (low <= high) {
long mid = (low + high) >>> 1;
long off = fromIndex + mid;
long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize));
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
}
}
return fromIndex + low;
}
private boolean isSameBuffer(long a, long b) {
return a / bufferSize == b/bufferSize;
}
@Override
public long quickSortPartition(int wordSize, long low, long high) {
if (high >= mappedSize)
grow(high + wordSize - 1);
if (isSameBuffer(low, high + wordSize - 1)) {
// Specialization that circumvents the need for expensive calls to
// MultimapFileLong.get() in the most common scenario
return quickSortPartitionSameBuffer(wordSize, low, high);
}
else {
return quickSortPartitionDifferentBuffers(wordSize, low, high);
}
}
@Override
public void insertionSort(int wordSize, long start, int n) {
if (start + n + wordSize - 1 >= mappedSize)
grow(start + n + wordSize - 1);
if (n == 1) {
return;
}
if (isSameBuffer(start, start + (long)n*wordSize-1L)) {
final var buffer = buffers.get((int) (start / bufferSize));
int off = (int) (start % bufferSize);
for (int i = 1; i < n; i++) {
for (int j = i; j > 0; j--) {
int a = off + wordSize*(j-1);
int b = off + wordSize*j;
if (buffer.get(a) > buffer.get(b)) {
for (int w = 0; w < wordSize; w++) {
long tmp = buffer.get(a+w);
buffer.put(a+w, buffer.get(b+w));
buffer.put(b+w, tmp);
}
}
else break;
}
}
}
else for (int i = 1; i < n; i++) {
for (int j = i; j > 0; j--) {
long a = start + (long)wordSize*(j-1);
long b = start + (long)wordSize*j;
if (get(a) > get(b)) {
swap(a, b);
}
else {
break;
}
}
}
}
private long quickSortPartitionDifferentBuffers(int wordSize, long low, long high) {
long pivotPoint = ((low + high) / (2L*wordSize)) * wordSize;
long pivot = get(pivotPoint);
long i = low - wordSize;
long j = high + wordSize;
for (;;) {
do {
i+=wordSize;
} while (get(i) < pivot);
do {
j-=wordSize;
}
while (get(j) > pivot);
if (i >= j) return j;
else swapn(wordSize, i, j);
}
}
private long quickSortPartitionSameBuffer(int wordSize, long low, long high) {
final var buffer = buffers.get((int) (low / bufferSize));
int pivotPoint = (int) ((low + high) / (2L*wordSize)) * wordSize % bufferSize;
long pivot = buffer.get(pivotPoint);
int j = (int) (high) % bufferSize + wordSize;
int i = (int) (low) % bufferSize - wordSize;
long j0 = high + wordSize - j;
for (;;) {
do {
i+=wordSize;
} while (buffer.get(i) < pivot);
do {
j-=wordSize;
}
while (buffer.get(j) > pivot);
if (i >= j) return j0 + j;
else {
for (int w = 0; w < wordSize; w++) {
long tmp = buffer.get(i+w);
buffer.put(i+w, buffer.get(j+w));
buffer.put(j+w, tmp);
}
}
}
}
public void retain(BTreeQueryBuffer buffer, long boundary, long searchStart, long numEntries, long mask, int stepSize) {
final long end = searchStart + stepSize * numEntries;
if (end < mappedSize) {
grow(end);
}
long bv = buffer.currentValue() & mask;
long av = get(searchStart) & mask;
long pos = searchStart;
int bi = (int)(searchStart / bufferSize);
int bo = (int)(searchStart % bufferSize);
LongBuffer data = buffers.get(bi);
while (bv <= boundary && buffer.hasMore()) {
if (bv < av) {
if (!buffer.rejectAndAdvance()) break;
bv = buffer.currentValue() & mask;
continue;
}
else if (bv == av) {
if (!buffer.retainAndAdvance()) break;
bv = buffer.currentValue() & mask;
continue;
}
pos += stepSize;
if (pos < end) {
bo += stepSize;
if (bo >= bufferSize) {
data = buffers.get(++bi);
bo = 0;
}
av = data.get(bo) & mask;
}
else {
break;
}
}
}
public void reject(BTreeQueryBuffer buffer, long boundary, long searchStart, long numEntries, long mask, int stepSize) {
final long end = searchStart + stepSize * numEntries;
if (end < mappedSize) {
grow(end);
}
long bv = buffer.currentValue() & mask;
long av = get(searchStart) & mask;
long pos = searchStart;
int bi = (int)(searchStart / bufferSize);
int bo = (int)(searchStart % bufferSize);
LongBuffer data = buffers.get(bi);
while (bv <= boundary && buffer.hasMore()) {
if (bv < av) {
if (!buffer.retainAndAdvance()) break;
bv = buffer.currentValue() & mask;
continue;
}
else if (bv == av) {
if (!buffer.rejectAndAdvance()) break;
bv = buffer.currentValue() & mask;
continue;
}
pos += stepSize;
if (pos < end) {
bo += stepSize;
if (bo >= bufferSize) {
data = buffers.get(++bi);
bo = 0;
}
av = data.get(bo) & mask;
}
else {
break;
}
}
}
@Override @Override
public void close() throws IOException { public void close() throws IOException {
@ -424,6 +849,4 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
System.runFinalization(); System.runFinalization();
System.gc(); System.gc();
} }
} }

View File

@ -61,6 +61,17 @@ public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice {
map.write(vals, idx+off); map.write(vals, idx+off);
} }
@Override
public void write(LongBuffer vals, int n, long idx) {
map.write(vals, n,idx+off);
}
@Override
public void swapn(int n, long idx1, long idx2) {
map.swapn(n, idx1+off, idx2+off);
}
@Override @Override
public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd)
throws IOException { throws IOException {
@ -75,4 +86,35 @@ public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice {
return new MultimapFileLongOffsetSlice(map, this.off + off); return new MultimapFileLongOffsetSlice(map, this.off + off);
} }
@Override
public long binarySearchInternal(long key, long fromIndex, int step, long n, long mask) {
throw new UnsupportedOperationException();
}
@Override
public long binarySearchInternal(long key, long fromIndex, long n, long mask) {
throw new UnsupportedOperationException();
}
@Override
public long binarySearchInternal(long key, long fromIndex, long n) {
throw new UnsupportedOperationException();
}
@Override
public long binarySearchUpperInternal(long key, long fromIndex, long n) {
throw new UnsupportedOperationException();
}
@Override
public long quickSortPartition(int wordSize, long low, long highInclusive) {
return map.quickSortPartition(wordSize, low+off, highInclusive+off);
}
@Override
public void insertionSort(int wordSize, long start, int n) {
map.insertionSort(wordSize, start+off, n);
}
} }

View File

@ -25,9 +25,23 @@ public interface MultimapFileLongSlice {
void write(LongBuffer vals, long idx); void write(LongBuffer vals, long idx);
void write(LongBuffer vals, int n, long idx);
void swapn(int n, long idx1, long idx2);
void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException; void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException;
default MultimapFileLongSlice atOffset(long off) { default MultimapFileLongSlice atOffset(long off) {
return new MultimapFileLongOffsetSlice(this, off); return new MultimapFileLongOffsetSlice(this, off);
} }
long binarySearchInternal(long key, long fromIndex, int step, long n, long mask);
long binarySearchInternal(long key, long fromIndex, long n, long mask);
long binarySearchInternal(long key, long fromIndex, long n);
long binarySearchUpperInternal(long key, long fromIndex, long n);
long quickSortPartition(int wordSize, long low, long highInclusive);
void insertionSort(int wordSize, long start, int n);
} }

View File

@ -1,7 +1,7 @@
package nu.marginalia.util.multimap; package nu.marginalia.util.multimap;
public interface MultimapSearcher { public interface MultimapSearcher {
long binarySearchUpper(long key, long fromIndex, long n); long binarySearchLower(long key, long fromIndex, long n);
long binarySearch(long key, long fromIndex, long n); long binarySearch(long key, long fromIndex, long n);
static MultimapSearcher forContext(MultimapFileLongSlice slice, long mask, int stepSize) { static MultimapSearcher forContext(MultimapFileLongSlice slice, long mask, int stepSize) {
@ -25,8 +25,8 @@ class SimpleMultimapSearcher implements MultimapSearcher {
} }
@Override @Override
public long binarySearchUpper(long key, long fromIndex, long n) { public long binarySearchLower(long key, long fromIndex, long n) {
return base.binarySearchUpper(key, fromIndex, n); return base.binarySearchLower(key, fromIndex, n);
} }
@Override @Override
@ -46,8 +46,8 @@ class MaskedMultimapSearcher implements MultimapSearcher {
} }
@Override @Override
public long binarySearchUpper(long key, long fromIndex, long n) { public long binarySearchLower(long key, long fromIndex, long n) {
return base.binarySearchUpper(key, fromIndex, n, mask); return base.binarySearchLower(key, fromIndex, n, mask);
} }
@Override @Override
@ -69,8 +69,8 @@ class SteppingMaskedMultimapSearcher implements MultimapSearcher {
} }
@Override @Override
public long binarySearchUpper(long key, long fromIndex, long n) { public long binarySearchLower(long key, long fromIndex, long n) {
return base.binarySearchUpper(key, fromIndex, step, n, mask); return base.binarySearchLower(key, fromIndex, step, n, mask);
} }
@Override @Override

View File

@ -29,26 +29,12 @@ public class MultimapSearcherBase {
return false; return false;
} }
public long binarySearchUpper(long key, long fromIndex, long n) { public long binarySearchLower(long key, long fromIndex, long n) {
long low = 0; return mmf.binarySearchUpperInternal(key, fromIndex, n);
long high = n - 1;
while (low <= high) {
long mid = (low + high) >>> 1;
long midVal = get(fromIndex + mid);
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
}
return fromIndex + low;
} }
public long binarySearchUpper(long key, long fromIndex, long n, long mask) { public long binarySearchLower(long key, long fromIndex, long n, long mask) {
long low = 0; long low = 0;
long high = n - 1; long high = n - 1;
@ -67,7 +53,7 @@ public class MultimapSearcherBase {
} }
public long binarySearchUpper(long key, long fromIndex, int step, long n, long mask) { public long binarySearchLower(long key, long fromIndex, int step, long n, long mask) {
long low = 0; long low = 0;
long high = n - 1; long high = n - 1;
@ -82,62 +68,19 @@ public class MultimapSearcherBase {
else else
return fromIndex + mid*step; return fromIndex + mid*step;
} }
return fromIndex + low; return fromIndex + low*step;
} }
public long binarySearch(long key, long fromIndex, long n) { public long binarySearch(long key, long fromIndex, long n) {
long low = 0; return mmf.binarySearchInternal(key, fromIndex, n);
long high = n - 1;
while (low <= high) {
long mid = (low + high) >>> 1;
long midVal = get(fromIndex + mid);
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
}
return -1;
} }
public long binarySearch(long key, long fromIndex, long n, long mask) { public long binarySearch(long key, long fromIndex, long n, long mask) {
long low = 0; return mmf.binarySearchInternal(key, fromIndex, n, mask);
long high = n - 1;
while (low <= high) {
long mid = (low + high) >>> 1;
long midVal = get(fromIndex + mid) & mask;
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid;
} }
return -1;
}
public long binarySearch(long key, long fromIndex, int step, long n, long mask) { public long binarySearch(long key, long fromIndex, int step, long n, long mask) {
long low = 0; return mmf.binarySearchInternal(key, fromIndex, step, n, mask);
long high = n - 1;
while (low <= high) {
long mid = (low + high) >>> 1;
long midVal = get(fromIndex + mid*step) & mask;
if (midVal < key)
low = mid + 1;
else if (midVal > key)
high = mid - 1;
else
return fromIndex + mid*step;
}
return -1;
} }
} }

View File

@ -1,56 +1,85 @@
package nu.marginalia.util.multimap; package nu.marginalia.util.multimap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException; import java.io.IOException;
import java.io.RandomAccessFile; import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.LongBuffer; import java.nio.LongBuffer;
import java.nio.channels.FileChannel; import java.nio.channels.FileChannel;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.Arrays;
import static nu.marginalia.util.multimap.MultimapFileLong.WORD_SIZE; import static nu.marginalia.util.multimap.MultimapFileLong.WORD_SIZE;
public class MultimapSorter { public class MultimapSorter {
private final Path tmpFileDir; private final Path tmpFileDir;
private final int internalSortLimit;
private final MultimapFileLongSlice multimapFileLong; private final MultimapFileLongSlice multimapFileLong;
private final long[] buffer; private final LongBuffer buffer;
private final int internalSortLimit;
private final int wordSize;
public MultimapSorter(MultimapFileLongSlice multimapFileLong, Path tmpFileDir, int internalSortLimit) { private static final Logger logger = LoggerFactory.getLogger(MultimapSorter.class);
public MultimapSorter(MultimapFileLongSlice multimapFileLong, Path tmpFileDir, int internalSortLimit, int wordSize) {
this.multimapFileLong = multimapFileLong; this.multimapFileLong = multimapFileLong;
this.tmpFileDir = tmpFileDir; this.tmpFileDir = tmpFileDir;
this.internalSortLimit = internalSortLimit; this.internalSortLimit = internalSortLimit;
buffer = new long[internalSortLimit]; this.wordSize = wordSize;
buffer = ByteBuffer.allocateDirect(internalSortLimit * wordSize * 8).asLongBuffer();
} }
public void sort(long start, int length) throws IOException { public void sortRange(long start, long end) throws IOException {
if (length <= internalSortLimit) { if (end - start < internalSortLimit) {
multimapFileLong.read(buffer, length, start); quickSortLH(start, end - wordSize);
Arrays.sort(buffer, 0, length);
multimapFileLong.write(buffer, length, start);
} }
else { else {
externalSort(start, length); mergeSort(start, (int) (end - start));
}
for (long lp = start + wordSize; lp < end; lp += wordSize) {
if (multimapFileLong.get(lp - wordSize) > multimapFileLong.get(lp)) {
logger.error("Sort contract breached [{}:{} ({}), ws={}, <isl={}, bc={}]",
start, end,
end - start,
wordSize, end - start < internalSortLimit,
buffer.capacity());
}
} }
} }
public void mergeSort(long start, int lengthLongs) throws IOException {
if (lengthLongs == 1)
return;
private void externalSort(long start, int length) throws IOException { if (lengthLongs < buffer.capacity()) {
Path tmpFile = Files.createTempFile(tmpFileDir,"sort-"+start+"-"+(start+length), ".dat"); mergeSort(start, lengthLongs, buffer);
}
else {
Path tmpFile = Files.createTempFile(tmpFileDir,"sort-"+start+"-"+(start+lengthLongs), ".dat");
try (var raf = new RandomAccessFile(tmpFile.toFile(), "rw"); var channel = raf.getChannel()) { try (var raf = new RandomAccessFile(tmpFile.toFile(), "rw"); var channel = raf.getChannel()) {
var workBuffer = var workBuffer =
channel.map(FileChannel.MapMode.READ_WRITE, 0, length * WORD_SIZE) channel.map(FileChannel.MapMode.READ_WRITE, 0, wordSize * lengthLongs * WORD_SIZE)
.asLongBuffer(); .asLongBuffer();
mergeSort(start, lengthLongs, workBuffer);
int width = Math.min(Integer.highestOneBit(length), Integer.highestOneBit(internalSortLimit)); }
finally {
tmpFile.toFile().delete();
}
}
}
private void mergeSort(long start, int length, LongBuffer workBuffer) throws IOException {
int width = Math.min(Integer.highestOneBit(length), Integer.highestOneBit(buffer.capacity()));
// Do in-memory sorting up until internalSortLimit first // Do in-memory sorting up until internalSortLimit first
for (int i = 0; i < length; i += width) { for (int i = 0; i < length; i += width) {
sort(start + i, Math.min(width, length-i)); quickSort(start + i, Math.min(width, length-i));
} }
// Then merge sort on disk for the rest // Then finish with merge sort
for (; width < length; width*=2) { for (; width < length; width*=2) {
for (int i = 0; i < length; i += 2*width) { for (int i = 0; i < length; i += 2*width) {
@ -58,30 +87,61 @@ public class MultimapSorter {
} }
workBuffer.clear(); workBuffer.clear();
multimapFileLong.write(workBuffer, start); multimapFileLong.write(workBuffer, length, start);
} }
} }
finally {
tmpFile.toFile().delete();
}
}
void merge(long offset, int left, int right, int end, LongBuffer workBuffer) { void merge(long offset, int left, int right, int end, LongBuffer workBuffer) {
int i = left; long idxL = left;
int j = right; long idxR = right;
for (int k = left; k < end; k++) { for (int putPos = left; putPos < end; putPos+= wordSize) {
final long bufferI = multimapFileLong.get(offset+i); final long bufferL = multimapFileLong.get(offset+idxL);
final long bufferJ = multimapFileLong.get(offset+j); final long bufferR = multimapFileLong.get(offset+idxR);
if (i < right && (j >= end || bufferI < bufferJ)) { if (idxL < right && (idxR >= end || bufferL < bufferR)) {
workBuffer.put(k, bufferI); workBuffer.put(putPos, bufferL);
i++; for (int s = 1; s < wordSize; s++) {
workBuffer.put(putPos + s, multimapFileLong.get(offset + idxL + s));
}
idxL+= wordSize;
} }
else { else {
workBuffer.put(k, bufferJ); workBuffer.put(putPos, bufferR);
j++; for (int s = 1; s < wordSize; s++) {
workBuffer.put(putPos + s, multimapFileLong.get(offset + idxR + s));
}
idxR+= wordSize;
}
}
}
public void insertionSort(long start, int n) {
multimapFileLong.insertionSort(wordSize, start, n);
}
private void swap(long a, long b) {
multimapFileLong.swapn(wordSize, a, b);
}
public void quickSort(long start, long length) {
quickSortLH(start, start + length - wordSize);
}
public void quickSortLH(long low, long highInclusive) {
if (low >= 0 && highInclusive >= 0 && low < highInclusive) {
if (highInclusive - low < 32) {
multimapFileLong.insertionSort(wordSize, low, (int) (1 + (highInclusive - low) / wordSize));
}
else {
long p = multimapFileLong.quickSortPartition(wordSize, low, highInclusive);
quickSortLH(low, p);
quickSortLH(p + wordSize, highInclusive);
} }
} }
} }

View File

@ -11,27 +11,16 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.LinkedBlockingQueue;
public class UpdateDomainRanksTool2 { public class UpdateDomainRanksTool2 {
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool2.class); private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool2.class);
public Set<String> originDomains = new HashSet<>();
public Set<Integer> originDomainIds = new HashSet<>();
public final long domainIdMax = -1; public final long domainIdMax = -1;
public int domainCount; public int domainCount;
private volatile static int rankMax; private volatile static int rankMax;
public int maxId() {
return (int) domainIdMax;
}
public int domainCount() {
return domainCount;
}
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10); static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
volatile static boolean running = true; volatile static boolean running = true;
@ -44,23 +33,14 @@ public class UpdateDomainRanksTool2 {
var uploader = new Thread(() -> uploadThread(conn), "Uploader"); var uploader = new Thread(() -> uploadThread(conn), "Uploader");
logger.info("Ranking"); logger.info("Ranking");
// "memex.marginalia.nu", "wiki.xxiivv.com", "bikobatanari.art", "sadgrl.online", "lileks.com",
// "www.rep.routledge.com", "www.personal.kent.edu", "xroads.virginia.edu", "classics.mit.edu", "faculty.washington.edu", "monadnock.net"
var ds = new DatabaseModule().provideConnection(); var ds = new DatabaseModule().provideConnection();
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds)); var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org"); var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
// var rpr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "%edu");
// var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu");
var rankVector = rpr.pageRankVector(); var rankVector = rpr.pageRankVector();
var norm = rankVector.norm();
rankMax = rpr.size(); rankMax = rpr.size();
uploader.start(); uploader.start();
rankMax = rpr.size();
rpr.pageRankWithPeripheralNodes(rankMax).forEach(i -> { rpr.pageRankWithPeripheralNodes(rankMax).forEach(i -> {
try { try {
uploadQueue.put(i); uploadQueue.put(i);

View File

@ -0,0 +1,298 @@
package nu.marginalia.util.tool;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.map.hash.TIntObjectHashMap;
import gnu.trove.set.hash.TIntHashSet;
import lombok.SneakyThrows;
import nu.marginalia.util.AndCardIntSet;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
import org.roaringbitmap.RoaringBitmap;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.*;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.TimeUnit;
import java.util.function.Consumer;
import static nu.marginalia.util.AndCardIntSet.*;
public class EdgeDomainLinkConsineSimilarityMain {
ArrayList<Integer> idsList = new ArrayList<>(100_000);
ArrayList<AndCardIntSet> itemsList = new ArrayList<>(100_000);
TIntObjectHashMap<AndCardIntSet> dToSMap = new TIntObjectHashMap<>(100_000);
TIntIntHashMap aliasMap = new TIntIntHashMap(100_000, 0.75f, -1, -1);
TIntHashSet indexed = new TIntHashSet(100_000);
float weights[];
private HikariDataSource dataSource;
public EdgeDomainLinkConsineSimilarityMain(HikariDataSource dataSource) throws SQLException {
this.dataSource = dataSource;
Map<Integer, RoaringBitmap> tmpMap = new HashMap<>(100_000);
try (
var conn = dataSource.getConnection();
var aliasStmt = conn.prepareStatement("SELECT ID, DOMAIN_ALIAS FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NOT NULL");
var indexedStmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE INDEXED>0");
var linksStmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) {
ResultSet rsp;
aliasStmt.setFetchSize(10_000);
rsp = aliasStmt.executeQuery();
while (rsp.next()) {
aliasMap.put(rsp.getInt(1), rsp.getInt(2));
}
indexedStmt.setFetchSize(10_000);
rsp = indexedStmt.executeQuery();
while (rsp.next()) {
indexed.add(rsp.getInt(1));
}
linksStmt.setFetchSize(10_000);
rsp = linksStmt.executeQuery();
while (rsp.next()) {
int source = deAlias(rsp.getInt(1));
int dest = deAlias(rsp.getInt(2));
tmpMap.computeIfAbsent(dest, this::createBitmapWithSelf).add(source);
}
}
tmpMap.entrySet().stream()
.filter(e -> isEligible(e.getValue()))
.forEach(e -> {
var val = of(e.getValue());
idsList.add(e.getKey());
itemsList.add(val);
dToSMap.put(e.getKey(), val);
});
weights = new float[1 + idsList.stream().mapToInt(i -> i).max().orElse(0)];
for (int i = 0; i < idsList.size(); i++) {
weights[idsList.get(i)] = getWeight(idsList.get(i));
}
}
private boolean isEligible(RoaringBitmap value) {
int cardinality = value.getCardinality();
return cardinality < 10000;
}
private int deAlias(int id) {
int val = aliasMap.get(id);
if (val < 0)
return id;
return val;
}
LinkedBlockingDeque<DomainSimilarities> similaritiesLinkedBlockingDeque = new LinkedBlockingDeque<>(10);
volatile boolean running;
@SneakyThrows
public void tryDomains(String... domainName) {
var dataStoreDao = new EdgeDataStoreDaoImpl(dataSource);
System.out.println(Arrays.toString(domainName));
int[] domainIds = Arrays.stream(domainName).map(EdgeDomain::new)
.map(dataStoreDao::getDomainId)
.mapToInt(EdgeId::id)
.map(this::deAlias)
.toArray();
for (int domainId : domainIds) {
findAdjacentDtoS(domainId, similarities -> {
for (var similarity : similarities.similarities()) {
if (indexed.contains(similarity.domainId)) System.out.print("*");
System.out.println(dataStoreDao.getDomain(new EdgeId<>(similarity.domainId)).map(Object::toString).orElse("") + " " + prettyPercent(similarity.value));
}
});
}
}
private String prettyPercent(double val) {
return String.format("%2.2f%%", 100. * val);
}
@SneakyThrows
public void loadAll() {
running = true;
var thread = new Thread(this::insertThreadRun);
thread.start();
idsList.parallelStream()
.filter(id -> !aliasMap.containsKey(id))
.forEach(id -> findAdjacent(id, this::addToQueue));
running = false;
thread.join();
}
@SneakyThrows
void addToQueue(DomainSimilarities similarities) {
similaritiesLinkedBlockingDeque.putLast(similarities);
}
public void insertThreadRun() {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement(
"""
INSERT INTO EC_DOMAIN_NEIGHBORS_2
(DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS)
VALUES (?, ?, ?)
ON DUPLICATE KEY UPDATE RELATEDNESS = GREATEST(EC_DOMAIN_NEIGHBORS_2.RELATEDNESS, VALUES(RELATEDNESS))
""")
) {
while (running || !similaritiesLinkedBlockingDeque.isEmpty()) {
var item = similaritiesLinkedBlockingDeque.pollFirst(60, TimeUnit.SECONDS);
if (item == null) continue;
for (var similarity : item.similarities) {
stmt.setInt(1, item.domainId);
stmt.setInt(2, similarity.domainId);
stmt.setDouble(3, similarity.value);
stmt.addBatch();
}
stmt.executeBatch();
}
} catch (SQLException | InterruptedException e) {
throw new RuntimeException(e);
}
}
public RoaringBitmap createBitmapWithSelf(int val) {
var bm = new RoaringBitmap();
bm.add(val);
return bm;
}
public void findAdjacent(int domainId, Consumer<DomainSimilarities> andThen) {
findAdjacentDtoS(domainId, andThen);
}
double cosineSimilarity(AndCardIntSet a, AndCardIntSet b) {
double andCardinality = andCardinality(a, b);
andCardinality /= Math.sqrt(a.getCardinality());
andCardinality /= Math.sqrt(b.getCardinality());
return andCardinality;
}
double expensiveCosineSimilarity(AndCardIntSet a, AndCardIntSet b) {
return weightedProduct(weights, a, b) / Math.sqrt(a.mulAndSum(weights) * b.mulAndSum(weights));
}
float getWeight(int i) {
var vector = dToSMap.get(i);
if (vector == null) return 1.0f;
return 1.0f / (float) Math.log(2+vector.getCardinality());
}
record DomainSimilarities(int domainId, List<DomainSimilarity> similarities) {};
record DomainSimilarity(int domainId, double value) {};
@SneakyThrows
private void findAdjacentDtoS(int domainId, Consumer<DomainSimilarities> andThen) {
var vector = dToSMap.get(domainId);
if (vector == null || !vector.cardinalityExceeds(10)) {
return;
}
System.out.println("DtoS " + domainId);
List<DomainSimilarity> similarities = new ArrayList<>(1000);
/** The minimum cardinality a vector can have so that
*
* a (x) b
* ------- < k is given by k^2
* |a||b|
*
*/
int cardMin = Math.max(2, (int) (0.01 * vector.getCardinality()));
for (int i = 0; i < itemsList.size(); i++) {
int id = idsList.get(i);
if (id == domainId)
continue;
var otherVec = itemsList.get(i);
if (otherVec.getCardinality() < cardMin)
continue;
double similarity = cosineSimilarity(vector, otherVec);
if (similarity > 0.1) {
var recalculated = expensiveCosineSimilarity(vector, otherVec);
if (recalculated > 0.1) {
similarities.add(new DomainSimilarity(id, recalculated));
}
}
}
if (similarities.size() > 128) {
similarities.sort(Comparator.comparing(DomainSimilarity::value));
similarities.subList(0, similarities.size() - 128).clear();
}
andThen.accept(new DomainSimilarities(domainId, similarities));
}
// @SneakyThrows
// private void findAdjacentDtoS(Consumer<DomainSimilarities> andThen, int... domainIds) {
// var vectors = Arrays.stream(domainIds).mapToObj(dToSMap::get)
// .filter(Objects::nonNull)
// .filter(vec -> vec.cardinalityExceeds(10))
// .toArray(AndCardIntSet[]::new);
// Set<Integer> domainIdsSet = new HashSet<>(Arrays.stream(domainIds).boxed().toList());
//
// if (vectors.length != domainIds.length)
// return;
//
// List<DomainSimilarity> similarities = dToSMap.entrySet().parallelStream()
// .filter(e -> !domainIdsSet.contains(e.getKey()) && indexed.contains(e.getKey()))
// .flatMap(entry -> {
//
// double similarity = 0.;
// for (var vector : vectors) {
// similarity += cosineSimilarity(vector, entry.getValue());
// }
//
// if (similarity > 0.1 * vectors.length) {
// double recalculated = 0;
// for (var vector : vectors) {
// recalculated += expensiveCosineSimilarity(vector, entry.getValue());
// }
// if (recalculated > 0.1 * vectors.length) {
// return Stream.of(new DomainSimilarity(entry.getKey(), recalculated));
// }
// }
// return Stream.empty();
// }).sorted(Comparator.comparing(DomainSimilarity::value))
// .toList();
//
// andThen.accept(new DomainSimilarities(domainIds[0], similarities));
// }
public static void main(String[] args) throws SQLException {
DatabaseModule dm = new DatabaseModule();
var main = new EdgeDomainLinkConsineSimilarityMain(dm.provideConnection());
if (args.length == 0) {
main.loadAll();
}
else {
main.tryDomains(args);
}
}
}

View File

@ -2,8 +2,14 @@ package nu.marginalia.wmsa.api.model;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.Getter; import lombok.Getter;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore;
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails; import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
@AllArgsConstructor @Getter @AllArgsConstructor @Getter
public class ApiSearchResult { public class ApiSearchResult {
public String url; public String url;
@ -11,10 +17,30 @@ public class ApiSearchResult {
public String description; public String description;
public double quality; public double quality;
public List<List<ApiSearchResultQueryDetails>> details = new ArrayList<>();
public ApiSearchResult(EdgeUrlDetails url) { public ApiSearchResult(EdgeUrlDetails url) {
this.url = url.url.toString(); this.url = url.url.toString();
this.title = url.getTitle(); this.title = url.getTitle();
this.description = url.getDescription(); this.description = url.getDescription();
this.quality = url.getTermScore(); this.quality = url.getTermScore();
if (url.resultItem != null) {
var bySet = url.resultItem.scores.stream().collect(Collectors.groupingBy(EdgeSearchResultKeywordScore::set));
outer:
for (var entries : bySet.values()) {
List<ApiSearchResultQueryDetails> lst = new ArrayList<>();
for (var entry : entries) {
var metadata = entry.metadata();
if (metadata.isEmpty())
continue outer;
Set<String> flags = metadata.flags().stream().map(Object::toString).collect(Collectors.toSet());
lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(),metadata.count(), flags));
}
details.add(lst);
}
}
} }
} }

View File

@ -0,0 +1,16 @@
package nu.marginalia.wmsa.api.model;
import lombok.AllArgsConstructor;
import lombok.Getter;
import java.util.Set;
@AllArgsConstructor @Getter
public class ApiSearchResultQueryDetails {
String keyword;
int tfIdf;
int count;
Set<String> flagsUnstableAPI;
}

View File

@ -5,6 +5,7 @@ import nu.marginalia.wmsa.auth.AuthMain;
import nu.marginalia.wmsa.configuration.command.*; import nu.marginalia.wmsa.configuration.command.*;
import nu.marginalia.wmsa.edge.assistant.EdgeAssistantMain; import nu.marginalia.wmsa.edge.assistant.EdgeAssistantMain;
import nu.marginalia.wmsa.edge.dating.DatingMain; import nu.marginalia.wmsa.edge.dating.DatingMain;
import nu.marginalia.wmsa.edge.explorer.ExplorerMain;
import nu.marginalia.wmsa.edge.index.EdgeIndexMain; import nu.marginalia.wmsa.edge.index.EdgeIndexMain;
import nu.marginalia.wmsa.edge.search.EdgeSearchMain; import nu.marginalia.wmsa.edge.search.EdgeSearchMain;
import nu.marginalia.wmsa.encyclopedia.EncyclopediaMain; import nu.marginalia.wmsa.encyclopedia.EncyclopediaMain;
@ -37,6 +38,7 @@ public enum ServiceDescriptor {
ENCYCLOPEDIA("encyclopedia", 5040, EncyclopediaMain.class), ENCYCLOPEDIA("encyclopedia", 5040, EncyclopediaMain.class),
DATING("dating", 5070, DatingMain.class), DATING("dating", 5070, DatingMain.class),
EXPLORER("explorer", 5071, ExplorerMain.class),
TEST_1("test-1", 0, null), TEST_1("test-1", 0, null),
TEST_2("test-2", 0, null); TEST_2("test-2", 0, null);
@ -77,7 +79,8 @@ public enum ServiceDescriptor {
public static void main(String... args) { public static void main(String... args) {
MainMapLookup.setMainArguments(args); MainMapLookup.setMainArguments(args);
Map<String, Command> functions = Stream.of(new ListCommand(), Map<String, Command> functions = Stream.of(
new ListCommand(),
new StartCommand(), new StartCommand(),
new ConvertCommand(), new ConvertCommand(),
new CrawlCommand(), new CrawlCommand(),

View File

@ -12,6 +12,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import spark.Request; import spark.Request;
import spark.Response; import spark.Response;
import spark.Spark;
import java.sql.SQLException; import java.sql.SQLException;
@ -85,6 +86,12 @@ public class ScreenshotService {
} }
private Object serveSvgPlaceholder(Response response, int id) { private Object serveSvgPlaceholder(Response response, int id) {
var domainName = edgeDataStoreDao.getDomain(new EdgeId<>(id)).map(Object::toString);
if (domainName.isEmpty()) {
Spark.halt(404);
}
response.type("image/svg+xml"); response.type("image/svg+xml");
return String.format("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" + return String.format("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" +
"<svg\n" + "<svg\n" +
@ -111,6 +118,6 @@ public class ScreenshotService {
" style=\"font-size:32px;fill:#000000;font-family:monospace;\"\n" + " style=\"font-size:32px;fill:#000000;font-family:monospace;\"\n" +
" x=\"320\" y=\"240\" dominant-baseline=\"middle\" text-anchor=\"middle\">%s</text>\n" + " x=\"320\" y=\"240\" dominant-baseline=\"middle\" text-anchor=\"middle\">%s</text>\n" +
" </g>\n" + " </g>\n" +
"</svg>\n", edgeDataStoreDao.getDomain(new EdgeId<>(id))); "</svg>\n", domainName.get());
} }
} }

View File

@ -0,0 +1,69 @@
package nu.marginalia.wmsa.edge.converting;
import com.github.luben.zstd.ZstdOutputStream;
import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.time.LocalDateTime;
import java.time.ZoneOffset;
public class ConversionLog implements AutoCloseable, Interpreter {
private final PrintWriter writer;
public ConversionLog(Path rootDir) throws IOException {
String fileName = String.format("conversion-log-%s.zstd", LocalDateTime.now().toEpochSecond(ZoneOffset.UTC));
Path logFile = rootDir.resolve(fileName);
writer = new PrintWriter(new ZstdOutputStream(
new BufferedOutputStream(Files.newOutputStream(logFile, StandardOpenOption.WRITE, StandardOpenOption.CREATE))));
}
@Override
public void close() throws IOException {
writer.close();
}
@Override
public void loadUrl(EdgeUrl[] url) {}
@Override
public void loadDomain(EdgeDomain[] domain) {}
@Override
public void loadRssFeed(EdgeUrl[] rssFeed) {}
@Override
public void loadDomainLink(DomainLink[] links) {}
@Override
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {}
@Override
public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {}
@Override
public synchronized void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) {
writer.printf("%s\t%s\n", loadProcessedDocumentWithError.url(), loadProcessedDocumentWithError.reason());
}
@Override
public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {}
@Override
public void loadDomainRedirect(DomainLink link) {}
}

View File

@ -54,5 +54,4 @@ public class ConvertedDomainReader {
return ret; return ret;
} }
} }

View File

@ -5,9 +5,9 @@ import com.google.inject.Guice;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Injector; import com.google.inject.Injector;
import nu.marginalia.util.ParallelPipe; import nu.marginalia.util.ParallelPipe;
import nu.marginalia.wmsa.edge.converting.compiler.InstructionsCompiler;
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction; import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor; import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor;
import nu.marginalia.wmsa.edge.converting.processor.InstructionsCompiler;
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader; import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
import nu.marginalia.wmsa.edge.crawling.WorkLog; import nu.marginalia.wmsa.edge.crawling.WorkLog;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
@ -47,11 +47,15 @@ public class ConverterMain {
Gson gson Gson gson
) throws Exception { ) throws Exception {
instructionWriter = new LoadInstructionWriter(plan.process.getDir(), gson); ;
logger.info("Starting pipe"); logger.info("Starting pipe");
try (WorkLog processLog = plan.createProcessWorkLog()) { try (WorkLog processLog = plan.createProcessWorkLog();
ConversionLog log = new ConversionLog(plan.process.getDir())) {
instructionWriter = new LoadInstructionWriter(log, plan.process.getDir(), gson);
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 20, 4, 2) { var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 20, 4, 2) {
@Override @Override

View File

@ -24,10 +24,13 @@ import java.util.List;
public class LoadInstructionWriter { public class LoadInstructionWriter {
private ConversionLog log;
private final Path outputDir; private final Path outputDir;
private final Gson gson; private final Gson gson;
private static final Logger logger = LoggerFactory.getLogger(LoadInstructionWriter.class); private static final Logger logger = LoggerFactory.getLogger(LoadInstructionWriter.class);
public LoadInstructionWriter(Path outputDir, Gson gson) {
public LoadInstructionWriter(ConversionLog log, Path outputDir, Gson gson) {
this.log = log;
this.outputDir = outputDir; this.outputDir = outputDir;
this.gson = gson; this.gson = gson;
@ -35,6 +38,7 @@ public class LoadInstructionWriter {
throw new IllegalArgumentException("Output dir " + outputDir + " does not exist"); throw new IllegalArgumentException("Output dir " + outputDir + " does not exist");
} }
} }
public String accept(String id, List<Instruction> instructionList) throws IOException { public String accept(String id, List<Instruction> instructionList) throws IOException {
Path outputFile = getOutputFile(id); Path outputFile = getOutputFile(id);
@ -48,6 +52,8 @@ public class LoadInstructionWriter {
logger.info("Writing {} - {} - {}", id, instructionList.size(), summary); logger.info("Writing {} - {} - {}", id, instructionList.size(), summary);
for (var instr : instructionList) { for (var instr : instructionList) {
instr.apply(log);
outputStream.append(instr.tag().name()); outputStream.append(instr.tag().name());
outputStream.append(' '); outputStream.append(' ');
gson.toJson(instr, outputStream); gson.toJson(instr, outputStream);
@ -66,6 +72,7 @@ public class LoadInstructionWriter {
if (!Files.exists(destDir)) { if (!Files.exists(destDir)) {
Files.createDirectories(destDir); Files.createDirectories(destDir);
} }
return destDir.resolve(id + ".pzstd"); return destDir.resolve(id + ".pzstd");
} }

View File

@ -70,7 +70,11 @@ public class ReindexTriggerMain {
}; };
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/repartition")).build()).execute(); client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/repartition")).build()).execute();
if (!Boolean.getBoolean("no-preconvert")) {
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/preconvert")).build()).execute(); client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/preconvert")).build()).execute();
}
for (int i = 0; i < DYNAMIC_BUCKET_LENGTH+1; i++) { for (int i = 0; i < DYNAMIC_BUCKET_LENGTH+1; i++) {
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/reindex/" + i)).build()).execute(); client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/reindex/" + i)).build()).execute();
} }

View File

@ -0,0 +1,58 @@
package nu.marginalia.wmsa.edge.converting.compiler;
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadKeywords;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
import nu.marginalia.wmsa.edge.index.model.IndexBlockType;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
import java.util.List;
public class DocumentsCompiler {
public void compile(List<Instruction> ret, List<ProcessedDocument> documents) {
for (var doc : documents) {
compileDocumentDetails(ret, doc);
}
for (var doc : documents) {
compileWords(ret, doc);
}
}
private void compileDocumentDetails(List<Instruction> ret, ProcessedDocument doc) {
var details = doc.details;
if (details != null) {
ret.add(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard, details.length, details.hashCode, details.quality));
}
else {
ret.add(new LoadProcessedDocumentWithError(doc.url, doc.state, doc.stateReason));
}
}
private void compileWords(List<Instruction> ret, ProcessedDocument doc) {
var words = doc.words;
if (words != null) {
var wordsArray = words.values().stream()
.filter(this::filterNonTransients)
.map(DocumentKeywords::new)
.toArray(DocumentKeywords[]::new);
ret.add(new LoadKeywords(doc.url, wordsArray));
}
}
private boolean filterNonTransients(EdgePageWords words) {
return words.block.type != IndexBlockType.TRANSIENT;
}
}

View File

@ -0,0 +1,23 @@
package nu.marginalia.wmsa.edge.converting.compiler;
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadRssFeed;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import java.util.List;
import java.util.Objects;
public class FeedsCompiler {
public void compile(List<Instruction> ret, List<ProcessedDocument> documents) {
EdgeUrl[] feeds = documents.stream().map(doc -> doc.details)
.filter(Objects::nonNull)
.flatMap(dets -> dets.feedLinks.stream())
.distinct()
.toArray(EdgeUrl[]::new);
ret.add(new LoadRssFeed(feeds));
}
}

View File

@ -0,0 +1,57 @@
package nu.marginalia.wmsa.edge.converting.compiler;
import com.google.inject.Inject;
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDomain;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
import java.util.ArrayList;
import java.util.List;
public class InstructionsCompiler {
private final UrlsCompiler urlsCompiler;
private final DocumentsCompiler documentsCompiler;
private final FeedsCompiler feedsCompiler;
private final LinksCompiler linksCompiler;
private final RedirectCompiler redirectCompiler;
@Inject
public InstructionsCompiler(UrlsCompiler urlsCompiler,
DocumentsCompiler documentsCompiler,
FeedsCompiler feedsCompiler,
LinksCompiler linksCompiler,
RedirectCompiler redirectCompiler)
{
this.urlsCompiler = urlsCompiler;
this.documentsCompiler = documentsCompiler;
this.feedsCompiler = feedsCompiler;
this.linksCompiler = linksCompiler;
this.redirectCompiler = redirectCompiler;
}
public List<Instruction> compile(ProcessedDomain domain) {
List<Instruction> ret = new ArrayList<>(domain.size()*4);
ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.ip));
if (domain.documents != null) {
urlsCompiler.compile(ret, domain.documents);
documentsCompiler.compile(ret, domain.documents);
feedsCompiler.compile(ret, domain.documents);
linksCompiler.compile(ret, domain.domain, domain.documents);
}
if (domain.redirect != null) {
redirectCompiler.compile(ret, domain.domain, domain.redirect);
}
return ret;
}
}

View File

@ -0,0 +1,26 @@
package nu.marginalia.wmsa.edge.converting.compiler;
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomainLink;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import java.util.List;
import java.util.Objects;
public class LinksCompiler {
public void compile(List<Instruction> ret, EdgeDomain from, List<ProcessedDocument> documents) {
DomainLink[] links = documents.stream().map(doc -> doc.details)
.filter(Objects::nonNull)
.flatMap(dets -> dets.linksExternal.stream())
.map(link -> link.domain)
.distinct()
.map(domain -> new DomainLink(from, domain))
.toArray(DomainLink[]::new);
ret.add(new LoadDomainLink(links));
}
}

View File

@ -0,0 +1,19 @@
package nu.marginalia.wmsa.edge.converting.compiler;
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomain;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomainLink;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomainRedirect;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import java.util.List;
public class RedirectCompiler {
public void compile(List<Instruction> ret, EdgeDomain from, EdgeDomain to) {
ret.add(new LoadDomain(to));
ret.add(new LoadDomainLink(new DomainLink(from, to)));
ret.add(new LoadDomainRedirect(new DomainLink(from, to)));
}
}

View File

@ -0,0 +1,49 @@
package nu.marginalia.wmsa.edge.converting.compiler;
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomain;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadUrl;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
public class UrlsCompiler {
private static final int MAX_INTERNAL_LINKS = 25;
public void compile(List<Instruction> ret, List<ProcessedDocument> documents) {
Set<EdgeUrl> seenUrls = new HashSet<>(documents.size()*4);
Set<EdgeDomain> seenDomains = new HashSet<>(documents.size());
for (var doc : documents) {
seenUrls.add(doc.url);
if (doc.details != null) {
for (var url : doc.details.linksExternal) {
if (seenDomains.add(url.domain)) {
seenUrls.add(url);
}
}
if (doc.isOk()) {
// Don't load more than a few from linksInternal, grows too big for no reason
var linksToAdd = new ArrayList<>(doc.details.linksInternal);
if (linksToAdd.size() > MAX_INTERNAL_LINKS) {
linksToAdd.subList(MAX_INTERNAL_LINKS, linksToAdd.size()).clear();
}
seenUrls.addAll(linksToAdd);
}
}
}
ret.add(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new)));
ret.add(new LoadUrl(seenUrls.toArray(EdgeUrl[]::new)));
}
}

View File

@ -1,17 +1,47 @@
package nu.marginalia.wmsa.edge.converting.interpreter.instruction; package nu.marginalia.wmsa.edge.converting.interpreter.instruction;
import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
import java.util.Arrays; import java.util.Arrays;
public record DocumentKeywords(IndexBlock block, String... keywords) { public record DocumentKeywords(IndexBlock block,
String[] keywords,
long[] metadata) {
public DocumentKeywords(EdgePageWords words) { public DocumentKeywords(EdgePageWords words) {
this(words.block, words.words.toArray(String[]::new)); this(words.block,
words.words.toArray(String[]::new),
words.metadata.toArray());
} }
@Override @Override
public String toString() { public String toString() {
return getClass().getSimpleName()+"["+block +", "+Arrays.toString(keywords)+"]"; StringBuilder sb = new StringBuilder();
sb.append(getClass().getSimpleName());
sb.append('[').append(block).append(", ");
for (int i = 0; i < keywords.length; i++) {
sb.append("\n\t ");
if (metadata[i] != 0) {
sb.append(keywords[i]).append("/").append(new EdgePageWordMetadata(metadata[i]));
}
else {
sb.append(keywords[i]);
}
}
return sb.append("\n]").toString();
}
public boolean isEmpty() {
return keywords.length == 0;
}
public int size() {
return keywords.length;
}
public DocumentKeywords subList(int start, int end) {
return new DocumentKeywords(block, Arrays.copyOfRange(keywords, start, end), Arrays.copyOfRange(metadata, start, end));
} }
} }

View File

@ -8,7 +8,8 @@ import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
public record LoadProcessedDocumentWithError(EdgeUrl url, public record LoadProcessedDocumentWithError(EdgeUrl url,
EdgeUrlState state) implements Instruction EdgeUrlState state,
String reason) implements Instruction
{ {
@Override @Override
public void apply(Interpreter interpreter) { public void apply(Interpreter interpreter) {

View File

@ -25,34 +25,13 @@ public class SqlLoadUrls {
@Inject @Inject
public SqlLoadUrls(HikariDataSource dataSource) { public SqlLoadUrls(HikariDataSource dataSource) {
this.dataSource = dataSource; this.dataSource = dataSource;
try (var conn = dataSource.getConnection()) {
try (var stmt = conn.createStatement()) {
stmt.execute("DROP PROCEDURE IF EXISTS INSERT_URL");
stmt.execute("""
CREATE PROCEDURE INSERT_URL (
IN PROTO VARCHAR(255),
IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
IN PORT INT,
IN PATH VARCHAR(255),
IN PARAM VARCHAR(255),
IN PATH_HASH BIGINT
)
BEGIN
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PARAM,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PARAM,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
END
""");
}
}
catch (SQLException ex) {
throw new RuntimeException("Failed to set up loader", ex);
}
} }
public void load(LoaderData data, EdgeUrl[] urls) { public void load(LoaderData data, EdgeUrl[] urls) {
Set<EdgeDomain> affectedDomains = new HashSet<>(); Set<EdgeDomain> affectedDomains = new HashSet<>();
try (var conn = dataSource.getConnection(); try (var conn = dataSource.getConnection();
var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?,?,?)"); var insertCall = conn.prepareStatement("INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PARAM,PATH_HASH) VALUES (?,?,?,?,?,?)");
var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=?") var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=?")
) )
{ {
@ -67,7 +46,7 @@ public class SqlLoadUrls {
affectedDomains.add(url.domain); affectedDomains.add(url.domain);
insertCall.setString(1, url.proto); insertCall.setString(1, url.proto);
insertCall.setString(2, url.domain.toString()); insertCall.setInt(2, data.getDomainId(url.domain));
if (url.port != null) { if (url.port != null) {
insertCall.setInt(3, url.port); insertCall.setInt(3, url.port);
} }
@ -79,7 +58,7 @@ public class SqlLoadUrls {
insertCall.setLong(6, hashPath(url.path, url.param)); insertCall.setLong(6, hashPath(url.path, url.param));
insertCall.addBatch(); insertCall.addBatch();
if (cnt++ == 250) { if (cnt++ == 1000) {
var ret = insertCall.executeBatch(); var ret = insertCall.executeBatch();
conn.commit(); conn.commit();

View File

@ -1,11 +1,18 @@
package nu.marginalia.wmsa.edge.converting.model; package nu.marginalia.wmsa.edge.converting.model;
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
public class DisqualifiedException extends Exception { public class DisqualifiedException extends Exception {
public final DisqualificationReason reason; public final DisqualificationReason reason;
public DisqualifiedException(DisqualificationReason reason) { public DisqualifiedException(DisqualificationReason reason) {
this.reason = reason; this.reason = reason;
} }
public DisqualifiedException(CrawlerDocumentStatus crawlerStatus) {
this.reason = DisqualificationReason.fromCrawlerStatus(crawlerStatus);
}
@Override @Override
public Throwable fillInStackTrace() { public Throwable fillInStackTrace() {
return this; return this;
@ -18,6 +25,22 @@ public class DisqualifiedException extends Exception {
STATUS, STATUS,
QUALITY, QUALITY,
ACCEPTABLE_ADS, ACCEPTABLE_ADS,
FORBIDDEN FORBIDDEN,
SHORT_CIRCUIT,
PROCESSING_EXCEPTION,
BAD_CONTENT_TYPE,
BAD_CHARSET,
REDIRECT,
ROBOTS_TXT,
ERROR,
Timeout, // Don't you dare
BAD_CANONICAL
;
public static DisqualificationReason fromCrawlerStatus(CrawlerDocumentStatus crawlerStatus) {
return DisqualificationReason.valueOf(crawlerStatus.name());
}
} }
} }

View File

@ -17,6 +17,10 @@ public class ProcessedDocument {
public EdgeUrlState state; public EdgeUrlState state;
public String stateReason; public String stateReason;
public boolean isOk() {
return EdgeUrlState.OK == state;
}
public OptionalDouble quality() { public OptionalDouble quality() {
if (details != null) { if (details != null) {
return OptionalDouble.of(details.quality); return OptionalDouble.of(details.quality);

View File

@ -7,6 +7,7 @@ import nu.marginalia.util.language.LanguageFilter;
import nu.marginalia.util.language.processing.DocumentKeywordExtractor; import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor; import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.util.language.processing.model.KeywordMetadata;
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException; import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException.DisqualificationReason; import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException.DisqualificationReason;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument; import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
@ -81,32 +82,12 @@ public class DocumentProcessor {
return ret; return ret;
} }
public ProcessedDocument process(CrawledDocument crawledDocument, CrawledDomain crawledDomain) { public ProcessedDocument process(CrawledDocument crawledDocument, CrawledDomain crawledDomain) {
ProcessedDocument ret = new ProcessedDocument(); ProcessedDocument ret = new ProcessedDocument();
try { try {
ret.url = getDocumentUrl(crawledDocument); processDocument(crawledDocument, crawledDomain, ret);
ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus);
if (ret.state == EdgeUrlState.OK) {
if (AcceptableAds.hasAcceptableAdsHeader(crawledDocument)) {
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
}
if (isAcceptedContentType(crawledDocument)) {
var detailsWords = createDetails(crawledDomain, crawledDocument);
ret.details = detailsWords.details();
ret.words = detailsWords.words();
}
else {
throw new DisqualifiedException(DisqualificationReason.CONTENT_TYPE);
}
}
else {
throw new DisqualifiedException(DisqualificationReason.STATUS);
}
} }
catch (DisqualifiedException ex) { catch (DisqualifiedException ex) {
ret.state = EdgeUrlState.DISQUALIFIED; ret.state = EdgeUrlState.DISQUALIFIED;
@ -115,6 +96,7 @@ public class DocumentProcessor {
} }
catch (Exception ex) { catch (Exception ex) {
ret.state = EdgeUrlState.DISQUALIFIED; ret.state = EdgeUrlState.DISQUALIFIED;
ret.stateReason = DisqualificationReason.PROCESSING_EXCEPTION.toString();
logger.info("Failed to convert " + crawledDocument.url, ex); logger.info("Failed to convert " + crawledDocument.url, ex);
ex.printStackTrace(); ex.printStackTrace();
} }
@ -122,6 +104,32 @@ public class DocumentProcessor {
return ret; return ret;
} }
private void processDocument(CrawledDocument crawledDocument, CrawledDomain crawledDomain, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException {
var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus);
if (crawlerStatus != CrawlerDocumentStatus.OK) {
throw new DisqualifiedException(crawlerStatus);
}
if (AcceptableAds.hasAcceptableAdsHeader(crawledDocument)) {
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
}
if (!isAcceptedContentType(crawledDocument)) {
throw new DisqualifiedException(DisqualificationReason.CONTENT_TYPE);
}
ret.url = getDocumentUrl(crawledDocument);
ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus);
var detailsWithWordsLinks = createDetails(crawledDomain, crawledDocument);
ret.details = detailsWithWordsLinks.details();
ret.words = detailsWithWordsLinks.words();
}
private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument) private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument)
throws URISyntaxException throws URISyntaxException
{ {
@ -193,9 +201,11 @@ public class DocumentProcessor {
ret.standard = getHtmlStandard(doc); ret.standard = getHtmlStandard(doc);
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url); ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
ret.quality = documentValuator.getQuality(ret.standard, doc, dld); ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc, dld);
ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong(); ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();
KeywordMetadata keywordMetadata = new KeywordMetadata(ret.quality);
EdgePageWordSet words; EdgePageWordSet words;
if (shouldDoSimpleProcessing(url, ret)) { if (shouldDoSimpleProcessing(url, ret)) {
/* Some documents we'll index, but only superficially. This is a compromise /* Some documents we'll index, but only superficially. This is a compromise
@ -203,12 +213,12 @@ public class DocumentProcessor {
queries. This also saves a lot of processing power. queries. This also saves a lot of processing power.
*/ */
ret.features = Set.of(HtmlFeature.UNKNOWN); ret.features = Set.of(HtmlFeature.UNKNOWN);
words = keywordExtractor.extractKeywordsMinimal(dld); words = keywordExtractor.extractKeywordsMinimal(dld, keywordMetadata);
ret.description = ""; ret.description = "";
} }
else { else {
ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld); ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld);
words = keywordExtractor.extractKeywords(dld); words = keywordExtractor.extractKeywords(dld, keywordMetadata);
ret.description = getDescription(doc); ret.description = getDescription(doc);
} }
@ -239,6 +249,10 @@ public class DocumentProcessor {
return true; return true;
} }
// Annoying wordpress crap
if (url.path.startsWith("/tag/") && url.path.endsWith("/")) {
return true;
}
return false; return false;
} }
@ -262,7 +276,7 @@ public class DocumentProcessor {
ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add); ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
words.append(IndexBlock.Meta, tagWords); words.appendWithNoMeta(IndexBlock.Meta, tagWords);
} }
private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) { private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) {
@ -296,14 +310,21 @@ public class DocumentProcessor {
.ifPresent(lp::acceptFeed); .ifPresent(lp::acceptFeed);
} }
createLinkKeywords(words, lp);
createFileLinkKeywords(words, lp, domain);
}
private void createLinkKeywords(EdgePageWordSet words, LinkProcessor lp) {
final Set<String> linkTerms = new HashSet<>(); final Set<String> linkTerms = new HashSet<>();
for (var fd : lp.getForeignDomains()) { for (var fd : lp.getForeignDomains()) {
linkTerms.add("links:"+fd.toString().toLowerCase()); linkTerms.add("links:"+fd.toString().toLowerCase());
linkTerms.add("links:"+fd.getDomain().toLowerCase()); linkTerms.add("links:"+fd.getDomain().toLowerCase());
} }
words.append(IndexBlock.Meta, linkTerms); words.appendWithNoMeta(IndexBlock.Meta, linkTerms);
}
private void createFileLinkKeywords(EdgePageWordSet words, LinkProcessor lp, EdgeDomain domain) {
Set<String> fileKeywords = new HashSet<>(100); Set<String> fileKeywords = new HashSet<>(100);
for (var link : lp.getNonIndexableUrls()) { for (var link : lp.getNonIndexableUrls()) {
@ -314,8 +335,8 @@ public class DocumentProcessor {
synthesizeFilenameKeyword(fileKeywords, link); synthesizeFilenameKeyword(fileKeywords, link);
} }
words.append(IndexBlock.Artifacts, fileKeywords);
words.appendWithNoMeta(IndexBlock.Artifacts, fileKeywords);
} }
private void synthesizeFilenameKeyword(Set<String> fileKeywords, EdgeUrl link) { private void synthesizeFilenameKeyword(Set<String> fileKeywords, EdgeUrl link) {
@ -364,5 +385,7 @@ public class DocumentProcessor {
return doc.text().length(); return doc.text().length();
} }
private record DetailsWithWords(ProcessedDocumentDetails details, EdgePageWordSet words) {} private record DetailsWithWords(ProcessedDocumentDetails details,
EdgePageWordSet words) {}
} }

View File

@ -3,17 +3,22 @@ package nu.marginalia.wmsa.edge.converting.processor;
import com.google.common.base.Strings; import com.google.common.base.Strings;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.name.Named; import com.google.inject.name.Named;
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain; import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
import nu.marginalia.wmsa.edge.converting.processor.logic.CommonKeywordExtractor; import nu.marginalia.wmsa.edge.converting.processor.logic.CommonKeywordExtractor;
import nu.marginalia.wmsa.edge.converting.processor.logic.InternalLinkGraph;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus; import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus;
import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.model.IndexBlockType;
import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState; import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import java.util.*; import java.util.*;
import java.util.stream.Collectors;
import static nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus.BAD_CANONICAL; import static nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus.BAD_CANONICAL;
@ -47,6 +52,8 @@ public class DomainProcessor {
fixBadCanonicalTags(crawledDomain.doc); fixBadCanonicalTags(crawledDomain.doc);
InternalLinkGraph internalLinkGraph = new InternalLinkGraph();
DocumentDisqualifier disqualifier = new DocumentDisqualifier(); DocumentDisqualifier disqualifier = new DocumentDisqualifier();
for (var doc : crawledDomain.doc) { for (var doc : crawledDomain.doc) {
if (disqualifier.isQualified()) { if (disqualifier.isQualified()) {
@ -54,6 +61,9 @@ public class DomainProcessor {
if (processedDoc.url != null) { if (processedDoc.url != null) {
ret.documents.add(processedDoc); ret.documents.add(processedDoc);
internalLinkGraph.accept(processedDoc);
processedDoc.quality().ifPresent(disqualifier::offer); processedDoc.quality().ifPresent(disqualifier::offer);
} }
else if ("LANGUAGE".equals(processedDoc.stateReason)) { else if ("LANGUAGE".equals(processedDoc.stateReason)) {
@ -62,24 +72,16 @@ public class DomainProcessor {
} }
else { // Short-circuit processing if quality is too low else { // Short-circuit processing if quality is too low
var stub = documentProcessor.makeDisqualifiedStub(doc); var stub = documentProcessor.makeDisqualifiedStub(doc);
stub.stateReason = DisqualifiedException.DisqualificationReason.SHORT_CIRCUIT.toString();
if (stub.url != null) { if (stub.url != null) {
ret.documents.add(stub); ret.documents.add(stub);
} }
} }
} }
Set<String> commonSiteWords = new HashSet<>(10); flagCommonSiteWords(ret);
flagAdjacentSiteWords(internalLinkGraph, ret);
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Tfidf_Top, IndexBlock.Subjects));
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Title));
if (!commonSiteWords.isEmpty()) {
for (var doc : ret.documents) {
if (doc.words != null) {
doc.words.get(IndexBlock.Site).addAll(commonSiteWords);
}
}
}
} }
else { else {
ret.documents = Collections.emptyList(); ret.documents = Collections.emptyList();
@ -90,6 +92,70 @@ public class DomainProcessor {
return ret; return ret;
} }
private void flagCommonSiteWords(ProcessedDomain processedDomain) {
Set<String> commonSiteWords = new HashSet<>(10);
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain, IndexBlock.Tfidf_High, IndexBlock.Subjects));
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain, IndexBlock.Title));
if (commonSiteWords.isEmpty()) {
return;
}
for (var doc : processedDomain.documents) {
if (doc.words != null) {
for (var block : IndexBlock.values()) {
if (block.type == IndexBlockType.PAGE_DATA) {
doc.words.get(block).setFlagOnMetadataForWords(EdgePageWordFlags.Site, commonSiteWords);
}
}
}
}
}
private void flagAdjacentSiteWords(InternalLinkGraph internalLinkGraph, ProcessedDomain processedDomain) {
var invertedGraph = internalLinkGraph.trimAndInvert();
Map<EdgeUrl, Set<String>> linkedKeywords = new HashMap<>(100);
invertedGraph.forEach((url, linkingUrls) -> {
Map<String, Integer> keywords = new HashMap<>(100);
for (var linkingUrl : linkingUrls) {
for (var keyword : internalLinkGraph.getKeywords(linkingUrl)) {
keywords.merge(keyword, 1, Integer::sum);
}
}
var words = keywords.entrySet().stream()
.filter(e -> e.getValue() > 3)
.map(Map.Entry::getKey)
.filter(internalLinkGraph.getCandidateKeywords(url)::contains)
.collect(Collectors.toSet());
if (!words.isEmpty()) {
linkedKeywords.put(url, words);
}
});
for (var doc : processedDomain.documents) {
if (doc.words == null)
continue;
final Set<String> keywords = linkedKeywords.get(doc.url);
if (keywords == null)
continue;
for (var block : IndexBlock.values()) {
if (block.type == IndexBlockType.PAGE_DATA) {
doc.words.get(block).setFlagOnMetadataForWords(EdgePageWordFlags.SiteAdjacent, keywords);
}
}
}
}
private void fixBadCanonicalTags(List<CrawledDocument> docs) { private void fixBadCanonicalTags(List<CrawledDocument> docs) {
Map<String, Set<String>> seenCanonicals = new HashMap<>(); Map<String, Set<String>> seenCanonicals = new HashMap<>();
Set<String> seenUrls = new HashSet<>(); Set<String> seenUrls = new HashSet<>();
@ -162,7 +228,8 @@ public class DomainProcessor {
} }
boolean isQualified() { boolean isQualified() {
return count < 25 || goodCount*10 >= count; return true;
// return count < 25 || goodCount*10 >= count;
} }
} }
} }

View File

@ -1,116 +0,0 @@
package nu.marginalia.wmsa.edge.converting.processor;
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.*;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import java.util.*;
public class InstructionsCompiler {
public List<Instruction> compile(ProcessedDomain domain) {
List<Instruction> ret = new ArrayList<>(domain.size()*4);
ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.ip));
if (domain.documents != null) {
compileUrls(ret, domain.documents);
compileDocuments(ret, domain.documents);
compileFeeds(ret, domain.documents);
compileLinks(ret, domain.domain, domain.documents);
}
if (domain.redirect != null) {
compileRedirect(ret, domain.domain, domain.redirect);
}
return ret;
}
private void compileRedirect(List<Instruction> ret, EdgeDomain from, EdgeDomain to) {
ret.add(new LoadDomain(to));
ret.add(new LoadDomainLink(new DomainLink(from, to)));
ret.add(new LoadDomainRedirect(new DomainLink(from, to)));
}
private void compileUrls(List<Instruction> ret, List<ProcessedDocument> documents) {
Set<EdgeUrl> seenUrls = new HashSet<>(documents.size()*4);
Set<EdgeDomain> seenDomains = new HashSet<>(documents.size());
for (var doc : documents) {
seenUrls.add(doc.url);
if (doc.details != null) {
for (var url : doc.details.linksExternal) {
seenDomains.add(url.domain);
}
seenUrls.addAll(doc.details.linksExternal);
seenUrls.addAll(doc.details.linksInternal);
}
}
ret.add(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new)));
ret.add(new LoadUrl(seenUrls.toArray(EdgeUrl[]::new)));
}
private void compileLinks(List<Instruction> ret, EdgeDomain from, List<ProcessedDocument> documents) {
DomainLink[] links = documents.stream().map(doc -> doc.details)
.filter(Objects::nonNull)
.flatMap(dets -> dets.linksExternal.stream())
.map(link -> link.domain)
.distinct()
.map(domain -> new DomainLink(from, domain))
.toArray(DomainLink[]::new);
ret.add(new LoadDomainLink(links));
}
private void compileFeeds(List<Instruction> ret, List<ProcessedDocument> documents) {
EdgeUrl[] feeds = documents.stream().map(doc -> doc.details)
.filter(Objects::nonNull)
.flatMap(dets -> dets.feedLinks.stream())
.distinct()
.toArray(EdgeUrl[]::new);
ret.add(new LoadRssFeed(feeds));
}
private void compileDocuments(List<Instruction> ret, List<ProcessedDocument> documents) {
for (var doc : documents) {
compileDocumentDetails(ret, doc);
}
for (var doc : documents) {
compileWords(ret, doc);
}
}
private void compileDocumentDetails(List<Instruction> ret, ProcessedDocument doc) {
var details = doc.details;
if (details != null) {
ret.add(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard, details.length, details.hashCode, details.quality));
}
else {
ret.add(new LoadProcessedDocumentWithError(doc.url, doc.state));
}
}
private void compileWords(List<Instruction> ret, ProcessedDocument doc) {
var words = doc.words;
if (words != null) {
var wordsArray = words.values().stream()
.map(DocumentKeywords::new)
.toArray(DocumentKeywords[]::new);
ret.add(new LoadKeywords(doc.url, wordsArray));
}
}
}

View File

@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
import crawlercommons.utils.Strings; import crawlercommons.utils.Strings;
import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException; import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard; import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
@ -23,13 +24,12 @@ public class DocumentValuator {
); );
public double getQuality(EdgeHtmlStandard htmlStandard, Document doc, DocumentLanguageData dld) throws DisqualifiedException { public double getQuality(CrawledDocument crawledDocument, EdgeHtmlStandard htmlStandard, Document parsedDocument, DocumentLanguageData dld) throws DisqualifiedException {
double smutCoefficient = dld.streamLowerCase().filter(filthTable::contains).count(); double smutCoefficient = dld.streamLowerCase().filter(filthTable::contains).count();
double scriptPenalty = getScriptPenalty(doc); double scriptPenalty = getScriptPenalty(parsedDocument);
int textBodyLength = parsedDocument.text().length();
int textBodyLength = doc.text().length(); int rawLength = crawledDocument.documentBody.length();
int rawLength = doc.html().length();
if (textBodyLength == 0) { if (textBodyLength == 0) {
throw new DisqualifiedException(LENGTH); throw new DisqualifiedException(LENGTH);

View File

@ -3,10 +3,7 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import nu.marginalia.util.language.processing.model.DocumentLanguageData; import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator; import nu.marginalia.wmsa.edge.converting.processor.logic.topic.*;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain; import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
@ -43,13 +40,15 @@ public class FeatureExtractor {
private final RecipeDetector recipeDetector; private final RecipeDetector recipeDetector;
private final TextileCraftDetector textileCraftDetector; private final TextileCraftDetector textileCraftDetector;
private final WoodworkingDetector woodworkingDetector; private final WoodworkingDetector woodworkingDetector;
private final GoogleAnwersSpamDetector googleAnwersSpamDetector;
@Inject @Inject
public FeatureExtractor(AdblockSimulator adblockSimulator, RecipeDetector recipeDetector, TextileCraftDetector textileCraftDetector, WoodworkingDetector woodworkingDetector) { public FeatureExtractor(AdblockSimulator adblockSimulator, RecipeDetector recipeDetector, TextileCraftDetector textileCraftDetector, WoodworkingDetector woodworkingDetector, GoogleAnwersSpamDetector googleAnwersSpamDetector) {
this.adblockSimulator = adblockSimulator; this.adblockSimulator = adblockSimulator;
this.recipeDetector = recipeDetector; this.recipeDetector = recipeDetector;
this.textileCraftDetector = textileCraftDetector; this.textileCraftDetector = textileCraftDetector;
this.woodworkingDetector = woodworkingDetector; this.woodworkingDetector = woodworkingDetector;
this.googleAnwersSpamDetector = googleAnwersSpamDetector;
} }
public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc, DocumentLanguageData dld) { public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc, DocumentLanguageData dld) {
@ -57,6 +56,10 @@ public class FeatureExtractor {
final Elements scriptTags = doc.getElementsByTag("script"); final Elements scriptTags = doc.getElementsByTag("script");
if (googleAnwersSpamDetector.testP(doc) > 0.5) {
features.add(HtmlFeature.GA_SPAM);
}
for (var scriptTag : scriptTags) { for (var scriptTag : scriptTags) {
if (isJavascriptTag(scriptTag)) { if (isJavascriptTag(scriptTag)) {
features.add(HtmlFeature.JS); features.add(HtmlFeature.JS);

View File

@ -7,14 +7,14 @@ public enum HtmlFeature {
JS("special:scripts"), JS("special:scripts"),
AFFILIATE_LINK( "special:affiliate"), AFFILIATE_LINK( "special:affiliate"),
TRACKING("special:tracking"), TRACKING("special:tracking"),
COOKIES("special:cookies"), COOKIES("special:cookies"),
CATEGORY_FOOD("category:food"), CATEGORY_FOOD("category:food"),
ADVERTISEMENT("special:ads"), ADVERTISEMENT("special:ads"),
CATEGORY_CRAFTS("category:crafts"), CATEGORY_CRAFTS("category:crafts"),
GA_SPAM("special:gaspam"),
UNKNOWN("special:uncategorized") UNKNOWN("special:uncategorized")
; ;

View File

@ -0,0 +1,54 @@
package nu.marginalia.wmsa.edge.converting.processor.logic;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import java.util.*;
public class InternalLinkGraph {
private final Map<EdgeUrl, Set<EdgeUrl>> internalLinkGraph = new HashMap<>(1000);
private final Set<EdgeUrl> goodUrls = new HashSet<>(1000);
private final Map<EdgeUrl, Set<String>> topKeywordsByUrl = new HashMap<>(1000);
private final Map<EdgeUrl, Set<String>> candidateKeywordsByUrl = new HashMap<>(1000);
public void accept(ProcessedDocument doc) {
if (doc.details == null || doc.details.linksInternal == null)
return;
goodUrls.add(doc.url);
internalLinkGraph.put(doc.url, new HashSet<>(doc.details.linksInternal));
Set<String> topKeywords = new HashSet<>(doc.words.get(IndexBlock.Tfidf_High).words);
topKeywords.addAll(doc.words.get(IndexBlock.Subjects).words);
topKeywordsByUrl.put(doc.url, topKeywords);
Set<String> candidateKeywords = new HashSet<>(topKeywords);
candidateKeywords.addAll(doc.words.get(IndexBlock.Tfidf_High).words);
candidateKeywords.addAll(doc.words.get(IndexBlock.Subjects).words);
candidateKeywordsByUrl.put(doc.url, candidateKeywords);
}
public Map<EdgeUrl, Set<EdgeUrl>> trimAndInvert() {
internalLinkGraph.values().forEach(dest -> dest.retainAll(goodUrls));
Map<EdgeUrl, Set<EdgeUrl>> inverted = new HashMap<>(goodUrls.size());
internalLinkGraph.forEach((source, dests) -> {
dests.forEach(dest -> inverted.computeIfAbsent(dest,
d->new HashSet<>(25))
.add(source));
});
internalLinkGraph.clear();
return inverted;
}
public Set<String> getKeywords(EdgeUrl url) {
return topKeywordsByUrl.getOrDefault(url, Collections.emptySet());
}
public Set<String> getCandidateKeywords(EdgeUrl url) {
return candidateKeywordsByUrl.getOrDefault(url, Collections.emptySet());
}
}

View File

@ -5,7 +5,6 @@ import com.google.common.base.Strings;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.jetbrains.annotations.Contract; import org.jetbrains.annotations.Contract;
import org.jetbrains.annotations.Nullable;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -202,7 +201,6 @@ public class LinkParser {
return binarySuffixList.stream().anyMatch(str::endsWith); return binarySuffixList.stream().anyMatch(str::endsWith);
} }
@Nullable
public EdgeUrl getBaseLink(Document parsed, EdgeUrl documentUrl) { public EdgeUrl getBaseLink(Document parsed, EdgeUrl documentUrl) {
var baseTags = parsed.getElementsByTag("base"); var baseTags = parsed.getElementsByTag("base");

View File

@ -1,9 +1,13 @@
package nu.marginalia.wmsa.edge.converting.processor.logic; package nu.marginalia.wmsa.edge.converting.processor.logic;
import org.apache.commons.lang3.StringUtils;
import javax.annotation.Nullable; import javax.annotation.Nullable;
import java.util.Arrays; import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.StringJoiner;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class QueryParams { public class QueryParams {
@ -15,10 +19,28 @@ public class QueryParams {
return null; return null;
} }
var ret = Arrays.stream(paramSplitterPattern.split(queryParams)) String ret;
.filter(param -> QueryParams.isPermittedParam(path, param)) if (queryParams.indexOf('&') >= 0) {
.sorted()
.collect(Collectors.joining("&")); List<String> parts = new ArrayList<>();
for (var part : StringUtils.split(queryParams, '&')) {
if (QueryParams.isPermittedParam(path, part)) {
parts.add(part);
}
}
if (parts.size() > 1) {
parts.sort(Comparator.naturalOrder());
}
StringJoiner retJoiner = new StringJoiner("&");
parts.forEach(retJoiner::add);
ret = retJoiner.toString();
}
else if (isPermittedParam(path, queryParams)) {
ret = queryParams;
}
else {
return null;
}
if (ret.isBlank()) if (ret.isBlank())
return null; return null;

View File

@ -0,0 +1,36 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
import org.jsoup.nodes.Document;
import java.util.List;
public class GoogleAnwersSpamDetector {
private final List<String> prefixes = List.of("What", "Why", "How", "When", "Is");
public double testP(Document doc) {
if (trialTag(doc, "h1")) return 1;
if (trialTag(doc, "h2")) return 1;
if (trialTag(doc, "h3")) return 1;
return 0;
}
private boolean trialTag(Document doc, String tagName) {
int positive = 0;
int total = 0;
for (var elem : doc.getElementsByTag(tagName)) {
String text = elem.text();
for (var prefix : prefixes) {
if (text.startsWith(prefix)) {
positive++;
break;
}
}
total ++;
}
return positive > 4 && positive / (double) total > 0.5;
}
}

View File

@ -29,7 +29,7 @@ public class CrawlJobExtractorMain {
""" """
SELECT ID SELECT ID
FROM EC_DOMAIN FROM EC_DOMAIN
WHERE URL_PART=? WHERE DOMAIN_NAME=?
"""; """;
private static final String domainsSql = private static final String domainsSql =

View File

@ -11,6 +11,17 @@ import java.util.regex.Pattern;
public class UrlBlocklist { public class UrlBlocklist {
private final List<Predicate<String>> patterns = new ArrayList<>(); private final List<Predicate<String>> patterns = new ArrayList<>();
private record UrlPatternContains(String contains, Pattern pattern) implements Predicate<String> {
public boolean test(String s) {
return s.contains(contains) && pattern.matcher(s).find();
}
}
private record UrlPatternMinLength(int minLength, Pattern pattern) implements Predicate<String> {
public boolean test(String s) {
return s.length() >= minLength && pattern.matcher(s).find();
}
}
// domains that have a lot of links but we know we don't want to crawl // domains that have a lot of links but we know we don't want to crawl
private final Set<String> badDomains = Set.of("t.co", "facebook.com", private final Set<String> badDomains = Set.of("t.co", "facebook.com",
"instagram.com", "youtube.com", "instagram.com", "youtube.com",
@ -18,18 +29,24 @@ public class UrlBlocklist {
public UrlBlocklist() { public UrlBlocklist() {
// Don't deep-crawl git repos // Don't deep-crawl git repos
patterns.add(Pattern.compile("\\.git/.+").asPredicate()); patterns.add(s -> s.contains(".git/"));
patterns.add(Pattern.compile("wp-content/upload").asPredicate());
patterns.add(s -> s.contains("wp-content/upload"));
patterns.add(s -> s.contains("-download-free"));
// long base64-strings in URLs are typically git hashes or the like, rarely worth crawling // long base64-strings in URLs are typically git hashes or the like, rarely worth crawling
patterns.add(Pattern.compile(".*/[^/]*[a-f0-9]{32,}(/|$)").asPredicate()); patterns.add(new UrlPatternMinLength(48, Pattern.compile(".*/[^/]*[a-f0-9]{32,}(/|$)")));
// link farms &c // link farms &c
patterns.add(Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$").asPredicate()); patterns.add(new UrlPatternContains("/download", Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$")));
patterns.add(Pattern.compile("/permalink/[a-z]+(-([A-Za-z]+|[0-9]+)){3,}\\.(htm|html|php)$").asPredicate()); patterns.add(new UrlPatternContains("/permalink", Pattern.compile("/permalink/[a-z]+(-([A-Za-z]+|[0-9]+)){3,}\\.(htm|html|php)$")));
patterns.add(Pattern.compile("(webrx3|lib|pdf|book|720p).*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$").asPredicate()); patterns.add(new UrlPatternContains("/webrx", Pattern.compile("webrx3.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")));
patterns.add(Pattern.compile("/node/.*/[a-z]+(-[a-z0-9]+)+.htm$").asPredicate()); patterns.add(new UrlPatternContains("/lib", Pattern.compile("lib.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")));
patterns.add(Pattern.compile(".*-download-free$").asPredicate()); patterns.add(new UrlPatternContains("/pdf", Pattern.compile("pdf.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")));
patterns.add(new UrlPatternContains("/book", Pattern.compile("book.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")));
patterns.add(new UrlPatternContains("/720p", Pattern.compile("720p.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")));
patterns.add(new UrlPatternContains("/node", Pattern.compile("/node/.*/[a-z]+(-[a-z0-9]+)+.htm$")));
} }
public boolean isUrlBlocked(EdgeUrl url) { public boolean isUrlBlocked(EdgeUrl url) {

View File

@ -31,6 +31,8 @@ public class CrawlerRetreiver {
private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 500); private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 500);
private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500); private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500);
private static final int MAX_ERRORS = 10;
private final LinkedList<EdgeUrl> queue = new LinkedList<>(); private final LinkedList<EdgeUrl> queue = new LinkedList<>();
private final HttpFetcher fetcher; private final HttpFetcher fetcher;
@ -50,6 +52,8 @@ public class CrawlerRetreiver {
private static final IpBlockList ipBlocklist; private static final IpBlockList ipBlocklist;
private static final UrlBlocklist urlBlocklist = new UrlBlocklist(); private static final UrlBlocklist urlBlocklist = new UrlBlocklist();
int errorCount = 0;
static { static {
try { try {
ipBlocklist = new IpBlockList(new GeoIpBlocklist()); ipBlocklist = new IpBlockList(new GeoIpBlocklist());
@ -137,7 +141,7 @@ public class CrawlerRetreiver {
int fetchedCount = 0; int fetchedCount = 0;
while (!queue.isEmpty() && visited.size() < depth) { while (!queue.isEmpty() && visited.size() < depth && errorCount < MAX_ERRORS ) {
var top = queue.removeFirst(); var top = queue.removeFirst();
if (!robotsRules.isAllowed(top.toString())) { if (!robotsRules.isAllowed(top.toString())) {
@ -179,6 +183,10 @@ public class CrawlerRetreiver {
EdgeUrl.parse(d.url).map(EdgeUrl::toString).ifPresent(visited::add); EdgeUrl.parse(d.url).map(EdgeUrl::toString).ifPresent(visited::add);
} }
if ("ERROR".equals(d.crawlerStatus)) {
errorCount++;
}
} }
long crawledTime = System.currentTimeMillis() - startTime; long crawledTime = System.currentTimeMillis() - startTime;

View File

@ -10,6 +10,7 @@ import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
import nu.marginalia.wmsa.edge.search.model.BrowseResult; import nu.marginalia.wmsa.edge.search.model.BrowseResult;
import java.util.List; import java.util.List;
import java.util.Optional;
@ImplementedBy(EdgeDataStoreDaoImpl.class) @ImplementedBy(EdgeDataStoreDaoImpl.class)
public interface EdgeDataStoreDao { public interface EdgeDataStoreDao {
@ -23,7 +24,7 @@ public interface EdgeDataStoreDao {
List<EdgeUrlDetails> getUrlDetailsMulti(EdgeIdCollection<EdgeUrl> ids); List<EdgeUrlDetails> getUrlDetailsMulti(EdgeIdCollection<EdgeUrl> ids);
EdgeDomain getDomain(EdgeId<EdgeDomain> id); Optional<EdgeDomain> getDomain(EdgeId<EdgeDomain> id);
} }

View File

@ -93,7 +93,9 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
WORDS_TOTAL, FORMAT, FEATURES, WORDS_TOTAL, FORMAT, FEATURES,
IP, DOMAIN_STATE, IP, DOMAIN_STATE,
DATA_HASH DATA_HASH
FROM EC_URL_VIEW WHERE ID IN FROM EC_URL_VIEW
WHERE TITLE IS NOT NULL
AND ID IN
""" + idString)) { """ + idString)) {
stmt.setFetchSize(ids.size()); stmt.setFetchSize(ids.size());
@ -113,7 +115,9 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
EdgePageScoreAdjustment.zero(), // urlQualityAdjustment EdgePageScoreAdjustment.zero(), // urlQualityAdjustment
Integer.MAX_VALUE, // rankingId Integer.MAX_VALUE, // rankingId
Double.MAX_VALUE, // termScore Double.MAX_VALUE, // termScore
1 // resultsFromSameDomain 1, // resultsFromSameDomain
"", // positions
null // result item
); );
if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF
&& Strings.isNullOrEmpty(val.description) && Strings.isNullOrEmpty(val.description)
@ -309,18 +313,17 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
@Override @Override
@SneakyThrows @SneakyThrows
public EdgeDomain getDomain(EdgeId<EdgeDomain> id) { public Optional<EdgeDomain> getDomain(EdgeId<EdgeDomain> id) {
try (var connection = dataSource.getConnection()) { try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) { try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
stmt.setInt(1, id.id()); stmt.setInt(1, id.id());
var rsp = stmt.executeQuery(); var rsp = stmt.executeQuery();
if (rsp.next()) { if (rsp.next()) {
return new EdgeDomain(rsp.getString(1)); return Optional.of(new EdgeDomain(rsp.getString(1)));
} }
throw new NoSuchElementException(); return Optional.empty();
} }
} }
} }
} }

View File

@ -4,7 +4,6 @@ import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.set.hash.TIntHashSet; import gnu.trove.set.hash.TIntHashSet;
import io.prometheus.client.Counter;
import io.reactivex.rxjava3.schedulers.Schedulers; import io.reactivex.rxjava3.schedulers.Schedulers;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -18,8 +17,6 @@ public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist {
private final HikariDataSource dataSource; private final HikariDataSource dataSource;
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
private static final Counter wmsa_blacklist_intercept = Counter.build("wmsa_blacklist_intercept",
"wmsa_blacklist_intercept").register();
@Inject @Inject
public EdgeDomainBlacklistImpl(HikariDataSource dataSource) { public EdgeDomainBlacklistImpl(HikariDataSource dataSource) {
this.dataSource = dataSource; this.dataSource = dataSource;
@ -65,7 +62,6 @@ public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist {
@Override @Override
public boolean isBlacklisted(int domainId) { public boolean isBlacklisted(int domainId) {
if (spamDomainSet.contains(domainId)) { if (spamDomainSet.contains(domainId)) {
wmsa_blacklist_intercept.inc();
return true; return true;
} }

View File

@ -0,0 +1,34 @@
package nu.marginalia.wmsa.edge.explorer;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.wmsa.configuration.MainClass;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import nu.marginalia.wmsa.configuration.module.ConfigurationModule;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.configuration.server.Initialization;
import spark.Spark;
public class ExplorerMain extends MainClass {
final ExplorerService service;
@Inject
public ExplorerMain(ExplorerService service) {
this.service = service;
}
public static void main(String... args) {
init(ServiceDescriptor.EXPLORER, args);
Spark.staticFileLocation("/static/explore/");
Injector injector = Guice.createInjector(
new ConfigurationModule(),
new DatabaseModule()
);
injector.getInstance(ExplorerMain.class);
injector.getInstance(Initialization.class).setReady();
}
}

View File

@ -0,0 +1,253 @@
package nu.marginalia.wmsa.edge.explorer;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.configuration.server.MetricsServer;
import nu.marginalia.wmsa.configuration.server.Service;
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
import nu.marginalia.wmsa.resource_store.StaticResources;
import org.jetbrains.annotations.NotNull;
import spark.Request;
import spark.Response;
import spark.Spark;
import java.sql.SQLException;
import java.util.*;
public class ExplorerService extends Service {
private final MustacheRenderer<Object> renderer;
private final HikariDataSource dataSource;
private final StaticResources staticResources;
record SearchResult(
String domain,
String url,
double relatedness,
boolean hasMore,
boolean active,
boolean indexed) implements Comparable<SearchResult> {
@Override
public int compareTo(@NotNull SearchResult o) {
return (int)(o.relatedness - relatedness);
}
}
record SearchResults(String query, String message, String aliasDomain, List<SearchResult> resultList) { }
@SneakyThrows
@Inject
public ExplorerService(@Named("service-host") String ip,
@Named("service-port") Integer port,
Initialization initialization,
MetricsServer metricsServer,
RendererFactory rendererFactory,
HikariDataSource dataSource,
StaticResources staticResources
) {
super(ip, port, initialization, metricsServer);
renderer = rendererFactory.renderer("explorer/explorer");
this.dataSource = dataSource;
this.staticResources = staticResources;
Spark.get("/public/", this::serveIndex, this::render);
Spark.get("/public/search", this::search, this::render);
Spark.get("/public/:resource", this::serveStatic);
}
private Object serveStatic(Request request, Response response) {
String resource = request.params("resource");
staticResources.serveStatic("explore", resource, request, response);
return "";
}
public String render(Object results) {
return renderer.render(results);
}
private SearchResults serveIndex(Request request, Response response) {
return new SearchResults("", "", null, Collections.emptyList());
}
private SearchResults search(Request request, Response response) throws SQLException {
String query = request.queryParams("domain");
query = trimUrlJunk(query);
DomainIdInformation domainId = getDomainId(query);
if (!domainId.isPresent()) {
return new SearchResults(query,
"Could not find such a domain (maybe try adding/removing www?)",
null, Collections.emptyList());
}
var relatedDomains = getRelatedDomains(domainId);
if (relatedDomains.isEmpty()) {
String message = """
I've got nothing. This may either be due to the website being far out in the periphery of Marginalia's
search engine index, or it may be due to the website being too big.
A few hundred of the biggest websites are excluded for performance reasons. They are usually
not very interesting to look at either as everyone links to them and there's no real pattern to discern.
""";
return new SearchResults(query, message, domainId.alias, relatedDomains);
}
return new SearchResults(query, "", domainId.alias, relatedDomains);
}
private List<SearchResult> getRelatedDomains(DomainIdInformation domainIdInformation) throws SQLException {
List<SearchResult> ret = new ArrayList<>();
Set<String> seen = new HashSet<>();
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
SELECT
NV.NEIGHBOR_NAME,
NV.RELATEDNESS,
(LV.DOMAIN_ID IS NOT NULL),
(STATE = 'ACTIVE' OR STATE='SOCIAL_MEDIA'),
INDEXED > 0
FROM EC_NEIGHBORS_VIEW NV
LEFT JOIN EC_NEIGHBORS_VIEW LV ON (NV.NEIGHBOR_ID=LV.DOMAIN_ID)
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NV.NEIGHBOR_ID
WHERE NV.DOMAIN_ID=?
GROUP BY NV.NEIGHBOR_ID
ORDER BY NV.RELATEDNESS DESC
""");
var stmtRev = conn.prepareStatement("""
SELECT
NV.DOMAIN_NAME,
NV.RELATEDNESS,
(LV.NEIGHBOR_ID IS NOT NULL),
(STATE = 'ACTIVE' OR STATE='SOCIAL_MEDIA'),
INDEXED > 0
FROM EC_NEIGHBORS_VIEW NV
LEFT JOIN EC_NEIGHBORS_VIEW LV ON (NV.DOMAIN_ID=LV.NEIGHBOR_ID)
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NV.DOMAIN_ID
WHERE NV.NEIGHBOR_ID=?
GROUP BY NV.DOMAIN_ID
ORDER BY NV.RELATEDNESS DESC
"""
);
) {
stmt.setInt(1, domainIdInformation.domainId);
var rsp = stmt.executeQuery();
while (rsp.next()) {
String domainName = rsp.getString(1);
double relatedness = rsp.getDouble(2);
boolean hasMore = rsp.getBoolean(3);
boolean active = rsp.getBoolean(4);
boolean indexed = rsp.getBoolean(5);
seen.add(domainName);
String url = "http://" + domainName + "/";
if (domainName.length() < 48 && domainName.contains(".")) {
ret.add(new SearchResult(
domainName,
url,
relatedness,
hasMore,
active,
indexed
));
}
}
stmtRev.setInt(1, domainIdInformation.domainId);
rsp = stmtRev.executeQuery();
while (rsp.next()) {
String domainName = rsp.getString(1);
double relatedness = rsp.getDouble(2);
boolean hasMore = rsp.getBoolean(3);
boolean active = rsp.getBoolean(4);
boolean indexed = rsp.getBoolean(5);
String url = "http://" + domainName + "/";
if (!seen.add(domainName))
continue;
if (domainName.length() < 48 && domainName.contains(".")) {
ret.add(new SearchResult(
domainName,
url,
relatedness,
hasMore,
active,
indexed
));
}
}
}
Comparator<SearchResult> comp = SearchResult::compareTo;
comp = comp.thenComparing(SearchResult::domain);
ret.sort(comp);
return ret;
}
private DomainIdInformation getDomainId(String query) throws SQLException {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
SELECT IFNULL(ALIAS.ID, DOMAIN.ID), DOMAIN.INDEXED>0 OR ALIAS.INDEXED>0, ALIAS.DOMAIN_NAME
FROM EC_DOMAIN DOMAIN
LEFT JOIN EC_DOMAIN ALIAS ON DOMAIN.DOMAIN_ALIAS=ALIAS.ID
WHERE DOMAIN.DOMAIN_NAME=?
""")) {
stmt.setString(1, query);
var rsp = stmt.executeQuery();
if (rsp.next()) {
return new DomainIdInformation(
rsp.getInt(1),
rsp.getBoolean(2),
rsp.getString(3)
);
}
}
return new DomainIdInformation(-1, false, null);
}
private String trimUrlJunk(String query) {
if (query.startsWith("http://")) {
query = query.substring(7);
}
if (query.startsWith("https://")) {
query = query.substring(8);
}
int lastSlash = query.indexOf('/');
if (lastSlash > 0) {
query = query.substring(0, lastSlash);
}
return query;
}
record DomainIdInformation(int domainId, boolean indexed, String alias) {
boolean isPresent() {
return domainId >= 0;
}
}
}

View File

@ -1,20 +1,19 @@
package nu.marginalia.wmsa.edge.index; package nu.marginalia.wmsa.edge.index;
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter; import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter;
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader; import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery; import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory; import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryParams;
import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator; import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator;
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepFromPredicate; import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepFromPredicate;
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryRankLimitingFilter;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.util.Collections; import java.util.Collections;
import java.util.Comparator;
import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock;
@ -103,54 +102,65 @@ public class EdgeIndexBucket {
return indexReader != null; return indexReader != null;
} }
public IndexQuery getQuery(IndexQueryCachePool cachePool, IndexBlock block, LongPredicate filter, EdgeIndexSearchTerms searchTerms) { public IndexQuery getQuery(LongPredicate filter, IndexQueryParams params) {
if (null == indexReader) { if (null == indexReader) {
logger.warn("Index reader not neady {}", block); logger.warn("Index reader not neady {}", params.block());
return new IndexQuery(Collections.emptyList()); return new IndexQuery(Collections.emptyList());
} }
final int[] orderedIncludes = searchTerms.includes final int[] orderedIncludes = params.searchTerms()
.stream() .sortedDistinctIncludes((a, b) -> compareKeywords(params.block(), a, b));
.sorted(Comparator.comparingLong(i -> indexReader.numHits(cachePool, block, i)))
.distinct()
.mapToInt(Integer::intValue)
.toArray();
IndexQueryFactory.IndexQueryBuilder query; IndexQueryFactory.IndexQueryBuilder query = createQueryBuilder(orderedIncludes[0], params);
query = indexReader.findWord(cachePool, block, orderedIncludes[0]);
if (query == null) { if (query == null) {
return new IndexQuery(Collections.emptyList()); return new IndexQuery(Collections.emptyList());
} }
query.filter(filter); query.addInclusionFilter(new QueryFilterStepFromPredicate(filter));
if (params.rankLimit() != null) {
query.addInclusionFilter(new QueryRankLimitingFilter(params.rankLimit()));
}
for (int i = 1; i < orderedIncludes.length; i++) { for (int i = 1; i < orderedIncludes.length; i++) {
query = query.also(orderedIncludes[i]); query = query.also(orderedIncludes[i]);
} }
for (int term : searchTerms.excludes) { for (int term : params.searchTerms().excludes()) {
query = query.not(term); query = query.not(term);
} }
return query.build(); return query.build();
} }
private IndexQueryFactory.IndexQueryBuilder createQueryBuilder(int firstKeyword, IndexQueryParams params) {
public IndexQuery getDomainQuery(IndexQueryCachePool pool, int wordId, ResultDomainDeduplicator localFilter) { if (params.targetDomains() != null && !params.targetDomains().isEmpty()) {
var query = indexReader.findDomain(pool, wordId); return indexReader.findWordForDomainList(params.block(), params.targetDomains(), firstKeyword);
}
return indexReader.findWord(params.block(), params.qualityLimit(), firstKeyword);
}
private int compareKeywords(IndexBlock block, int a, int b) {
return Long.compare(
indexReader.numHits(block, a),
indexReader.numHits(block, b)
);
}
public IndexQuery getDomainQuery(int wordId, ResultDomainDeduplicator localFilter) {
var query = indexReader.findDomain(wordId);
query.addInclusionFilter(new QueryFilterStepFromPredicate(localFilter::filterRawValue)); query.addInclusionFilter(new QueryFilterStepFromPredicate(localFilter::filterRawValue));
return query; return query;
} }
public IndexBlock getTermScore(IndexQueryCachePool cachePool, int termId, long urlId) { /** Replaces the values of ids with their associated metadata, or 0L if absent */
return indexReader.getBlockForResult(cachePool, termId, urlId); public long[] getMetadata(IndexBlock block, int termId, long[] ids) {
return indexReader.getMetadata(block, termId, ids);
} }
public boolean isTermInBucket(IndexQueryCachePool cachePool, IndexBlock block, int termId, long urlId) {
return indexReader.isTermInBucket(cachePool, block, termId, urlId);
}
} }

View File

@ -2,8 +2,8 @@ package nu.marginalia.wmsa.edge.index;
import com.google.inject.Inject; import com.google.inject.Inject;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException; import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import java.io.IOException; import java.io.IOException;
@ -18,9 +18,6 @@ public class EdgeIndexControl {
} }
public void regenerateIndex(int id) { public void regenerateIndex(int id) {
System.runFinalization();
System.gc();
for (IndexBlock block : IndexBlock.values()) { for (IndexBlock block : IndexBlock.values()) {
try { try {
servicesFactory.convertIndex(id, block); servicesFactory.convertIndex(id, block);

View File

@ -9,6 +9,7 @@ import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.configuration.server.MetricsServer; import nu.marginalia.wmsa.configuration.server.MetricsServer;
import nu.marginalia.wmsa.configuration.server.Service; import nu.marginalia.wmsa.configuration.server.Service;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexDomainQueryService;
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexLexiconService; import nu.marginalia.wmsa.edge.index.svc.EdgeIndexLexiconService;
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexOpsService; import nu.marginalia.wmsa.edge.index.svc.EdgeIndexOpsService;
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexQueryService; import nu.marginalia.wmsa.edge.index.svc.EdgeIndexQueryService;
@ -39,7 +40,9 @@ public class EdgeIndexService extends Service {
EdgeIndexOpsService opsService, EdgeIndexOpsService opsService,
EdgeIndexLexiconService lexiconService, EdgeIndexLexiconService lexiconService,
EdgeIndexQueryService indexQueryService) EdgeIndexQueryService indexQueryService,
EdgeIndexDomainQueryService domainQueryService
)
{ {
super(ip, port, init, metricsServer); super(ip, port, init, metricsServer);
@ -51,7 +54,7 @@ public class EdgeIndexService extends Service {
Spark.post("/words/", lexiconService::putWords); Spark.post("/words/", lexiconService::putWords);
Spark.post("/search/", indexQueryService::search, gson::toJson); Spark.post("/search/", indexQueryService::search, gson::toJson);
Spark.post("/search-domain/", indexQueryService::searchDomain, gson::toJson); Spark.post("/search-domain/", domainQueryService::searchDomain, gson::toJson);
Spark.get("/dictionary/*", lexiconService::getWordId, gson::toJson); Spark.get("/dictionary/*", lexiconService::getWordId, gson::toJson);

View File

@ -103,9 +103,9 @@ public class IndexServicesFactory {
public void convertIndex(int id, IndexBlock block) throws ConversionUnnecessaryException, IOException { public void convertIndex(int id, IndexBlock block) throws ConversionUnnecessaryException, IOException {
var converter = new SearchIndexConverter(block, id, tmpFileDir, var converter = new SearchIndexConverter(block, id, tmpFileDir,
preconverterOutputFile.get(id, block.ordinal()), preconverterOutputFile.get(id, block),
indexWriteWordsFile.get(id, block.id), indexWriteWordsFile.get(id, block),
indexWriteUrlsFile.get(id, block.id), indexWriteUrlsFile.get(id, block),
partitioner, partitioner,
domainBlacklist domainBlacklist
); );
@ -118,7 +118,7 @@ public class IndexServicesFactory {
for (int index = 0; index < (DYNAMIC_BUCKET_LENGTH + 1); index++) { for (int index = 0; index < (DYNAMIC_BUCKET_LENGTH + 1); index++) {
for (IndexBlock block : IndexBlock.values()) { for (IndexBlock block : IndexBlock.values()) {
shards.put(new SearchIndexPreconverter.Shard(index, block.ordinal()), getPreconverterOutputFile(index, block.ordinal())); shards.put(new SearchIndexPreconverter.Shard(index, block.ordinal()), getPreconverterOutputFile(index, block));
} }
} }
@ -129,7 +129,7 @@ public class IndexServicesFactory {
); );
} }
private File getPreconverterOutputFile(int index, int block) { private File getPreconverterOutputFile(int index, IndexBlock block) {
return preconverterOutputFile.get(index, block); return preconverterOutputFile.get(index, block);
} }
@ -141,7 +141,7 @@ public class IndexServicesFactory {
indexMap.put(block, createSearchIndex(id, block)); indexMap.put(block, createSearchIndex(id, block));
} }
catch (Exception ex) { catch (Exception ex) {
logger.error("Could not create index {}-{}", id, block); logger.error("Could not create index {}-{} ({})", id, block, ex.getMessage());
} }
} }
return new SearchIndexReader(indexMap); return new SearchIndexReader(indexMap);
@ -150,8 +150,8 @@ public class IndexServicesFactory {
private SearchIndex createSearchIndex(int bucketId, IndexBlock block) { private SearchIndex createSearchIndex(int bucketId, IndexBlock block) {
try { try {
return new SearchIndex("IndexReader"+bucketId+":"+ block.name(), return new SearchIndex("IndexReader"+bucketId+":"+ block.name(),
indexReadUrlsFile.get(bucketId, block.id), indexReadUrlsFile.get(bucketId, block),
indexReadWordsFile.get(bucketId, block.id)); indexReadWordsFile.get(bucketId, block));
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
@ -159,7 +159,8 @@ public class IndexServicesFactory {
public Callable<Boolean> switchFilesJob(int id) { public Callable<Boolean> switchFilesJob(int id) {
return () -> { return () -> {
for (int block = 0; block < IndexBlock.values().length; block++) {
for (var block : IndexBlock.values()) {
if (Files.exists(indexWriteWordsFile.get(id, block).toPath()) && if (Files.exists(indexWriteWordsFile.get(id, block).toPath()) &&
Files.exists(indexWriteUrlsFile.get(id, block).toPath())) { Files.exists(indexWriteUrlsFile.get(id, block).toPath())) {
Files.move( Files.move(
@ -172,6 +173,7 @@ public class IndexServicesFactory {
StandardCopyOption.REPLACE_EXISTING); StandardCopyOption.REPLACE_EXISTING);
} }
} }
return true; return true;
}; };
} }
@ -205,8 +207,8 @@ class PartitionedDataFile {
this.pattern = pattern; this.pattern = pattern;
} }
public File get(int id) { public File get(Object id) {
Path partitionDir = partition.resolve(Integer.toString(id)); Path partitionDir = partition.resolve(id.toString());
if (!partitionDir.toFile().exists()) { if (!partitionDir.toFile().exists()) {
partitionDir.toFile().mkdir(); partitionDir.toFile().mkdir();
} }
@ -223,13 +225,13 @@ class DoublePartitionedDataFile {
this.pattern = pattern; this.pattern = pattern;
} }
public File get(int id, int id2) { public File get(Object id, Object id2) {
Path partitionDir = partition.resolve(Integer.toString(id)); Path partitionDir = partition.resolve(id.toString());
if (!partitionDir.toFile().exists()) { if (!partitionDir.toFile().exists()) {
partitionDir.toFile().mkdir(); partitionDir.toFile().mkdir();
} }
partitionDir = partitionDir.resolve(Integer.toString(id2)); partitionDir = partitionDir.resolve(id2.toString());
if (!partitionDir.toFile().exists()) { if (!partitionDir.toFile().exists()) {
partitionDir.toFile().mkdir(); partitionDir.toFile().mkdir();
} }

View File

@ -47,6 +47,9 @@ public class EdgeIndexClient extends AbstractDynamicClient implements EdgeIndexW
var wordSetBuilder = IndexPutKeywordsReq.WordSet.newBuilder(); var wordSetBuilder = IndexPutKeywordsReq.WordSet.newBuilder();
wordSetBuilder.setIndex(wordSet.block().ordinal()); wordSetBuilder.setIndex(wordSet.block().ordinal());
wordSetBuilder.addAllWords(List.of(wordSet.keywords())); wordSetBuilder.addAllWords(List.of(wordSet.keywords()));
for (var meta : wordSet.metadata()) {
wordSetBuilder.addMeta(meta);
}
keywordBuilder.addWordSet(wordSetBuilder.build()); keywordBuilder.addWordSet(wordSetBuilder.build());
var req = keywordBuilder.build(); var req = keywordBuilder.build();

View File

@ -21,7 +21,6 @@ import org.slf4j.LoggerFactory;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.Arrays; import java.util.Arrays;
import java.util.List;
@Singleton @Singleton
public class EdgeIndexLocalService implements EdgeIndexWriterClient { public class EdgeIndexLocalService implements EdgeIndexWriterClient {
@ -53,9 +52,9 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient {
return; return;
} }
for (var chunk : ListChunker.chopList(List.of(wordSet.keywords()), SearchIndexJournalEntry.MAX_LENGTH)) { for (var chunk : ListChunker.chopList(wordSet, SearchIndexJournalEntry.MAX_LENGTH)) {
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk)); var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk.keywords(), chunk.metadata()));
var header = new SearchIndexJournalEntryHeader(domain, url, wordSet.block()); var header = new SearchIndexJournalEntryHeader(domain, url, wordSet.block());
indexWriter.put(header, entry); indexWriter.put(header, entry);
@ -63,19 +62,22 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient {
} }
private long[] getOrInsertWordIds(List<String> words) { private long[] getOrInsertWordIds(String[] words, long[] meta) {
long[] ids = new long[words.size()]; long[] ids = new long[words.length*2];
int putId = 0; int putIdx = 0;
for (int i = 0; i < words.length; i++) {
String word = words[i];
for (String word : words) {
long id = lexicon.getOrInsert(word); long id = lexicon.getOrInsert(word);
if (id != DictionaryHashMap.NO_VALUE) { if (id != DictionaryHashMap.NO_VALUE) {
ids[putId++] = id; ids[putIdx++] = id;
ids[putIdx++] = meta[i];
} }
} }
if (putId != words.size()) { if (putIdx != words.length*2) {
ids = Arrays.copyOf(ids, putId); ids = Arrays.copyOf(ids, putIdx);
} }
return ids; return ids;
} }

View File

@ -20,12 +20,14 @@ import java.nio.channels.FileChannel;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry.MAX_LENGTH;
public class SearchIndexConverter { public class SearchIndexConverter {
public static final BTreeContext urlsBTreeContext = new BTreeContext(5, 1, ~0, 8); public static final int ENTRY_URL_OFFSET = 0;
public static final int ENTRY_METADATA_OFFSET = 1;
public static final int ENTRY_SIZE = 2;
private final long[] tmpWordsBuffer = new long[MAX_LENGTH]; public static final BTreeContext urlsBTreeContext = new BTreeContext(5, ENTRY_SIZE, ~0, 8);
private final long[] tmpWordsBuffer = SearchIndexJournalReader.createAdequateTempBuffer();
private final Path tmpFileDir; private final Path tmpFileDir;
@ -72,7 +74,7 @@ public class SearchIndexConverter {
return; return;
} }
logger.info("Converting {} ({}) {} {}", block.id, block, inputFile, journalReader.fileHeader); logger.info("Converting {} ({}) {} {}", block.ordinal(), block, inputFile, journalReader.fileHeader);
var lock = partitioner.getReadLock(); var lock = partitioner.getReadLock();
try { try {
@ -80,10 +82,10 @@ public class SearchIndexConverter {
var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat"); var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
logger.info("Creating word index table {} for block {} ({})", outputFileWords, block.id, block); logger.info("Creating word index table {} for block {}", outputFileWords, block.ordinal());
WordIndexOffsetsTable wordIndexTable = createWordIndexTable(journalReader, outputFileWords); WordIndexOffsetsTable wordIndexTable = createWordIndexTable(journalReader, outputFileWords);
logger.info("Creating word urls table {} for block {} ({})", outputFileUrls, block.id, block); logger.info("Creating word urls table {} for block {}", outputFileUrls, block.ordinal());
createUrlTable(journalReader, tmpUrlsFile, wordIndexTable); createUrlTable(journalReader, tmpUrlsFile, wordIndexTable);
Files.delete(tmpUrlsFile); Files.delete(tmpUrlsFile);
@ -111,10 +113,10 @@ public class SearchIndexConverter {
final SearchIndexJournalEntry entryData = entry.readEntryUsingBuffer(tmpWordsBuffer); final SearchIndexJournalEntry entryData = entry.readEntryUsingBuffer(tmpWordsBuffer);
for (int i = 0; i < entryData.size(); i++) { for (var record : entryData) {
int wordId = (int) entryData.get(i); int wordId = record.wordId();
if (wordId < 0 || wordId >= topWord) { if (wordId < 0 || wordId >= topWord) {
logger.warn("Bad wordId {}", wordId); logger.warn("Bad word {}", record);
} }
wordsTableWriter.acceptWord(wordId); wordsTableWriter.acceptWord(wordId);
} }
@ -138,7 +140,7 @@ public class SearchIndexConverter {
try (RandomAccessFile urlsTmpFileRAF = new RandomAccessFile(tmpUrlsFile.toFile(), "rw"); try (RandomAccessFile urlsTmpFileRAF = new RandomAccessFile(tmpUrlsFile.toFile(), "rw");
FileChannel urlsTmpFileChannel = urlsTmpFileRAF.getChannel()) { FileChannel urlsTmpFileChannel = urlsTmpFileRAF.getChannel()) {
try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, numberOfWordsTotal, 10_000_000)) { try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, ENTRY_SIZE * numberOfWordsTotal, 10_000_000)) {
int[] wordWriteOffset = new int[wordOffsetsTable.length()]; int[] wordWriteOffset = new int[wordOffsetsTable.length()];
for (var entry : journalReader) { for (var entry : journalReader) {
@ -146,21 +148,29 @@ public class SearchIndexConverter {
var entryData = entry.readEntryUsingBuffer(tmpWordsBuffer); var entryData = entry.readEntryUsingBuffer(tmpWordsBuffer);
for (int i = 0; i < entryData.size(); i++) { for (var record : entryData) {
int wordId = (int) entryData.get(i); int wordId = record.wordId();
long metadata = record.metadata();
if (wordId >= wordWriteOffset.length) if (wordId >= wordWriteOffset.length) {
logger.warn("Overflowing wordId {}", wordId);
continue; continue;
}
if (wordId < 0) { if (wordId < 0) {
logger.warn("Negative wordId {}", wordId); logger.warn("Negative wordId {}", wordId);
} }
final long urlInternal = translateUrl(entry.docId()); final long urlInternal = translateUrl(entry.docId());
if (wordId > 0) {
rwf.put(wordOffsetsTable.get(wordId - 1) + wordWriteOffset[wordId]++, urlInternal); long offset;
} else { if (wordId > 0) offset = wordOffsetsTable.get(wordId - 1) + wordWriteOffset[wordId];
rwf.put(wordWriteOffset[wordId]++, urlInternal); else offset = wordWriteOffset[wordId];
}
rwf.put(offset + ENTRY_URL_OFFSET, urlInternal);
rwf.put(offset + ENTRY_METADATA_OFFSET, metadata);
wordWriteOffset[wordId] += ENTRY_SIZE;
} }
} }
@ -171,9 +181,9 @@ public class SearchIndexConverter {
try (var urlsTmpFileMap = MultimapFileLong.forOutput(tmpUrlsFile, numberOfWordsTotal)) { try (var urlsTmpFileMap = MultimapFileLong.forOutput(tmpUrlsFile, numberOfWordsTotal)) {
if (wordOffsetsTable.length() > 0) { if (wordOffsetsTable.length() > 0) {
var urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit); var urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit, ENTRY_SIZE);
wordOffsetsTable.forEachRange(urlTmpFileSorter::sort); wordOffsetsTable.forEachRange(urlTmpFileSorter::sortRange);
urlsTmpFileMap.force(); urlsTmpFileMap.force();
} else { } else {
@ -187,7 +197,7 @@ public class SearchIndexConverter {
wordOffsetsTable.foldRanges((accumulatorIdx, start, length) -> { wordOffsetsTable.foldRanges((accumulatorIdx, start, length) -> {
// Note: The return value is accumulated into accumulatorIdx! // Note: The return value is accumulated into accumulatorIdx!
return writer.write(accumulatorIdx, length, return writer.write(accumulatorIdx, length/ENTRY_SIZE,
slice -> slice.transferFromFileChannel(urlsTmpFileChannel, 0, start, start + length)); slice -> slice.transferFromFileChannel(urlsTmpFileChannel, 0, start, start + length));
}); });

View File

@ -9,7 +9,6 @@ import gnu.trove.set.hash.TIntHashSet;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BetterReversePageRank; import nu.marginalia.util.ranking.BetterReversePageRank;
import nu.marginalia.util.ranking.BetterStandardPageRank; import nu.marginalia.util.ranking.BetterStandardPageRank;
import nu.marginalia.util.ranking.BuggyStandardPageRank;
import nu.marginalia.util.ranking.RankingDomainFetcher; import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.edge.index.model.RankingSettings; import nu.marginalia.wmsa.edge.index.model.RankingSettings;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -87,8 +86,25 @@ public class SearchIndexDao {
@SneakyThrows @SneakyThrows
public TIntList getStandardDomains() { public TIntList getStandardDomains() {
var spr = new BuggyStandardPageRank(rankingDomains,rankingSettings.standard.toArray(String[]::new)); TIntArrayList results = new TIntArrayList();
return spr.pageRankWithPeripheralNodes(spr.size()/2);
try (var connection = dataSource.getConnection();
var stmt = connection.prepareStatement(
"""
SELECT ID FROM EC_DOMAIN
WHERE INDEXED>0
AND STATE='ACTIVE'
AND DOMAIN_ALIAS IS NULL
ORDER BY ID ASC
""");
) {
var rs = stmt.executeQuery();
while (rs.next()) {
results.add(rs.getInt(1));
}
}
return results;
} }
@SneakyThrows @SneakyThrows

View File

@ -110,11 +110,12 @@ public class SearchIndexPartitioner {
return true; return true;
if (academiaRanking.hasBucket(bucketId, domainId)) if (academiaRanking.hasBucket(bucketId, domainId))
return true; return true;
if (standardRanking.hasBucket(bucketId, domainId))
return true;
if (specialDomainRanking.hasBucket(bucketId, domainId)) if (specialDomainRanking.hasBucket(bucketId, domainId))
return true; return true;
if (standardRanking.hasBucket(bucketId, domainId))
return true;
return DYNAMIC_BUCKET_LENGTH == bucketId; return DYNAMIC_BUCKET_LENGTH == bucketId;
} }
@ -148,15 +149,15 @@ public class SearchIndexPartitioner {
if (academiaRanking != null && academiaRanking.ownsBucket(bucketId)) { if (academiaRanking != null && academiaRanking.ownsBucket(bucketId)) {
return academiaRanking.translateId(id); return academiaRanking.translateId(id);
} }
if (standardRanking != null && standardRanking.ownsBucket(bucketId)) {
return standardRanking.translateId(id);
}
if (specialDomainRanking != null && specialDomainRanking.ownsBucket(bucketId)) { if (specialDomainRanking != null && specialDomainRanking.ownsBucket(bucketId)) {
return specialDomainRanking.translateId(id); return specialDomainRanking.translateId(id);
} }
if (retroRanking != null) {
return retroRanking.translateId(id); // standard gets passed traight through
if (standardRanking != null && standardRanking.ownsBucket(bucketId)) {
return id;
} }
return id; return id;
} }

View File

@ -52,7 +52,7 @@ public class SearchIndexPreconverter {
var lock = partitioner.getReadLock(); var lock = partitioner.getReadLock();
try { try {
lock.lock(); lock.lock();
ByteBuffer buffer = ByteBuffer.allocateDirect(8192); ByteBuffer buffer = ByteBuffer.allocateDirect(65536);
for (var entry : indexJournalReader) { for (var entry : indexJournalReader) {
if (!partitioner.isGoodUrl(entry.urlId()) if (!partitioner.isGoodUrl(entry.urlId())
|| spamDomains.contains(entry.domainId())) { || spamDomains.contains(entry.domainId())) {
@ -93,7 +93,7 @@ public class SearchIndexPreconverter {
} }
public boolean shouldWrite(SearchIndexPartitioner partitioner, SearchIndexJournalReader.JournalEntry entry) { public boolean shouldWrite(SearchIndexPartitioner partitioner, SearchIndexJournalReader.JournalEntry entry) {
return shard.block == entry.header.block().id return shard.block == entry.header.block().ordinal()
&& partitioner.filterUnsafe(entry.domainId(), shard.bucket); && partitioner.filterUnsafe(entry.domainId(), shard.bucket);
} }

View File

@ -23,10 +23,10 @@ public class WordIndexOffsetsTable {
for (int i = 1; i < table.length; i++) { for (int i = 1; i < table.length; i++) {
long start = table[i-1]; long start = table[i-1];
int length = (int) (table[i] - start); long end = table[i];
if (length != 0) { if (start != end) {
o.accept(start, length); o.accept(start, end);
} }
} }
} }
@ -58,7 +58,7 @@ public class WordIndexOffsetsTable {
} }
public interface OffsetTableEntryConsumer { public interface OffsetTableEntryConsumer {
void accept(long start, int length) throws IOException; void accept(long start, long end) throws IOException;
} }
public interface OffsetTableEntryFoldConsumer { public interface OffsetTableEntryFoldConsumer {

View File

@ -8,8 +8,10 @@ import nu.marginalia.wmsa.edge.index.reader.IndexWordsTable;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.*; import java.io.File;
import java.io.IOException;
import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.ENTRY_SIZE;
import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.urlsBTreeContext; import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.urlsBTreeContext;
public class WordsTableWriter { public class WordsTableWriter {
@ -23,8 +25,10 @@ public class WordsTableWriter {
} }
public void acceptWord(int wordId) { public void acceptWord(int wordId) {
for (int i = 0; i < ENTRY_SIZE; i++) {
table.lengths().increment(wordId); table.lengths().increment(wordId);
} }
}
public WordIndexOffsetsTable getTable() { public WordIndexOffsetsTable getTable() {
return table.offsets(); return table.offsets();
@ -58,7 +62,7 @@ public class WordsTableWriter {
mapSlice.put(idx++, (long)length<<32); mapSlice.put(idx++, (long)length<<32);
mapSlice.put(idx++, 0); mapSlice.put(idx++, 0);
urlFileOffset += (urlsBTreeContext.calculateSize(length)); urlFileOffset += (urlsBTreeContext.calculateSize(length / ENTRY_SIZE));
} }
for (int i = 1; i < offsetTable.length; i++) { for (int i = 1; i < offsetTable.length; i++) {
@ -68,7 +72,7 @@ public class WordsTableWriter {
mapSlice.put(idx++, (long)length << 32 | i); mapSlice.put(idx++, (long)length << 32 | i);
mapSlice.put(idx++, urlFileOffset); mapSlice.put(idx++, urlFileOffset);
urlFileOffset += (urlsBTreeContext.calculateSize(length)); urlFileOffset += (urlsBTreeContext.calculateSize(length / ENTRY_SIZE));
} }
} }
} }

View File

@ -12,6 +12,8 @@ import org.jetbrains.annotations.NotNull;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.util.Iterator; import java.util.Iterator;
import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry.ENTRY_SIZE;
import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry.MAX_LENGTH;
import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS; import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS;
public class SearchIndexJournalReader implements Iterable<SearchIndexJournalReader.JournalEntry> { public class SearchIndexJournalReader implements Iterable<SearchIndexJournalReader.JournalEntry> {
@ -23,6 +25,10 @@ public class SearchIndexJournalReader implements Iterable<SearchIndexJournalRead
private final MultimapFileLongSlice map; private final MultimapFileLongSlice map;
private final long committedSize; private final long committedSize;
public static long[] createAdequateTempBuffer() {
return new long[MAX_LENGTH*ENTRY_SIZE];
}
public SearchIndexJournalReader(MultimapFileLong map) { public SearchIndexJournalReader(MultimapFileLong map) {
fileHeader = new SearchIndexJournalFileHeader(map.get(0), map.get(1)); fileHeader = new SearchIndexJournalFileHeader(map.get(0), map.get(1));
committedSize = map.get(0) / 8 - FILE_HEADER_SIZE_LONGS; committedSize = map.get(0) / 8 - FILE_HEADER_SIZE_LONGS;
@ -92,7 +98,7 @@ public class SearchIndexJournalReader implements Iterable<SearchIndexJournalRead
public IndexBlock block() { public IndexBlock block() {
return header.block(); return header.block();
} }
public int wordCount() { return header.entrySize(); } public int wordCount() { return header.entrySize() / ENTRY_SIZE; }
public SearchIndexJournalEntry readEntry() { public SearchIndexJournalEntry readEntry() {
long[] dest = new long[header.entrySize()]; long[] dest = new long[header.entrySize()];

View File

@ -26,7 +26,7 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
private RandomAccessFile raf; private RandomAccessFile raf;
private FileChannel channel; private FileChannel channel;
public static final int MAX_BLOCK_SIZE = SearchIndexJournalEntry.MAX_LENGTH*32*8*4; public static final int MAX_BLOCK_SIZE = SearchIndexJournalEntry.MAX_LENGTH*128*8*4;
private final ByteBuffer byteBuffer; private final ByteBuffer byteBuffer;
private long pos; private long pos;
@ -83,7 +83,7 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
byteBuffer.clear(); byteBuffer.clear();
byteBuffer.putInt(entryData.size()); byteBuffer.putInt(entryData.size());
byteBuffer.putInt(header.block().id); byteBuffer.putInt(header.block().ordinal());
byteBuffer.putLong(header.documentId()); byteBuffer.putLong(header.documentId());
entryData.write(byteBuffer); entryData.write(byteBuffer);

View File

@ -2,12 +2,14 @@ package nu.marginalia.wmsa.edge.index.journal.model;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.util.Arrays; import java.util.Arrays;
import java.util.Iterator;
public class SearchIndexJournalEntry { public class SearchIndexJournalEntry implements Iterable<SearchIndexJournalEntry.Record> {
private final int size; private final int size;
private final long[] underlyingArray; private final long[] underlyingArray;
public static final int MAX_LENGTH = 1000; public static final int MAX_LENGTH = 1000;
public static final int ENTRY_SIZE = 2;
public SearchIndexJournalEntry(long[] underlyingArray) { public SearchIndexJournalEntry(long[] underlyingArray) {
this.size = underlyingArray.length; this.size = underlyingArray.length;
@ -46,4 +48,24 @@ public class SearchIndexJournalEntry {
return String.format("%s[%s]", getClass().getSimpleName(), Arrays.toString(toArray())); return String.format("%s[%s]", getClass().getSimpleName(), Arrays.toString(toArray()));
} }
public Iterator<Record> iterator() {
return new EntryIterator();
}
private class EntryIterator implements Iterator<Record> {
int pos = -ENTRY_SIZE;
public boolean hasNext() {
return pos + ENTRY_SIZE < size;
}
@Override
public Record next() {
pos+=ENTRY_SIZE;
return new Record((int) underlyingArray[pos], underlyingArray[pos+1]);
}
}
public record Record(int wordId, long metadata) {}
} }

View File

@ -5,6 +5,7 @@ import com.google.common.hash.Hashing;
import io.prometheus.client.Gauge; import io.prometheus.client.Gauge;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.util.dict.DictionaryMap;
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -16,7 +17,7 @@ import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock;
public class KeywordLexicon implements AutoCloseable { public class KeywordLexicon implements AutoCloseable {
private final DictionaryHashMap reverseIndex; private final DictionaryMap reverseIndex;
private final ReadWriteLock memoryLock = new ReentrantReadWriteLock(); private final ReadWriteLock memoryLock = new ReentrantReadWriteLock();
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
@ -30,7 +31,7 @@ public class KeywordLexicon implements AutoCloseable {
private final KeywordLexiconJournal journal; private final KeywordLexiconJournal journal;
@SneakyThrows @SneakyThrows
public KeywordLexicon(KeywordLexiconJournal keywordLexiconJournal, DictionaryHashMap reverseIndexHashMap) { public KeywordLexicon(KeywordLexiconJournal keywordLexiconJournal, DictionaryMap reverseIndexHashMap) {
journal = keywordLexiconJournal; journal = keywordLexiconJournal;
reverseIndex = reverseIndexHashMap; reverseIndex = reverseIndexHashMap;

View File

@ -1,16 +0,0 @@
package nu.marginalia.wmsa.edge.index.model;
import lombok.AllArgsConstructor;
import java.util.ArrayList;
import java.util.List;
@AllArgsConstructor
public class EdgeIndexSearchTerms {
public List<Integer> includes = new ArrayList<>();
public List<Integer> excludes = new ArrayList<>();
public boolean isEmpty() {
return includes.isEmpty();
}
}

View File

@ -0,0 +1,32 @@
package nu.marginalia.wmsa.edge.index.model;
import java.util.EnumSet;
public enum EdgePageWordFlags {
Title,
Subjects,
NamesWords,
Site,
SiteAdjacent,
Simple;
public int asBit() {
return 1 << ordinal();
}
public boolean isPresent(long value) {
return (asBit() & value) > 0;
}
public static EnumSet<EdgePageWordFlags> decode(long encodedValue) {
EnumSet<EdgePageWordFlags> ret = EnumSet.noneOf(EdgePageWordFlags.class);
for (EdgePageWordFlags f : values()) {
if ((encodedValue & f.asBit()) > 0) {
ret.add(f);
}
}
return ret;
}
}

View File

@ -0,0 +1,90 @@
package nu.marginalia.wmsa.edge.index.model;
import nu.marginalia.util.BrailleBlockPunchCards;
import java.util.EnumSet;
import static java.lang.Math.max;
import static java.lang.Math.min;
public record EdgePageWordMetadata(int tfIdf,
int positions,
int quality,
int count,
EnumSet<EdgePageWordFlags> flags) {
// If flags are moved from the least significant end of
// this struct, then EntrySourceFromBTree will break.
public static final long COUNT_MASK = 0xFL;
public static final int COUNT_SHIFT = 8;
public static final long QUALITY_MASK = 0xFL;
public static final int QUALITY_SHIFT = 12;
public static final long TF_IDF_MASK = 0xFFFFL;
public static final int TF_IDF_SHIFT = 16;
public static final int POSITIONS_SHIFT = 32;
public EdgePageWordMetadata(long value) {
this(
(int)((value >>> TF_IDF_SHIFT) & TF_IDF_MASK),
(int)(value >>> POSITIONS_SHIFT),
(int)((value >>> QUALITY_SHIFT) & QUALITY_MASK),
(int)((value >>> COUNT_SHIFT) & COUNT_MASK),
EdgePageWordFlags.decode(value)
);
}
public static int decodeQuality(long encoded) {
return (int)((encoded >>> QUALITY_SHIFT) & QUALITY_MASK);
}
public static boolean hasFlags(long encoded, long metadataBitMask) {
return (encoded & metadataBitMask) == encoded;
}
public String toString() {
StringBuilder sb = new StringBuilder(getClass().getSimpleName());
sb.append('[')
.append("tfidf=").append(tfIdf).append(", ")
.append("quality=").append(quality).append(", ")
.append("count=").append(count).append(", ")
.append("positions=[").append(BrailleBlockPunchCards.printBits(positions, 32)).append(']');
sb.append(", flags=").append(flags).append(']');
return sb.toString();
}
/* Encoded in a 64 bit long as
0-8 flags
8-12 count,
12-16 quality,
16-32 tf-idf [0, 65536]
32-64 position mask
*/
public long encode() {
long ret = 0;
for (var flag : flags) {
ret |= flag.asBit();
}
ret |= min(TF_IDF_MASK, max(0, tfIdf)) << TF_IDF_SHIFT;
ret |= min(COUNT_MASK, max(0, count)) << COUNT_SHIFT;
ret |= min(QUALITY_MASK, max(0, quality)) << QUALITY_SHIFT;
ret |= ((long)(positions)) << POSITIONS_SHIFT;
return ret;
}
public boolean isEmpty() {
return count == 0 && positions == 0 && flags.isEmpty() && tfIdf == 0;
}
public static long emptyValue() {
return 0L;
}
}

View File

@ -1,20 +0,0 @@
package nu.marginalia.wmsa.edge.index.model;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
@AllArgsConstructor @Getter
@ToString
public class EdgePutWordsRequest {
public EdgeId<EdgeDomain> domainId;
public EdgeId<EdgeUrl> urlId;
public double quality;
public EdgePageWordSet wordSet;
private int index = 0;
}

View File

@ -1,47 +1,35 @@
package nu.marginalia.wmsa.edge.index.model; package nu.marginalia.wmsa.edge.index.model;
public enum IndexBlock { public enum IndexBlock {
TitleKeywords(IndexBlockType.QUALITY_SIGNAL, 0, 0), Title(IndexBlockType.PAGE_DATA),
Title(IndexBlockType.QUALITY_SIGNAL, 1, 1), Meta(IndexBlockType.PAGE_DATA),
Link(IndexBlockType.QUALITY_SIGNAL, 2, 1.15), Words_1(IndexBlockType.PAGE_DATA),
Words_2(IndexBlockType.PAGE_DATA),
Words_4(IndexBlockType.PAGE_DATA),
Words_8(IndexBlockType.PAGE_DATA),
Words_16Plus(IndexBlockType.PAGE_DATA),
Subjects(IndexBlockType.QUALITY_SIGNAL, 3, 1.0), Link(IndexBlockType.QUALITY_SIGNAL),
NamesWords(IndexBlockType.QUALITY_SIGNAL, 4, 3.0), Site(IndexBlockType.QUALITY_SIGNAL),
Artifacts(IndexBlockType.PAGE_DATA, 5, 10), Artifacts(IndexBlockType.PAGE_DATA),
Meta(IndexBlockType.PAGE_DATA, 6, 7),
Tfidf_Top(IndexBlockType.TF_IDF, 7, 1.5), Tfidf_High(IndexBlockType.TRANSIENT),
Tfidf_Middle(IndexBlockType.TF_IDF, 8, 2), Subjects(IndexBlockType.TRANSIENT)
Tfidf_Lower(IndexBlockType.TF_IDF, 9, 3.5),
Words_1(IndexBlockType.PAGE_DATA, 10, 2.0),
Words_2(IndexBlockType.PAGE_DATA,11, 3.5),
Words_4(IndexBlockType.PAGE_DATA,12, 4.0),
Words_8(IndexBlockType.PAGE_DATA,13, 4.5),
Words_16Plus(IndexBlockType.PAGE_DATA,14, 7.0),
Site(IndexBlockType.QUALITY_SIGNAL, 15, 1.2)
; ;
public final IndexBlockType type; public final IndexBlockType type;
public final int id;
public final double sortOrder;
IndexBlock(IndexBlockType type, int id, double sortOrder) { IndexBlock(IndexBlockType type) {
this.type = type; this.type = type;
this.sortOrder = sortOrder;
this.id = id;
} }
// This is kind of a hot method, and Enum.values() allocates a new
// array each call.
private static final IndexBlock[] values = IndexBlock.values();
public static IndexBlock byId(int id) { public static IndexBlock byId(int id) {
for (IndexBlock block : values()) { return values[id];
if (id == block.id) {
return block;
}
}
throw new IllegalArgumentException("Bad block id");
} }
} }

View File

@ -1,7 +1,10 @@
package nu.marginalia.wmsa.edge.index.model; package nu.marginalia.wmsa.edge.index.model;
public enum IndexBlockType { public enum IndexBlockType {
/** This block is only used for joins */
QUALITY_SIGNAL, QUALITY_SIGNAL,
TF_IDF, /** This block contains page keywords */
PAGE_DATA PAGE_DATA,
/** This block is only used for generation */
TRANSIENT
} }

View File

@ -2,7 +2,6 @@ package nu.marginalia.wmsa.edge.index.reader;
import com.upserve.uppend.blobs.NativeIO; import com.upserve.uppend.blobs.NativeIO;
import nu.marginalia.util.btree.BTreeReader; import nu.marginalia.util.btree.BTreeReader;
import nu.marginalia.util.btree.model.BTreeHeader;
import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.util.multimap.MultimapFileLong;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -17,7 +16,6 @@ import static nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter.wo
public class IndexWordsTable implements AutoCloseable { public class IndexWordsTable implements AutoCloseable {
protected final MultimapFileLong words; protected final MultimapFileLong words;
protected final BTreeReader reader; protected final BTreeReader reader;
protected final BTreeHeader header;
protected final int HEADER_OFFSET = 1; protected final int HEADER_OFFSET = 1;
final Logger logger = LoggerFactory.getLogger(getClass()); final Logger logger = LoggerFactory.getLogger(getClass());
@ -26,8 +24,7 @@ public class IndexWordsTable implements AutoCloseable {
public IndexWordsTable(MultimapFileLong words) { public IndexWordsTable(MultimapFileLong words) {
this.words = words; this.words = words;
reader = new BTreeReader(words, wordsBTreeContext); reader = new BTreeReader(words, wordsBTreeContext, HEADER_OFFSET);
header = reader.getHeader(HEADER_OFFSET);
madvise(); madvise();
} }
@ -49,7 +46,7 @@ public class IndexWordsTable implements AutoCloseable {
} }
public long positionForWord(int wordId) { public long positionForWord(int wordId) {
long offset = reader.findEntry(header, wordId); long offset = reader.findEntry(wordId);
if (offset < 0) { if (offset < 0) {
return -1L; return -1L;
@ -60,7 +57,7 @@ public class IndexWordsTable implements AutoCloseable {
public int wordLength(int wordId) { public int wordLength(int wordId) {
long offset = reader.findEntry(header, wordId); long offset = reader.findEntry(wordId);
if (offset < 0) { if (offset < 0) {
return -1; return -1;
} }
@ -72,7 +69,7 @@ public class IndexWordsTable implements AutoCloseable {
words.advice(NativeIO.Advice.Random); words.advice(NativeIO.Advice.Random);
words.advice0(NativeIO.Advice.WillNeed); words.advice0(NativeIO.Advice.WillNeed);
var h = reader.getHeader(HEADER_OFFSET); var h = reader.getHeader();
int length = (int)(h.dataOffsetLongs() - h.indexOffsetLongs()); int length = (int)(h.dataOffsetLongs() - h.indexOffsetLongs());
words.adviceRange(NativeIO.Advice.WillNeed, h.indexOffsetLongs(), length); words.adviceRange(NativeIO.Advice.WillNeed, h.indexOffsetLongs(), length);
@ -80,8 +77,8 @@ public class IndexWordsTable implements AutoCloseable {
} }
public void forEachWordsOffset(LongConsumer offsetConsumer) { public void forEachWordsOffset(LongConsumer offsetConsumer) {
int n = header.numEntries(); int n = reader.numEntries();
long offset = header.dataOffsetLongs(); long offset = reader.getHeader().dataOffsetLongs();
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
try { try {

View File

@ -5,21 +5,13 @@ import com.google.inject.name.Named;
import com.upserve.uppend.blobs.NativeIO; import com.upserve.uppend.blobs.NativeIO;
import io.reactivex.rxjava3.schedulers.Schedulers; import io.reactivex.rxjava3.schedulers.Schedulers;
import nu.marginalia.util.btree.BTreeReader; import nu.marginalia.util.btree.BTreeReader;
import nu.marginalia.util.btree.CachingBTreeReader;
import nu.marginalia.util.btree.model.BTreeHeader;
import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.RandomAccessFile; import java.io.RandomAccessFile;
import java.util.Arrays;
import java.util.stream.LongStream;
public class SearchIndex implements AutoCloseable { public class SearchIndex implements AutoCloseable {
@ -27,8 +19,6 @@ public class SearchIndex implements AutoCloseable {
private final IndexWordsTable words; private final IndexWordsTable words;
public final String name; public final String name;
private final RandomAccessFile wordsFile; private final RandomAccessFile wordsFile;
private final BTreeReader bTreeReader;
private final CachingBTreeReader cachingBTreeReader;
private final Logger logger; private final Logger logger;
@ -49,16 +39,13 @@ public class SearchIndex implements AutoCloseable {
urls = MultimapFileLong.forReading(inUrls.toPath()); urls = MultimapFileLong.forReading(inUrls.toPath());
words = IndexWordsTable.ofFile(wordsFile); words = IndexWordsTable.ofFile(wordsFile);
bTreeReader = new BTreeReader(urls, SearchIndexConverter.urlsBTreeContext); Schedulers.io().scheduleDirect(() -> madvise(urls));
cachingBTreeReader = new CachingBTreeReader(urls, SearchIndexConverter.urlsBTreeContext);
Schedulers.io().scheduleDirect(() -> madvise(urls, bTreeReader));
} }
private void madvise(MultimapFileLong urls, BTreeReader reader) { private void madvise(MultimapFileLong urls) {
words.forEachWordsOffset(offset -> { words.forEachWordsOffset(offset -> {
var h = reader.getHeader(offset); var h = BTreeReader.createHeader(urls, offset);
long length = h.dataOffsetLongs() - h.indexOffsetLongs(); long length = h.dataOffsetLongs() - h.indexOffsetLongs();
urls.adviceRange(NativeIO.Advice.WillNeed, offset, 512); urls.adviceRange(NativeIO.Advice.WillNeed, offset, 512);
@ -70,174 +57,16 @@ public class SearchIndex implements AutoCloseable {
} }
public long numUrls(IndexQueryCachePool pool, int wordId) { public long numUrls(int wordId) {
int length = words.wordLength(wordId); int length = words.wordLength(wordId);
if (length < 0) return 0; if (length < 0) return 0;
if (length > 0) return length; if (length > 0) return length;
return rangeForWord(pool, wordId).numEntries(); return rangeForWord(wordId).numEntries();
}
public IndexBTreeRange rangeForWord(IndexQueryCachePool pool, int wordId) {
IndexBTreeRange range = pool.getRange(words, wordId);
if (range == null) {
range = new IndexBTreeRange(words.positionForWord(wordId));
pool.cacheRange(words, wordId, range);
}
return range;
}
public IndexBTreeRange rangeForWord(int wordId) {
return new IndexBTreeRange(words.positionForWord(wordId));
}
public class IndexBTreeRange {
public final long dataOffset;
private BTreeHeader header;
public IndexBTreeRange(long dataOffset) {
this.dataOffset = dataOffset;
}
public LongStream stream(int bufferSize) {
if (dataOffset < 0) {
return LongStream.empty();
}
if (header == null) {
header = bTreeReader.getHeader(dataOffset);
}
long urlOffset = header.dataOffsetLongs();
long endOffset = header.dataOffsetLongs() + header.numEntries();
int stepSize = Math.min(bufferSize, header.numEntries());
long[] buffer = new long[stepSize];
return LongStream
.iterate(urlOffset, i -> i< endOffset, i->i+stepSize)
.flatMap(pos -> {
int sz = (int)(Math.min(pos+stepSize, endOffset) - pos);
urls.read(buffer, sz, pos);
return Arrays.stream(buffer, 0, sz);
});
}
public EntrySource asEntrySource() {
return new AsEntrySource();
}
public QueryFilterStepIf asExcludeFilterStep(IndexQueryCachePool pool) {
return new AsExcludeQueryFilterStep(pool);
}
public LongStream stream() {
return stream(1024);
}
public boolean isPresent() {
return dataOffset >= 0;
}
public long numEntries() {
if (header != null) {
return header.numEntries();
}
else if (dataOffset < 0) return 0L;
else {
header = bTreeReader.getHeader(dataOffset);
return header.numEntries();
}
}
public boolean hasUrl(CachingBTreeReader.BTreeCachedIndex cache, long url) {
if (dataOffset < 0) return false;
return cachingBTreeReader.findEntry(cache, url) >= 0;
}
public boolean hasUrl(IndexQueryCachePool pool, long url) {
if (dataOffset < 0)
return false;
CachingBTreeReader.BTreeCachedIndex cache = pool.getIndexCache(SearchIndex.this, this);
return cachingBTreeReader.findEntry(cache, url) >= 0;
}
public CachingBTreeReader.BTreeCachedIndex createIndexCache() {
if (dataOffset < 0)
return null;
if (header == null) {
header = cachingBTreeReader.getHeader(dataOffset);
}
return cachingBTreeReader.prepareCache(header);
}
class AsEntrySource implements EntrySource {
long pos;
final long endOffset;
public SearchIndex getIndex() {
return SearchIndex.this;
};
public AsEntrySource() {
if (dataOffset <= 0) {
pos = -1;
endOffset = -1;
return;
}
if (header == null) {
header = bTreeReader.getHeader(dataOffset);
}
pos = header.dataOffsetLongs();
endOffset = header.dataOffsetLongs() + header.numEntries();
}
@Override
public int read(long[] buffer, int n) {
if (pos >= endOffset) {
return 0;
}
int rb = Math.min(n, (int)(endOffset - pos));
urls.read(buffer, rb, pos);
pos += rb;
return rb;
}
}
class AsExcludeQueryFilterStep implements QueryFilterStepIf {
private final CachingBTreeReader.BTreeCachedIndex cache;
public AsExcludeQueryFilterStep(IndexQueryCachePool pool) {
cache = pool.getIndexCache(SearchIndex.this, IndexBTreeRange.this);
}
public SearchIndex getIndex() {
return SearchIndex.this;
};
public double cost() {
return cache.getIndexedDataSize();
}
@Override
public boolean test(long value) {
return !hasUrl(cache, value);
}
public String describe() {
return "Exclude["+name+"]";
}
} }
public SearchIndexURLRange rangeForWord(int wordId) {
return new SearchIndexURLRange(urls, words.positionForWord(wordId));
} }
@Override @Override

View File

@ -5,7 +5,6 @@ import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.svc.query.IndexDomainQueryFactory; import nu.marginalia.wmsa.edge.index.svc.query.IndexDomainQueryFactory;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery; import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory; import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -22,31 +21,14 @@ public class SearchIndexReader implements AutoCloseable {
private final IndexDomainQueryFactory domainQueryFactory; private final IndexDomainQueryFactory domainQueryFactory;
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
private static final IndexBlock[] indicesBySearchOrder = new IndexBlock[] {
IndexBlock.Title,
IndexBlock.Tfidf_Top,
IndexBlock.Tfidf_Middle,
IndexBlock.Tfidf_Lower,
IndexBlock.Words_1,
IndexBlock.Words_2,
IndexBlock.Words_4,
IndexBlock.Words_8,
IndexBlock.Words_16Plus,
};
@Inject @Inject
public SearchIndexReader( public SearchIndexReader(
EnumMap<IndexBlock, SearchIndex> indices) { EnumMap<IndexBlock, SearchIndex> indices) {
this.indices = indices; this.indices = indices;
var lowIndex = indices.get(IndexBlock.Tfidf_Lower);
var midIndex = indices.get(IndexBlock.Tfidf_Middle);
var topIndex = indices.get(IndexBlock.Tfidf_Top);
var linkIndex = indices.get(IndexBlock.Link); var linkIndex = indices.get(IndexBlock.Link);
var titleIndex = indices.get(IndexBlock.Title); var titleIndex = indices.get(IndexBlock.Title);
var siteIndex = indices.get(IndexBlock.Site);
var metaIndex = indices.get(IndexBlock.Meta); var metaIndex = indices.get(IndexBlock.Meta);
var topicIndex = indices.get(IndexBlock.Subjects);
var words1 = indices.get(IndexBlock.Words_1); var words1 = indices.get(IndexBlock.Words_1);
var words2 = indices.get(IndexBlock.Words_2); var words2 = indices.get(IndexBlock.Words_2);
@ -57,7 +39,7 @@ public class SearchIndexReader implements AutoCloseable {
queryBuilders = new EnumMap<>(IndexBlock.class); queryBuilders = new EnumMap<>(IndexBlock.class);
List<SearchIndex> excludeIndices = listOfNonNulls(metaIndex, titleIndex, topIndex, midIndex, lowIndex, words1); List<SearchIndex> excludeIndices = listOfNonNulls(metaIndex, titleIndex, words1, words2, words4, words8, words16);
queryBuilders.put(IndexBlock.Title, new IndexQueryFactory(listOfNonNulls(metaIndex, titleIndex, linkIndex), excludeIndices)); queryBuilders.put(IndexBlock.Title, new IndexQueryFactory(listOfNonNulls(metaIndex, titleIndex, linkIndex), excludeIndices));
queryBuilders.put(IndexBlock.Words_1, new IndexQueryFactory(listOfNonNulls(metaIndex, words1), excludeIndices)); queryBuilders.put(IndexBlock.Words_1, new IndexQueryFactory(listOfNonNulls(metaIndex, words1), excludeIndices));
@ -66,7 +48,7 @@ public class SearchIndexReader implements AutoCloseable {
queryBuilders.put(IndexBlock.Words_8, new IndexQueryFactory(listOfNonNulls(metaIndex, words8), excludeIndices)); queryBuilders.put(IndexBlock.Words_8, new IndexQueryFactory(listOfNonNulls(metaIndex, words8), excludeIndices));
queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryFactory(listOfNonNulls(metaIndex, words16, artifacts), excludeIndices)); queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryFactory(listOfNonNulls(metaIndex, words16, artifacts), excludeIndices));
domainQueryFactory = new IndexDomainQueryFactory(siteIndex, listOfNonNulls(topicIndex)); domainQueryFactory = new IndexDomainQueryFactory(indices.get(IndexBlock.Words_1));
} }
@SafeVarargs @SafeVarargs
@ -75,17 +57,31 @@ public class SearchIndexReader implements AutoCloseable {
} }
public IndexQueryFactory.IndexQueryBuilder findWord(IndexQueryCachePool cachePool, IndexBlock block, int wordId) { public IndexQueryFactory.IndexQueryBuilder findWord(IndexBlock block, Integer quality, int wordId) {
var builder = queryBuilders.get(block); var builder = queryBuilders.get(block);
if (builder == null) if (builder == null)
return null; return null;
return builder.buildQuery(cachePool, wordId); if (quality == null) {
return builder.buildQuery(wordId);
}
else {
return builder.buildQuery(quality, wordId);
}
} }
public IndexQuery findDomain(IndexQueryCachePool cachePool, int wordId) { public IndexQueryFactory.IndexQueryBuilder findWordForDomainList(IndexBlock block, List<Integer> domains, int wordId) {
return domainQueryFactory.buildQuery(cachePool, wordId); var builder = queryBuilders.get(block);
if (builder == null)
return null;
return builder.buildQuery(domains, wordId);
}
public IndexQuery findDomain(int wordId) {
return domainQueryFactory.buildQuery(wordId);
} }
@Override @Override
@ -96,7 +92,7 @@ public class SearchIndexReader implements AutoCloseable {
} }
@SneakyThrows @SneakyThrows
public long numHits(IndexQueryCachePool pool, IndexBlock block, int word) { public long numHits(IndexBlock block, int word) {
IndexQueryFactory builder = queryBuilders.get(block); IndexQueryFactory builder = queryBuilders.get(block);
if (builder == null) if (builder == null)
@ -104,31 +100,18 @@ public class SearchIndexReader implements AutoCloseable {
long hits = 0; long hits = 0;
for (var index : builder.getIndicies()) { for (var index : builder.getIndicies()) {
hits += index.numUrls(pool, word); hits += index.numUrls(word);
} }
return hits; return hits;
} }
public IndexBlock getBlockForResult(IndexQueryCachePool cachePool, int searchTerm, long urlId) {
for (var block : indicesBySearchOrder) {
var index = indices.get(block);
if (null == index) { public long[] getMetadata(IndexBlock block, int termId, long[] ids) {
continue;
}
if (cachePool.isUrlPresent(index, searchTerm, urlId))
return block;
}
return IndexBlock.Words_16Plus;
}
public boolean isTermInBucket(IndexQueryCachePool cachePool, IndexBlock block, int searchTerm, long urlId) {
final var index = indices.get(block); final var index = indices.get(block);
if (null == index) return false; if (null == index) {
return new long[ids.length];
}
return cachePool.isUrlPresent(index, searchTerm, urlId); return indices.get(block).rangeForWord(termId).getMetadata(ids);
} }
} }

View File

@ -0,0 +1,100 @@
package nu.marginalia.wmsa.edge.index.reader;
import it.unimi.dsi.fastutil.longs.LongLongImmutablePair;
import nu.marginalia.util.btree.BTreeQueryBuffer;
import nu.marginalia.util.btree.BTreeReader;
import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
import nu.marginalia.wmsa.edge.index.svc.query.types.EmptyEntrySource;
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySourceFromBTree;
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySourceFromMapRange;
import javax.annotation.Nullable;
import static nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags.*;
public class SearchIndexURLRange {
public final long dataOffset;
private final MultimapFileLong urlsFile;
@Nullable
private final BTreeReader reader;
public SearchIndexURLRange(MultimapFileLong urlsFile, long dataOffset) {
this.dataOffset = dataOffset;
this.urlsFile = urlsFile;
if (dataOffset >= 0) {
this.reader = new BTreeReader(urlsFile, SearchIndexConverter.urlsBTreeContext, dataOffset);
} else {
this.reader = null;
}
}
public EntrySource asPrefixSource(long prefix, long prefixNext) {
if (reader == null)
return new EmptyEntrySource();
LongLongImmutablePair startAndEnd = reader.getRangeForPrefix(prefix, prefixNext);
if (startAndEnd.firstLong() == startAndEnd.secondLong()) {
return new EmptyEntrySource();
}
return new EntrySourceFromMapRange(urlsFile, startAndEnd.firstLong(), startAndEnd.secondLong());
}
public EntrySource asEntrySource() {
return new EntrySourceFromBTree(reader, EntrySourceFromBTree.NO_MASKING, null);
}
public EntrySource asQualityLimitingEntrySource(int limit) {
return new EntrySourceFromBTree(reader, EntrySourceFromBTree.NO_MASKING, limit);
}
public EntrySource asDomainEntrySource() {
return new EntrySourceFromBTree(reader, Subjects.asBit() | Site.asBit() | Title.asBit(), null);
}
public boolean isPresent() {
return dataOffset >= 0;
}
public long numEntries() {
if (reader == null)
return 0L;
return reader.numEntries();
}
public void retainUrls(BTreeQueryBuffer buffer) {
if (reader != null)
reader.retainEntries(buffer);
}
public void rejectUrls(BTreeQueryBuffer buffer) {
if (reader != null)
reader.rejectEntries(buffer);
}
public boolean hasUrl(long url) {
if (reader == null)
return false;
return reader.findEntry(url) >= 0;
}
public long[] getMetadata(long[] urls) {
if (reader == null) {
return new long[urls.length];
}
return reader.queryData(urls, 1);
}
@Override
public String toString() {
return String.format("BTreeRange(@" + dataOffset + ", size = " + numEntries() + ")");
}
}

View File

@ -0,0 +1,111 @@
package nu.marginalia.wmsa.edge.index.svc;
import com.google.gson.Gson;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import io.prometheus.client.Histogram;
import nu.marginalia.util.btree.BTreeQueryBuffer;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.wmsa.client.GsonFactory;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
import nu.marginalia.wmsa.edge.index.svc.query.IndexSearchBudget;
import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.id.EdgeIdList;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
import org.apache.http.HttpStatus;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.HaltException;
import spark.Request;
import spark.Response;
import spark.Spark;
import java.util.OptionalInt;
import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH;
import static spark.Spark.halt;
@Singleton
public class EdgeIndexDomainQueryService {
private final Logger logger = LoggerFactory.getLogger(getClass());
private static final Histogram wmsa_edge_index_domain_query_time = Histogram.build().name("wmsa_edge_index_domain_query_time").linearBuckets(25/1000., 25/1000., 15).help("-").register();
private final Gson gson = GsonFactory.get();
private final SearchIndexes indexes;
@Inject
public EdgeIndexDomainQueryService(SearchIndexes indexes) {
this.indexes = indexes;
}
public Object searchDomain(Request request, Response response) {
if (indexes.getLexiconReader() == null) {
logger.warn("Dictionary reader not yet initialized");
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
}
String json = request.body();
EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class);
try {
return wmsa_edge_index_domain_query_time.time(() -> queryDomain(specsSet));
}
catch (HaltException ex) {
logger.warn("Halt", ex);
throw ex;
}
catch (Exception ex) {
logger.info("Error during domain search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
logger.info("Error", ex);
Spark.halt(500, "Error");
return null;
}
}
public EdgeDomainSearchResults queryDomain(EdgeDomainSearchSpecification specsSet) {
final OptionalInt wordId = lookUpWord(specsSet.keyword);
final EdgeIdList<EdgeUrl> urlIds = new EdgeIdList<>();
final IndexSearchBudget budget = new IndexSearchBudget(50);
if (wordId.isEmpty()) {
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
}
BTreeQueryBuffer buffer = new BTreeQueryBuffer(512);
for (int bucket = 0; budget.hasTimeLeft() && bucket < DYNAMIC_BUCKET_LENGTH+1; bucket++) {
final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(1);
var query = indexes.getBucket(bucket).getDomainQuery(wordId.getAsInt(), localFilter);
while (query.hasMore() && urlIds.size() < specsSet.maxResults) {
query.getMoreResults(buffer);
for (int i = 0; i < buffer.end && urlIds.size() < specsSet.maxResults; i++) {
long result = buffer.data[i];
if (localFilter.test(result)) {
urlIds.add((int) (result & 0xFFFF_FFFFL));
}
}
}
}
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
}
private OptionalInt lookUpWord(String s) {
int ret = indexes.getLexiconReader().get(s);
if (ret == DictionaryHashMap.NO_VALUE) {
return OptionalInt.empty();
}
return OptionalInt.of(ret);
}
}

View File

@ -5,6 +5,7 @@ import com.google.inject.Singleton;
import com.google.protobuf.InvalidProtocolBufferException; import com.google.protobuf.InvalidProtocolBufferException;
import nu.marginalia.util.ListChunker; import nu.marginalia.util.ListChunker;
import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
import nu.marginalia.wmsa.edge.index.IndexServicesFactory; import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl; import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry; import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
@ -21,7 +22,6 @@ import spark.Request;
import spark.Response; import spark.Response;
import java.util.Arrays; import java.util.Arrays;
import java.util.List;
@Singleton @Singleton
public class EdgeIndexLexiconService { public class EdgeIndexLexiconService {
@ -35,6 +35,11 @@ public class EdgeIndexLexiconService {
this.keywordLexicon = servicesFactory.getKeywordLexicon(); this.keywordLexicon = servicesFactory.getKeywordLexicon();
} }
public EdgeIndexLexiconService(SearchIndexes indexes, KeywordLexicon lexicon) {
this.indexes = indexes;
this.keywordLexicon = lexicon;
}
public Object getWordId(Request request, Response response) { public Object getWordId(Request request, Response response) {
final String word = request.splat()[0]; final String word = request.splat()[0];
@ -73,31 +78,37 @@ public class EdgeIndexLexiconService {
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId, public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
IndexPutKeywordsReq.WordSet words, int idx IndexPutKeywordsReq.WordSet words, int idx
) { ) {
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
IndexBlock block = IndexBlock.values()[words.getIndex()]; IndexBlock block = IndexBlock.values()[words.getIndex()];
for (var chunk : ListChunker.chopList(words.getWordsList(), SearchIndexJournalEntry.MAX_LENGTH)) { var wordArray = words.getWordsList().toArray(String[]::new);
var metaArray = words.getMetaList().stream().mapToLong(Long::valueOf).toArray();
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk)); DocumentKeywords documentKeywords = new DocumentKeywords(block, wordArray, metaArray);
for (var chunk : ListChunker.chopList(documentKeywords, SearchIndexJournalEntry.MAX_LENGTH)) {
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk.keywords(), chunk.metadata()));
var header = new SearchIndexJournalEntryHeader(domainId, urlId, block); var header = new SearchIndexJournalEntryHeader(domainId, urlId, block);
indexWriter.put(header, entry); indexWriter.put(header, entry);
} }
} }
private long[] getOrInsertWordIds(List<String> words) { private long[] getOrInsertWordIds(String[] words, long[] meta) {
long[] ids = new long[words.size()]; long[] ids = new long[words.length*2];
int putIdx = 0; int putIdx = 0;
for (String word : words) { for (int i = 0; i < words.length; i++) {
String word = words[i];
long id = keywordLexicon.getOrInsert(word); long id = keywordLexicon.getOrInsert(word);
if (id != DictionaryHashMap.NO_VALUE) { if (id != DictionaryHashMap.NO_VALUE) {
ids[putIdx++] = id; ids[putIdx++] = id;
ids[putIdx++] = meta[i];
} }
} }
if (putIdx != words.size()) { if (putIdx != words.length*2) {
ids = Arrays.copyOf(ids, putIdx); ids = Arrays.copyOf(ids, putIdx);
} }
return ids; return ids;

View File

@ -7,22 +7,23 @@ import gnu.trove.set.hash.TIntHashSet;
import io.prometheus.client.Counter; import io.prometheus.client.Counter;
import io.prometheus.client.Gauge; import io.prometheus.client.Gauge;
import io.prometheus.client.Histogram; import io.prometheus.client.Histogram;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntComparator;
import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongAVLTreeSet;
import nu.marginalia.util.btree.BTreeQueryBuffer;
import nu.marginalia.util.dict.DictionaryHashMap; import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.wmsa.client.GsonFactory; import nu.marginalia.wmsa.client.GsonFactory;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.index.EdgeIndexBucket; import nu.marginalia.wmsa.edge.index.EdgeIndexBucket;
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms; import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes; import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery; import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool; import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryParams;
import nu.marginalia.wmsa.edge.index.svc.query.IndexSearchBudget; import nu.marginalia.wmsa.edge.index.svc.query.IndexSearchBudget;
import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator; import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.id.EdgeIdList;
import nu.marginalia.wmsa.edge.model.search.*; import nu.marginalia.wmsa.edge.model.search.*;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
import org.apache.http.HttpStatus; import org.apache.http.HttpStatus;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -36,7 +37,6 @@ import java.util.function.LongPredicate;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import static java.util.Comparator.comparing; import static java.util.Comparator.comparing;
import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH;
import static spark.Spark.halt; import static spark.Spark.halt;
@Singleton @Singleton
@ -50,7 +50,6 @@ public class EdgeIndexQueryService {
private static final Gauge wmsa_edge_index_query_cost = Gauge.build().name("wmsa_edge_index_query_cost").help("-").register(); private static final Gauge wmsa_edge_index_query_cost = Gauge.build().name("wmsa_edge_index_query_cost").help("-").register();
private static final Histogram wmsa_edge_index_query_time = Histogram.build().name("wmsa_edge_index_query_time").linearBuckets(25/1000., 25/1000., 15).help("-").register(); private static final Histogram wmsa_edge_index_query_time = Histogram.build().name("wmsa_edge_index_query_time").linearBuckets(25/1000., 25/1000., 15).help("-").register();
private static final Histogram wmsa_edge_index_domain_query_time = Histogram.build().name("wmsa_edge_index_domain_query_time").linearBuckets(25/1000., 25/1000., 15).help("-").register();
private final Gson gson = GsonFactory.get(); private final Gson gson = GsonFactory.get();
@ -61,30 +60,6 @@ public class EdgeIndexQueryService {
this.indexes = indexes; this.indexes = indexes;
} }
public Object searchDomain(Request request, Response response) {
if (indexes.getLexiconReader() == null) {
logger.warn("Dictionary reader not yet initialized");
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
}
String json = request.body();
EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class);
try {
return wmsa_edge_index_domain_query_time.time(() -> queryDomain(specsSet));
}
catch (HaltException ex) {
logger.warn("Halt", ex);
throw ex;
}
catch (Exception ex) {
logger.info("Error during domain search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
logger.info("Error", ex);
Spark.halt(500, "Error");
return null;
}
}
public Object search(Request request, Response response) { public Object search(Request request, Response response) {
if (indexes.getLexiconReader() == null) { if (indexes.getLexiconReader() == null) {
logger.warn("Dictionary reader not yet initialized"); logger.warn("Dictionary reader not yet initialized");
@ -94,6 +69,7 @@ public class EdgeIndexQueryService {
String json = request.body(); String json = request.body();
EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class); EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class);
try { try {
return wmsa_edge_index_query_time.time(() -> query(specsSet)); return wmsa_edge_index_query_time.time(() -> query(specsSet));
} }
@ -117,51 +93,20 @@ public class EdgeIndexQueryService {
wmsa_edge_index_query_cost.set(searchQuery.getDataCost()); wmsa_edge_index_query_cost.set(searchQuery.getDataCost());
if (!searchQuery.hasTimeLeft()) {
wmsa_edge_index_query_timeouts.inc();
}
return new EdgeSearchResultSet(results); return new EdgeSearchResultSet(results);
} }
public EdgeDomainSearchResults queryDomain(EdgeDomainSearchSpecification specsSet) {
final OptionalInt wordId = lookUpWord(specsSet.keyword);
final EdgeIdList<EdgeUrl> urlIds = new EdgeIdList<>();
final IndexQueryCachePool pool = new IndexQueryCachePool();
final IndexSearchBudget budget = new IndexSearchBudget(50);
if (wordId.isEmpty()) {
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
}
for (int bucket = 0; budget.hasTimeLeft() && bucket < DYNAMIC_BUCKET_LENGTH+1; bucket++) {
final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(1);
var query = indexes.getBucket(bucket).getDomainQuery(pool, wordId.getAsInt(), localFilter);
long[] buffer = new long[512];
while (query.hasMore() && urlIds.size() < specsSet.maxResults) {
int cnt = query.getMoreResults(buffer, budget);
for (int i = 0; i < cnt && urlIds.size() < specsSet.maxResults; i++) {
long result = buffer[i];
if (localFilter.test(result)) {
urlIds.add((int) (result & 0xFFFF_FFFFL));
}
}
}
}
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
}
private class SearchQuery { private class SearchQuery {
private final int fetchSize; private final int fetchSize;
private final TIntHashSet seenResults; private final TIntHashSet seenResults;
private final EdgeSearchSpecification specsSet; private final EdgeSearchSpecification specsSet;
private final IndexSearchBudget budget; private final IndexSearchBudget budget;
private final IndexQueryCachePool cachePool = new IndexQueryCachePool(); private final Integer qualityLimit;
private final Integer rankLimit;
private long dataCost = 0; private long dataCost = 0;
public SearchQuery(EdgeSearchSpecification specsSet) { public SearchQuery(EdgeSearchSpecification specsSet) {
@ -169,6 +114,8 @@ public class EdgeIndexQueryService {
this.budget = new IndexSearchBudget(specsSet.timeoutMs); this.budget = new IndexSearchBudget(specsSet.timeoutMs);
this.fetchSize = specsSet.fetchSize; this.fetchSize = specsSet.fetchSize;
this.seenResults = new TIntHashSet(fetchSize, 0.5f); this.seenResults = new TIntHashSet(fetchSize, 0.5f);
this.qualityLimit = specsSet.quality;
this.rankLimit = specsSet.rank;
} }
private List<EdgeSearchResultItem> execute() { private List<EdgeSearchResultItem> execute() {
@ -178,22 +125,18 @@ public class EdgeIndexQueryService {
results.addAll(performSearch(sq)); results.addAll(performSearch(sq));
} }
final SearchTermEvaluator evaluator = new SearchTermEvaluator(specsSet, results);
for (var result : results) { for (var result : results) {
addResultScores(result); evaluator.addResultScores(result);
} }
if (!budget.hasTimeLeft()) { return createResultList(results);
wmsa_edge_index_query_timeouts.inc();
} }
private List<EdgeSearchResultItem> createResultList(Set<EdgeSearchResultItem> results) {
var domainCountFilter = new ResultDomainDeduplicator(specsSet.limitByDomain); var domainCountFilter = new ResultDomainDeduplicator(specsSet.limitByDomain);
if (WmsaHome.isDebug()) {
cachePool.printSummary(logger);
}
cachePool.clear();
List<EdgeSearchResultItem> resultList = results.stream() List<EdgeSearchResultItem> resultList = results.stream()
.sorted( .sorted(
comparing(EdgeSearchResultItem::getScore) comparing(EdgeSearchResultItem::getScore)
@ -204,6 +147,9 @@ public class EdgeIndexQueryService {
.collect(Collectors.toList()); .collect(Collectors.toList());
if (resultList.size() > specsSet.getLimitTotal()) { if (resultList.size() > specsSet.getLimitTotal()) {
// This can't be made a stream limit() operation because we need domainCountFilter
// to run over the entire list to provide accurate statistics
resultList.subList(specsSet.getLimitTotal(), resultList.size()).clear(); resultList.subList(specsSet.getLimitTotal(), resultList.size()).clear();
} }
@ -219,16 +165,20 @@ public class EdgeIndexQueryService {
{ {
final List<EdgeSearchResultItem> results = new ArrayList<>(fetchSize); final List<EdgeSearchResultItem> results = new ArrayList<>(fetchSize);
final EdgeIndexSearchTerms searchTerms = getSearchTerms(sq); final SearchTerms searchTerms = getSearchTerms(sq);
if (searchTerms.isEmpty()) if (searchTerms.isEmpty()) {
return Collections.emptyList(); return Collections.emptyList();
}
final BTreeQueryBuffer buffer = new BTreeQueryBuffer(fetchSize);
for (int indexBucket : specsSet.buckets) { for (int indexBucket : specsSet.buckets) {
final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT); final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT);
if (!budget.hasTimeLeft()) { if (!budget.hasTimeLeft()) {
logger.info("Query timed out, omitting {}:{} for query {}, ({}), -{}", indexBucket, sq.block, sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude); logger.info("Query timed out, omitting {}:{} for query {}, ({}), -{}",
indexBucket, sq.block, sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude);
continue; continue;
} }
@ -237,20 +187,22 @@ public class EdgeIndexQueryService {
break; break;
} }
IndexQuery query = getQuery(cachePool, indexBucket, sq.block, localFilter::filterRawValue, searchTerms); IndexQueryParams queryParams = new IndexQueryParams(sq.block, searchTerms, qualityLimit, rankLimit, specsSet.domains);
long[] buf = new long[fetchSize];
IndexQuery query = getQuery(indexBucket, localFilter::filterRawValue, queryParams);
while (query.hasMore() && results.size() < fetchSize && budget.hasTimeLeft()) { while (query.hasMore() && results.size() < fetchSize && budget.hasTimeLeft()) {
int cnt = query.getMoreResults(buf, budget); buffer.reset();
query.getMoreResults(buffer);
for (int i = 0; i < cnt && results.size() < fetchSize; i++) { for (int i = 0; i < buffer.size() && results.size() < fetchSize; i++) {
final long id = buf[i]; final long id = buffer.data[i];
if (!seenResults.add((int)(id & 0xFFFF_FFFFL)) || !localFilter.test(id)) { if (!seenResults.add((int)(id & 0xFFFF_FFFFL)) || !localFilter.test(id)) {
continue; continue;
} }
results.add(new EdgeSearchResultItem(indexBucket, id)); results.add(new EdgeSearchResultItem(indexBucket, sq.block, id));
} }
} }
@ -261,40 +213,127 @@ public class EdgeIndexQueryService {
return results; return results;
} }
private IndexQuery getQuery(IndexQueryCachePool cachePool, int bucket, IndexBlock block, private IndexQuery getQuery(int bucket, LongPredicate filter, IndexQueryParams params) {
LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
if (!indexes.isValidBucket(bucket)) { if (!indexes.isValidBucket(bucket)) {
logger.warn("Invalid bucket {}", bucket); logger.warn("Invalid bucket {}", bucket);
return new IndexQuery(Collections.emptyList()); return new IndexQuery(Collections.emptyList());
} }
return indexes.getBucket(bucket).getQuery(cachePool, block, filter, searchTerms); return indexes.getBucket(bucket).getQuery(filter, params);
} }
private void addResultScores(EdgeSearchResultItem searchResult) { public boolean hasTimeLeft() {
return budget.hasTimeLeft();
}
private record IndexAndBucket(IndexBlock block, int bucket) {}
public long getDataCost() {
return dataCost;
}
record ResultTerm (int bucket, int termId, long combinedUrlId) {}
}
public class SearchTermEvaluator {
private static final EdgePageWordMetadata blankMetadata = new EdgePageWordMetadata(EdgePageWordMetadata.emptyValue());
private final Map<SearchQuery.ResultTerm, EdgePageWordMetadata> termData = new HashMap<>(16);
private final List<List<String>> searchTermVariants;
public SearchTermEvaluator(EdgeSearchSpecification specsSet, Set<EdgeSearchResultItem> results) {
this.searchTermVariants = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
final int[] termIdsAll = getIncludeTermIds(specsSet);
Map<SearchQuery.IndexAndBucket, LongAVLTreeSet> resultIdsByBucket = new HashMap<>(7);
for (int termId : termIdsAll) {
for (var result: results) {
resultIdsByBucket
.computeIfAbsent(new SearchQuery.IndexAndBucket(result.block, result.bucketId),
id -> new LongAVLTreeSet())
.add(result.combinedId);
}
resultIdsByBucket.forEach((indexAndBucket, resultIds) ->
loadMetadata(termId, indexAndBucket.bucket, indexAndBucket.block, resultIds));
resultIdsByBucket.clear();
}
}
private int[] getIncludeTermIds(EdgeSearchSpecification specsSet) {
final var reader = Objects.requireNonNull(indexes.getLexiconReader()); final var reader = Objects.requireNonNull(indexes.getLexiconReader());
List<List<String>> searchTermVariants = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList(); final List<String> terms = specsSet.allIncludeSearchTerms();
final IntList ret = new IntArrayList(terms.size());
// Memoize calls to getTermData, as they're somewhat expensive and highly redundant for (var term : terms) {
Map<ResultTerm, ResultTermData> termMetadata = new HashMap<>(32); int id = reader.get(term);
if (id >= 0)
ret.add(id);
}
return ret.toIntArray();
}
private void loadMetadata(int termId, int bucket, IndexBlock indexBlock,
LongAVLTreeSet docIdsMissingMetadata)
{
EdgeIndexBucket index = indexes.getBucket(bucket);
if (docIdsMissingMetadata.isEmpty())
return;
long[] ids = docIdsMissingMetadata.toLongArray();
long[] metadata = index.getMetadata(indexBlock, termId, ids);
for (int i = 0; i < metadata.length; i++) {
if (metadata[i] == 0L)
continue;
termData.put(
new SearchQuery.ResultTerm(bucket, termId, ids[i]),
new EdgePageWordMetadata(metadata[i])
);
docIdsMissingMetadata.remove(ids[i]);
}
}
public void addResultScores(EdgeSearchResultItem searchResult) {
final var reader = Objects.requireNonNull(indexes.getLexiconReader());
double bestScore = 0; double bestScore = 0;
for (int searchTermListIdx = 0; searchTermListIdx < searchTermVariants.size(); searchTermListIdx++) { for (int searchTermListIdx = 0; searchTermListIdx < searchTermVariants.size(); searchTermListIdx++) {
double setScore = 0; double setScore = 0;
int setSize = 0; int setSize = 0;
for (var searchTerm : searchTermVariants.get(searchTermListIdx)) { var termList = searchTermVariants.get(searchTermListIdx);
for (int termIdx = 0; termIdx < termList.size(); termIdx++) {
String searchTerm = termList.get(termIdx);
final int termId = reader.get(searchTerm); final int termId = reader.get(searchTerm);
ResultTermData data = termMetadata.computeIfAbsent( var key = new SearchQuery.ResultTerm(searchResult.bucketId, termId, searchResult.getCombinedId());
new ResultTerm(searchResult.bucketId, termId, searchResult.getCombinedId()), this::getTermData); var metadata = termData.getOrDefault(key, blankMetadata);
EdgeSearchResultKeywordScore score = new EdgeSearchResultKeywordScore(searchTermListIdx, searchTerm, metadata);
var score = data.asScore(searchTermListIdx, searchTerm);
searchResult.scores.add(score); searchResult.scores.add(score);
setScore += score.value(); setScore += score.termValue();
if (termIdx == 0) {
setScore += score.documentValue();
}
setSize++; setSize++;
} }
bestScore = Math.min(bestScore, setScore/setSize); bestScore = Math.min(bestScore, setScore/setSize);
@ -303,64 +342,27 @@ public class EdgeIndexQueryService {
searchResult.setScore(bestScore); searchResult.setScore(bestScore);
} }
private ResultTermData getTermData(ResultTerm resultTerm) {
final EdgeIndexBucket bucket = indexes.getBucket(resultTerm.bucket);
final int termId = resultTerm.termId;
final long combinedUrlId = resultTerm.combinedUrlId;
return new ResultTermData(bucket.getTermScore(cachePool, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Title, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Link, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Site, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Subjects, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.NamesWords, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Top, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Middle, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Lower, termId, combinedUrlId)
);
} }
public long getDataCost() { private SearchTerms getSearchTerms(EdgeSearchSubquery request) {
return dataCost; final IntList excludes = new IntArrayList();
} final IntList includes = new IntArrayList();
record ResultTerm (int bucket, int termId, long combinedUrlId) {}
record ResultTermData (IndexBlock index,
boolean title,
boolean link,
boolean site,
boolean subject,
boolean name,
boolean high,
boolean mid,
boolean low
) {
public EdgeSearchResultKeywordScore asScore(int set, String searchTerm) {
return new EdgeSearchResultKeywordScore(set, searchTerm, index, title, link, site, subject, name, high, mid, low);
}
}
}
private EdgeIndexSearchTerms getSearchTerms(EdgeSearchSubquery request) {
final List<Integer> excludes = new ArrayList<>();
final List<Integer> includes = new ArrayList<>();
for (var include : request.searchTermsInclude) { for (var include : request.searchTermsInclude) {
var word = lookUpWord(include); var word = lookUpWord(include);
if (word.isEmpty()) { if (word.isEmpty()) {
logger.debug("Unknown search term: " + include); logger.debug("Unknown search term: " + include);
return new EdgeIndexSearchTerms(Collections.emptyList(), Collections.emptyList()); return new SearchTerms();
} }
includes.add(word.getAsInt()); includes.add(word.getAsInt());
} }
for (var advice : request.searchTermsAdvice) { for (var advice : request.searchTermsAdvice) {
var word = lookUpWord(advice); var word = lookUpWord(advice);
if (word.isEmpty()) { if (word.isEmpty()) {
logger.debug("Unknown search term: " + advice); logger.debug("Unknown search term: " + advice);
return new EdgeIndexSearchTerms(Collections.emptyList(), Collections.emptyList()); return new SearchTerms();
} }
includes.add(word.getAsInt()); includes.add(word.getAsInt());
} }
@ -369,7 +371,26 @@ public class EdgeIndexQueryService {
lookUpWord(exclude).ifPresent(excludes::add); lookUpWord(exclude).ifPresent(excludes::add);
} }
return new EdgeIndexSearchTerms(includes, excludes); return new SearchTerms(includes, excludes);
}
public record SearchTerms(IntList includes, IntList excludes) {
public SearchTerms() {
this(IntList.of(), IntList.of());
}
public boolean isEmpty() {
return includes.isEmpty();
}
public int[] sortedDistinctIncludes(IntComparator comparator) {
if (includes.isEmpty())
return includes.toIntArray();
IntList list = new IntArrayList(new IntOpenHashSet(includes));
list.sort(comparator);
return list.toIntArray();
}
} }

Some files were not shown because too many files have changed in this diff Show More