mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 04:58:59 +00:00
October Release (#118)
Co-authored-by: vlofgren <vlofgren@gmail.com> Co-authored-by: vlofgren <vlofgren@marginalia.nu> Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/118
This commit is contained in:
parent
9a7d052c43
commit
df49ccbe59
@ -175,7 +175,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
||||
|
||||
driver.get("http://proxyNginx/");
|
||||
System.out.println(driver.getTitle());
|
||||
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
||||
// System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
||||
|
||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("frontpage"));
|
||||
}
|
||||
@ -249,7 +249,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
||||
|
||||
driver.get("http://proxyNginx/search?query=browse:wikipedia.local");
|
||||
System.out.println(driver.getTitle());
|
||||
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
||||
// System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
||||
|
||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("browse"));
|
||||
}
|
||||
@ -259,7 +259,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
||||
|
||||
driver.get("http://proxyNginx/search?query=define:adiabatic");
|
||||
System.out.println(driver.getTitle());
|
||||
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
||||
// System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
||||
|
||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("define"));
|
||||
}
|
||||
@ -269,7 +269,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
||||
|
||||
driver.get("http://proxyNginx/search?query=3%2B3");
|
||||
System.out.println(driver.getTitle());
|
||||
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
||||
// System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
||||
|
||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("eval"));
|
||||
}
|
||||
|
313
marginalia_nu/src/jmh/java/nu/marginalia/BitSetTest.java
Normal file
313
marginalia_nu/src/jmh/java/nu/marginalia/BitSetTest.java
Normal file
@ -0,0 +1,313 @@
|
||||
package nu.marginalia;
|
||||
|
||||
import nu.marginalia.util.AndCardIntSet;
|
||||
import org.openjdk.jmh.annotations.*;
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
public class BitSetTest {
|
||||
@org.openjdk.jmh.annotations.State(Scope.Benchmark)
|
||||
public static class State {
|
||||
List<RoaringBitmap> roar = new ArrayList<>();
|
||||
List<AndCardIntSet> acbs = new ArrayList<>();
|
||||
|
||||
List<RoaringBitmap> roarLow = new ArrayList<>();
|
||||
List<RoaringBitmap> roarHigh = new ArrayList<>();
|
||||
|
||||
List<AndCardIntSet> acbsLow = new ArrayList<>();
|
||||
List<AndCardIntSet> acbsHigh = new ArrayList<>();
|
||||
|
||||
@Setup(Level.Trial)
|
||||
public void setUp() {
|
||||
var rand = new Random();
|
||||
|
||||
for (int i = 0; i < 100; i++) {
|
||||
int card = 1 + rand.nextInt(10);
|
||||
|
||||
var rb = new RoaringBitmap();
|
||||
var cbs = new AndCardIntSet();
|
||||
|
||||
for (int j = 0; j < card; j++) {
|
||||
int val = rand.nextInt(1_000_000);
|
||||
rb.add(val);
|
||||
cbs.add(val);
|
||||
}
|
||||
acbsLow.add(cbs);
|
||||
roarLow.add(rb);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 10; i++) {
|
||||
int card = 1 + rand.nextInt(10000, 20000);
|
||||
|
||||
var rb = new RoaringBitmap();
|
||||
|
||||
for (int j = 0; j < card; j++) {
|
||||
int val = rand.nextInt(1_000_000);
|
||||
rb.add(val);
|
||||
}
|
||||
acbsHigh.add(AndCardIntSet.of(rb));
|
||||
roarHigh.add(rb);
|
||||
}
|
||||
|
||||
|
||||
|
||||
for (int i = 0; i < 100000; i++) {
|
||||
var rb = new RoaringBitmap();
|
||||
var cbs = new AndCardIntSet();
|
||||
|
||||
int val = rand.nextInt(1_000_000);
|
||||
rb.add(val);
|
||||
cbs.add(val);
|
||||
|
||||
acbs.add(cbs);
|
||||
roar.add(rb);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 10000; i++) {
|
||||
int card = 1 + rand.nextInt(10);
|
||||
|
||||
var rb = new RoaringBitmap();
|
||||
var cbs = new AndCardIntSet();
|
||||
|
||||
for (int j = 0; j < card; j++) {
|
||||
int val = rand.nextInt(1_000_000);
|
||||
rb.add(val);
|
||||
cbs.add(val);
|
||||
}
|
||||
acbs.add(cbs);
|
||||
roar.add(rb);
|
||||
}
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
int card = 1 + rand.nextInt(100);
|
||||
|
||||
var rb = new RoaringBitmap();
|
||||
var cbs = new AndCardIntSet();
|
||||
|
||||
for (int j = 0; j < card; j++) {
|
||||
int val = rand.nextInt(1_000_000);
|
||||
rb.add(val);
|
||||
cbs.add(val);
|
||||
}
|
||||
acbs.add(cbs);
|
||||
roar.add(rb);
|
||||
}
|
||||
for (int i = 0; i < 100; i++) {
|
||||
int card = 1 + rand.nextInt(1000);
|
||||
|
||||
var rb = new RoaringBitmap();
|
||||
var cbs = new AndCardIntSet();
|
||||
|
||||
for (int j = 0; j < card; j++) {
|
||||
int val = rand.nextInt(1_000_000);
|
||||
rb.add(val);
|
||||
cbs.add(val);
|
||||
}
|
||||
acbs.add(cbs);
|
||||
roar.add(rb);
|
||||
}
|
||||
for (int i = 0; i < 100; i++) {
|
||||
int card = 1 + rand.nextInt(10000);
|
||||
|
||||
var rb = new RoaringBitmap();
|
||||
var cbs = new AndCardIntSet();
|
||||
|
||||
for (int j = 0; j < card; j++) {
|
||||
int val = rand.nextInt(1_000_000);
|
||||
rb.add(val);
|
||||
cbs.add(val);
|
||||
}
|
||||
acbs.add(cbs);
|
||||
roar.add(rb);
|
||||
}
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
int card = 1 + rand.nextInt(100000);
|
||||
|
||||
var rb = new RoaringBitmap();
|
||||
var cbs = new AndCardIntSet();
|
||||
|
||||
for (int j = 0; j < card; j++) {
|
||||
int val = rand.nextInt(1_000_000);
|
||||
rb.add(val);
|
||||
cbs.add(val);
|
||||
}
|
||||
acbs.add(cbs);
|
||||
roar.add(rb);
|
||||
}
|
||||
Collections.shuffle(acbs);
|
||||
Collections.shuffle(roar);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// @Benchmark
|
||||
// @BenchmarkMode(Mode.Throughput)
|
||||
// @Fork(value = 5, warmups = 5)
|
||||
// public Object roaringCard(State state) {
|
||||
// long val = 0;
|
||||
//
|
||||
// for (int i = 0; i < state.roar.size(); i++) {
|
||||
// for (int j = i+1; j < state.roar.size(); j++) {
|
||||
// val += RoaringBitmap.andCardinality(state.roar.get(i), state.roar.get(j));
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// return val;
|
||||
// }
|
||||
// @Benchmark
|
||||
// @BenchmarkMode(Mode.Throughput)
|
||||
// @Fork(value = 2, warmups = 2)
|
||||
// public Object roaringCardNorm(State state) {
|
||||
// long val = 0;
|
||||
//
|
||||
// for (int i = 0; i < state.roar.size()/1000; i++) {
|
||||
// for (int j = i+1; j < state.roar.size(); j++) {
|
||||
//
|
||||
// var a = state.roar.get(i);
|
||||
// var b = state.roar.get(j);
|
||||
// val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// return val;
|
||||
// }
|
||||
// @Benchmark
|
||||
// @BenchmarkMode(Mode.Throughput)
|
||||
// @Fork(value = 5, warmups = 5)
|
||||
// public Object cbsCard(State state) {
|
||||
// long val = 0;
|
||||
//
|
||||
// for (int i = 0; i < state.roar.size(); i++) {
|
||||
// for (int j = i+1; j < state.roar.size(); j++) {
|
||||
// val += AndCardIntSet.andCardinality(state.acbs.get(i), state.acbs.get(j));
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// return val;
|
||||
// }
|
||||
//
|
||||
// @Benchmark
|
||||
// @BenchmarkMode(Mode.Throughput)
|
||||
// @Fork(value = 1, warmups = 1)
|
||||
// public Object cbsCardNorm(State state) {
|
||||
// double val = 0;
|
||||
//
|
||||
// for (int i = 0; i < state.roar.size()/1000; i++) {
|
||||
// for (int j = i+1; j < state.roar.size(); j++) {
|
||||
// var a = state.acbs.get(i);
|
||||
// var b = state.acbs.get(j);
|
||||
// val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.cardinality()*b.cardinality()));
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// return val;
|
||||
// }
|
||||
|
||||
@Benchmark
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
@Fork(value = 1, warmups = 1)
|
||||
public Object cbsLowLow(State state) {
|
||||
double val = 0;
|
||||
|
||||
for (int i = 0; i < state.acbsLow.size(); i++) {
|
||||
for (int j = 0; j < state.acbsLow.size(); j++) {
|
||||
var a = state.acbsLow.get(i);
|
||||
var b = state.acbsLow.get(j);
|
||||
val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
|
||||
}
|
||||
}
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
|
||||
@Benchmark
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
@Fork(value = 1, warmups = 1)
|
||||
public Object cbsHighHigh(State state) {
|
||||
double val = 0;
|
||||
|
||||
for (int i = 0; i < state.acbsHigh.size(); i++) {
|
||||
for (int j = 0; j < state.acbsHigh.size(); j++) {
|
||||
var a = state.acbsHigh.get(i);
|
||||
var b = state.acbsHigh.get(j);
|
||||
val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
|
||||
}
|
||||
}
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
@Fork(value = 1, warmups = 1)
|
||||
public Object cbsHighLow(State state) {
|
||||
double val = 0;
|
||||
|
||||
for (int i = 0; i < state.acbsHigh.size(); i++) {
|
||||
for (int j = 0; j < state.acbsLow.size(); j++) {
|
||||
var a = state.acbsHigh.get(i);
|
||||
var b = state.acbsLow.get(j);
|
||||
val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
|
||||
}
|
||||
}
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
@Fork(value = 1, warmups = 1)
|
||||
public Object roarLowLow(State state) {
|
||||
double val = 0;
|
||||
|
||||
for (int i = 0; i < state.roarLow.size(); i++) {
|
||||
for (int j = 0; j < state.roarLow.size(); j++) {
|
||||
var a = state.roarLow.get(i);
|
||||
var b = state.roarLow.get(j);
|
||||
val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
|
||||
}
|
||||
}
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
|
||||
@Benchmark
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
@Fork(value = 1, warmups = 1)
|
||||
public Object roarHighLow(State state) {
|
||||
double val = 0;
|
||||
|
||||
for (int i = 0; i < state.roarHigh.size(); i++) {
|
||||
for (int j = 0; j < state.roarLow.size(); j++) {
|
||||
var a = state.roarHigh.get(i);
|
||||
var b = state.roarLow.get(j);
|
||||
val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
|
||||
}
|
||||
}
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
@Benchmark
|
||||
@BenchmarkMode(Mode.Throughput)
|
||||
@Fork(value = 1, warmups = 1)
|
||||
public Object roarHighHigh(State state) {
|
||||
double val = 0;
|
||||
|
||||
for (int i = 0; i < state.roarHigh.size(); i++) {
|
||||
for (int j = 0; j < state.roarHigh.size(); j++) {
|
||||
var a = state.roarHigh.get(i);
|
||||
var b = state.roarHigh.get(j);
|
||||
val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
|
||||
}
|
||||
}
|
||||
|
||||
return val;
|
||||
}
|
||||
}
|
@ -1,85 +0,0 @@
|
||||
package nu.marginalia;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import org.openjdk.jmh.annotations.*;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
public class ByteBufferBlockReadVsIndividualRead {
|
||||
|
||||
@State(Scope.Benchmark)
|
||||
public static class ByteBufferState {
|
||||
private MultimapFileLong mmf;
|
||||
private Path file;
|
||||
private static final int size = 800*1024*1024;
|
||||
@Setup(Level.Iteration)
|
||||
@SneakyThrows
|
||||
public void setUp() {
|
||||
file = Files.createTempFile("jmh", ".dat");
|
||||
mmf = MultimapFileLong.forOutput(file, size);
|
||||
for (int i = 0; i < size; i++) {
|
||||
mmf.put(i, i);
|
||||
}
|
||||
}
|
||||
|
||||
@TearDown(Level.Iteration)
|
||||
@SneakyThrows
|
||||
public void tearDown() {
|
||||
mmf.close();
|
||||
Files.delete(file);
|
||||
}
|
||||
|
||||
LongStream basicStream() {
|
||||
return IntStream.range(0, size).mapToLong(mmf::get);
|
||||
}
|
||||
|
||||
LongStream blockStream(int blockSize) {
|
||||
long urlOffset = 0;
|
||||
long endOffset = size;
|
||||
|
||||
long[] arry = new long[blockSize];
|
||||
|
||||
return LongStream
|
||||
.iterate(urlOffset, i -> i< endOffset, i->i+blockSize)
|
||||
.flatMap(pos -> {
|
||||
int sz = (int)(Math.min(pos+blockSize, endOffset) - pos);
|
||||
mmf.read(arry, sz, pos);
|
||||
return Arrays.stream(arry, 0, sz);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
// @Benchmark @BenchmarkMode(Mode.Throughput)
|
||||
// @Fork(value = 1, warmups = 1)
|
||||
// @Warmup(iterations = 1)
|
||||
public long testBasic(ByteBufferState state) {
|
||||
return state.basicStream().sum();
|
||||
}
|
||||
|
||||
|
||||
@Benchmark @BenchmarkMode(Mode.Throughput)
|
||||
@Fork(value = 1, warmups = 1)
|
||||
@Warmup(iterations = 0)
|
||||
public long testBlock128(ByteBufferState state) {
|
||||
return state.blockStream(128).sum();
|
||||
}
|
||||
@Benchmark @BenchmarkMode(Mode.Throughput)
|
||||
@Fork(value = 1, warmups = 1)
|
||||
@Warmup(iterations = 0)
|
||||
public long testBlock1024(ByteBufferState state) {
|
||||
return state.blockStream(1024).sum();
|
||||
}
|
||||
@Benchmark @BenchmarkMode(Mode.Throughput)
|
||||
@Fork(value = 1, warmups = 1)
|
||||
@Warmup(iterations = 0)
|
||||
public long testBlock8192(ByteBufferState state) {
|
||||
return state.blockStream(8192).sum();
|
||||
}
|
||||
}
|
@ -0,0 +1,205 @@
|
||||
package nu.marginalia.util;
|
||||
|
||||
import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
|
||||
|
||||
public class AndCardIntSet {
|
||||
final TIntArrayList backingList;
|
||||
long hash;
|
||||
|
||||
public AndCardIntSet() {
|
||||
backingList = new TIntArrayList(16);
|
||||
backingList.sort();
|
||||
}
|
||||
|
||||
public static AndCardIntSet of(int... list) {
|
||||
var set = new TIntHashSet(list);
|
||||
TIntArrayList lst = new TIntArrayList(set);
|
||||
lst.sort();
|
||||
|
||||
return new AndCardIntSet(lst);
|
||||
}
|
||||
|
||||
public static AndCardIntSet of(RoaringBitmap bmap) {
|
||||
|
||||
TIntArrayList lst = new TIntArrayList(bmap.getCardinality());
|
||||
lst.addAll(bmap.toArray());
|
||||
|
||||
return new AndCardIntSet(lst);
|
||||
}
|
||||
|
||||
|
||||
private AndCardIntSet(TIntArrayList list) {
|
||||
backingList = list;
|
||||
hash = 0;
|
||||
|
||||
if (list.size() < 128) {
|
||||
for (int v : list.toArray()) {
|
||||
int bit = hasher.hashInt(v).asInt() % 64;
|
||||
hash |= (1L << bit);
|
||||
}
|
||||
}
|
||||
else {
|
||||
hash = ~0L;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static final HashFunction hasher = Hashing.murmur3_128(0);
|
||||
|
||||
public boolean add(int val) {
|
||||
if (!contains(val)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (backingList.size() < 128) {
|
||||
int bit = hasher.hashInt(val).asInt() % 64;
|
||||
hash |= (1L << bit);
|
||||
}
|
||||
else {
|
||||
hash = ~0L;
|
||||
}
|
||||
backingList.add(val);
|
||||
backingList.sort();
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean contains(int val) {
|
||||
return backingList.binarySearch(val) >= 0;
|
||||
}
|
||||
|
||||
public int getCardinality() {
|
||||
return backingList.size();
|
||||
}
|
||||
|
||||
public static int andCardinality(AndCardIntSet a, AndCardIntSet b) {
|
||||
|
||||
if (!testHash(a,b)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (a.getCardinality() + b.getCardinality() < 10) {
|
||||
return andLinearSmall(a, b);
|
||||
}
|
||||
|
||||
return andLinear(a,b);
|
||||
}
|
||||
|
||||
private static int andLinearSmall(AndCardIntSet a, AndCardIntSet b) {
|
||||
int sum = 0;
|
||||
for (int i = 0; i < a.getCardinality(); i++) {
|
||||
for (int j = 0; j < b.getCardinality(); j++) {
|
||||
if (a.backingList.getQuick(i) == b.backingList.getQuick(j))
|
||||
sum++;
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
private static int andLinear(AndCardIntSet a, AndCardIntSet b) {
|
||||
|
||||
int i = 0, j = 0;
|
||||
int card = 0;
|
||||
|
||||
do {
|
||||
int diff = a.backingList.getQuick(i) - b.backingList.getQuick(j);
|
||||
|
||||
if (diff < 0) i++;
|
||||
else if (diff > 0) j++;
|
||||
else {
|
||||
i++;
|
||||
j++;
|
||||
card++;
|
||||
}
|
||||
} while (i < a.getCardinality() && j < b.getCardinality());
|
||||
|
||||
return card;
|
||||
|
||||
}
|
||||
|
||||
private static boolean testHash(AndCardIntSet a, AndCardIntSet b) {
|
||||
return (a.hash & b.hash) != 0;
|
||||
}
|
||||
|
||||
public boolean cardinalityExceeds(int val) {
|
||||
return getCardinality() >= val;
|
||||
}
|
||||
|
||||
public static AndCardIntSet and(AndCardIntSet a, AndCardIntSet b) {
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
|
||||
TIntArrayList andVals = new TIntArrayList(1 + (int)Math.sqrt(a.getCardinality()));
|
||||
|
||||
while (i < a.getCardinality() && j < b.getCardinality()) {
|
||||
int diff = a.backingList.getQuick(i) - b.backingList.getQuick(j);
|
||||
if (diff < 0) i++;
|
||||
else if (diff > 0) j++;
|
||||
else {
|
||||
andVals.add(a.backingList.getQuick(i));
|
||||
i++;
|
||||
j++;
|
||||
}
|
||||
}
|
||||
|
||||
return new AndCardIntSet(andVals);
|
||||
}
|
||||
|
||||
public static double weightedProduct(float[] weights, AndCardIntSet a, AndCardIntSet b) {
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
|
||||
double sum = 0;
|
||||
|
||||
if (a.getCardinality() + b.getCardinality() < 10) {
|
||||
return weightedProductSmall(weights, a, b);
|
||||
}
|
||||
|
||||
do {
|
||||
int diff = a.backingList.getQuick(i) - b.backingList.getQuick(j);
|
||||
if (diff < 0) i++;
|
||||
else if (diff > 0) j++;
|
||||
else {
|
||||
sum += weights[a.backingList.getQuick(i)];
|
||||
i++;
|
||||
j++;
|
||||
}
|
||||
} while (i < a.getCardinality() && j < b.getCardinality());
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
private static double weightedProductSmall(float[] weights, AndCardIntSet a, AndCardIntSet b) {
|
||||
double sum = 0;
|
||||
|
||||
for (int i = 0; i < a.getCardinality(); i++) {
|
||||
for (int j = 0; j < b.getCardinality(); j++) {
|
||||
int av = a.backingList.getQuick(i);
|
||||
int bv = b.backingList.getQuick(j);
|
||||
if (av == bv)
|
||||
sum+=weights[av];
|
||||
}
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
public double mulAndSum(float[] weights) {
|
||||
double sum = 0;
|
||||
for (int i = 0; i < backingList.size(); i++) {
|
||||
sum += weights[backingList.getQuick(i)];
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
public int[] toArray() {
|
||||
return backingList.toArray();
|
||||
}
|
||||
|
||||
public TIntArrayList values() {
|
||||
return backingList;
|
||||
}
|
||||
}
|
@ -0,0 +1,52 @@
|
||||
package nu.marginalia.util;
|
||||
|
||||
public class BrailleBlockPunchCards {
|
||||
|
||||
public static String printBits(int val, int bits) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
|
||||
for (int b = 0; b < bits; b+=8, val>>>=8) {
|
||||
builder.append((char)('\u2800'+bin2brail(val)));
|
||||
}
|
||||
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
/* The braille block in unicode U2800 is neat because it contains
|
||||
* 8 "bits", but for historical reasons, they're addressed in a bit
|
||||
* of an awkward way. Braille used to be a 2x6 grid, but it was extended
|
||||
* to 2x8.
|
||||
*
|
||||
* It's addressed as follows
|
||||
*
|
||||
* 0 3
|
||||
* 1 4
|
||||
* 2 5
|
||||
* 6 7 <-- extended braille
|
||||
*
|
||||
*
|
||||
* We want to use it as a dot matrix to represent bits. To do that we need
|
||||
* to do this transformation:
|
||||
*
|
||||
* 0 1 2 3 4 5 6 7 native order bits
|
||||
* | | | \ _\__\/ |
|
||||
* | | | / \ \ \ |
|
||||
* 0 1 2 6 3 4 5 7 braille order bits
|
||||
*
|
||||
* 01 02 04 08 10 20 40 80
|
||||
* 01+02+04 +80 : &0x87
|
||||
* << 10+20+40 : &0x70, <<1
|
||||
* 08 >> >> >> : &0x08, >>3
|
||||
*
|
||||
* Or in other words we do
|
||||
* (v & 0x87)
|
||||
* | ((v & 0x70) >> 1)
|
||||
* | ((v & 0x08) << 3)
|
||||
*
|
||||
* Thanks for coming to my TED talk.
|
||||
*/
|
||||
|
||||
private static char bin2brail(int v) {
|
||||
return (char)((v & 0x87) | ((v & 0x70) >> 1) | ((v & 0x08) << 3));
|
||||
}
|
||||
}
|
@ -1,5 +1,7 @@
|
||||
package nu.marginalia.util;
|
||||
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
@ -14,13 +16,13 @@ public class ListChunker {
|
||||
*
|
||||
* @see List#subList
|
||||
*/
|
||||
public static <T> List<List<T>> chopList(List<T> data, int size) {
|
||||
public static List<DocumentKeywords> chopList(DocumentKeywords data, int size) {
|
||||
if (data.isEmpty())
|
||||
return Collections.emptyList();
|
||||
else if (data.size() < size)
|
||||
return List.of(data);
|
||||
|
||||
final List<List<T>> ret = new ArrayList<>(1 + data.size() / size);
|
||||
final List<DocumentKeywords> ret = new ArrayList<>(1 + data.size() / size);
|
||||
|
||||
for (int i = 0; i < data.size(); i+=size) {
|
||||
ret.add(data.subList(i, Math.min(data.size(), i+size)));
|
||||
|
@ -0,0 +1,33 @@
|
||||
package nu.marginalia.util.btree;
|
||||
|
||||
import nu.marginalia.util.btree.model.BTreeContext;
|
||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||
import nu.marginalia.util.multimap.MultimapFileLongSlice;
|
||||
|
||||
/*
|
||||
* End-of-page mark that's used as a sentinel to verify that
|
||||
* the BTreeWriter's caller actually writes as much as they say
|
||||
* they want to. (Failing to do so will corrupt the tree)
|
||||
*
|
||||
*/
|
||||
public class BTreeDogEar {
|
||||
|
||||
private MultimapFileLongSlice sentinelSlice;
|
||||
|
||||
public BTreeDogEar(BTreeContext ctx, BTreeHeader header, MultimapFileLongSlice base) {
|
||||
if (header.numEntries() > 3) {
|
||||
sentinelSlice = base.atOffset((long) header.numEntries() * ctx.entrySize() - 3);
|
||||
sentinelSlice.put(0, 4L);
|
||||
sentinelSlice.put(1, 5L);
|
||||
sentinelSlice.put(2, 1L);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean verify() {
|
||||
if (sentinelSlice == null)
|
||||
return true;
|
||||
|
||||
return 4 != sentinelSlice.get(0) || 5 != sentinelSlice.get(1) || 1 != sentinelSlice.get(2);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,146 @@
|
||||
package nu.marginalia.util.btree;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class BTreeQueryBuffer {
|
||||
public final long[] data;
|
||||
public int end;
|
||||
|
||||
private int read = 0;
|
||||
private int write = 0;
|
||||
|
||||
public BTreeQueryBuffer(int size) {
|
||||
this.data = new long[size];
|
||||
this.end = size;
|
||||
}
|
||||
|
||||
public BTreeQueryBuffer(long [] data, int size) {
|
||||
this.data = data;
|
||||
this.end = size;
|
||||
}
|
||||
|
||||
private BTreeQueryBuffer(long [] data) {
|
||||
this.data = data;
|
||||
this.end = data.length;
|
||||
}
|
||||
|
||||
public BTreeQueryBuffer[] split(int... splitPoints) {
|
||||
BTreeQueryBuffer[] ret = new BTreeQueryBuffer[splitPoints.length+1];
|
||||
|
||||
ret[0] = new BTreeQueryBuffer(Arrays.copyOfRange(data, 0, splitPoints[0]));
|
||||
for (int i = 1; i < splitPoints.length; i++) {
|
||||
ret[i] = new BTreeQueryBuffer(Arrays.copyOfRange(data, splitPoints[i-1], splitPoints[i]));
|
||||
}
|
||||
ret[ret.length-1] = new BTreeQueryBuffer(Arrays.copyOfRange(data, splitPoints[splitPoints.length-1], end));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public void gather(BTreeQueryBuffer... buffers) {
|
||||
int start = 0;
|
||||
|
||||
for (var buffer : buffers) {
|
||||
System.arraycopy(buffer.data, 0, data, start, buffer.end);
|
||||
start += buffer.end;
|
||||
}
|
||||
|
||||
this.read = 0;
|
||||
this.write = 0;
|
||||
this.end = start;
|
||||
}
|
||||
|
||||
public long[] copyData() {
|
||||
return Arrays.copyOf(data, end);
|
||||
}
|
||||
|
||||
public void retainAll() {
|
||||
read = write = end;
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return end == 0;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return end;
|
||||
}
|
||||
|
||||
public long currentValue() {
|
||||
return data[read];
|
||||
}
|
||||
|
||||
public boolean rejectAndAdvance() {
|
||||
return ++read < end;
|
||||
}
|
||||
|
||||
public boolean retainAndAdvance() {
|
||||
if (read != write) {
|
||||
long tmp = data[write];
|
||||
data[write] = data[read];
|
||||
data[read] = tmp;
|
||||
}
|
||||
|
||||
write++;
|
||||
|
||||
return ++read < end;
|
||||
}
|
||||
|
||||
public boolean hasMore() {
|
||||
return read < end;
|
||||
}
|
||||
|
||||
public void finalizeFiltering() {
|
||||
end = write;
|
||||
read = 0;
|
||||
write = 0;
|
||||
}
|
||||
|
||||
public void startFilterForRange(int pos, int end) {
|
||||
read = write = pos;
|
||||
this.end = end;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
end = data.length;
|
||||
read = 0;
|
||||
write = 0;
|
||||
}
|
||||
|
||||
public void zero() {
|
||||
end = 0;
|
||||
read = 0;
|
||||
write = 0;
|
||||
Arrays.fill(data, 0);
|
||||
}
|
||||
|
||||
public void uniq() {
|
||||
if (end <= 1) return;
|
||||
|
||||
long prev = currentValue();
|
||||
retainAndAdvance();
|
||||
|
||||
while (hasMore()) {
|
||||
|
||||
long val = currentValue();
|
||||
|
||||
if (prev == val) {
|
||||
rejectAndAdvance();
|
||||
} else {
|
||||
retainAndAdvance();
|
||||
prev = val;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
finalizeFiltering();
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return getClass().getSimpleName() + "[" +
|
||||
"read = " + read +
|
||||
",write = " + write +
|
||||
",end = " + end +
|
||||
",data = [" + Arrays.toString(Arrays.copyOf(data, end)) + "]]";
|
||||
}
|
||||
|
||||
}
|
@ -1,5 +1,7 @@
|
||||
package nu.marginalia.util.btree;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongLongImmutablePair;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.btree.model.BTreeContext;
|
||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
@ -14,70 +16,275 @@ public class BTreeReader {
|
||||
|
||||
private final MultimapSearcher indexSearcher;
|
||||
private final MultimapSearcher dataSearcher;
|
||||
private final BTreeHeader header;
|
||||
|
||||
public BTreeReader(MultimapFileLong file, BTreeContext ctx) {
|
||||
public BTreeReader(MultimapFileLong file, BTreeContext ctx, BTreeHeader header) {
|
||||
this.file = file;
|
||||
this.indexSearcher = MultimapSearcher.forContext(file, ~0, 1);
|
||||
this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize());
|
||||
|
||||
this.ctx = ctx;
|
||||
this.header = header;
|
||||
}
|
||||
|
||||
public BTreeHeader getHeader(long fileOffset) {
|
||||
public BTreeReader(MultimapFileLong file, BTreeContext ctx, long offset) {
|
||||
this.file = file;
|
||||
this.indexSearcher = MultimapSearcher.forContext(file, ~0, 1);
|
||||
this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize());
|
||||
|
||||
this.ctx = ctx;
|
||||
this.header = createHeader(file, offset);
|
||||
}
|
||||
|
||||
public static BTreeHeader createHeader(MultimapFileLong file, long fileOffset) {
|
||||
return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
|
||||
}
|
||||
|
||||
public BTreeHeader getHeader() {
|
||||
return header;
|
||||
}
|
||||
|
||||
public int numEntries() {
|
||||
return header.numEntries();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void retainEntries(BTreeQueryBuffer buffer) {
|
||||
if (header.layers() == 0) {
|
||||
BTreePointer pointer = new BTreePointer(header);
|
||||
pointer.retainData(buffer);
|
||||
}
|
||||
retainSingle(buffer);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void rejectEntries(BTreeQueryBuffer buffer) {
|
||||
if (header.layers() == 0) {
|
||||
BTreePointer pointer = new BTreePointer(header);
|
||||
pointer.rejectData(buffer);
|
||||
}
|
||||
rejectSingle(buffer);
|
||||
}
|
||||
|
||||
private void retainSingle(BTreeQueryBuffer buffer) {
|
||||
|
||||
BTreePointer pointer = new BTreePointer(header);
|
||||
|
||||
for (; buffer.hasMore(); pointer.resetToRoot()) {
|
||||
|
||||
long val = buffer.currentValue() & ctx.equalityMask();
|
||||
|
||||
if (!pointer.walkToData(val)) {
|
||||
buffer.rejectAndAdvance();
|
||||
continue;
|
||||
}
|
||||
|
||||
pointer.retainData(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
private void rejectSingle(BTreeQueryBuffer buffer) {
|
||||
BTreePointer pointer = new BTreePointer(header);
|
||||
|
||||
for (; buffer.hasMore(); pointer.resetToRoot()) {
|
||||
|
||||
long val = buffer.currentValue() & ctx.equalityMask();
|
||||
|
||||
if (pointer.walkToData(val) && pointer.containsData(val)) {
|
||||
buffer.rejectAndAdvance();
|
||||
}
|
||||
else {
|
||||
buffer.retainAndAdvance();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @return file offset of entry matching keyRaw, negative if absent
|
||||
*/
|
||||
public long findEntry(BTreeHeader header, final long keyRaw) {
|
||||
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
||||
|
||||
public long findEntry(final long keyRaw) {
|
||||
final long key = keyRaw & ctx.equalityMask();
|
||||
final long dataAddress = header.dataOffsetLongs();
|
||||
|
||||
final long searchStart;
|
||||
final long numEntries;
|
||||
BTreePointer ip = new BTreePointer(header);
|
||||
|
||||
if (header.layers() == 0) { // For small data, there is no index block, only a flat data block
|
||||
searchStart = dataAddress;
|
||||
numEntries = header.numEntries();
|
||||
}
|
||||
else {
|
||||
long dataLayerOffset = searchIndex(header, key);
|
||||
if (dataLayerOffset < 0) {
|
||||
return dataLayerOffset;
|
||||
while (!ip.isDataLayer())
|
||||
ip.walkToChild(key);
|
||||
|
||||
return ip.findData(key);
|
||||
}
|
||||
|
||||
searchStart = dataAddress + dataLayerOffset * ctx.entrySize();
|
||||
numEntries = min(header.numEntries() - dataLayerOffset, blockSize);
|
||||
public void readData(long[] data, int n, long pos) {
|
||||
file.read(data, n, header.dataOffsetLongs() + pos);
|
||||
}
|
||||
|
||||
public long[] queryData(long[] urls, int offset) {
|
||||
BTreePointer pointer = new BTreePointer(header);
|
||||
|
||||
long[] ret = new long[urls.length];
|
||||
|
||||
for (int i = 0; i < urls.length; i++, pointer.resetToRoot()) {
|
||||
if (pointer.walkToData(urls[i])) {
|
||||
long dataAddress = pointer.findData(urls[i]);
|
||||
if (dataAddress >= 0) {
|
||||
ret[i] = file.get(dataAddress + offset);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/** Find the range of values so that prefixStart <= n < prefixNext */
|
||||
public LongLongImmutablePair getRangeForPrefix(long prefixStart, long prefixNext) {
|
||||
long lowerBoundStart = lowerBound(prefixStart);
|
||||
long lowerBoundEnd = lowerBound(prefixNext);
|
||||
|
||||
return new LongLongImmutablePair(lowerBoundStart, lowerBoundEnd);
|
||||
}
|
||||
|
||||
private long lowerBound(long key) {
|
||||
key &= ctx.equalityMask();
|
||||
|
||||
BTreePointer ip = new BTreePointer(header);
|
||||
|
||||
while (!ip.isDataLayer())
|
||||
ip.walkToChild(key);
|
||||
|
||||
return ip.findDataLower(key);
|
||||
}
|
||||
|
||||
private class BTreePointer {
|
||||
private final long[] layerOffsets;
|
||||
|
||||
private int layer;
|
||||
private long offset;
|
||||
private long boundary;
|
||||
|
||||
public String toString() {
|
||||
return getClass().getSimpleName() + "[" +
|
||||
"layer = " + layer + " ," +
|
||||
"offset = " + offset + "]";
|
||||
}
|
||||
|
||||
public BTreePointer(BTreeHeader header) {
|
||||
layer = header.layers() - 1;
|
||||
offset = 0;
|
||||
layerOffsets = header.getRelativeLayerOffsets(ctx);
|
||||
boundary = Long.MAX_VALUE;
|
||||
}
|
||||
|
||||
public void resetToRoot() {
|
||||
this.layer = header.layers() - 1;
|
||||
this.offset = 0;
|
||||
this.boundary = Long.MAX_VALUE;
|
||||
}
|
||||
|
||||
public int layer() {
|
||||
return layer;
|
||||
}
|
||||
|
||||
public boolean walkToChild(long key) {
|
||||
final long indexAddress = header.indexOffsetLongs();
|
||||
|
||||
final long indexLayerBlockOffset = layerOffsets[layer] + offset;
|
||||
|
||||
final long searchStart = indexAddress + indexLayerBlockOffset;
|
||||
final long nextLayerOffset = (int)(indexSearcher.binarySearchLower(key, searchStart, ctx.BLOCK_SIZE_WORDS()) - searchStart);
|
||||
|
||||
if (nextLayerOffset < 0)
|
||||
return false;
|
||||
|
||||
layer --;
|
||||
boundary = file.get(searchStart + offset);
|
||||
offset = ctx.BLOCK_SIZE_WORDS() * (offset + nextLayerOffset);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean walkToData(long key) {
|
||||
while (!isDataLayer()) {
|
||||
if (!walkToChild(key)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean isDataLayer() {
|
||||
return layer < 0;
|
||||
}
|
||||
|
||||
public boolean containsData(long key) {
|
||||
return findData(key) >= 0;
|
||||
}
|
||||
|
||||
public long findData(long key) {
|
||||
if (layer > 0) {
|
||||
throw new IllegalStateException("Looking for data in an index layer");
|
||||
}
|
||||
|
||||
long searchStart = header.dataOffsetLongs() + offset * ctx.entrySize();
|
||||
int numEntries = min((int)(header.numEntries() - offset), ctx.BLOCK_SIZE_WORDS());
|
||||
|
||||
return dataSearcher.binarySearch(key, searchStart, numEntries);
|
||||
}
|
||||
|
||||
private long searchIndex(BTreeHeader header, long key) {
|
||||
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
||||
final long indexAddress = header.indexOffsetLongs();
|
||||
|
||||
long layerOffset = 0;
|
||||
|
||||
for (int i = header.layers() - 1; i >= 0; --i) {
|
||||
final long indexLayerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset;
|
||||
|
||||
final long nextLayerOffset = relativePositionInIndex(key, indexAddress + indexLayerBlockOffset, blockSize);
|
||||
if (nextLayerOffset < 0)
|
||||
return nextLayerOffset;
|
||||
|
||||
layerOffset = blockSize * (nextLayerOffset + layerOffset);
|
||||
public long findDataLower(long key) {
|
||||
if (layer > 0) {
|
||||
throw new IllegalStateException("Looking for data in an index layer");
|
||||
}
|
||||
|
||||
return layerOffset;
|
||||
long searchStart = header.dataOffsetLongs() + offset * ctx.entrySize();
|
||||
int numEntries = min((int)(header.numEntries() - offset), ctx.BLOCK_SIZE_WORDS());
|
||||
|
||||
return dataSearcher.binarySearchLower(key, searchStart, numEntries);
|
||||
}
|
||||
|
||||
private long relativePositionInIndex(long key, long start, long n) {
|
||||
return indexSearcher.binarySearchUpper(key, start, n) - start;
|
||||
public void retainData(BTreeQueryBuffer buffer) {
|
||||
|
||||
long dataOffset = findData(buffer.currentValue());
|
||||
if (dataOffset >= 0) {
|
||||
buffer.retainAndAdvance();
|
||||
|
||||
long blockBase = header.dataOffsetLongs() + offset * ctx.entrySize();
|
||||
long relOffset = dataOffset - blockBase;
|
||||
|
||||
int numEntries =
|
||||
min((int) (header.numEntries() - relOffset), ctx.BLOCK_SIZE_WORDS()) / ctx.entrySize();
|
||||
|
||||
if (buffer.currentValue() <= boundary) {
|
||||
file.retain(buffer, boundary, dataOffset, numEntries, ctx.equalityMask(), ctx.entrySize());
|
||||
}
|
||||
}
|
||||
else {
|
||||
buffer.rejectAndAdvance();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void rejectData(BTreeQueryBuffer buffer) {
|
||||
|
||||
long dataOffset = findData(buffer.currentValue());
|
||||
if (dataOffset >= 0) {
|
||||
buffer.rejectAndAdvance();
|
||||
|
||||
long blockBase = header.dataOffsetLongs() + offset * ctx.entrySize();
|
||||
long relOffset = dataOffset - blockBase;
|
||||
|
||||
int numEntries =
|
||||
min((int) (header.numEntries() - relOffset), ctx.BLOCK_SIZE_WORDS()) / ctx.entrySize();
|
||||
|
||||
if (buffer.currentValue() <= boundary) {
|
||||
file.reject(buffer, boundary, dataOffset, numEntries, ctx.equalityMask(), ctx.entrySize());
|
||||
}
|
||||
}
|
||||
else {
|
||||
buffer.retainAndAdvance();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -3,6 +3,8 @@ package nu.marginalia.util.btree;
|
||||
import nu.marginalia.util.btree.model.BTreeContext;
|
||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||
import nu.marginalia.util.multimap.MultimapFileLongSlice;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
@ -10,6 +12,7 @@ import java.io.IOException;
|
||||
public class BTreeWriter {
|
||||
private final BTreeContext ctx;
|
||||
private final MultimapFileLongSlice map;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
public BTreeWriter(MultimapFileLongSlice map, BTreeContext ctx) {
|
||||
this.map = map;
|
||||
@ -39,7 +42,16 @@ public class BTreeWriter {
|
||||
|
||||
header.write(map, offset);
|
||||
|
||||
writeIndexCallback.write(map.atOffset(header.dataOffsetLongs()));
|
||||
|
||||
var slice = map.atOffset(header.dataOffsetLongs());
|
||||
|
||||
BTreeDogEar dogEar = new BTreeDogEar(ctx, header, slice);
|
||||
|
||||
writeIndexCallback.write(slice);
|
||||
|
||||
if (!dogEar.verify()) {
|
||||
logger.error("Dog ear was not overwritten: {}", header);
|
||||
}
|
||||
|
||||
if (header.layers() < 1) { // The data is too small to benefit from indexing
|
||||
return ctx.calculateSize(numEntries);
|
||||
|
@ -1,136 +0,0 @@
|
||||
package nu.marginalia.util.btree;
|
||||
|
||||
import nu.marginalia.util.btree.model.BTreeContext;
|
||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import nu.marginalia.util.multimap.MultimapSearcher;
|
||||
|
||||
import static java.lang.Math.min;
|
||||
|
||||
public class CachingBTreeReader {
|
||||
|
||||
private final MultimapFileLong file;
|
||||
public final BTreeContext ctx;
|
||||
|
||||
private final MultimapSearcher dataSearcher;
|
||||
|
||||
public CachingBTreeReader(MultimapFileLong file, BTreeContext ctx) {
|
||||
this.file = file;
|
||||
this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize());
|
||||
|
||||
this.ctx = ctx;
|
||||
}
|
||||
|
||||
public BTreeHeader getHeader(long fileOffset) {
|
||||
return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
|
||||
}
|
||||
|
||||
public BTreeCachedIndex prepareCache(BTreeHeader header) {
|
||||
return new BTreeCachedIndex(header);
|
||||
}
|
||||
/**
|
||||
*
|
||||
* @return file offset of entry matching keyRaw, negative if absent
|
||||
*/
|
||||
public long findEntry(BTreeCachedIndex cache, final long keyRaw) {
|
||||
BTreeHeader header = cache.header;
|
||||
|
||||
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
||||
|
||||
final long key = keyRaw & ctx.equalityMask();
|
||||
final long dataAddress = header.dataOffsetLongs();
|
||||
|
||||
final long searchStart;
|
||||
final long numEntries;
|
||||
|
||||
if (header.layers() == 0) { // For small data, there is no index block, only a flat data block
|
||||
searchStart = dataAddress;
|
||||
numEntries = header.numEntries();
|
||||
}
|
||||
else {
|
||||
cache.load();
|
||||
|
||||
long dataLayerOffset = searchIndex(header, cache, key);
|
||||
if (dataLayerOffset < 0) {
|
||||
return dataLayerOffset;
|
||||
}
|
||||
|
||||
searchStart = dataAddress + dataLayerOffset * ctx.entrySize();
|
||||
numEntries = min(header.numEntries() - dataLayerOffset, blockSize);
|
||||
}
|
||||
|
||||
return dataSearcher.binarySearch(key, searchStart, numEntries);
|
||||
}
|
||||
|
||||
private long searchIndex(BTreeHeader header, BTreeCachedIndex cache, long key) {
|
||||
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
||||
long layerOffset = 0;
|
||||
|
||||
for (int i = header.layers() - 1; i >= 0; --i) {
|
||||
final long indexLayerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset;
|
||||
|
||||
final long nextLayerOffset = cache.relativePositionInIndex(key, (int) indexLayerBlockOffset, blockSize);
|
||||
if (nextLayerOffset < 0)
|
||||
return nextLayerOffset;
|
||||
|
||||
layerOffset = blockSize * (nextLayerOffset + layerOffset);
|
||||
}
|
||||
|
||||
return layerOffset;
|
||||
}
|
||||
|
||||
/** A cache for the BTree index data that will drastically reduce the number of disk reads
|
||||
* for repeated queries against the same tree. The memory consumption is typically very low
|
||||
* and the disk access pattern for reading the entire index relatively cheap.
|
||||
*/
|
||||
public class BTreeCachedIndex {
|
||||
long[] indexData;
|
||||
final BTreeHeader header;
|
||||
|
||||
final int indexedDataSize;
|
||||
|
||||
public BTreeCachedIndex(BTreeHeader header) {
|
||||
this.header = header;
|
||||
indexedDataSize = header.numEntries();
|
||||
}
|
||||
|
||||
public void load() {
|
||||
if (indexData != null)
|
||||
return;
|
||||
|
||||
int size = (int)(header.dataOffsetLongs() - header.indexOffsetLongs());
|
||||
indexData = new long[size];
|
||||
file.read(indexData, header.indexOffsetLongs());
|
||||
}
|
||||
|
||||
long relativePositionInIndex(long key, int fromIndex, int n) {
|
||||
int low = 0;
|
||||
int high = n - 1;
|
||||
|
||||
while (low <= high) {
|
||||
int mid = (low + high) >>> 1;
|
||||
long midVal = indexData[fromIndex + mid];
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return mid;
|
||||
}
|
||||
return low;
|
||||
}
|
||||
|
||||
public long sizeBytes() {
|
||||
return isLoaded() ? 8L*indexData.length : 0;
|
||||
}
|
||||
|
||||
public int getIndexedDataSize() {
|
||||
return indexedDataSize;
|
||||
}
|
||||
|
||||
public boolean isLoaded() {
|
||||
return indexData != null;
|
||||
}
|
||||
}
|
||||
}
|
@ -19,7 +19,7 @@ public record BTreeContext(int MAX_LAYERS,
|
||||
}
|
||||
|
||||
public int numIndexLayers(int numEntries) {
|
||||
if (numEntries <= BLOCK_SIZE_WORDS*2) {
|
||||
if (numEntries <= BLOCK_SIZE_WORDS*2/entrySize) {
|
||||
return 0;
|
||||
}
|
||||
for (int i = 1; i < MAX_LAYERS; i++) {
|
||||
|
@ -26,7 +26,6 @@ public class DictionaryData {
|
||||
|
||||
if (rb == -1) {
|
||||
int end = activeBank.getEnd();
|
||||
logger.debug("Switching bank @ {}", end);
|
||||
var newBank = new DictionaryDataBank(end, DICTIONARY_BANK_SIZE);
|
||||
rb = newBank.add(key);
|
||||
|
||||
|
@ -16,7 +16,7 @@ import static nu.marginalia.util.FileSizeUtil.readableSize;
|
||||
* Spiritually influenced by GNU Trove's hash maps
|
||||
* LGPL 2.1
|
||||
*/
|
||||
public class DictionaryHashMap {
|
||||
public class DictionaryHashMap implements DictionaryMap {
|
||||
private static final Logger logger = LoggerFactory.getLogger(DictionaryHashMap.class);
|
||||
private static final Gauge probe_count_metrics
|
||||
= Gauge.build("wmsa_dictionary_hash_map_probe_count", "Probing Count")
|
||||
@ -81,6 +81,7 @@ public class DictionaryHashMap {
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return sz.get();
|
||||
}
|
||||
@ -97,6 +98,7 @@ public class DictionaryHashMap {
|
||||
buffers[buffer].put(bufferIdx, val);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int put(long key) {
|
||||
|
||||
long hash = key & 0x7FFF_FFFF_FFFF_FFFFL;
|
||||
@ -143,6 +145,7 @@ public class DictionaryHashMap {
|
||||
return di;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int get(long key) {
|
||||
final long hash = key & 0x7FFF_FFFF_FFFF_FFFFL;
|
||||
final long cell = hash % hashTableSize;
|
||||
|
@ -0,0 +1,9 @@
|
||||
package nu.marginalia.util.dict;
|
||||
|
||||
public interface DictionaryMap {
|
||||
int size();
|
||||
|
||||
int put(long key);
|
||||
|
||||
int get(long key);
|
||||
}
|
@ -72,7 +72,7 @@ public enum UnicodeRanges {
|
||||
int count = 0;
|
||||
int max = sensitive ? 15 : 100;
|
||||
|
||||
for (int i = 0; i < text.length(); i++) {
|
||||
for (int i = 0; i < Math.min(2000, text.length()); i++) {
|
||||
char c = text.charAt(i);
|
||||
if (c >= min && c <= max) {
|
||||
if (count++ > max) {
|
||||
|
@ -88,6 +88,9 @@ public class WordPatterns {
|
||||
}
|
||||
|
||||
public static boolean hasWordQualities(String s) {
|
||||
if (s.isBlank())
|
||||
return false;
|
||||
|
||||
int start = 0;
|
||||
int end = s.length();
|
||||
if (s.charAt(0) == '#') start++;
|
||||
@ -95,13 +98,14 @@ public class WordPatterns {
|
||||
|
||||
for (int i = start; i < end; i++) {
|
||||
char c = s.charAt(i);
|
||||
if (!("_@.'+-".indexOf(c) >= 0)
|
||||
if (("_@.'+-".indexOf(c) < 0)
|
||||
&& !(c >= 'a' && c <= 'z')
|
||||
&& !(c >= 'A' && c <= 'Z')
|
||||
&& !(c >= '0' && c <= '9')
|
||||
&& !(c >= '\u00C0' && c <= '\u00D6')
|
||||
&& !(c >= '\u00D8' && c <= '\u00f6')
|
||||
&& !(c >= '\u00f8' && c <= '\u00ff')) {
|
||||
&& !(c >= '\u00f8' && c <= '\u00ff'))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -119,10 +123,14 @@ public class WordPatterns {
|
||||
if (!filter(s)) {
|
||||
return true;
|
||||
}
|
||||
if (topWords.contains(s.toLowerCase())) {
|
||||
if (isTopWord(s)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public static boolean isTopWord(String s) {
|
||||
return topWords.contains(s.toLowerCase());
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -2,8 +2,10 @@ package nu.marginalia.util.language.processing;
|
||||
|
||||
import nu.marginalia.util.language.WordPatterns;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
||||
import nu.marginalia.util.language.processing.model.WordRep;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||
@ -20,14 +22,9 @@ public class DocumentKeywordExtractor {
|
||||
private final NameCounter nameCounter;
|
||||
private final SubjectCounter subjectCounter;
|
||||
|
||||
private final TermFrequencyDict dict;
|
||||
private final double docCount;
|
||||
|
||||
@Inject
|
||||
public DocumentKeywordExtractor(TermFrequencyDict dict) {
|
||||
this.dict = dict;
|
||||
docCount = dict.docCount();
|
||||
|
||||
keywordExtractor = new KeywordExtractor();
|
||||
|
||||
tfIdfCounter = new KeywordCounter(dict, keywordExtractor);
|
||||
@ -36,69 +33,105 @@ public class DocumentKeywordExtractor {
|
||||
}
|
||||
|
||||
|
||||
public EdgePageWordSet extractKeywordsMinimal(DocumentLanguageData documentLanguageData) {
|
||||
public EdgePageWordSet extractKeywordsMinimal(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
|
||||
|
||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||
|
||||
KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
|
||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
||||
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
||||
|
||||
List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
|
||||
List<WordRep> topKeywords = new ArrayList<>(wordsTfIdf.top());
|
||||
tfIdfCounter.countHisto(keywordMetadata, documentLanguageData);
|
||||
|
||||
Collection<String> artifacts = getArtifacts(documentLanguageData);
|
||||
for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
|
||||
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
|
||||
for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed);
|
||||
|
||||
List<String> artifacts = getArtifacts(documentLanguageData);
|
||||
|
||||
keywordMetadata.flagsTemplate().add(EdgePageWordFlags.Simple);
|
||||
|
||||
return new EdgePageWordSet(
|
||||
createWords(IndexBlock.Subjects, subjects),
|
||||
createWords(IndexBlock.Title, titleWords),
|
||||
createWords(IndexBlock.NamesWords, wordsNamesAll),
|
||||
createWords(IndexBlock.Tfidf_Top, topKeywords),
|
||||
createWords(IndexBlock.Tfidf_Middle, midKeywords),
|
||||
new EdgePageWords(IndexBlock.Artifacts, artifacts)
|
||||
createWords(keywordMetadata, IndexBlock.Title, titleWords),
|
||||
EdgePageWords.withBlankMetadata(IndexBlock.Artifacts, artifacts)
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
|
||||
public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData) {
|
||||
public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
|
||||
|
||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||
|
||||
KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
|
||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
|
||||
getWordPositions(keywordMetadata, documentLanguageData);
|
||||
|
||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
||||
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
||||
|
||||
List<WordRep> lowKeywords = new ArrayList<>(wordsTfIdf.lower());
|
||||
List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
|
||||
List<WordRep> topKeywords = new ArrayList<>(wordsTfIdf.top());
|
||||
List<WordRep> wordsTfIdf = tfIdfCounter.countHisto(keywordMetadata, documentLanguageData);
|
||||
|
||||
Collection<String> artifacts = getArtifacts(documentLanguageData);
|
||||
for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
|
||||
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
|
||||
for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed);
|
||||
|
||||
List<String> artifacts = getArtifacts(documentLanguageData);
|
||||
|
||||
var wordSet = new EdgePageWordSet(
|
||||
createWords(IndexBlock.Subjects, subjects),
|
||||
createWords(IndexBlock.Title, titleWords),
|
||||
createWords(IndexBlock.NamesWords, wordsNamesAll),
|
||||
createWords(IndexBlock.Tfidf_Top, topKeywords),
|
||||
createWords(IndexBlock.Tfidf_Middle, midKeywords),
|
||||
createWords(IndexBlock.Tfidf_Lower, lowKeywords),
|
||||
new EdgePageWords(IndexBlock.Artifacts, artifacts)
|
||||
createWords(keywordMetadata, IndexBlock.Title, titleWords),
|
||||
createWords(keywordMetadata, IndexBlock.Tfidf_High, wordsTfIdf),
|
||||
createWords(keywordMetadata, IndexBlock.Subjects, subjects),
|
||||
EdgePageWords.withBlankMetadata(IndexBlock.Artifacts, artifacts)
|
||||
);
|
||||
|
||||
getSimpleWords(wordSet, documentLanguageData,
|
||||
getSimpleWords(keywordMetadata, wordSet, documentLanguageData,
|
||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus);
|
||||
|
||||
return wordSet;
|
||||
}
|
||||
|
||||
private void getSimpleWords(EdgePageWordSet wordSet, DocumentLanguageData documentLanguageData, IndexBlock... blocks) {
|
||||
|
||||
public void getWordPositions(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
|
||||
Map<String, Integer> ret = keywordMetadata.positionMask();
|
||||
|
||||
int posCtr = 0;
|
||||
for (var sent : dld.titleSentences) {
|
||||
int posBit = (int)((1L << (posCtr/4)) & 0xFFFF_FFFFL);
|
||||
|
||||
for (var word : sent) {
|
||||
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getNames(sent)) {
|
||||
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
}
|
||||
posCtr+=4;
|
||||
for (var sent : dld.sentences) {
|
||||
int posBit = (int)((1L << (posCtr/4)) & 0xFFFF_FFFFL);
|
||||
|
||||
for (var word : sent) {
|
||||
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getNames(sent)) {
|
||||
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
posCtr++;
|
||||
}
|
||||
}
|
||||
|
||||
private int bitwiseOr(int a, int b) {
|
||||
return a | b;
|
||||
}
|
||||
|
||||
|
||||
private void getSimpleWords(KeywordMetadata metadata, EdgePageWordSet wordSet, DocumentLanguageData documentLanguageData, IndexBlock... blocks) {
|
||||
|
||||
EnumSet<EdgePageWordFlags> flagsTemplate = EnumSet.noneOf(EdgePageWordFlags.class);
|
||||
|
||||
int start = 0;
|
||||
int lengthGoal = 32;
|
||||
|
||||
for (int blockIdx = 0; blockIdx < blocks.length-1 && start < documentLanguageData.sentences.length; blockIdx++) {
|
||||
for (int blockIdx = 0; blockIdx < blocks.length && start < documentLanguageData.sentences.length; blockIdx++) {
|
||||
IndexBlock block = blocks[blockIdx];
|
||||
Set<String> words = new HashSet<>(lengthGoal+100);
|
||||
Set<EdgePageWords.Entry> words = new HashSet<>(lengthGoal+100);
|
||||
|
||||
int pos;
|
||||
int length = 0;
|
||||
@ -110,55 +143,26 @@ public class DocumentKeywordExtractor {
|
||||
if (!word.isStopWord()) {
|
||||
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
||||
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
|
||||
words.add(w);
|
||||
words.add(new EdgePageWords.Entry(w, metadata.forWord(flagsTemplate, word.stemmed())));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (var names : keywordExtractor.getNames(sent)) {
|
||||
var rep = new WordRep(sent, names);
|
||||
String w = AsciiFlattener.flattenUnicode(rep.word);
|
||||
|
||||
words.add(new EdgePageWords.Entry(w, metadata.forWord(flagsTemplate, rep.stemmed)));
|
||||
}
|
||||
}
|
||||
wordSet.append(block, words);
|
||||
start = pos;
|
||||
lengthGoal+=32;
|
||||
}
|
||||
|
||||
if (start < documentLanguageData.sentences.length) {
|
||||
|
||||
Map<String, Integer> counts = new HashMap<>(documentLanguageData.totalNumWords());
|
||||
for (int pos = start; pos < documentLanguageData.sentences.length && counts.size() < lengthGoal; pos++) {
|
||||
var sent = documentLanguageData.sentences[pos];
|
||||
for (var word : sent) {
|
||||
if (!word.isStopWord()) {
|
||||
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
||||
if (counts.containsKey(w) || (WordPatterns.singleWordQualitiesPredicate.test(w))) {
|
||||
counts.merge(w, 1, Integer::sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Set<String> lastSet;
|
||||
if (counts.size() < 1024) {
|
||||
lastSet = counts.keySet();
|
||||
}
|
||||
else {
|
||||
lastSet = counts.entrySet().stream()
|
||||
.sorted(Comparator.comparing(e -> {
|
||||
double N = docCount; // Number of documents in term freq dictionary
|
||||
|
||||
// Caveat: This is actually the *negated* term score, because the second logarithm has
|
||||
// its parameter inverted (log(a^b) = b log(a); here b = -1)
|
||||
return (1 + Math.log(e.getValue())) * Math.log((1. + dict.getTermFreq(e.getKey())) / N);
|
||||
}))
|
||||
.map(Map.Entry::getKey)
|
||||
.limit(1024)
|
||||
.collect(Collectors.toCollection(LinkedHashSet::new));
|
||||
}
|
||||
|
||||
wordSet.append(blocks[blocks.length - 1], lastSet);
|
||||
}
|
||||
}
|
||||
|
||||
private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+");
|
||||
private Collection<String> getArtifacts(DocumentLanguageData documentLanguageData) {
|
||||
private List<String> getArtifacts(DocumentLanguageData documentLanguageData) {
|
||||
Set<String> reps = new HashSet<>();
|
||||
|
||||
for (var sent : documentLanguageData.sentences) {
|
||||
@ -183,7 +187,7 @@ public class DocumentKeywordExtractor {
|
||||
}
|
||||
}
|
||||
}
|
||||
return reps;
|
||||
return new ArrayList<>(reps);
|
||||
}
|
||||
|
||||
private List<WordRep> extractTitleWords(DocumentLanguageData documentLanguageData) {
|
||||
@ -193,7 +197,21 @@ public class DocumentKeywordExtractor {
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) {
|
||||
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns::hasWordQualities).collect(Collectors.toSet()));
|
||||
public EdgePageWords createWords(KeywordMetadata metadata,
|
||||
IndexBlock block,
|
||||
Collection<WordRep> words) {
|
||||
|
||||
Set<EdgePageWords.Entry> entries = new HashSet<>(words.size());
|
||||
for (var word : words) {
|
||||
|
||||
String flatWord = AsciiFlattener.flattenUnicode(word.word);
|
||||
if (!WordPatterns.hasWordQualities(flatWord)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
entries.add(new EdgePageWords.Entry(flatWord, metadata.forWord(metadata.flagsTemplate(), word.stemmed)));
|
||||
}
|
||||
|
||||
return new EdgePageWords(block, entries);
|
||||
}
|
||||
}
|
||||
|
@ -1,15 +1,19 @@
|
||||
package nu.marginalia.util.language.processing;
|
||||
|
||||
import com.github.jknack.handlebars.internal.lang3.StringUtils;
|
||||
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||
import nu.marginalia.util.language.WordPatterns;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
||||
import nu.marginalia.util.language.processing.model.WordRep;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.List;
|
||||
|
||||
import static java.lang.Math.max;
|
||||
|
||||
public class KeywordCounter {
|
||||
private final KeywordExtractor keywordExtractor;
|
||||
@ -19,72 +23,78 @@ public class KeywordCounter {
|
||||
public KeywordCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) {
|
||||
this.dict = dict;
|
||||
this.keywordExtractor = keywordExtractor;
|
||||
this.docCount = (double) dict.docCount();
|
||||
this.docCount = dict.docCount();
|
||||
}
|
||||
|
||||
public WordHistogram countHisto(DocumentLanguageData dld) {
|
||||
HashMap<String, Integer> counts = new HashMap<>(15000);
|
||||
public List<WordRep> countHisto(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
|
||||
TObjectIntHashMap<String> counts = new TObjectIntHashMap<>(10_000, 0.7f);
|
||||
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(15000);
|
||||
|
||||
|
||||
for (var sent : dld.sentences) {
|
||||
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
||||
for (var span : keywords) {
|
||||
if (span.size() == 1 &&
|
||||
WordPatterns.isStopWord(sent.words[span.start]))
|
||||
|
||||
if (span.size() == 1 && WordPatterns.isStopWord(sent.words[span.start])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
String stemmed = sent.constructStemmedWordFromSpan(span);
|
||||
var rep = new WordRep(sent, span);
|
||||
|
||||
counts.merge(stemmed, 1, Integer::sum);
|
||||
instances.computeIfAbsent(stemmed, k -> new HashSet<>(500)).add(new WordRep(sent, span));
|
||||
counts.adjustOrPutValue(rep.stemmed, 1, 1);
|
||||
var instanceSet = instances.computeIfAbsent(rep.stemmed, k -> new HashSet<>(500));
|
||||
if (instanceSet.size() < 250) {
|
||||
instanceSet.add(rep);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
double maxC = counts.values().stream().mapToDouble(Double::valueOf).max().orElse(1);
|
||||
HashMap<String, WordFrequencyData> tfIdf = keywordMetadata.wordsTfIdf();
|
||||
List<WordRep> tfIdfHigh = new ArrayList<>();
|
||||
|
||||
Set<WordRep> h5 = new HashSet<>(2500);
|
||||
Set<WordRep> h10 = new HashSet<>(500);
|
||||
Set<WordRep> h15 = new HashSet<>(500);
|
||||
int maxVal = maxValue(counts);
|
||||
|
||||
int doubleWordCount = 0;
|
||||
counts.forEachEntry((key, cnt) -> {
|
||||
int value = getTermValue(key, cnt, maxVal);
|
||||
|
||||
for (var entry : counts.entrySet()) {
|
||||
double value = getTermValue(entry, maxC);
|
||||
tfIdf.put(key, new WordFrequencyData(cnt, value));
|
||||
|
||||
double avgCnt = entry.getValue();
|
||||
String wordStemmed = entry.getKey();
|
||||
|
||||
Set<WordRep> histogram;
|
||||
if (value < -3 && avgCnt>1) histogram = h15;
|
||||
else if (value < -1.75 && avgCnt>1) histogram = h10;
|
||||
else if (value < -1 &&
|
||||
(!wordStemmed.contains("_") || doubleWordCount++ < 50))
|
||||
histogram = h5;
|
||||
else continue;
|
||||
|
||||
histogram.addAll(instances.get(wordStemmed));
|
||||
}
|
||||
return new WordHistogram(h5, h10, h15);
|
||||
if (cnt > 1 && value > 100) {
|
||||
tfIdfHigh.addAll(instances.get(key));
|
||||
}
|
||||
|
||||
private static final Pattern separator = Pattern.compile("_");
|
||||
return true;
|
||||
});
|
||||
|
||||
public double getTermValue(Map.Entry<String, Integer> e, double maxValue) {
|
||||
String key = e.getKey();
|
||||
if (key.contains("_")) {
|
||||
String[] parts = separator.split(e.getKey());
|
||||
return tfIdfHigh;
|
||||
}
|
||||
|
||||
private int maxValue(TObjectIntHashMap<?> map) {
|
||||
int maxC = 0;
|
||||
for (int c : map.values()) {
|
||||
maxC = max(c, maxC);
|
||||
}
|
||||
return maxC;
|
||||
}
|
||||
|
||||
public int getTermValue(String key, int count, double maxValue) {
|
||||
if (key.indexOf('_') >= 0) {
|
||||
String[] parts = StringUtils.split(key, '_');
|
||||
double totalValue = 0.;
|
||||
for (String part : parts) {
|
||||
totalValue += value(part, e.getValue(), maxValue);
|
||||
totalValue += value(part, count, maxValue);
|
||||
}
|
||||
return totalValue / parts.length;
|
||||
return normalizeValue(totalValue / parts.length);
|
||||
}
|
||||
else {
|
||||
return value(key, e.getValue(), maxValue);
|
||||
return normalizeValue(value(key, count, maxValue));
|
||||
}
|
||||
}
|
||||
|
||||
int normalizeValue(double v) {
|
||||
return (int)(-v*75);
|
||||
}
|
||||
|
||||
double value(String key, double value, double maxValue) {
|
||||
double freq = dict.getTermFreqStemmed(key);
|
||||
if (freq < 1) {
|
||||
@ -93,5 +103,5 @@ public class KeywordCounter {
|
||||
return (0.1 + 0.9*value/maxValue) * Math.log(freq/docCount);
|
||||
}
|
||||
|
||||
public record WordHistogram(Set<WordRep> lower, Set<WordRep> mid, Set<WordRep> top) { }
|
||||
public record WordFrequencyData(int count, int tfIdfNormalized) { }
|
||||
}
|
||||
|
@ -1,64 +0,0 @@
|
||||
package nu.marginalia.util.language.processing;
|
||||
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
||||
import nu.marginalia.util.language.processing.model.WordRep;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class LongNameCounter {
|
||||
private final KeywordExtractor keywordExtractor;
|
||||
private final TermFrequencyDict dict;
|
||||
private final double docCount;
|
||||
public LongNameCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) {
|
||||
this.dict = dict;
|
||||
docCount = (double) dict.docCount();
|
||||
this.keywordExtractor = keywordExtractor;
|
||||
}
|
||||
|
||||
public List<WordRep> count(DocumentLanguageData dld) {
|
||||
HashMap<String, Double> counts = new HashMap<>(1000);
|
||||
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
|
||||
|
||||
for (int i = 0; i < dld.sentences.length; i++) {
|
||||
DocumentSentence sent = dld.sentences[i];
|
||||
var keywords = keywordExtractor.getNamesStrict(sent);
|
||||
for (var span : keywords) {
|
||||
var stemmed = sent.constructStemmedWordFromSpan(span);
|
||||
counts.merge(stemmed, 1., Double::sum);
|
||||
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span));
|
||||
}
|
||||
}
|
||||
|
||||
return counts.entrySet().stream().filter(e -> termSize(e.getKey()) > 1)
|
||||
.sorted(Comparator.comparing(this::getTermValue))
|
||||
.limit(Math.min(50, counts.size()/3))
|
||||
.map(Map.Entry::getKey)
|
||||
.flatMap(w -> instances.get(w).stream()).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
int termSize(String word) {
|
||||
return 1 + (int) word.chars().filter(c -> c == '_').count();
|
||||
}
|
||||
|
||||
|
||||
final Pattern separator = Pattern.compile("_");
|
||||
|
||||
public double getTermValue(Map.Entry<String, Double> e) {
|
||||
String[] parts = separator.split(e.getKey());
|
||||
double totalValue = 0.;
|
||||
for (String part : parts) {
|
||||
totalValue += value(part, e.getValue());
|
||||
}
|
||||
return totalValue / Math.sqrt(parts.length);
|
||||
}
|
||||
|
||||
double value(String key, double value) {
|
||||
return (1+Math.log(value)) * Math.log((1.1+dict.getTermFreqStemmed(key))/11820118.);
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -37,7 +37,8 @@ public class NameCounter {
|
||||
.sorted(Comparator.comparing(e -> -e.getValue()))
|
||||
.limit(150)
|
||||
.map(Map.Entry::getKey)
|
||||
.flatMap(w -> instances.get(w).stream()).collect(Collectors.toList());
|
||||
.flatMap(w -> instances.get(w).stream())
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.util.language.processing;
|
||||
|
||||
import com.github.datquocnguyen.RDRPOSTagger;
|
||||
import com.github.jknack.handlebars.internal.lang3.StringUtils;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||
import lombok.AllArgsConstructor;
|
||||
@ -125,11 +126,45 @@ public class SentenceExtractor {
|
||||
return counts;
|
||||
}
|
||||
|
||||
private static final Pattern dotPattern = Pattern.compile("\\.+$");
|
||||
private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)");
|
||||
private static final Pattern spacesPattern = Pattern.compile("\\s+");
|
||||
|
||||
private static final Pattern badCharPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|(\\.(\\s+|$))");
|
||||
// private static final Pattern badCharPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|(\\.(\\s+|$))");
|
||||
|
||||
private boolean isBadChar(char c) {
|
||||
if (c >= 'a' && c <= 'z') return false;
|
||||
if (c >= 'A' && c <= 'Z') return false;
|
||||
if (c >= '0' && c <= '9') return false;
|
||||
if ("_#@.".indexOf(c) >= 0) return false;
|
||||
if (c >= '\u00C0' && c <= '\u00D6') return false;
|
||||
if (c >= '\u00D8' && c <= '\u00F6') return false;
|
||||
if (c >= '\u00F8' && c <= '\u00FF') return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
private String sanitizeString(String s) {
|
||||
char[] newChars = new char[s.length()];
|
||||
int pi = 0;
|
||||
|
||||
for (int i = 0; i < newChars.length; i++) {
|
||||
char c = s.charAt(i);
|
||||
if (!isBadChar(c)) {
|
||||
newChars[pi++] = c;
|
||||
}
|
||||
else {
|
||||
newChars[pi++] = ' ';
|
||||
}
|
||||
}
|
||||
|
||||
s = new String(newChars, 0, pi);
|
||||
|
||||
if (s.startsWith(".")) {
|
||||
s = s.substring(1);
|
||||
if (s.isBlank())
|
||||
return "";
|
||||
}
|
||||
return s;
|
||||
|
||||
}
|
||||
|
||||
public DocumentSentence extractSentence(String text) {
|
||||
var wordsAndSeps = splitSegment(text);
|
||||
@ -139,7 +174,7 @@ public class SentenceExtractor {
|
||||
var lc = toLc(wordsAndSeps.words);
|
||||
|
||||
return new DocumentSentence(
|
||||
badCharPattern.matcher(text).replaceAll(" "), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc)
|
||||
sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc)
|
||||
);
|
||||
}
|
||||
|
||||
@ -161,7 +196,7 @@ public class SentenceExtractor {
|
||||
sentences = sentenceDetector.sentDetect(textNormalizedSpaces);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
sentences = textNormalizedSpaces.split("[.]");
|
||||
sentences = StringUtils.split(textNormalizedSpaces, '.');
|
||||
}
|
||||
|
||||
if (sentences.length > 250) {
|
||||
@ -196,8 +231,8 @@ public class SentenceExtractor {
|
||||
separators[i] = Arrays.copyOf(separators[i], 250);
|
||||
}
|
||||
for (int j = 0; j < tokens[i].length; j++) {
|
||||
if (tokens[i][j].endsWith(".")) {
|
||||
tokens[i][j] = dotPattern.matcher(tokens[i][j]).replaceAll("");
|
||||
while (tokens[i][j].endsWith(".")) {
|
||||
tokens[i][j] = StringUtils.removeEnd(tokens[i][j], ".");
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -216,7 +251,7 @@ public class SentenceExtractor {
|
||||
|
||||
DocumentSentence[] ret = new DocumentSentence[sentences.length];
|
||||
for (int i = 0; i < ret.length; i++) {
|
||||
ret[i] = new DocumentSentence(badCharPattern.matcher(sentences[i]).replaceAll(" "), tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]);
|
||||
ret[i] = new DocumentSentence(sanitizeString(sentences[i]), tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
@ -5,9 +5,7 @@ import nu.marginalia.util.language.processing.model.WordRep;
|
||||
import nu.marginalia.util.language.processing.model.WordSpan;
|
||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class SubjectCounter {
|
||||
@ -27,7 +25,9 @@ public class SubjectCounter {
|
||||
|
||||
public List<WordRep> count(DocumentLanguageData dld) {
|
||||
|
||||
Map<WordRep, Integer> counts = new HashMap<>();
|
||||
Map<String, Integer> counts = new HashMap<>();
|
||||
Map<String, Set<WordRep>> instances = new HashMap<>();
|
||||
|
||||
for (var sentence : dld.sentences) {
|
||||
for (WordSpan kw : keywordExtractor.getNames(sentence)) {
|
||||
if (kw.end + 2 >= sentence.length()) {
|
||||
@ -41,7 +41,13 @@ public class SubjectCounter {
|
||||
String nextNextTag = sentence.posTags[kw.end+1];
|
||||
|
||||
if (isVerb(nextTag) && isDetOrAdverbOrVerb(nextNextTag)) {
|
||||
counts.merge(new WordRep(sentence, new WordSpan(kw.start, kw.end)), -1, Integer::sum);
|
||||
var span = new WordSpan(kw.start, kw.end);
|
||||
var rep = new WordRep(sentence, span);
|
||||
|
||||
String stemmed = rep.stemmed;
|
||||
|
||||
counts.merge(stemmed, -1, Integer::sum);
|
||||
instances.computeIfAbsent(stemmed, s -> new HashSet<>()).add(rep);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -49,8 +55,8 @@ public class SubjectCounter {
|
||||
int best = counts.values().stream().mapToInt(Integer::valueOf).min().orElse(0);
|
||||
|
||||
return counts.entrySet().stream().sorted(Map.Entry.comparingByValue())
|
||||
.filter(e -> e.getValue()<-2 && e.getValue()<best*0.75)
|
||||
.map(Map.Entry::getKey)
|
||||
.filter(e -> e.getValue()<-2 && e.getValue()<=best*0.75)
|
||||
.flatMap(e -> instances.getOrDefault(e.getKey(), Collections.emptySet()).stream())
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,58 @@
|
||||
package nu.marginalia.util.language.processing.model;
|
||||
|
||||
import nu.marginalia.util.language.processing.KeywordCounter;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
|
||||
|
||||
import java.util.EnumSet;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
||||
public record KeywordMetadata(HashSet<String> titleKeywords,
|
||||
HashSet<String> subjectKeywords,
|
||||
HashSet<String> namesKeywords,
|
||||
HashMap<String, KeywordCounter.WordFrequencyData> wordsTfIdf,
|
||||
HashMap<String, Integer> positionMask,
|
||||
EnumSet<EdgePageWordFlags> flagsTemplate,
|
||||
int quality
|
||||
)
|
||||
{
|
||||
|
||||
private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0);
|
||||
|
||||
public KeywordMetadata(double quality, EnumSet<EdgePageWordFlags> flags) {
|
||||
this(new HashSet<>(50), new HashSet<>(10), new HashSet<>(50),
|
||||
new HashMap<>(15_000),
|
||||
new HashMap<>(10_000),
|
||||
flags,
|
||||
(int)(-quality));
|
||||
}
|
||||
|
||||
public KeywordMetadata(double quality) {
|
||||
this(quality, EnumSet.noneOf(EdgePageWordFlags.class));
|
||||
}
|
||||
|
||||
public long forWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
|
||||
|
||||
KeywordCounter.WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty);
|
||||
EnumSet<EdgePageWordFlags> flags = flagsTemplate.clone();
|
||||
|
||||
if (subjectKeywords.contains(stemmed))
|
||||
flags.add(EdgePageWordFlags.Subjects);
|
||||
|
||||
if (namesKeywords.contains(stemmed))
|
||||
flags.add(EdgePageWordFlags.NamesWords);
|
||||
|
||||
if (titleKeywords.contains(stemmed))
|
||||
flags.add(EdgePageWordFlags.Title);
|
||||
|
||||
int positions = positionMask.getOrDefault(stemmed, 0);
|
||||
|
||||
return new EdgePageWordMetadata(tfidf.tfIdfNormalized(), positions, quality, tfidf.count(), flags).encode();
|
||||
}
|
||||
|
||||
public int quality() {
|
||||
return -quality;
|
||||
}
|
||||
|
||||
}
|
@ -1,21 +1,22 @@
|
||||
package nu.marginalia.util.language.processing.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
@AllArgsConstructor @EqualsAndHashCode @Getter
|
||||
@AllArgsConstructor @Getter
|
||||
public class WordRep implements Comparable<WordRep> {
|
||||
|
||||
public WordRep(DocumentSentence sent, WordSpan span) {
|
||||
word = sent.constructWordFromSpan(span);
|
||||
stemmed = sent.constructStemmedWordFromSpan(span);
|
||||
length = span.end - span.start;
|
||||
|
||||
hashCode = Objects.hash(word);
|
||||
}
|
||||
|
||||
public final int length;
|
||||
public final String word;
|
||||
public final String stemmed;
|
||||
@ -34,4 +35,12 @@ public class WordRep implements Comparable<WordRep> {
|
||||
public int hashCode() {
|
||||
return hashCode;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == this) return true;
|
||||
if (other instanceof WordRep wr) {
|
||||
return Objects.equals(wr.word, word);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.util.multimap;
|
||||
|
||||
import com.upserve.uppend.blobs.NativeIO;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.btree.BTreeQueryBuffer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -100,8 +101,8 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
|
||||
public MultimapSearcherBase createSearcher() {
|
||||
return new MultimapSearcherBase(this);
|
||||
}
|
||||
public MultimapSorter createSorter(Path tmpFile, int internalSortLimit) {
|
||||
return new MultimapSorter(this, tmpFile, internalSortLimit);
|
||||
public MultimapSorter createSorter(Path tmpFile, int internalSortLimit, int minStepSize) {
|
||||
return new MultimapSorter(this, tmpFile, internalSortLimit, minStepSize);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@ -340,6 +341,49 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void write(LongBuffer vals, int n, long idx) {
|
||||
if (idx+n >= mappedSize) {
|
||||
grow(idx+n);
|
||||
}
|
||||
int iN = (int)((idx + n) / bufferSize);
|
||||
|
||||
for (int i = 0; i < n; ) {
|
||||
int i0 = (int)((idx + i) / bufferSize);
|
||||
|
||||
int bufferOffset = (int) ((idx+i) % bufferSize);
|
||||
var buffer = buffers.get(i0);
|
||||
|
||||
final int l;
|
||||
|
||||
if (i0 < iN) l = bufferSize - bufferOffset;
|
||||
else l = Math.min(n - i, bufferSize - bufferOffset);
|
||||
|
||||
buffer.put(bufferOffset, vals, vals.position() + i, l);
|
||||
i+=l;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void swapn(int n, long idx1, long idx2) {
|
||||
for (int i = 0; i < n; i++)
|
||||
swap(idx1+i, idx2+i);
|
||||
}
|
||||
|
||||
private void swap(long idx1, long idx2) {
|
||||
LongBuffer buff1 = buffers.get((int)(idx1) / bufferSize);
|
||||
final int o1 = (int) (idx1) % bufferSize;
|
||||
|
||||
LongBuffer buff2 = buffers.get((int)(idx2) / bufferSize);
|
||||
final int o2 = (int) (idx2) % bufferSize;
|
||||
|
||||
long tmp = buff1.get(o1);
|
||||
buff1.put(o1, buff2.get(o2));
|
||||
buff2.put(o2, tmp);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setRange(long idx, int n, long val) {
|
||||
if (n == 0) return;
|
||||
@ -410,6 +454,387 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchInternal(long key, long fromIndex, int step, long n, long mask) {
|
||||
if (fromIndex + n*step >= mappedSize)
|
||||
grow(fromIndex + n*step);
|
||||
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
if (fromIndex/bufferSize == (fromIndex+step*n)/bufferSize) {
|
||||
int idx = (int)(fromIndex / bufferSize);
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long off = fromIndex + mid*step;
|
||||
long midVal = buffers.get(idx).get((int)(off % bufferSize)) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid*step;
|
||||
}
|
||||
}
|
||||
else {
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long off = fromIndex + mid*step;
|
||||
long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize)) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid*step;
|
||||
}
|
||||
}
|
||||
|
||||
return -1L-(fromIndex + high*step);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchInternal(long key, long fromIndex, long n, long mask) {
|
||||
if (fromIndex + n >= mappedSize)
|
||||
grow(fromIndex + n);
|
||||
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
if (fromIndex/bufferSize == (fromIndex+n)/bufferSize) {
|
||||
int idx = (int)(fromIndex / bufferSize);
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long off = fromIndex + mid;
|
||||
long midVal = buffers.get(idx).get((int)(off % bufferSize)) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
}
|
||||
else {
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long off = fromIndex + mid;
|
||||
long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize)) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
}
|
||||
|
||||
return -1L-(fromIndex + high);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public long binarySearchInternal(long key, long fromIndex, long n) {
|
||||
if (fromIndex + n >= mappedSize)
|
||||
grow(fromIndex + n);
|
||||
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
if (fromIndex/bufferSize == (fromIndex+n)/bufferSize) {
|
||||
int idx = (int)(fromIndex / bufferSize);
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long off = fromIndex + mid;
|
||||
long midVal = buffers.get(idx).get((int)(off % bufferSize));
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
}
|
||||
else {
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long off = fromIndex + mid;
|
||||
long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize));
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
}
|
||||
|
||||
return -1L-(fromIndex + high);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public long binarySearchUpperInternal(long key, long fromIndex, long n) {
|
||||
if (fromIndex + n >= mappedSize)
|
||||
grow(fromIndex + n);
|
||||
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
if (fromIndex/bufferSize == (fromIndex+n)/bufferSize) {
|
||||
int idx = (int)(fromIndex / bufferSize);
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long off = fromIndex + mid;
|
||||
long midVal = buffers.get(idx).get((int)(off % bufferSize));
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
}
|
||||
else {
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long off = fromIndex + mid;
|
||||
long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize));
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
}
|
||||
|
||||
return fromIndex + low;
|
||||
}
|
||||
|
||||
private boolean isSameBuffer(long a, long b) {
|
||||
return a / bufferSize == b/bufferSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long quickSortPartition(int wordSize, long low, long high) {
|
||||
if (high >= mappedSize)
|
||||
grow(high + wordSize - 1);
|
||||
|
||||
if (isSameBuffer(low, high + wordSize - 1)) {
|
||||
// Specialization that circumvents the need for expensive calls to
|
||||
// MultimapFileLong.get() in the most common scenario
|
||||
|
||||
return quickSortPartitionSameBuffer(wordSize, low, high);
|
||||
}
|
||||
else {
|
||||
return quickSortPartitionDifferentBuffers(wordSize, low, high);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void insertionSort(int wordSize, long start, int n) {
|
||||
if (start + n + wordSize - 1 >= mappedSize)
|
||||
grow(start + n + wordSize - 1);
|
||||
|
||||
if (n == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (isSameBuffer(start, start + (long)n*wordSize-1L)) {
|
||||
final var buffer = buffers.get((int) (start / bufferSize));
|
||||
int off = (int) (start % bufferSize);
|
||||
|
||||
for (int i = 1; i < n; i++) {
|
||||
for (int j = i; j > 0; j--) {
|
||||
int a = off + wordSize*(j-1);
|
||||
int b = off + wordSize*j;
|
||||
|
||||
if (buffer.get(a) > buffer.get(b)) {
|
||||
for (int w = 0; w < wordSize; w++) {
|
||||
long tmp = buffer.get(a+w);
|
||||
buffer.put(a+w, buffer.get(b+w));
|
||||
buffer.put(b+w, tmp);
|
||||
}
|
||||
}
|
||||
else break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else for (int i = 1; i < n; i++) {
|
||||
for (int j = i; j > 0; j--) {
|
||||
long a = start + (long)wordSize*(j-1);
|
||||
long b = start + (long)wordSize*j;
|
||||
|
||||
if (get(a) > get(b)) {
|
||||
swap(a, b);
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private long quickSortPartitionDifferentBuffers(int wordSize, long low, long high) {
|
||||
|
||||
long pivotPoint = ((low + high) / (2L*wordSize)) * wordSize;
|
||||
long pivot = get(pivotPoint);
|
||||
|
||||
long i = low - wordSize;
|
||||
long j = high + wordSize;
|
||||
|
||||
for (;;) {
|
||||
do {
|
||||
i+=wordSize;
|
||||
} while (get(i) < pivot);
|
||||
|
||||
do {
|
||||
j-=wordSize;
|
||||
}
|
||||
while (get(j) > pivot);
|
||||
|
||||
if (i >= j) return j;
|
||||
else swapn(wordSize, i, j);
|
||||
}
|
||||
}
|
||||
|
||||
private long quickSortPartitionSameBuffer(int wordSize, long low, long high) {
|
||||
|
||||
final var buffer = buffers.get((int) (low / bufferSize));
|
||||
|
||||
int pivotPoint = (int) ((low + high) / (2L*wordSize)) * wordSize % bufferSize;
|
||||
long pivot = buffer.get(pivotPoint);
|
||||
|
||||
int j = (int) (high) % bufferSize + wordSize;
|
||||
int i = (int) (low) % bufferSize - wordSize;
|
||||
|
||||
long j0 = high + wordSize - j;
|
||||
|
||||
for (;;) {
|
||||
do {
|
||||
i+=wordSize;
|
||||
} while (buffer.get(i) < pivot);
|
||||
|
||||
do {
|
||||
j-=wordSize;
|
||||
}
|
||||
while (buffer.get(j) > pivot);
|
||||
|
||||
if (i >= j) return j0 + j;
|
||||
else {
|
||||
for (int w = 0; w < wordSize; w++) {
|
||||
long tmp = buffer.get(i+w);
|
||||
buffer.put(i+w, buffer.get(j+w));
|
||||
buffer.put(j+w, tmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void retain(BTreeQueryBuffer buffer, long boundary, long searchStart, long numEntries, long mask, int stepSize) {
|
||||
|
||||
final long end = searchStart + stepSize * numEntries;
|
||||
if (end < mappedSize) {
|
||||
grow(end);
|
||||
}
|
||||
|
||||
long bv = buffer.currentValue() & mask;
|
||||
long av = get(searchStart) & mask;
|
||||
long pos = searchStart;
|
||||
|
||||
int bi = (int)(searchStart / bufferSize);
|
||||
int bo = (int)(searchStart % bufferSize);
|
||||
|
||||
LongBuffer data = buffers.get(bi);
|
||||
|
||||
while (bv <= boundary && buffer.hasMore()) {
|
||||
if (bv < av) {
|
||||
if (!buffer.rejectAndAdvance()) break;
|
||||
bv = buffer.currentValue() & mask;
|
||||
continue;
|
||||
}
|
||||
else if (bv == av) {
|
||||
if (!buffer.retainAndAdvance()) break;
|
||||
bv = buffer.currentValue() & mask;
|
||||
continue;
|
||||
}
|
||||
|
||||
pos += stepSize;
|
||||
if (pos < end) {
|
||||
bo += stepSize;
|
||||
if (bo >= bufferSize) {
|
||||
data = buffers.get(++bi);
|
||||
bo = 0;
|
||||
}
|
||||
av = data.get(bo) & mask;
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public void reject(BTreeQueryBuffer buffer, long boundary, long searchStart, long numEntries, long mask, int stepSize) {
|
||||
|
||||
final long end = searchStart + stepSize * numEntries;
|
||||
if (end < mappedSize) {
|
||||
grow(end);
|
||||
}
|
||||
|
||||
long bv = buffer.currentValue() & mask;
|
||||
long av = get(searchStart) & mask;
|
||||
long pos = searchStart;
|
||||
|
||||
int bi = (int)(searchStart / bufferSize);
|
||||
int bo = (int)(searchStart % bufferSize);
|
||||
|
||||
LongBuffer data = buffers.get(bi);
|
||||
|
||||
while (bv <= boundary && buffer.hasMore()) {
|
||||
if (bv < av) {
|
||||
if (!buffer.retainAndAdvance()) break;
|
||||
bv = buffer.currentValue() & mask;
|
||||
continue;
|
||||
}
|
||||
else if (bv == av) {
|
||||
if (!buffer.rejectAndAdvance()) break;
|
||||
bv = buffer.currentValue() & mask;
|
||||
continue;
|
||||
}
|
||||
|
||||
pos += stepSize;
|
||||
if (pos < end) {
|
||||
bo += stepSize;
|
||||
if (bo >= bufferSize) {
|
||||
data = buffers.get(++bi);
|
||||
bo = 0;
|
||||
}
|
||||
av = data.get(bo) & mask;
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
@ -424,6 +849,4 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
|
||||
System.runFinalization();
|
||||
System.gc();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -61,6 +61,17 @@ public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice {
|
||||
map.write(vals, idx+off);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(LongBuffer vals, int n, long idx) {
|
||||
map.write(vals, n,idx+off);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void swapn(int n, long idx1, long idx2) {
|
||||
map.swapn(n, idx1+off, idx2+off);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd)
|
||||
throws IOException {
|
||||
@ -75,4 +86,35 @@ public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice {
|
||||
|
||||
return new MultimapFileLongOffsetSlice(map, this.off + off);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchInternal(long key, long fromIndex, int step, long n, long mask) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchInternal(long key, long fromIndex, long n, long mask) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchInternal(long key, long fromIndex, long n) {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchUpperInternal(long key, long fromIndex, long n) {
|
||||
throw new UnsupportedOperationException();
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public long quickSortPartition(int wordSize, long low, long highInclusive) {
|
||||
return map.quickSortPartition(wordSize, low+off, highInclusive+off);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void insertionSort(int wordSize, long start, int n) {
|
||||
map.insertionSort(wordSize, start+off, n);
|
||||
}
|
||||
}
|
||||
|
@ -25,9 +25,23 @@ public interface MultimapFileLongSlice {
|
||||
|
||||
void write(LongBuffer vals, long idx);
|
||||
|
||||
void write(LongBuffer vals, int n, long idx);
|
||||
|
||||
void swapn(int n, long idx1, long idx2);
|
||||
|
||||
void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException;
|
||||
|
||||
default MultimapFileLongSlice atOffset(long off) {
|
||||
return new MultimapFileLongOffsetSlice(this, off);
|
||||
}
|
||||
long binarySearchInternal(long key, long fromIndex, int step, long n, long mask);
|
||||
long binarySearchInternal(long key, long fromIndex, long n, long mask);
|
||||
|
||||
long binarySearchInternal(long key, long fromIndex, long n);
|
||||
|
||||
long binarySearchUpperInternal(long key, long fromIndex, long n);
|
||||
|
||||
long quickSortPartition(int wordSize, long low, long highInclusive);
|
||||
|
||||
void insertionSort(int wordSize, long start, int n);
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.util.multimap;
|
||||
|
||||
public interface MultimapSearcher {
|
||||
long binarySearchUpper(long key, long fromIndex, long n);
|
||||
long binarySearchLower(long key, long fromIndex, long n);
|
||||
long binarySearch(long key, long fromIndex, long n);
|
||||
|
||||
static MultimapSearcher forContext(MultimapFileLongSlice slice, long mask, int stepSize) {
|
||||
@ -25,8 +25,8 @@ class SimpleMultimapSearcher implements MultimapSearcher {
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchUpper(long key, long fromIndex, long n) {
|
||||
return base.binarySearchUpper(key, fromIndex, n);
|
||||
public long binarySearchLower(long key, long fromIndex, long n) {
|
||||
return base.binarySearchLower(key, fromIndex, n);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -46,8 +46,8 @@ class MaskedMultimapSearcher implements MultimapSearcher {
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchUpper(long key, long fromIndex, long n) {
|
||||
return base.binarySearchUpper(key, fromIndex, n, mask);
|
||||
public long binarySearchLower(long key, long fromIndex, long n) {
|
||||
return base.binarySearchLower(key, fromIndex, n, mask);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -69,8 +69,8 @@ class SteppingMaskedMultimapSearcher implements MultimapSearcher {
|
||||
}
|
||||
|
||||
@Override
|
||||
public long binarySearchUpper(long key, long fromIndex, long n) {
|
||||
return base.binarySearchUpper(key, fromIndex, step, n, mask);
|
||||
public long binarySearchLower(long key, long fromIndex, long n) {
|
||||
return base.binarySearchLower(key, fromIndex, step, n, mask);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -29,26 +29,12 @@ public class MultimapSearcherBase {
|
||||
return false;
|
||||
}
|
||||
|
||||
public long binarySearchUpper(long key, long fromIndex, long n) {
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid);
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
return fromIndex + low;
|
||||
public long binarySearchLower(long key, long fromIndex, long n) {
|
||||
return mmf.binarySearchUpperInternal(key, fromIndex, n);
|
||||
}
|
||||
|
||||
|
||||
public long binarySearchUpper(long key, long fromIndex, long n, long mask) {
|
||||
public long binarySearchLower(long key, long fromIndex, long n, long mask) {
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
@ -67,7 +53,7 @@ public class MultimapSearcherBase {
|
||||
}
|
||||
|
||||
|
||||
public long binarySearchUpper(long key, long fromIndex, int step, long n, long mask) {
|
||||
public long binarySearchLower(long key, long fromIndex, int step, long n, long mask) {
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
@ -82,62 +68,19 @@ public class MultimapSearcherBase {
|
||||
else
|
||||
return fromIndex + mid*step;
|
||||
}
|
||||
return fromIndex + low;
|
||||
return fromIndex + low*step;
|
||||
}
|
||||
|
||||
public long binarySearch(long key, long fromIndex, long n) {
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid);
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
}
|
||||
return -1;
|
||||
return mmf.binarySearchInternal(key, fromIndex, n);
|
||||
}
|
||||
|
||||
|
||||
public long binarySearch(long key, long fromIndex, long n, long mask) {
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid;
|
||||
return mmf.binarySearchInternal(key, fromIndex, n, mask);
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
public long binarySearch(long key, long fromIndex, int step, long n, long mask) {
|
||||
long low = 0;
|
||||
long high = n - 1;
|
||||
|
||||
while (low <= high) {
|
||||
long mid = (low + high) >>> 1;
|
||||
long midVal = get(fromIndex + mid*step) & mask;
|
||||
|
||||
if (midVal < key)
|
||||
low = mid + 1;
|
||||
else if (midVal > key)
|
||||
high = mid - 1;
|
||||
else
|
||||
return fromIndex + mid*step;
|
||||
}
|
||||
return -1;
|
||||
return mmf.binarySearchInternal(key, fromIndex, step, n, mask);
|
||||
}
|
||||
}
|
||||
|
@ -1,56 +1,85 @@
|
||||
package nu.marginalia.util.multimap;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.LongBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
|
||||
import static nu.marginalia.util.multimap.MultimapFileLong.WORD_SIZE;
|
||||
|
||||
public class MultimapSorter {
|
||||
private final Path tmpFileDir;
|
||||
private final int internalSortLimit;
|
||||
private final MultimapFileLongSlice multimapFileLong;
|
||||
private final long[] buffer;
|
||||
private final LongBuffer buffer;
|
||||
private final int internalSortLimit;
|
||||
private final int wordSize;
|
||||
|
||||
public MultimapSorter(MultimapFileLongSlice multimapFileLong, Path tmpFileDir, int internalSortLimit) {
|
||||
private static final Logger logger = LoggerFactory.getLogger(MultimapSorter.class);
|
||||
|
||||
public MultimapSorter(MultimapFileLongSlice multimapFileLong, Path tmpFileDir, int internalSortLimit, int wordSize) {
|
||||
this.multimapFileLong = multimapFileLong;
|
||||
this.tmpFileDir = tmpFileDir;
|
||||
this.internalSortLimit = internalSortLimit;
|
||||
buffer = new long[internalSortLimit];
|
||||
this.wordSize = wordSize;
|
||||
buffer = ByteBuffer.allocateDirect(internalSortLimit * wordSize * 8).asLongBuffer();
|
||||
}
|
||||
|
||||
public void sort(long start, int length) throws IOException {
|
||||
if (length <= internalSortLimit) {
|
||||
multimapFileLong.read(buffer, length, start);
|
||||
Arrays.sort(buffer, 0, length);
|
||||
multimapFileLong.write(buffer, length, start);
|
||||
public void sortRange(long start, long end) throws IOException {
|
||||
if (end - start < internalSortLimit) {
|
||||
quickSortLH(start, end - wordSize);
|
||||
}
|
||||
else {
|
||||
externalSort(start, length);
|
||||
mergeSort(start, (int) (end - start));
|
||||
}
|
||||
|
||||
for (long lp = start + wordSize; lp < end; lp += wordSize) {
|
||||
if (multimapFileLong.get(lp - wordSize) > multimapFileLong.get(lp)) {
|
||||
|
||||
logger.error("Sort contract breached [{}:{} ({}), ws={}, <isl={}, bc={}]",
|
||||
start, end,
|
||||
end - start,
|
||||
wordSize, end - start < internalSortLimit,
|
||||
buffer.capacity());
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void mergeSort(long start, int lengthLongs) throws IOException {
|
||||
if (lengthLongs == 1)
|
||||
return;
|
||||
|
||||
private void externalSort(long start, int length) throws IOException {
|
||||
Path tmpFile = Files.createTempFile(tmpFileDir,"sort-"+start+"-"+(start+length), ".dat");
|
||||
|
||||
if (lengthLongs < buffer.capacity()) {
|
||||
mergeSort(start, lengthLongs, buffer);
|
||||
}
|
||||
else {
|
||||
Path tmpFile = Files.createTempFile(tmpFileDir,"sort-"+start+"-"+(start+lengthLongs), ".dat");
|
||||
try (var raf = new RandomAccessFile(tmpFile.toFile(), "rw"); var channel = raf.getChannel()) {
|
||||
var workBuffer =
|
||||
channel.map(FileChannel.MapMode.READ_WRITE, 0, length * WORD_SIZE)
|
||||
channel.map(FileChannel.MapMode.READ_WRITE, 0, wordSize * lengthLongs * WORD_SIZE)
|
||||
.asLongBuffer();
|
||||
|
||||
int width = Math.min(Integer.highestOneBit(length), Integer.highestOneBit(internalSortLimit));
|
||||
mergeSort(start, lengthLongs, workBuffer);
|
||||
}
|
||||
finally {
|
||||
tmpFile.toFile().delete();
|
||||
}
|
||||
}
|
||||
}
|
||||
private void mergeSort(long start, int length, LongBuffer workBuffer) throws IOException {
|
||||
int width = Math.min(Integer.highestOneBit(length), Integer.highestOneBit(buffer.capacity()));
|
||||
|
||||
// Do in-memory sorting up until internalSortLimit first
|
||||
for (int i = 0; i < length; i += width) {
|
||||
sort(start + i, Math.min(width, length-i));
|
||||
quickSort(start + i, Math.min(width, length-i));
|
||||
}
|
||||
|
||||
// Then merge sort on disk for the rest
|
||||
// Then finish with merge sort
|
||||
for (; width < length; width*=2) {
|
||||
|
||||
for (int i = 0; i < length; i += 2*width) {
|
||||
@ -58,30 +87,61 @@ public class MultimapSorter {
|
||||
}
|
||||
|
||||
workBuffer.clear();
|
||||
multimapFileLong.write(workBuffer, start);
|
||||
multimapFileLong.write(workBuffer, length, start);
|
||||
}
|
||||
|
||||
}
|
||||
finally {
|
||||
tmpFile.toFile().delete();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void merge(long offset, int left, int right, int end, LongBuffer workBuffer) {
|
||||
int i = left;
|
||||
int j = right;
|
||||
long idxL = left;
|
||||
long idxR = right;
|
||||
|
||||
for (int k = left; k < end; k++) {
|
||||
final long bufferI = multimapFileLong.get(offset+i);
|
||||
final long bufferJ = multimapFileLong.get(offset+j);
|
||||
for (int putPos = left; putPos < end; putPos+= wordSize) {
|
||||
final long bufferL = multimapFileLong.get(offset+idxL);
|
||||
final long bufferR = multimapFileLong.get(offset+idxR);
|
||||
|
||||
if (i < right && (j >= end || bufferI < bufferJ)) {
|
||||
workBuffer.put(k, bufferI);
|
||||
i++;
|
||||
if (idxL < right && (idxR >= end || bufferL < bufferR)) {
|
||||
workBuffer.put(putPos, bufferL);
|
||||
for (int s = 1; s < wordSize; s++) {
|
||||
workBuffer.put(putPos + s, multimapFileLong.get(offset + idxL + s));
|
||||
}
|
||||
idxL+= wordSize;
|
||||
}
|
||||
else {
|
||||
workBuffer.put(k, bufferJ);
|
||||
j++;
|
||||
workBuffer.put(putPos, bufferR);
|
||||
for (int s = 1; s < wordSize; s++) {
|
||||
workBuffer.put(putPos + s, multimapFileLong.get(offset + idxR + s));
|
||||
}
|
||||
idxR+= wordSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void insertionSort(long start, int n) {
|
||||
multimapFileLong.insertionSort(wordSize, start, n);
|
||||
}
|
||||
|
||||
private void swap(long a, long b) {
|
||||
multimapFileLong.swapn(wordSize, a, b);
|
||||
}
|
||||
|
||||
public void quickSort(long start, long length) {
|
||||
quickSortLH(start, start + length - wordSize);
|
||||
|
||||
}
|
||||
public void quickSortLH(long low, long highInclusive) {
|
||||
|
||||
if (low >= 0 && highInclusive >= 0 && low < highInclusive) {
|
||||
|
||||
if (highInclusive - low < 32) {
|
||||
multimapFileLong.insertionSort(wordSize, low, (int) (1 + (highInclusive - low) / wordSize));
|
||||
}
|
||||
else {
|
||||
long p = multimapFileLong.quickSortPartition(wordSize, low, highInclusive);
|
||||
|
||||
quickSortLH(low, p);
|
||||
quickSortLH(p + wordSize, highInclusive);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -11,27 +11,16 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
public class UpdateDomainRanksTool2 {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool2.class);
|
||||
|
||||
public Set<String> originDomains = new HashSet<>();
|
||||
public Set<Integer> originDomainIds = new HashSet<>();
|
||||
public final long domainIdMax = -1;
|
||||
public int domainCount;
|
||||
private volatile static int rankMax;
|
||||
|
||||
public int maxId() {
|
||||
return (int) domainIdMax;
|
||||
}
|
||||
public int domainCount() {
|
||||
return domainCount;
|
||||
}
|
||||
|
||||
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
|
||||
volatile static boolean running = true;
|
||||
|
||||
@ -44,23 +33,14 @@ public class UpdateDomainRanksTool2 {
|
||||
var uploader = new Thread(() -> uploadThread(conn), "Uploader");
|
||||
|
||||
logger.info("Ranking");
|
||||
// "memex.marginalia.nu", "wiki.xxiivv.com", "bikobatanari.art", "sadgrl.online", "lileks.com",
|
||||
// "www.rep.routledge.com", "www.personal.kent.edu", "xroads.virginia.edu", "classics.mit.edu", "faculty.washington.edu", "monadnock.net"
|
||||
var ds = new DatabaseModule().provideConnection();
|
||||
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||
var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
||||
// var rpr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "%edu");
|
||||
// var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu");
|
||||
|
||||
var rankVector = rpr.pageRankVector();
|
||||
var norm = rankVector.norm();
|
||||
rankMax = rpr.size();
|
||||
uploader.start();
|
||||
|
||||
|
||||
rankMax = rpr.size();
|
||||
|
||||
|
||||
rpr.pageRankWithPeripheralNodes(rankMax).forEach(i -> {
|
||||
try {
|
||||
uploadQueue.put(i);
|
||||
|
@ -0,0 +1,298 @@
|
||||
package nu.marginalia.util.tool;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import gnu.trove.map.hash.TIntIntHashMap;
|
||||
import gnu.trove.map.hash.TIntObjectHashMap;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.AndCardIntSet;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.LinkedBlockingDeque;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
import static nu.marginalia.util.AndCardIntSet.*;
|
||||
|
||||
public class EdgeDomainLinkConsineSimilarityMain {
|
||||
ArrayList<Integer> idsList = new ArrayList<>(100_000);
|
||||
ArrayList<AndCardIntSet> itemsList = new ArrayList<>(100_000);
|
||||
TIntObjectHashMap<AndCardIntSet> dToSMap = new TIntObjectHashMap<>(100_000);
|
||||
TIntIntHashMap aliasMap = new TIntIntHashMap(100_000, 0.75f, -1, -1);
|
||||
TIntHashSet indexed = new TIntHashSet(100_000);
|
||||
|
||||
float weights[];
|
||||
|
||||
private HikariDataSource dataSource;
|
||||
|
||||
public EdgeDomainLinkConsineSimilarityMain(HikariDataSource dataSource) throws SQLException {
|
||||
this.dataSource = dataSource;
|
||||
|
||||
Map<Integer, RoaringBitmap> tmpMap = new HashMap<>(100_000);
|
||||
try (
|
||||
var conn = dataSource.getConnection();
|
||||
var aliasStmt = conn.prepareStatement("SELECT ID, DOMAIN_ALIAS FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NOT NULL");
|
||||
var indexedStmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE INDEXED>0");
|
||||
var linksStmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) {
|
||||
ResultSet rsp;
|
||||
|
||||
aliasStmt.setFetchSize(10_000);
|
||||
rsp = aliasStmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
aliasMap.put(rsp.getInt(1), rsp.getInt(2));
|
||||
}
|
||||
|
||||
indexedStmt.setFetchSize(10_000);
|
||||
rsp = indexedStmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
indexed.add(rsp.getInt(1));
|
||||
}
|
||||
|
||||
|
||||
linksStmt.setFetchSize(10_000);
|
||||
rsp = linksStmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
int source = deAlias(rsp.getInt(1));
|
||||
int dest = deAlias(rsp.getInt(2));
|
||||
|
||||
tmpMap.computeIfAbsent(dest, this::createBitmapWithSelf).add(source);
|
||||
}
|
||||
}
|
||||
|
||||
tmpMap.entrySet().stream()
|
||||
.filter(e -> isEligible(e.getValue()))
|
||||
.forEach(e -> {
|
||||
var val = of(e.getValue());
|
||||
idsList.add(e.getKey());
|
||||
itemsList.add(val);
|
||||
dToSMap.put(e.getKey(), val);
|
||||
});
|
||||
weights = new float[1 + idsList.stream().mapToInt(i -> i).max().orElse(0)];
|
||||
for (int i = 0; i < idsList.size(); i++) {
|
||||
weights[idsList.get(i)] = getWeight(idsList.get(i));
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isEligible(RoaringBitmap value) {
|
||||
int cardinality = value.getCardinality();
|
||||
|
||||
return cardinality < 10000;
|
||||
}
|
||||
|
||||
private int deAlias(int id) {
|
||||
int val = aliasMap.get(id);
|
||||
if (val < 0)
|
||||
return id;
|
||||
return val;
|
||||
}
|
||||
|
||||
LinkedBlockingDeque<DomainSimilarities> similaritiesLinkedBlockingDeque = new LinkedBlockingDeque<>(10);
|
||||
volatile boolean running;
|
||||
|
||||
@SneakyThrows
|
||||
public void tryDomains(String... domainName) {
|
||||
var dataStoreDao = new EdgeDataStoreDaoImpl(dataSource);
|
||||
|
||||
System.out.println(Arrays.toString(domainName));
|
||||
|
||||
int[] domainIds = Arrays.stream(domainName).map(EdgeDomain::new)
|
||||
.map(dataStoreDao::getDomainId)
|
||||
.mapToInt(EdgeId::id)
|
||||
.map(this::deAlias)
|
||||
.toArray();
|
||||
|
||||
for (int domainId : domainIds) {
|
||||
findAdjacentDtoS(domainId, similarities -> {
|
||||
for (var similarity : similarities.similarities()) {
|
||||
if (indexed.contains(similarity.domainId)) System.out.print("*");
|
||||
System.out.println(dataStoreDao.getDomain(new EdgeId<>(similarity.domainId)).map(Object::toString).orElse("") + " " + prettyPercent(similarity.value));
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
private String prettyPercent(double val) {
|
||||
return String.format("%2.2f%%", 100. * val);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void loadAll() {
|
||||
running = true;
|
||||
var thread = new Thread(this::insertThreadRun);
|
||||
thread.start();
|
||||
idsList.parallelStream()
|
||||
.filter(id -> !aliasMap.containsKey(id))
|
||||
.forEach(id -> findAdjacent(id, this::addToQueue));
|
||||
running = false;
|
||||
thread.join();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
void addToQueue(DomainSimilarities similarities) {
|
||||
similaritiesLinkedBlockingDeque.putLast(similarities);
|
||||
}
|
||||
|
||||
public void insertThreadRun() {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement(
|
||||
"""
|
||||
INSERT INTO EC_DOMAIN_NEIGHBORS_2
|
||||
(DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS)
|
||||
VALUES (?, ?, ?)
|
||||
ON DUPLICATE KEY UPDATE RELATEDNESS = GREATEST(EC_DOMAIN_NEIGHBORS_2.RELATEDNESS, VALUES(RELATEDNESS))
|
||||
""")
|
||||
) {
|
||||
while (running || !similaritiesLinkedBlockingDeque.isEmpty()) {
|
||||
var item = similaritiesLinkedBlockingDeque.pollFirst(60, TimeUnit.SECONDS);
|
||||
if (item == null) continue;
|
||||
|
||||
for (var similarity : item.similarities) {
|
||||
stmt.setInt(1, item.domainId);
|
||||
stmt.setInt(2, similarity.domainId);
|
||||
stmt.setDouble(3, similarity.value);
|
||||
stmt.addBatch();
|
||||
}
|
||||
stmt.executeBatch();
|
||||
}
|
||||
} catch (SQLException | InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public RoaringBitmap createBitmapWithSelf(int val) {
|
||||
var bm = new RoaringBitmap();
|
||||
bm.add(val);
|
||||
return bm;
|
||||
}
|
||||
|
||||
public void findAdjacent(int domainId, Consumer<DomainSimilarities> andThen) {
|
||||
findAdjacentDtoS(domainId, andThen);
|
||||
}
|
||||
|
||||
double cosineSimilarity(AndCardIntSet a, AndCardIntSet b) {
|
||||
double andCardinality = andCardinality(a, b);
|
||||
andCardinality /= Math.sqrt(a.getCardinality());
|
||||
andCardinality /= Math.sqrt(b.getCardinality());
|
||||
return andCardinality;
|
||||
}
|
||||
|
||||
double expensiveCosineSimilarity(AndCardIntSet a, AndCardIntSet b) {
|
||||
return weightedProduct(weights, a, b) / Math.sqrt(a.mulAndSum(weights) * b.mulAndSum(weights));
|
||||
}
|
||||
|
||||
float getWeight(int i) {
|
||||
var vector = dToSMap.get(i);
|
||||
|
||||
if (vector == null) return 1.0f;
|
||||
return 1.0f / (float) Math.log(2+vector.getCardinality());
|
||||
}
|
||||
|
||||
record DomainSimilarities(int domainId, List<DomainSimilarity> similarities) {};
|
||||
record DomainSimilarity(int domainId, double value) {};
|
||||
|
||||
@SneakyThrows
|
||||
private void findAdjacentDtoS(int domainId, Consumer<DomainSimilarities> andThen) {
|
||||
var vector = dToSMap.get(domainId);
|
||||
if (vector == null || !vector.cardinalityExceeds(10)) {
|
||||
return;
|
||||
}
|
||||
|
||||
System.out.println("DtoS " + domainId);
|
||||
|
||||
List<DomainSimilarity> similarities = new ArrayList<>(1000);
|
||||
|
||||
/** The minimum cardinality a vector can have so that
|
||||
*
|
||||
* a (x) b
|
||||
* ------- < k is given by k^2
|
||||
* |a||b|
|
||||
*
|
||||
*/
|
||||
int cardMin = Math.max(2, (int) (0.01 * vector.getCardinality()));
|
||||
|
||||
for (int i = 0; i < itemsList.size(); i++) {
|
||||
|
||||
int id = idsList.get(i);
|
||||
if (id == domainId)
|
||||
continue;
|
||||
|
||||
var otherVec = itemsList.get(i);
|
||||
if (otherVec.getCardinality() < cardMin)
|
||||
continue;
|
||||
|
||||
double similarity = cosineSimilarity(vector, otherVec);
|
||||
if (similarity > 0.1) {
|
||||
var recalculated = expensiveCosineSimilarity(vector, otherVec);
|
||||
if (recalculated > 0.1) {
|
||||
similarities.add(new DomainSimilarity(id, recalculated));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (similarities.size() > 128) {
|
||||
similarities.sort(Comparator.comparing(DomainSimilarity::value));
|
||||
similarities.subList(0, similarities.size() - 128).clear();
|
||||
}
|
||||
|
||||
|
||||
andThen.accept(new DomainSimilarities(domainId, similarities));
|
||||
}
|
||||
|
||||
|
||||
// @SneakyThrows
|
||||
// private void findAdjacentDtoS(Consumer<DomainSimilarities> andThen, int... domainIds) {
|
||||
// var vectors = Arrays.stream(domainIds).mapToObj(dToSMap::get)
|
||||
// .filter(Objects::nonNull)
|
||||
// .filter(vec -> vec.cardinalityExceeds(10))
|
||||
// .toArray(AndCardIntSet[]::new);
|
||||
// Set<Integer> domainIdsSet = new HashSet<>(Arrays.stream(domainIds).boxed().toList());
|
||||
//
|
||||
// if (vectors.length != domainIds.length)
|
||||
// return;
|
||||
//
|
||||
// List<DomainSimilarity> similarities = dToSMap.entrySet().parallelStream()
|
||||
// .filter(e -> !domainIdsSet.contains(e.getKey()) && indexed.contains(e.getKey()))
|
||||
// .flatMap(entry -> {
|
||||
//
|
||||
// double similarity = 0.;
|
||||
// for (var vector : vectors) {
|
||||
// similarity += cosineSimilarity(vector, entry.getValue());
|
||||
// }
|
||||
//
|
||||
// if (similarity > 0.1 * vectors.length) {
|
||||
// double recalculated = 0;
|
||||
// for (var vector : vectors) {
|
||||
// recalculated += expensiveCosineSimilarity(vector, entry.getValue());
|
||||
// }
|
||||
// if (recalculated > 0.1 * vectors.length) {
|
||||
// return Stream.of(new DomainSimilarity(entry.getKey(), recalculated));
|
||||
// }
|
||||
// }
|
||||
// return Stream.empty();
|
||||
// }).sorted(Comparator.comparing(DomainSimilarity::value))
|
||||
// .toList();
|
||||
//
|
||||
// andThen.accept(new DomainSimilarities(domainIds[0], similarities));
|
||||
// }
|
||||
|
||||
|
||||
public static void main(String[] args) throws SQLException {
|
||||
DatabaseModule dm = new DatabaseModule();
|
||||
|
||||
var main = new EdgeDomainLinkConsineSimilarityMain(dm.provideConnection());
|
||||
if (args.length == 0) {
|
||||
main.loadAll();
|
||||
}
|
||||
else {
|
||||
main.tryDomains(args);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -2,8 +2,14 @@ package nu.marginalia.wmsa.api.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@AllArgsConstructor @Getter
|
||||
public class ApiSearchResult {
|
||||
public String url;
|
||||
@ -11,10 +17,30 @@ public class ApiSearchResult {
|
||||
public String description;
|
||||
public double quality;
|
||||
|
||||
public List<List<ApiSearchResultQueryDetails>> details = new ArrayList<>();
|
||||
|
||||
public ApiSearchResult(EdgeUrlDetails url) {
|
||||
this.url = url.url.toString();
|
||||
this.title = url.getTitle();
|
||||
this.description = url.getDescription();
|
||||
this.quality = url.getTermScore();
|
||||
|
||||
if (url.resultItem != null) {
|
||||
var bySet = url.resultItem.scores.stream().collect(Collectors.groupingBy(EdgeSearchResultKeywordScore::set));
|
||||
|
||||
outer:
|
||||
for (var entries : bySet.values()) {
|
||||
List<ApiSearchResultQueryDetails> lst = new ArrayList<>();
|
||||
for (var entry : entries) {
|
||||
var metadata = entry.metadata();
|
||||
if (metadata.isEmpty())
|
||||
continue outer;
|
||||
|
||||
Set<String> flags = metadata.flags().stream().map(Object::toString).collect(Collectors.toSet());
|
||||
lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(),metadata.count(), flags));
|
||||
}
|
||||
details.add(lst);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,16 @@
|
||||
package nu.marginalia.wmsa.api.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
@AllArgsConstructor @Getter
|
||||
public class ApiSearchResultQueryDetails {
|
||||
|
||||
String keyword;
|
||||
int tfIdf;
|
||||
int count;
|
||||
|
||||
Set<String> flagsUnstableAPI;
|
||||
}
|
@ -5,6 +5,7 @@ import nu.marginalia.wmsa.auth.AuthMain;
|
||||
import nu.marginalia.wmsa.configuration.command.*;
|
||||
import nu.marginalia.wmsa.edge.assistant.EdgeAssistantMain;
|
||||
import nu.marginalia.wmsa.edge.dating.DatingMain;
|
||||
import nu.marginalia.wmsa.edge.explorer.ExplorerMain;
|
||||
import nu.marginalia.wmsa.edge.index.EdgeIndexMain;
|
||||
import nu.marginalia.wmsa.edge.search.EdgeSearchMain;
|
||||
import nu.marginalia.wmsa.encyclopedia.EncyclopediaMain;
|
||||
@ -37,6 +38,7 @@ public enum ServiceDescriptor {
|
||||
ENCYCLOPEDIA("encyclopedia", 5040, EncyclopediaMain.class),
|
||||
|
||||
DATING("dating", 5070, DatingMain.class),
|
||||
EXPLORER("explorer", 5071, ExplorerMain.class),
|
||||
|
||||
TEST_1("test-1", 0, null),
|
||||
TEST_2("test-2", 0, null);
|
||||
@ -77,7 +79,8 @@ public enum ServiceDescriptor {
|
||||
|
||||
public static void main(String... args) {
|
||||
MainMapLookup.setMainArguments(args);
|
||||
Map<String, Command> functions = Stream.of(new ListCommand(),
|
||||
Map<String, Command> functions = Stream.of(
|
||||
new ListCommand(),
|
||||
new StartCommand(),
|
||||
new ConvertCommand(),
|
||||
new CrawlCommand(),
|
||||
|
@ -12,6 +12,7 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.sql.SQLException;
|
||||
|
||||
@ -85,6 +86,12 @@ public class ScreenshotService {
|
||||
}
|
||||
|
||||
private Object serveSvgPlaceholder(Response response, int id) {
|
||||
|
||||
var domainName = edgeDataStoreDao.getDomain(new EdgeId<>(id)).map(Object::toString);
|
||||
if (domainName.isEmpty()) {
|
||||
Spark.halt(404);
|
||||
}
|
||||
|
||||
response.type("image/svg+xml");
|
||||
return String.format("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" +
|
||||
"<svg\n" +
|
||||
@ -111,6 +118,6 @@ public class ScreenshotService {
|
||||
" style=\"font-size:32px;fill:#000000;font-family:monospace;\"\n" +
|
||||
" x=\"320\" y=\"240\" dominant-baseline=\"middle\" text-anchor=\"middle\">%s</text>\n" +
|
||||
" </g>\n" +
|
||||
"</svg>\n", edgeDataStoreDao.getDomain(new EdgeId<>(id)));
|
||||
"</svg>\n", domainName.get());
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,69 @@
|
||||
package nu.marginalia.wmsa.edge.converting;
|
||||
|
||||
import com.github.luben.zstd.ZstdOutputStream;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.PrintWriter;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.ZoneOffset;
|
||||
|
||||
public class ConversionLog implements AutoCloseable, Interpreter {
|
||||
|
||||
|
||||
|
||||
private final PrintWriter writer;
|
||||
|
||||
public ConversionLog(Path rootDir) throws IOException {
|
||||
String fileName = String.format("conversion-log-%s.zstd", LocalDateTime.now().toEpochSecond(ZoneOffset.UTC));
|
||||
Path logFile = rootDir.resolve(fileName);
|
||||
|
||||
writer = new PrintWriter(new ZstdOutputStream(
|
||||
new BufferedOutputStream(Files.newOutputStream(logFile, StandardOpenOption.WRITE, StandardOpenOption.CREATE))));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
writer.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void loadUrl(EdgeUrl[] url) {}
|
||||
|
||||
@Override
|
||||
public void loadDomain(EdgeDomain[] domain) {}
|
||||
|
||||
@Override
|
||||
public void loadRssFeed(EdgeUrl[] rssFeed) {}
|
||||
|
||||
@Override
|
||||
public void loadDomainLink(DomainLink[] links) {}
|
||||
|
||||
@Override
|
||||
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {}
|
||||
|
||||
@Override
|
||||
public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {}
|
||||
|
||||
@Override
|
||||
public synchronized void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) {
|
||||
writer.printf("%s\t%s\n", loadProcessedDocumentWithError.url(), loadProcessedDocumentWithError.reason());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {}
|
||||
|
||||
@Override
|
||||
public void loadDomainRedirect(DomainLink link) {}
|
||||
}
|
@ -54,5 +54,4 @@ public class ConvertedDomainReader {
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -5,9 +5,9 @@ import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.util.ParallelPipe;
|
||||
import nu.marginalia.wmsa.edge.converting.compiler.InstructionsCompiler;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.InstructionsCompiler;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||
import nu.marginalia.wmsa.edge.crawling.WorkLog;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
@ -47,11 +47,15 @@ public class ConverterMain {
|
||||
Gson gson
|
||||
) throws Exception {
|
||||
|
||||
instructionWriter = new LoadInstructionWriter(plan.process.getDir(), gson);
|
||||
;
|
||||
|
||||
|
||||
|
||||
logger.info("Starting pipe");
|
||||
|
||||
try (WorkLog processLog = plan.createProcessWorkLog()) {
|
||||
try (WorkLog processLog = plan.createProcessWorkLog();
|
||||
ConversionLog log = new ConversionLog(plan.process.getDir())) {
|
||||
instructionWriter = new LoadInstructionWriter(log, plan.process.getDir(), gson);
|
||||
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 20, 4, 2) {
|
||||
|
||||
@Override
|
||||
|
@ -24,10 +24,13 @@ import java.util.List;
|
||||
|
||||
public class LoadInstructionWriter {
|
||||
|
||||
private ConversionLog log;
|
||||
private final Path outputDir;
|
||||
private final Gson gson;
|
||||
private static final Logger logger = LoggerFactory.getLogger(LoadInstructionWriter.class);
|
||||
public LoadInstructionWriter(Path outputDir, Gson gson) {
|
||||
|
||||
public LoadInstructionWriter(ConversionLog log, Path outputDir, Gson gson) {
|
||||
this.log = log;
|
||||
this.outputDir = outputDir;
|
||||
this.gson = gson;
|
||||
|
||||
@ -35,6 +38,7 @@ public class LoadInstructionWriter {
|
||||
throw new IllegalArgumentException("Output dir " + outputDir + " does not exist");
|
||||
}
|
||||
}
|
||||
|
||||
public String accept(String id, List<Instruction> instructionList) throws IOException {
|
||||
Path outputFile = getOutputFile(id);
|
||||
|
||||
@ -48,6 +52,8 @@ public class LoadInstructionWriter {
|
||||
logger.info("Writing {} - {} - {}", id, instructionList.size(), summary);
|
||||
|
||||
for (var instr : instructionList) {
|
||||
instr.apply(log);
|
||||
|
||||
outputStream.append(instr.tag().name());
|
||||
outputStream.append(' ');
|
||||
gson.toJson(instr, outputStream);
|
||||
@ -66,6 +72,7 @@ public class LoadInstructionWriter {
|
||||
if (!Files.exists(destDir)) {
|
||||
Files.createDirectories(destDir);
|
||||
}
|
||||
|
||||
return destDir.resolve(id + ".pzstd");
|
||||
}
|
||||
|
||||
|
@ -70,7 +70,11 @@ public class ReindexTriggerMain {
|
||||
};
|
||||
|
||||
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/repartition")).build()).execute();
|
||||
|
||||
if (!Boolean.getBoolean("no-preconvert")) {
|
||||
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/preconvert")).build()).execute();
|
||||
}
|
||||
|
||||
for (int i = 0; i < DYNAMIC_BUCKET_LENGTH+1; i++) {
|
||||
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/reindex/" + i)).build()).execute();
|
||||
}
|
||||
|
@ -0,0 +1,58 @@
|
||||
package nu.marginalia.wmsa.edge.converting.compiler;
|
||||
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadKeywords;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlockType;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class DocumentsCompiler {
|
||||
|
||||
public void compile(List<Instruction> ret, List<ProcessedDocument> documents) {
|
||||
|
||||
for (var doc : documents) {
|
||||
compileDocumentDetails(ret, doc);
|
||||
}
|
||||
|
||||
for (var doc : documents) {
|
||||
compileWords(ret, doc);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void compileDocumentDetails(List<Instruction> ret, ProcessedDocument doc) {
|
||||
var details = doc.details;
|
||||
|
||||
if (details != null) {
|
||||
ret.add(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard, details.length, details.hashCode, details.quality));
|
||||
}
|
||||
else {
|
||||
ret.add(new LoadProcessedDocumentWithError(doc.url, doc.state, doc.stateReason));
|
||||
}
|
||||
}
|
||||
|
||||
private void compileWords(List<Instruction> ret, ProcessedDocument doc) {
|
||||
var words = doc.words;
|
||||
|
||||
if (words != null) {
|
||||
|
||||
var wordsArray = words.values().stream()
|
||||
.filter(this::filterNonTransients)
|
||||
.map(DocumentKeywords::new)
|
||||
.toArray(DocumentKeywords[]::new);
|
||||
|
||||
ret.add(new LoadKeywords(doc.url, wordsArray));
|
||||
}
|
||||
}
|
||||
|
||||
private boolean filterNonTransients(EdgePageWords words) {
|
||||
return words.block.type != IndexBlockType.TRANSIENT;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,23 @@
|
||||
package nu.marginalia.wmsa.edge.converting.compiler;
|
||||
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadRssFeed;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
public class FeedsCompiler {
|
||||
|
||||
public void compile(List<Instruction> ret, List<ProcessedDocument> documents) {
|
||||
|
||||
EdgeUrl[] feeds = documents.stream().map(doc -> doc.details)
|
||||
.filter(Objects::nonNull)
|
||||
.flatMap(dets -> dets.feedLinks.stream())
|
||||
.distinct()
|
||||
.toArray(EdgeUrl[]::new);
|
||||
|
||||
ret.add(new LoadRssFeed(feeds));
|
||||
}
|
||||
}
|
@ -0,0 +1,57 @@
|
||||
package nu.marginalia.wmsa.edge.converting.compiler;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDomain;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class InstructionsCompiler {
|
||||
private final UrlsCompiler urlsCompiler;
|
||||
private final DocumentsCompiler documentsCompiler;
|
||||
private final FeedsCompiler feedsCompiler;
|
||||
private final LinksCompiler linksCompiler;
|
||||
private final RedirectCompiler redirectCompiler;
|
||||
|
||||
@Inject
|
||||
public InstructionsCompiler(UrlsCompiler urlsCompiler,
|
||||
DocumentsCompiler documentsCompiler,
|
||||
FeedsCompiler feedsCompiler,
|
||||
LinksCompiler linksCompiler,
|
||||
RedirectCompiler redirectCompiler)
|
||||
{
|
||||
this.urlsCompiler = urlsCompiler;
|
||||
this.documentsCompiler = documentsCompiler;
|
||||
this.feedsCompiler = feedsCompiler;
|
||||
this.linksCompiler = linksCompiler;
|
||||
this.redirectCompiler = redirectCompiler;
|
||||
}
|
||||
|
||||
public List<Instruction> compile(ProcessedDomain domain) {
|
||||
List<Instruction> ret = new ArrayList<>(domain.size()*4);
|
||||
|
||||
ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.ip));
|
||||
|
||||
if (domain.documents != null) {
|
||||
urlsCompiler.compile(ret, domain.documents);
|
||||
documentsCompiler.compile(ret, domain.documents);
|
||||
|
||||
feedsCompiler.compile(ret, domain.documents);
|
||||
|
||||
linksCompiler.compile(ret, domain.domain, domain.documents);
|
||||
}
|
||||
if (domain.redirect != null) {
|
||||
redirectCompiler.compile(ret, domain.domain, domain.redirect);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,26 @@
|
||||
package nu.marginalia.wmsa.edge.converting.compiler;
|
||||
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomainLink;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
public class LinksCompiler {
|
||||
|
||||
public void compile(List<Instruction> ret, EdgeDomain from, List<ProcessedDocument> documents) {
|
||||
|
||||
DomainLink[] links = documents.stream().map(doc -> doc.details)
|
||||
.filter(Objects::nonNull)
|
||||
.flatMap(dets -> dets.linksExternal.stream())
|
||||
.map(link -> link.domain)
|
||||
.distinct()
|
||||
.map(domain -> new DomainLink(from, domain))
|
||||
.toArray(DomainLink[]::new);
|
||||
|
||||
ret.add(new LoadDomainLink(links));
|
||||
}
|
||||
}
|
@ -0,0 +1,19 @@
|
||||
package nu.marginalia.wmsa.edge.converting.compiler;
|
||||
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomain;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomainLink;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomainRedirect;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class RedirectCompiler {
|
||||
|
||||
public void compile(List<Instruction> ret, EdgeDomain from, EdgeDomain to) {
|
||||
ret.add(new LoadDomain(to));
|
||||
ret.add(new LoadDomainLink(new DomainLink(from, to)));
|
||||
ret.add(new LoadDomainRedirect(new DomainLink(from, to)));
|
||||
}
|
||||
}
|
@ -0,0 +1,49 @@
|
||||
package nu.marginalia.wmsa.edge.converting.compiler;
|
||||
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomain;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadUrl;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
public class UrlsCompiler {
|
||||
|
||||
private static final int MAX_INTERNAL_LINKS = 25;
|
||||
|
||||
public void compile(List<Instruction> ret, List<ProcessedDocument> documents) {
|
||||
Set<EdgeUrl> seenUrls = new HashSet<>(documents.size()*4);
|
||||
Set<EdgeDomain> seenDomains = new HashSet<>(documents.size());
|
||||
|
||||
for (var doc : documents) {
|
||||
seenUrls.add(doc.url);
|
||||
|
||||
if (doc.details != null) {
|
||||
|
||||
for (var url : doc.details.linksExternal) {
|
||||
if (seenDomains.add(url.domain)) {
|
||||
seenUrls.add(url);
|
||||
}
|
||||
}
|
||||
|
||||
if (doc.isOk()) {
|
||||
// Don't load more than a few from linksInternal, grows too big for no reason
|
||||
var linksToAdd = new ArrayList<>(doc.details.linksInternal);
|
||||
if (linksToAdd.size() > MAX_INTERNAL_LINKS) {
|
||||
linksToAdd.subList(MAX_INTERNAL_LINKS, linksToAdd.size()).clear();
|
||||
}
|
||||
seenUrls.addAll(linksToAdd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ret.add(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new)));
|
||||
ret.add(new LoadUrl(seenUrls.toArray(EdgeUrl[]::new)));
|
||||
}
|
||||
|
||||
}
|
@ -1,17 +1,47 @@
|
||||
package nu.marginalia.wmsa.edge.converting.interpreter.instruction;
|
||||
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public record DocumentKeywords(IndexBlock block, String... keywords) {
|
||||
public record DocumentKeywords(IndexBlock block,
|
||||
String[] keywords,
|
||||
long[] metadata) {
|
||||
|
||||
public DocumentKeywords(EdgePageWords words) {
|
||||
this(words.block, words.words.toArray(String[]::new));
|
||||
this(words.block,
|
||||
words.words.toArray(String[]::new),
|
||||
words.metadata.toArray());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return getClass().getSimpleName()+"["+block +", "+Arrays.toString(keywords)+"]";
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append(getClass().getSimpleName());
|
||||
sb.append('[').append(block).append(", ");
|
||||
for (int i = 0; i < keywords.length; i++) {
|
||||
sb.append("\n\t ");
|
||||
if (metadata[i] != 0) {
|
||||
sb.append(keywords[i]).append("/").append(new EdgePageWordMetadata(metadata[i]));
|
||||
}
|
||||
else {
|
||||
sb.append(keywords[i]);
|
||||
}
|
||||
}
|
||||
return sb.append("\n]").toString();
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return keywords.length == 0;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return keywords.length;
|
||||
}
|
||||
|
||||
public DocumentKeywords subList(int start, int end) {
|
||||
return new DocumentKeywords(block, Arrays.copyOfRange(keywords, start, end), Arrays.copyOfRange(metadata, start, end));
|
||||
}
|
||||
}
|
||||
|
@ -8,7 +8,8 @@ import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
|
||||
|
||||
|
||||
public record LoadProcessedDocumentWithError(EdgeUrl url,
|
||||
EdgeUrlState state) implements Instruction
|
||||
EdgeUrlState state,
|
||||
String reason) implements Instruction
|
||||
{
|
||||
@Override
|
||||
public void apply(Interpreter interpreter) {
|
||||
|
@ -25,34 +25,13 @@ public class SqlLoadUrls {
|
||||
@Inject
|
||||
public SqlLoadUrls(HikariDataSource dataSource) {
|
||||
this.dataSource = dataSource;
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
try (var stmt = conn.createStatement()) {
|
||||
stmt.execute("DROP PROCEDURE IF EXISTS INSERT_URL");
|
||||
stmt.execute("""
|
||||
CREATE PROCEDURE INSERT_URL (
|
||||
IN PROTO VARCHAR(255),
|
||||
IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
|
||||
IN PORT INT,
|
||||
IN PATH VARCHAR(255),
|
||||
IN PARAM VARCHAR(255),
|
||||
IN PATH_HASH BIGINT
|
||||
)
|
||||
BEGIN
|
||||
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PARAM,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PARAM,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
|
||||
END
|
||||
""");
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException("Failed to set up loader", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public void load(LoaderData data, EdgeUrl[] urls) {
|
||||
Set<EdgeDomain> affectedDomains = new HashSet<>();
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?,?,?)");
|
||||
var insertCall = conn.prepareStatement("INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PARAM,PATH_HASH) VALUES (?,?,?,?,?,?)");
|
||||
var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=?")
|
||||
)
|
||||
{
|
||||
@ -67,7 +46,7 @@ public class SqlLoadUrls {
|
||||
affectedDomains.add(url.domain);
|
||||
|
||||
insertCall.setString(1, url.proto);
|
||||
insertCall.setString(2, url.domain.toString());
|
||||
insertCall.setInt(2, data.getDomainId(url.domain));
|
||||
if (url.port != null) {
|
||||
insertCall.setInt(3, url.port);
|
||||
}
|
||||
@ -79,7 +58,7 @@ public class SqlLoadUrls {
|
||||
insertCall.setLong(6, hashPath(url.path, url.param));
|
||||
insertCall.addBatch();
|
||||
|
||||
if (cnt++ == 250) {
|
||||
if (cnt++ == 1000) {
|
||||
var ret = insertCall.executeBatch();
|
||||
conn.commit();
|
||||
|
||||
|
@ -1,11 +1,18 @@
|
||||
package nu.marginalia.wmsa.edge.converting.model;
|
||||
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
|
||||
|
||||
public class DisqualifiedException extends Exception {
|
||||
public final DisqualificationReason reason;
|
||||
|
||||
public DisqualifiedException(DisqualificationReason reason) {
|
||||
this.reason = reason;
|
||||
}
|
||||
|
||||
public DisqualifiedException(CrawlerDocumentStatus crawlerStatus) {
|
||||
this.reason = DisqualificationReason.fromCrawlerStatus(crawlerStatus);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Throwable fillInStackTrace() {
|
||||
return this;
|
||||
@ -18,6 +25,22 @@ public class DisqualifiedException extends Exception {
|
||||
STATUS,
|
||||
QUALITY,
|
||||
ACCEPTABLE_ADS,
|
||||
FORBIDDEN
|
||||
FORBIDDEN,
|
||||
SHORT_CIRCUIT,
|
||||
|
||||
PROCESSING_EXCEPTION,
|
||||
|
||||
BAD_CONTENT_TYPE,
|
||||
BAD_CHARSET,
|
||||
REDIRECT,
|
||||
ROBOTS_TXT,
|
||||
ERROR,
|
||||
Timeout, // Don't you dare
|
||||
BAD_CANONICAL
|
||||
;
|
||||
|
||||
public static DisqualificationReason fromCrawlerStatus(CrawlerDocumentStatus crawlerStatus) {
|
||||
return DisqualificationReason.valueOf(crawlerStatus.name());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -17,6 +17,10 @@ public class ProcessedDocument {
|
||||
public EdgeUrlState state;
|
||||
public String stateReason;
|
||||
|
||||
public boolean isOk() {
|
||||
return EdgeUrlState.OK == state;
|
||||
}
|
||||
|
||||
public OptionalDouble quality() {
|
||||
if (details != null) {
|
||||
return OptionalDouble.of(details.quality);
|
||||
|
@ -7,6 +7,7 @@ import nu.marginalia.util.language.LanguageFilter;
|
||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
||||
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException.DisqualificationReason;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||
@ -81,32 +82,12 @@ public class DocumentProcessor {
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public ProcessedDocument process(CrawledDocument crawledDocument, CrawledDomain crawledDomain) {
|
||||
ProcessedDocument ret = new ProcessedDocument();
|
||||
|
||||
try {
|
||||
ret.url = getDocumentUrl(crawledDocument);
|
||||
ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus);
|
||||
|
||||
if (ret.state == EdgeUrlState.OK) {
|
||||
|
||||
if (AcceptableAds.hasAcceptableAdsHeader(crawledDocument)) {
|
||||
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
|
||||
}
|
||||
|
||||
if (isAcceptedContentType(crawledDocument)) {
|
||||
var detailsWords = createDetails(crawledDomain, crawledDocument);
|
||||
|
||||
ret.details = detailsWords.details();
|
||||
ret.words = detailsWords.words();
|
||||
}
|
||||
else {
|
||||
throw new DisqualifiedException(DisqualificationReason.CONTENT_TYPE);
|
||||
}
|
||||
}
|
||||
else {
|
||||
throw new DisqualifiedException(DisqualificationReason.STATUS);
|
||||
}
|
||||
processDocument(crawledDocument, crawledDomain, ret);
|
||||
}
|
||||
catch (DisqualifiedException ex) {
|
||||
ret.state = EdgeUrlState.DISQUALIFIED;
|
||||
@ -115,6 +96,7 @@ public class DocumentProcessor {
|
||||
}
|
||||
catch (Exception ex) {
|
||||
ret.state = EdgeUrlState.DISQUALIFIED;
|
||||
ret.stateReason = DisqualificationReason.PROCESSING_EXCEPTION.toString();
|
||||
logger.info("Failed to convert " + crawledDocument.url, ex);
|
||||
ex.printStackTrace();
|
||||
}
|
||||
@ -122,6 +104,32 @@ public class DocumentProcessor {
|
||||
return ret;
|
||||
}
|
||||
|
||||
private void processDocument(CrawledDocument crawledDocument, CrawledDomain crawledDomain, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException {
|
||||
|
||||
var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus);
|
||||
if (crawlerStatus != CrawlerDocumentStatus.OK) {
|
||||
throw new DisqualifiedException(crawlerStatus);
|
||||
}
|
||||
|
||||
if (AcceptableAds.hasAcceptableAdsHeader(crawledDocument)) {
|
||||
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
|
||||
}
|
||||
|
||||
if (!isAcceptedContentType(crawledDocument)) {
|
||||
throw new DisqualifiedException(DisqualificationReason.CONTENT_TYPE);
|
||||
}
|
||||
|
||||
|
||||
ret.url = getDocumentUrl(crawledDocument);
|
||||
ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus);
|
||||
|
||||
var detailsWithWordsLinks = createDetails(crawledDomain, crawledDocument);
|
||||
|
||||
ret.details = detailsWithWordsLinks.details();
|
||||
ret.words = detailsWithWordsLinks.words();
|
||||
}
|
||||
|
||||
|
||||
private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument)
|
||||
throws URISyntaxException
|
||||
{
|
||||
@ -193,9 +201,11 @@ public class DocumentProcessor {
|
||||
ret.standard = getHtmlStandard(doc);
|
||||
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
|
||||
|
||||
ret.quality = documentValuator.getQuality(ret.standard, doc, dld);
|
||||
ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc, dld);
|
||||
ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();
|
||||
|
||||
KeywordMetadata keywordMetadata = new KeywordMetadata(ret.quality);
|
||||
|
||||
EdgePageWordSet words;
|
||||
if (shouldDoSimpleProcessing(url, ret)) {
|
||||
/* Some documents we'll index, but only superficially. This is a compromise
|
||||
@ -203,12 +213,12 @@ public class DocumentProcessor {
|
||||
queries. This also saves a lot of processing power.
|
||||
*/
|
||||
ret.features = Set.of(HtmlFeature.UNKNOWN);
|
||||
words = keywordExtractor.extractKeywordsMinimal(dld);
|
||||
words = keywordExtractor.extractKeywordsMinimal(dld, keywordMetadata);
|
||||
ret.description = "";
|
||||
}
|
||||
else {
|
||||
ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld);
|
||||
words = keywordExtractor.extractKeywords(dld);
|
||||
words = keywordExtractor.extractKeywords(dld, keywordMetadata);
|
||||
ret.description = getDescription(doc);
|
||||
}
|
||||
|
||||
@ -239,6 +249,10 @@ public class DocumentProcessor {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Annoying wordpress crap
|
||||
if (url.path.startsWith("/tag/") && url.path.endsWith("/")) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -262,7 +276,7 @@ public class DocumentProcessor {
|
||||
|
||||
ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
|
||||
|
||||
words.append(IndexBlock.Meta, tagWords);
|
||||
words.appendWithNoMeta(IndexBlock.Meta, tagWords);
|
||||
}
|
||||
|
||||
private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) {
|
||||
@ -296,14 +310,21 @@ public class DocumentProcessor {
|
||||
.ifPresent(lp::acceptFeed);
|
||||
}
|
||||
|
||||
createLinkKeywords(words, lp);
|
||||
createFileLinkKeywords(words, lp, domain);
|
||||
}
|
||||
|
||||
private void createLinkKeywords(EdgePageWordSet words, LinkProcessor lp) {
|
||||
final Set<String> linkTerms = new HashSet<>();
|
||||
|
||||
for (var fd : lp.getForeignDomains()) {
|
||||
linkTerms.add("links:"+fd.toString().toLowerCase());
|
||||
linkTerms.add("links:"+fd.getDomain().toLowerCase());
|
||||
}
|
||||
words.append(IndexBlock.Meta, linkTerms);
|
||||
words.appendWithNoMeta(IndexBlock.Meta, linkTerms);
|
||||
}
|
||||
|
||||
private void createFileLinkKeywords(EdgePageWordSet words, LinkProcessor lp, EdgeDomain domain) {
|
||||
Set<String> fileKeywords = new HashSet<>(100);
|
||||
for (var link : lp.getNonIndexableUrls()) {
|
||||
|
||||
@ -314,8 +335,8 @@ public class DocumentProcessor {
|
||||
synthesizeFilenameKeyword(fileKeywords, link);
|
||||
|
||||
}
|
||||
words.append(IndexBlock.Artifacts, fileKeywords);
|
||||
|
||||
words.appendWithNoMeta(IndexBlock.Artifacts, fileKeywords);
|
||||
}
|
||||
|
||||
private void synthesizeFilenameKeyword(Set<String> fileKeywords, EdgeUrl link) {
|
||||
@ -364,5 +385,7 @@ public class DocumentProcessor {
|
||||
return doc.text().length();
|
||||
}
|
||||
|
||||
private record DetailsWithWords(ProcessedDocumentDetails details, EdgePageWordSet words) {}
|
||||
private record DetailsWithWords(ProcessedDocumentDetails details,
|
||||
EdgePageWordSet words) {}
|
||||
|
||||
}
|
||||
|
@ -3,17 +3,22 @@ package nu.marginalia.wmsa.edge.converting.processor;
|
||||
import com.google.common.base.Strings;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.CommonKeywordExtractor;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.InternalLinkGraph;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlockType;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus.BAD_CANONICAL;
|
||||
|
||||
@ -47,6 +52,8 @@ public class DomainProcessor {
|
||||
|
||||
fixBadCanonicalTags(crawledDomain.doc);
|
||||
|
||||
InternalLinkGraph internalLinkGraph = new InternalLinkGraph();
|
||||
|
||||
DocumentDisqualifier disqualifier = new DocumentDisqualifier();
|
||||
for (var doc : crawledDomain.doc) {
|
||||
if (disqualifier.isQualified()) {
|
||||
@ -54,6 +61,9 @@ public class DomainProcessor {
|
||||
|
||||
if (processedDoc.url != null) {
|
||||
ret.documents.add(processedDoc);
|
||||
|
||||
internalLinkGraph.accept(processedDoc);
|
||||
|
||||
processedDoc.quality().ifPresent(disqualifier::offer);
|
||||
}
|
||||
else if ("LANGUAGE".equals(processedDoc.stateReason)) {
|
||||
@ -62,24 +72,16 @@ public class DomainProcessor {
|
||||
}
|
||||
else { // Short-circuit processing if quality is too low
|
||||
var stub = documentProcessor.makeDisqualifiedStub(doc);
|
||||
stub.stateReason = DisqualifiedException.DisqualificationReason.SHORT_CIRCUIT.toString();
|
||||
if (stub.url != null) {
|
||||
ret.documents.add(stub);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Set<String> commonSiteWords = new HashSet<>(10);
|
||||
flagCommonSiteWords(ret);
|
||||
flagAdjacentSiteWords(internalLinkGraph, ret);
|
||||
|
||||
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Tfidf_Top, IndexBlock.Subjects));
|
||||
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Title));
|
||||
|
||||
if (!commonSiteWords.isEmpty()) {
|
||||
for (var doc : ret.documents) {
|
||||
if (doc.words != null) {
|
||||
doc.words.get(IndexBlock.Site).addAll(commonSiteWords);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
ret.documents = Collections.emptyList();
|
||||
@ -90,6 +92,70 @@ public class DomainProcessor {
|
||||
return ret;
|
||||
}
|
||||
|
||||
private void flagCommonSiteWords(ProcessedDomain processedDomain) {
|
||||
Set<String> commonSiteWords = new HashSet<>(10);
|
||||
|
||||
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain, IndexBlock.Tfidf_High, IndexBlock.Subjects));
|
||||
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain, IndexBlock.Title));
|
||||
|
||||
if (commonSiteWords.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (var doc : processedDomain.documents) {
|
||||
if (doc.words != null) {
|
||||
for (var block : IndexBlock.values()) {
|
||||
if (block.type == IndexBlockType.PAGE_DATA) {
|
||||
doc.words.get(block).setFlagOnMetadataForWords(EdgePageWordFlags.Site, commonSiteWords);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void flagAdjacentSiteWords(InternalLinkGraph internalLinkGraph, ProcessedDomain processedDomain) {
|
||||
var invertedGraph = internalLinkGraph.trimAndInvert();
|
||||
|
||||
Map<EdgeUrl, Set<String>> linkedKeywords = new HashMap<>(100);
|
||||
|
||||
invertedGraph.forEach((url, linkingUrls) -> {
|
||||
Map<String, Integer> keywords = new HashMap<>(100);
|
||||
|
||||
for (var linkingUrl : linkingUrls) {
|
||||
for (var keyword : internalLinkGraph.getKeywords(linkingUrl)) {
|
||||
keywords.merge(keyword, 1, Integer::sum);
|
||||
}
|
||||
}
|
||||
|
||||
var words = keywords.entrySet().stream()
|
||||
.filter(e -> e.getValue() > 3)
|
||||
.map(Map.Entry::getKey)
|
||||
.filter(internalLinkGraph.getCandidateKeywords(url)::contains)
|
||||
.collect(Collectors.toSet());
|
||||
if (!words.isEmpty()) {
|
||||
linkedKeywords.put(url, words);
|
||||
}
|
||||
});
|
||||
|
||||
for (var doc : processedDomain.documents) {
|
||||
if (doc.words == null)
|
||||
continue;
|
||||
|
||||
final Set<String> keywords = linkedKeywords.get(doc.url);
|
||||
if (keywords == null)
|
||||
continue;
|
||||
|
||||
for (var block : IndexBlock.values()) {
|
||||
if (block.type == IndexBlockType.PAGE_DATA) {
|
||||
doc.words.get(block).setFlagOnMetadataForWords(EdgePageWordFlags.SiteAdjacent, keywords);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
private void fixBadCanonicalTags(List<CrawledDocument> docs) {
|
||||
Map<String, Set<String>> seenCanonicals = new HashMap<>();
|
||||
Set<String> seenUrls = new HashSet<>();
|
||||
@ -162,7 +228,8 @@ public class DomainProcessor {
|
||||
}
|
||||
|
||||
boolean isQualified() {
|
||||
return count < 25 || goodCount*10 >= count;
|
||||
return true;
|
||||
// return count < 25 || goodCount*10 >= count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,116 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor;
|
||||
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.*;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class InstructionsCompiler {
|
||||
|
||||
public List<Instruction> compile(ProcessedDomain domain) {
|
||||
List<Instruction> ret = new ArrayList<>(domain.size()*4);
|
||||
|
||||
ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.ip));
|
||||
|
||||
if (domain.documents != null) {
|
||||
compileUrls(ret, domain.documents);
|
||||
compileDocuments(ret, domain.documents);
|
||||
compileFeeds(ret, domain.documents);
|
||||
|
||||
compileLinks(ret, domain.domain, domain.documents);
|
||||
}
|
||||
if (domain.redirect != null) {
|
||||
compileRedirect(ret, domain.domain, domain.redirect);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
private void compileRedirect(List<Instruction> ret, EdgeDomain from, EdgeDomain to) {
|
||||
ret.add(new LoadDomain(to));
|
||||
ret.add(new LoadDomainLink(new DomainLink(from, to)));
|
||||
ret.add(new LoadDomainRedirect(new DomainLink(from, to)));
|
||||
}
|
||||
|
||||
private void compileUrls(List<Instruction> ret, List<ProcessedDocument> documents) {
|
||||
Set<EdgeUrl> seenUrls = new HashSet<>(documents.size()*4);
|
||||
Set<EdgeDomain> seenDomains = new HashSet<>(documents.size());
|
||||
|
||||
for (var doc : documents) {
|
||||
seenUrls.add(doc.url);
|
||||
|
||||
if (doc.details != null) {
|
||||
for (var url : doc.details.linksExternal) {
|
||||
seenDomains.add(url.domain);
|
||||
}
|
||||
seenUrls.addAll(doc.details.linksExternal);
|
||||
seenUrls.addAll(doc.details.linksInternal);
|
||||
}
|
||||
}
|
||||
|
||||
ret.add(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new)));
|
||||
ret.add(new LoadUrl(seenUrls.toArray(EdgeUrl[]::new)));
|
||||
}
|
||||
|
||||
private void compileLinks(List<Instruction> ret, EdgeDomain from, List<ProcessedDocument> documents) {
|
||||
DomainLink[] links = documents.stream().map(doc -> doc.details)
|
||||
.filter(Objects::nonNull)
|
||||
.flatMap(dets -> dets.linksExternal.stream())
|
||||
.map(link -> link.domain)
|
||||
.distinct()
|
||||
.map(domain -> new DomainLink(from, domain))
|
||||
.toArray(DomainLink[]::new);
|
||||
|
||||
ret.add(new LoadDomainLink(links));
|
||||
}
|
||||
|
||||
private void compileFeeds(List<Instruction> ret, List<ProcessedDocument> documents) {
|
||||
|
||||
EdgeUrl[] feeds = documents.stream().map(doc -> doc.details)
|
||||
.filter(Objects::nonNull)
|
||||
.flatMap(dets -> dets.feedLinks.stream())
|
||||
.distinct()
|
||||
.toArray(EdgeUrl[]::new);
|
||||
|
||||
ret.add(new LoadRssFeed(feeds));
|
||||
}
|
||||
|
||||
private void compileDocuments(List<Instruction> ret, List<ProcessedDocument> documents) {
|
||||
|
||||
for (var doc : documents) {
|
||||
compileDocumentDetails(ret, doc);
|
||||
}
|
||||
|
||||
for (var doc : documents) {
|
||||
compileWords(ret, doc);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private void compileDocumentDetails(List<Instruction> ret, ProcessedDocument doc) {
|
||||
var details = doc.details;
|
||||
|
||||
if (details != null) {
|
||||
ret.add(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard, details.length, details.hashCode, details.quality));
|
||||
}
|
||||
else {
|
||||
ret.add(new LoadProcessedDocumentWithError(doc.url, doc.state));
|
||||
}
|
||||
}
|
||||
|
||||
private void compileWords(List<Instruction> ret, ProcessedDocument doc) {
|
||||
var words = doc.words;
|
||||
if (words != null) {
|
||||
var wordsArray = words.values().stream()
|
||||
.map(DocumentKeywords::new)
|
||||
.toArray(DocumentKeywords[]::new);
|
||||
|
||||
ret.add(new LoadKeywords(doc.url, wordsArray));
|
||||
}
|
||||
}
|
||||
}
|
@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||
import crawlercommons.utils.Strings;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
@ -23,13 +24,12 @@ public class DocumentValuator {
|
||||
|
||||
);
|
||||
|
||||
public double getQuality(EdgeHtmlStandard htmlStandard, Document doc, DocumentLanguageData dld) throws DisqualifiedException {
|
||||
public double getQuality(CrawledDocument crawledDocument, EdgeHtmlStandard htmlStandard, Document parsedDocument, DocumentLanguageData dld) throws DisqualifiedException {
|
||||
double smutCoefficient = dld.streamLowerCase().filter(filthTable::contains).count();
|
||||
double scriptPenalty = getScriptPenalty(doc);
|
||||
double scriptPenalty = getScriptPenalty(parsedDocument);
|
||||
|
||||
|
||||
int textBodyLength = doc.text().length();
|
||||
int rawLength = doc.html().length();
|
||||
int textBodyLength = parsedDocument.text().length();
|
||||
int rawLength = crawledDocument.documentBody.length();
|
||||
|
||||
if (textBodyLength == 0) {
|
||||
throw new DisqualifiedException(LENGTH);
|
||||
|
@ -3,10 +3,7 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.*;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
@ -43,13 +40,15 @@ public class FeatureExtractor {
|
||||
private final RecipeDetector recipeDetector;
|
||||
private final TextileCraftDetector textileCraftDetector;
|
||||
private final WoodworkingDetector woodworkingDetector;
|
||||
private final GoogleAnwersSpamDetector googleAnwersSpamDetector;
|
||||
|
||||
@Inject
|
||||
public FeatureExtractor(AdblockSimulator adblockSimulator, RecipeDetector recipeDetector, TextileCraftDetector textileCraftDetector, WoodworkingDetector woodworkingDetector) {
|
||||
public FeatureExtractor(AdblockSimulator adblockSimulator, RecipeDetector recipeDetector, TextileCraftDetector textileCraftDetector, WoodworkingDetector woodworkingDetector, GoogleAnwersSpamDetector googleAnwersSpamDetector) {
|
||||
this.adblockSimulator = adblockSimulator;
|
||||
this.recipeDetector = recipeDetector;
|
||||
this.textileCraftDetector = textileCraftDetector;
|
||||
this.woodworkingDetector = woodworkingDetector;
|
||||
this.googleAnwersSpamDetector = googleAnwersSpamDetector;
|
||||
}
|
||||
|
||||
public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc, DocumentLanguageData dld) {
|
||||
@ -57,6 +56,10 @@ public class FeatureExtractor {
|
||||
|
||||
final Elements scriptTags = doc.getElementsByTag("script");
|
||||
|
||||
if (googleAnwersSpamDetector.testP(doc) > 0.5) {
|
||||
features.add(HtmlFeature.GA_SPAM);
|
||||
}
|
||||
|
||||
for (var scriptTag : scriptTags) {
|
||||
if (isJavascriptTag(scriptTag)) {
|
||||
features.add(HtmlFeature.JS);
|
||||
|
@ -7,14 +7,14 @@ public enum HtmlFeature {
|
||||
JS("special:scripts"),
|
||||
AFFILIATE_LINK( "special:affiliate"),
|
||||
TRACKING("special:tracking"),
|
||||
|
||||
COOKIES("special:cookies"),
|
||||
|
||||
CATEGORY_FOOD("category:food"),
|
||||
|
||||
ADVERTISEMENT("special:ads"),
|
||||
|
||||
CATEGORY_CRAFTS("category:crafts"),
|
||||
|
||||
GA_SPAM("special:gaspam"),
|
||||
|
||||
UNKNOWN("special:uncategorized")
|
||||
;
|
||||
|
||||
|
@ -0,0 +1,54 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class InternalLinkGraph {
|
||||
private final Map<EdgeUrl, Set<EdgeUrl>> internalLinkGraph = new HashMap<>(1000);
|
||||
private final Set<EdgeUrl> goodUrls = new HashSet<>(1000);
|
||||
private final Map<EdgeUrl, Set<String>> topKeywordsByUrl = new HashMap<>(1000);
|
||||
private final Map<EdgeUrl, Set<String>> candidateKeywordsByUrl = new HashMap<>(1000);
|
||||
|
||||
public void accept(ProcessedDocument doc) {
|
||||
if (doc.details == null || doc.details.linksInternal == null)
|
||||
return;
|
||||
|
||||
goodUrls.add(doc.url);
|
||||
internalLinkGraph.put(doc.url, new HashSet<>(doc.details.linksInternal));
|
||||
|
||||
Set<String> topKeywords = new HashSet<>(doc.words.get(IndexBlock.Tfidf_High).words);
|
||||
topKeywords.addAll(doc.words.get(IndexBlock.Subjects).words);
|
||||
topKeywordsByUrl.put(doc.url, topKeywords);
|
||||
|
||||
Set<String> candidateKeywords = new HashSet<>(topKeywords);
|
||||
candidateKeywords.addAll(doc.words.get(IndexBlock.Tfidf_High).words);
|
||||
candidateKeywords.addAll(doc.words.get(IndexBlock.Subjects).words);
|
||||
candidateKeywordsByUrl.put(doc.url, candidateKeywords);
|
||||
}
|
||||
|
||||
public Map<EdgeUrl, Set<EdgeUrl>> trimAndInvert() {
|
||||
internalLinkGraph.values().forEach(dest -> dest.retainAll(goodUrls));
|
||||
|
||||
Map<EdgeUrl, Set<EdgeUrl>> inverted = new HashMap<>(goodUrls.size());
|
||||
|
||||
internalLinkGraph.forEach((source, dests) -> {
|
||||
dests.forEach(dest -> inverted.computeIfAbsent(dest,
|
||||
d->new HashSet<>(25))
|
||||
.add(source));
|
||||
});
|
||||
|
||||
internalLinkGraph.clear();
|
||||
|
||||
return inverted;
|
||||
}
|
||||
|
||||
public Set<String> getKeywords(EdgeUrl url) {
|
||||
return topKeywordsByUrl.getOrDefault(url, Collections.emptySet());
|
||||
}
|
||||
public Set<String> getCandidateKeywords(EdgeUrl url) {
|
||||
return candidateKeywordsByUrl.getOrDefault(url, Collections.emptySet());
|
||||
}
|
||||
}
|
@ -5,7 +5,6 @@ import com.google.common.base.Strings;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import org.jetbrains.annotations.Contract;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.slf4j.Logger;
|
||||
@ -202,7 +201,6 @@ public class LinkParser {
|
||||
return binarySuffixList.stream().anyMatch(str::endsWith);
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public EdgeUrl getBaseLink(Document parsed, EdgeUrl documentUrl) {
|
||||
var baseTags = parsed.getElementsByTag("base");
|
||||
|
||||
|
@ -1,9 +1,13 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.StringJoiner;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class QueryParams {
|
||||
|
||||
@ -15,10 +19,28 @@ public class QueryParams {
|
||||
return null;
|
||||
}
|
||||
|
||||
var ret = Arrays.stream(paramSplitterPattern.split(queryParams))
|
||||
.filter(param -> QueryParams.isPermittedParam(path, param))
|
||||
.sorted()
|
||||
.collect(Collectors.joining("&"));
|
||||
String ret;
|
||||
if (queryParams.indexOf('&') >= 0) {
|
||||
|
||||
List<String> parts = new ArrayList<>();
|
||||
for (var part : StringUtils.split(queryParams, '&')) {
|
||||
if (QueryParams.isPermittedParam(path, part)) {
|
||||
parts.add(part);
|
||||
}
|
||||
}
|
||||
if (parts.size() > 1) {
|
||||
parts.sort(Comparator.naturalOrder());
|
||||
}
|
||||
StringJoiner retJoiner = new StringJoiner("&");
|
||||
parts.forEach(retJoiner::add);
|
||||
ret = retJoiner.toString();
|
||||
}
|
||||
else if (isPermittedParam(path, queryParams)) {
|
||||
ret = queryParams;
|
||||
}
|
||||
else {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (ret.isBlank())
|
||||
return null;
|
||||
|
@ -0,0 +1,36 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
||||
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class GoogleAnwersSpamDetector {
|
||||
|
||||
private final List<String> prefixes = List.of("What", "Why", "How", "When", "Is");
|
||||
|
||||
public double testP(Document doc) {
|
||||
if (trialTag(doc, "h1")) return 1;
|
||||
if (trialTag(doc, "h2")) return 1;
|
||||
if (trialTag(doc, "h3")) return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private boolean trialTag(Document doc, String tagName) {
|
||||
int positive = 0;
|
||||
int total = 0;
|
||||
|
||||
for (var elem : doc.getElementsByTag(tagName)) {
|
||||
String text = elem.text();
|
||||
for (var prefix : prefixes) {
|
||||
if (text.startsWith(prefix)) {
|
||||
positive++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
total ++;
|
||||
}
|
||||
|
||||
return positive > 4 && positive / (double) total > 0.5;
|
||||
}
|
||||
}
|
@ -29,7 +29,7 @@ public class CrawlJobExtractorMain {
|
||||
"""
|
||||
SELECT ID
|
||||
FROM EC_DOMAIN
|
||||
WHERE URL_PART=?
|
||||
WHERE DOMAIN_NAME=?
|
||||
""";
|
||||
|
||||
private static final String domainsSql =
|
||||
|
@ -11,6 +11,17 @@ import java.util.regex.Pattern;
|
||||
public class UrlBlocklist {
|
||||
private final List<Predicate<String>> patterns = new ArrayList<>();
|
||||
|
||||
private record UrlPatternContains(String contains, Pattern pattern) implements Predicate<String> {
|
||||
public boolean test(String s) {
|
||||
return s.contains(contains) && pattern.matcher(s).find();
|
||||
}
|
||||
}
|
||||
private record UrlPatternMinLength(int minLength, Pattern pattern) implements Predicate<String> {
|
||||
public boolean test(String s) {
|
||||
return s.length() >= minLength && pattern.matcher(s).find();
|
||||
}
|
||||
}
|
||||
|
||||
// domains that have a lot of links but we know we don't want to crawl
|
||||
private final Set<String> badDomains = Set.of("t.co", "facebook.com",
|
||||
"instagram.com", "youtube.com",
|
||||
@ -18,18 +29,24 @@ public class UrlBlocklist {
|
||||
|
||||
public UrlBlocklist() {
|
||||
// Don't deep-crawl git repos
|
||||
patterns.add(Pattern.compile("\\.git/.+").asPredicate());
|
||||
patterns.add(Pattern.compile("wp-content/upload").asPredicate());
|
||||
patterns.add(s -> s.contains(".git/"));
|
||||
|
||||
patterns.add(s -> s.contains("wp-content/upload"));
|
||||
patterns.add(s -> s.contains("-download-free"));
|
||||
|
||||
// long base64-strings in URLs are typically git hashes or the like, rarely worth crawling
|
||||
patterns.add(Pattern.compile(".*/[^/]*[a-f0-9]{32,}(/|$)").asPredicate());
|
||||
patterns.add(new UrlPatternMinLength(48, Pattern.compile(".*/[^/]*[a-f0-9]{32,}(/|$)")));
|
||||
|
||||
// link farms &c
|
||||
patterns.add(Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$").asPredicate());
|
||||
patterns.add(Pattern.compile("/permalink/[a-z]+(-([A-Za-z]+|[0-9]+)){3,}\\.(htm|html|php)$").asPredicate());
|
||||
patterns.add(Pattern.compile("(webrx3|lib|pdf|book|720p).*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$").asPredicate());
|
||||
patterns.add(Pattern.compile("/node/.*/[a-z]+(-[a-z0-9]+)+.htm$").asPredicate());
|
||||
patterns.add(Pattern.compile(".*-download-free$").asPredicate());
|
||||
patterns.add(new UrlPatternContains("/download", Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$")));
|
||||
patterns.add(new UrlPatternContains("/permalink", Pattern.compile("/permalink/[a-z]+(-([A-Za-z]+|[0-9]+)){3,}\\.(htm|html|php)$")));
|
||||
patterns.add(new UrlPatternContains("/webrx", Pattern.compile("webrx3.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")));
|
||||
patterns.add(new UrlPatternContains("/lib", Pattern.compile("lib.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")));
|
||||
patterns.add(new UrlPatternContains("/pdf", Pattern.compile("pdf.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")));
|
||||
patterns.add(new UrlPatternContains("/book", Pattern.compile("book.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")));
|
||||
patterns.add(new UrlPatternContains("/720p", Pattern.compile("720p.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")));
|
||||
patterns.add(new UrlPatternContains("/node", Pattern.compile("/node/.*/[a-z]+(-[a-z0-9]+)+.htm$")));
|
||||
|
||||
}
|
||||
|
||||
public boolean isUrlBlocked(EdgeUrl url) {
|
||||
|
@ -31,6 +31,8 @@ public class CrawlerRetreiver {
|
||||
private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 500);
|
||||
private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500);
|
||||
|
||||
private static final int MAX_ERRORS = 10;
|
||||
|
||||
private final LinkedList<EdgeUrl> queue = new LinkedList<>();
|
||||
private final HttpFetcher fetcher;
|
||||
|
||||
@ -50,6 +52,8 @@ public class CrawlerRetreiver {
|
||||
private static final IpBlockList ipBlocklist;
|
||||
private static final UrlBlocklist urlBlocklist = new UrlBlocklist();
|
||||
|
||||
int errorCount = 0;
|
||||
|
||||
static {
|
||||
try {
|
||||
ipBlocklist = new IpBlockList(new GeoIpBlocklist());
|
||||
@ -137,7 +141,7 @@ public class CrawlerRetreiver {
|
||||
|
||||
int fetchedCount = 0;
|
||||
|
||||
while (!queue.isEmpty() && visited.size() < depth) {
|
||||
while (!queue.isEmpty() && visited.size() < depth && errorCount < MAX_ERRORS ) {
|
||||
var top = queue.removeFirst();
|
||||
|
||||
if (!robotsRules.isAllowed(top.toString())) {
|
||||
@ -179,6 +183,10 @@ public class CrawlerRetreiver {
|
||||
EdgeUrl.parse(d.url).map(EdgeUrl::toString).ifPresent(visited::add);
|
||||
}
|
||||
|
||||
if ("ERROR".equals(d.crawlerStatus)) {
|
||||
errorCount++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
long crawledTime = System.currentTimeMillis() - startTime;
|
||||
|
@ -10,6 +10,7 @@ import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
|
||||
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
@ImplementedBy(EdgeDataStoreDaoImpl.class)
|
||||
public interface EdgeDataStoreDao {
|
||||
@ -23,7 +24,7 @@ public interface EdgeDataStoreDao {
|
||||
|
||||
List<EdgeUrlDetails> getUrlDetailsMulti(EdgeIdCollection<EdgeUrl> ids);
|
||||
|
||||
EdgeDomain getDomain(EdgeId<EdgeDomain> id);
|
||||
Optional<EdgeDomain> getDomain(EdgeId<EdgeDomain> id);
|
||||
|
||||
|
||||
}
|
||||
|
@ -93,7 +93,9 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
WORDS_TOTAL, FORMAT, FEATURES,
|
||||
IP, DOMAIN_STATE,
|
||||
DATA_HASH
|
||||
FROM EC_URL_VIEW WHERE ID IN
|
||||
FROM EC_URL_VIEW
|
||||
WHERE TITLE IS NOT NULL
|
||||
AND ID IN
|
||||
""" + idString)) {
|
||||
stmt.setFetchSize(ids.size());
|
||||
|
||||
@ -113,7 +115,9 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
EdgePageScoreAdjustment.zero(), // urlQualityAdjustment
|
||||
Integer.MAX_VALUE, // rankingId
|
||||
Double.MAX_VALUE, // termScore
|
||||
1 // resultsFromSameDomain
|
||||
1, // resultsFromSameDomain
|
||||
"", // positions
|
||||
null // result item
|
||||
);
|
||||
if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF
|
||||
&& Strings.isNullOrEmpty(val.description)
|
||||
@ -309,18 +313,17 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public EdgeDomain getDomain(EdgeId<EdgeDomain> id) {
|
||||
public Optional<EdgeDomain> getDomain(EdgeId<EdgeDomain> id) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
|
||||
stmt.setInt(1, id.id());
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return new EdgeDomain(rsp.getString(1));
|
||||
return Optional.of(new EdgeDomain(rsp.getString(1)));
|
||||
}
|
||||
throw new NoSuchElementException();
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -4,7 +4,6 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import io.prometheus.client.Counter;
|
||||
import io.reactivex.rxjava3.schedulers.Schedulers;
|
||||
import lombok.SneakyThrows;
|
||||
import org.slf4j.Logger;
|
||||
@ -18,8 +17,6 @@ public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist {
|
||||
private final HikariDataSource dataSource;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private static final Counter wmsa_blacklist_intercept = Counter.build("wmsa_blacklist_intercept",
|
||||
"wmsa_blacklist_intercept").register();
|
||||
@Inject
|
||||
public EdgeDomainBlacklistImpl(HikariDataSource dataSource) {
|
||||
this.dataSource = dataSource;
|
||||
@ -65,7 +62,6 @@ public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist {
|
||||
@Override
|
||||
public boolean isBlacklisted(int domainId) {
|
||||
if (spamDomainSet.contains(domainId)) {
|
||||
wmsa_blacklist_intercept.inc();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,34 @@
|
||||
package nu.marginalia.wmsa.edge.explorer;
|
||||
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.wmsa.configuration.MainClass;
|
||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||
import nu.marginalia.wmsa.configuration.module.ConfigurationModule;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||
import spark.Spark;
|
||||
|
||||
public class ExplorerMain extends MainClass {
|
||||
final ExplorerService service;
|
||||
|
||||
@Inject
|
||||
public ExplorerMain(ExplorerService service) {
|
||||
this.service = service;
|
||||
}
|
||||
|
||||
public static void main(String... args) {
|
||||
init(ServiceDescriptor.EXPLORER, args);
|
||||
|
||||
Spark.staticFileLocation("/static/explore/");
|
||||
|
||||
Injector injector = Guice.createInjector(
|
||||
new ConfigurationModule(),
|
||||
new DatabaseModule()
|
||||
);
|
||||
|
||||
injector.getInstance(ExplorerMain.class);
|
||||
injector.getInstance(Initialization.class).setReady();
|
||||
}
|
||||
}
|
@ -0,0 +1,253 @@
|
||||
package nu.marginalia.wmsa.edge.explorer;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
||||
import nu.marginalia.wmsa.configuration.server.Service;
|
||||
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
|
||||
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
|
||||
import nu.marginalia.wmsa.resource_store.StaticResources;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
|
||||
public class ExplorerService extends Service {
|
||||
|
||||
private final MustacheRenderer<Object> renderer;
|
||||
private final HikariDataSource dataSource;
|
||||
private final StaticResources staticResources;
|
||||
|
||||
record SearchResult(
|
||||
String domain,
|
||||
String url,
|
||||
double relatedness,
|
||||
boolean hasMore,
|
||||
boolean active,
|
||||
boolean indexed) implements Comparable<SearchResult> {
|
||||
|
||||
@Override
|
||||
public int compareTo(@NotNull SearchResult o) {
|
||||
return (int)(o.relatedness - relatedness);
|
||||
}
|
||||
}
|
||||
|
||||
record SearchResults(String query, String message, String aliasDomain, List<SearchResult> resultList) { }
|
||||
|
||||
@SneakyThrows
|
||||
@Inject
|
||||
public ExplorerService(@Named("service-host") String ip,
|
||||
@Named("service-port") Integer port,
|
||||
Initialization initialization,
|
||||
MetricsServer metricsServer,
|
||||
RendererFactory rendererFactory,
|
||||
HikariDataSource dataSource,
|
||||
StaticResources staticResources
|
||||
) {
|
||||
|
||||
super(ip, port, initialization, metricsServer);
|
||||
|
||||
renderer = rendererFactory.renderer("explorer/explorer");
|
||||
this.dataSource = dataSource;
|
||||
this.staticResources = staticResources;
|
||||
Spark.get("/public/", this::serveIndex, this::render);
|
||||
Spark.get("/public/search", this::search, this::render);
|
||||
Spark.get("/public/:resource", this::serveStatic);
|
||||
|
||||
}
|
||||
|
||||
|
||||
private Object serveStatic(Request request, Response response) {
|
||||
String resource = request.params("resource");
|
||||
staticResources.serveStatic("explore", resource, request, response);
|
||||
return "";
|
||||
}
|
||||
|
||||
public String render(Object results) {
|
||||
return renderer.render(results);
|
||||
}
|
||||
|
||||
private SearchResults serveIndex(Request request, Response response) {
|
||||
|
||||
return new SearchResults("", "", null, Collections.emptyList());
|
||||
}
|
||||
|
||||
|
||||
private SearchResults search(Request request, Response response) throws SQLException {
|
||||
String query = request.queryParams("domain");
|
||||
|
||||
query = trimUrlJunk(query);
|
||||
|
||||
DomainIdInformation domainId = getDomainId(query);
|
||||
if (!domainId.isPresent()) {
|
||||
return new SearchResults(query,
|
||||
"Could not find such a domain (maybe try adding/removing www?)",
|
||||
null, Collections.emptyList());
|
||||
}
|
||||
|
||||
var relatedDomains = getRelatedDomains(domainId);
|
||||
|
||||
if (relatedDomains.isEmpty()) {
|
||||
String message = """
|
||||
I've got nothing. This may either be due to the website being far out in the periphery of Marginalia's
|
||||
search engine index, or it may be due to the website being too big.
|
||||
A few hundred of the biggest websites are excluded for performance reasons. They are usually
|
||||
not very interesting to look at either as everyone links to them and there's no real pattern to discern.
|
||||
""";
|
||||
|
||||
return new SearchResults(query, message, domainId.alias, relatedDomains);
|
||||
}
|
||||
|
||||
return new SearchResults(query, "", domainId.alias, relatedDomains);
|
||||
}
|
||||
|
||||
private List<SearchResult> getRelatedDomains(DomainIdInformation domainIdInformation) throws SQLException {
|
||||
List<SearchResult> ret = new ArrayList<>();
|
||||
Set<String> seen = new HashSet<>();
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT
|
||||
NV.NEIGHBOR_NAME,
|
||||
NV.RELATEDNESS,
|
||||
(LV.DOMAIN_ID IS NOT NULL),
|
||||
(STATE = 'ACTIVE' OR STATE='SOCIAL_MEDIA'),
|
||||
INDEXED > 0
|
||||
FROM EC_NEIGHBORS_VIEW NV
|
||||
LEFT JOIN EC_NEIGHBORS_VIEW LV ON (NV.NEIGHBOR_ID=LV.DOMAIN_ID)
|
||||
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NV.NEIGHBOR_ID
|
||||
WHERE NV.DOMAIN_ID=?
|
||||
GROUP BY NV.NEIGHBOR_ID
|
||||
ORDER BY NV.RELATEDNESS DESC
|
||||
""");
|
||||
var stmtRev = conn.prepareStatement("""
|
||||
SELECT
|
||||
NV.DOMAIN_NAME,
|
||||
NV.RELATEDNESS,
|
||||
(LV.NEIGHBOR_ID IS NOT NULL),
|
||||
(STATE = 'ACTIVE' OR STATE='SOCIAL_MEDIA'),
|
||||
INDEXED > 0
|
||||
FROM EC_NEIGHBORS_VIEW NV
|
||||
LEFT JOIN EC_NEIGHBORS_VIEW LV ON (NV.DOMAIN_ID=LV.NEIGHBOR_ID)
|
||||
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NV.DOMAIN_ID
|
||||
WHERE NV.NEIGHBOR_ID=?
|
||||
GROUP BY NV.DOMAIN_ID
|
||||
ORDER BY NV.RELATEDNESS DESC
|
||||
"""
|
||||
);
|
||||
|
||||
) {
|
||||
|
||||
stmt.setInt(1, domainIdInformation.domainId);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
|
||||
String domainName = rsp.getString(1);
|
||||
double relatedness = rsp.getDouble(2);
|
||||
boolean hasMore = rsp.getBoolean(3);
|
||||
boolean active = rsp.getBoolean(4);
|
||||
boolean indexed = rsp.getBoolean(5);
|
||||
|
||||
seen.add(domainName);
|
||||
|
||||
String url = "http://" + domainName + "/";
|
||||
|
||||
|
||||
if (domainName.length() < 48 && domainName.contains(".")) {
|
||||
ret.add(new SearchResult(
|
||||
domainName,
|
||||
url,
|
||||
relatedness,
|
||||
hasMore,
|
||||
active,
|
||||
indexed
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
stmtRev.setInt(1, domainIdInformation.domainId);
|
||||
rsp = stmtRev.executeQuery();
|
||||
while (rsp.next()) {
|
||||
|
||||
String domainName = rsp.getString(1);
|
||||
double relatedness = rsp.getDouble(2);
|
||||
boolean hasMore = rsp.getBoolean(3);
|
||||
boolean active = rsp.getBoolean(4);
|
||||
boolean indexed = rsp.getBoolean(5);
|
||||
|
||||
String url = "http://" + domainName + "/";
|
||||
|
||||
if (!seen.add(domainName))
|
||||
continue;
|
||||
|
||||
if (domainName.length() < 48 && domainName.contains(".")) {
|
||||
ret.add(new SearchResult(
|
||||
domainName,
|
||||
url,
|
||||
relatedness,
|
||||
hasMore,
|
||||
active,
|
||||
indexed
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Comparator<SearchResult> comp = SearchResult::compareTo;
|
||||
comp = comp.thenComparing(SearchResult::domain);
|
||||
ret.sort(comp);
|
||||
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
||||
private DomainIdInformation getDomainId(String query) throws SQLException {
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT IFNULL(ALIAS.ID, DOMAIN.ID), DOMAIN.INDEXED>0 OR ALIAS.INDEXED>0, ALIAS.DOMAIN_NAME
|
||||
FROM EC_DOMAIN DOMAIN
|
||||
LEFT JOIN EC_DOMAIN ALIAS ON DOMAIN.DOMAIN_ALIAS=ALIAS.ID
|
||||
WHERE DOMAIN.DOMAIN_NAME=?
|
||||
""")) {
|
||||
stmt.setString(1, query);
|
||||
var rsp = stmt.executeQuery();
|
||||
if (rsp.next()) {
|
||||
return new DomainIdInformation(
|
||||
rsp.getInt(1),
|
||||
rsp.getBoolean(2),
|
||||
rsp.getString(3)
|
||||
);
|
||||
}
|
||||
}
|
||||
return new DomainIdInformation(-1, false, null);
|
||||
}
|
||||
|
||||
private String trimUrlJunk(String query) {
|
||||
if (query.startsWith("http://")) {
|
||||
query = query.substring(7);
|
||||
}
|
||||
if (query.startsWith("https://")) {
|
||||
query = query.substring(8);
|
||||
}
|
||||
|
||||
int lastSlash = query.indexOf('/');
|
||||
if (lastSlash > 0) {
|
||||
query = query.substring(0, lastSlash);
|
||||
}
|
||||
|
||||
return query;
|
||||
}
|
||||
|
||||
record DomainIdInformation(int domainId, boolean indexed, String alias) {
|
||||
boolean isPresent() {
|
||||
return domainId >= 0;
|
||||
}
|
||||
}
|
||||
}
|
@ -1,20 +1,19 @@
|
||||
package nu.marginalia.wmsa.edge.index;
|
||||
|
||||
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryParams;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepFromPredicate;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryRankLimitingFilter;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.concurrent.locks.Lock;
|
||||
import java.util.concurrent.locks.ReadWriteLock;
|
||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||
@ -103,54 +102,65 @@ public class EdgeIndexBucket {
|
||||
return indexReader != null;
|
||||
}
|
||||
|
||||
public IndexQuery getQuery(IndexQueryCachePool cachePool, IndexBlock block, LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
|
||||
public IndexQuery getQuery(LongPredicate filter, IndexQueryParams params) {
|
||||
|
||||
if (null == indexReader) {
|
||||
logger.warn("Index reader not neady {}", block);
|
||||
logger.warn("Index reader not neady {}", params.block());
|
||||
return new IndexQuery(Collections.emptyList());
|
||||
}
|
||||
|
||||
final int[] orderedIncludes = searchTerms.includes
|
||||
.stream()
|
||||
.sorted(Comparator.comparingLong(i -> indexReader.numHits(cachePool, block, i)))
|
||||
.distinct()
|
||||
.mapToInt(Integer::intValue)
|
||||
.toArray();
|
||||
final int[] orderedIncludes = params.searchTerms()
|
||||
.sortedDistinctIncludes((a, b) -> compareKeywords(params.block(), a, b));
|
||||
|
||||
IndexQueryFactory.IndexQueryBuilder query;
|
||||
IndexQueryFactory.IndexQueryBuilder query = createQueryBuilder(orderedIncludes[0], params);
|
||||
|
||||
query = indexReader.findWord(cachePool, block, orderedIncludes[0]);
|
||||
if (query == null) {
|
||||
return new IndexQuery(Collections.emptyList());
|
||||
}
|
||||
|
||||
query.filter(filter);
|
||||
query.addInclusionFilter(new QueryFilterStepFromPredicate(filter));
|
||||
if (params.rankLimit() != null) {
|
||||
query.addInclusionFilter(new QueryRankLimitingFilter(params.rankLimit()));
|
||||
}
|
||||
|
||||
for (int i = 1; i < orderedIncludes.length; i++) {
|
||||
query = query.also(orderedIncludes[i]);
|
||||
}
|
||||
|
||||
for (int term : searchTerms.excludes) {
|
||||
for (int term : params.searchTerms().excludes()) {
|
||||
query = query.not(term);
|
||||
}
|
||||
|
||||
return query.build();
|
||||
}
|
||||
|
||||
private IndexQueryFactory.IndexQueryBuilder createQueryBuilder(int firstKeyword, IndexQueryParams params) {
|
||||
|
||||
public IndexQuery getDomainQuery(IndexQueryCachePool pool, int wordId, ResultDomainDeduplicator localFilter) {
|
||||
var query = indexReader.findDomain(pool, wordId);
|
||||
if (params.targetDomains() != null && !params.targetDomains().isEmpty()) {
|
||||
return indexReader.findWordForDomainList(params.block(), params.targetDomains(), firstKeyword);
|
||||
}
|
||||
return indexReader.findWord(params.block(), params.qualityLimit(), firstKeyword);
|
||||
|
||||
}
|
||||
|
||||
private int compareKeywords(IndexBlock block, int a, int b) {
|
||||
return Long.compare(
|
||||
indexReader.numHits(block, a),
|
||||
indexReader.numHits(block, b)
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
public IndexQuery getDomainQuery(int wordId, ResultDomainDeduplicator localFilter) {
|
||||
var query = indexReader.findDomain(wordId);
|
||||
|
||||
query.addInclusionFilter(new QueryFilterStepFromPredicate(localFilter::filterRawValue));
|
||||
|
||||
return query;
|
||||
}
|
||||
|
||||
public IndexBlock getTermScore(IndexQueryCachePool cachePool, int termId, long urlId) {
|
||||
return indexReader.getBlockForResult(cachePool, termId, urlId);
|
||||
/** Replaces the values of ids with their associated metadata, or 0L if absent */
|
||||
public long[] getMetadata(IndexBlock block, int termId, long[] ids) {
|
||||
return indexReader.getMetadata(block, termId, ids);
|
||||
}
|
||||
|
||||
public boolean isTermInBucket(IndexQueryCachePool cachePool, IndexBlock block, int termId, long urlId) {
|
||||
return indexReader.isTermInBucket(cachePool, block, termId, urlId);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -2,8 +2,8 @@ package nu.marginalia.wmsa.edge.index;
|
||||
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
@ -18,9 +18,6 @@ public class EdgeIndexControl {
|
||||
}
|
||||
|
||||
public void regenerateIndex(int id) {
|
||||
System.runFinalization();
|
||||
System.gc();
|
||||
|
||||
for (IndexBlock block : IndexBlock.values()) {
|
||||
try {
|
||||
servicesFactory.convertIndex(id, block);
|
||||
|
@ -9,6 +9,7 @@ import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
||||
import nu.marginalia.wmsa.configuration.server.Service;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexDomainQueryService;
|
||||
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexLexiconService;
|
||||
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexOpsService;
|
||||
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexQueryService;
|
||||
@ -39,7 +40,9 @@ public class EdgeIndexService extends Service {
|
||||
|
||||
EdgeIndexOpsService opsService,
|
||||
EdgeIndexLexiconService lexiconService,
|
||||
EdgeIndexQueryService indexQueryService)
|
||||
EdgeIndexQueryService indexQueryService,
|
||||
EdgeIndexDomainQueryService domainQueryService
|
||||
)
|
||||
{
|
||||
super(ip, port, init, metricsServer);
|
||||
|
||||
@ -51,7 +54,7 @@ public class EdgeIndexService extends Service {
|
||||
Spark.post("/words/", lexiconService::putWords);
|
||||
|
||||
Spark.post("/search/", indexQueryService::search, gson::toJson);
|
||||
Spark.post("/search-domain/", indexQueryService::searchDomain, gson::toJson);
|
||||
Spark.post("/search-domain/", domainQueryService::searchDomain, gson::toJson);
|
||||
|
||||
Spark.get("/dictionary/*", lexiconService::getWordId, gson::toJson);
|
||||
|
||||
|
@ -103,9 +103,9 @@ public class IndexServicesFactory {
|
||||
|
||||
public void convertIndex(int id, IndexBlock block) throws ConversionUnnecessaryException, IOException {
|
||||
var converter = new SearchIndexConverter(block, id, tmpFileDir,
|
||||
preconverterOutputFile.get(id, block.ordinal()),
|
||||
indexWriteWordsFile.get(id, block.id),
|
||||
indexWriteUrlsFile.get(id, block.id),
|
||||
preconverterOutputFile.get(id, block),
|
||||
indexWriteWordsFile.get(id, block),
|
||||
indexWriteUrlsFile.get(id, block),
|
||||
partitioner,
|
||||
domainBlacklist
|
||||
);
|
||||
@ -118,7 +118,7 @@ public class IndexServicesFactory {
|
||||
|
||||
for (int index = 0; index < (DYNAMIC_BUCKET_LENGTH + 1); index++) {
|
||||
for (IndexBlock block : IndexBlock.values()) {
|
||||
shards.put(new SearchIndexPreconverter.Shard(index, block.ordinal()), getPreconverterOutputFile(index, block.ordinal()));
|
||||
shards.put(new SearchIndexPreconverter.Shard(index, block.ordinal()), getPreconverterOutputFile(index, block));
|
||||
}
|
||||
}
|
||||
|
||||
@ -129,7 +129,7 @@ public class IndexServicesFactory {
|
||||
);
|
||||
}
|
||||
|
||||
private File getPreconverterOutputFile(int index, int block) {
|
||||
private File getPreconverterOutputFile(int index, IndexBlock block) {
|
||||
return preconverterOutputFile.get(index, block);
|
||||
}
|
||||
|
||||
@ -141,7 +141,7 @@ public class IndexServicesFactory {
|
||||
indexMap.put(block, createSearchIndex(id, block));
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Could not create index {}-{}", id, block);
|
||||
logger.error("Could not create index {}-{} ({})", id, block, ex.getMessage());
|
||||
}
|
||||
}
|
||||
return new SearchIndexReader(indexMap);
|
||||
@ -150,8 +150,8 @@ public class IndexServicesFactory {
|
||||
private SearchIndex createSearchIndex(int bucketId, IndexBlock block) {
|
||||
try {
|
||||
return new SearchIndex("IndexReader"+bucketId+":"+ block.name(),
|
||||
indexReadUrlsFile.get(bucketId, block.id),
|
||||
indexReadWordsFile.get(bucketId, block.id));
|
||||
indexReadUrlsFile.get(bucketId, block),
|
||||
indexReadWordsFile.get(bucketId, block));
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
@ -159,7 +159,8 @@ public class IndexServicesFactory {
|
||||
|
||||
public Callable<Boolean> switchFilesJob(int id) {
|
||||
return () -> {
|
||||
for (int block = 0; block < IndexBlock.values().length; block++) {
|
||||
|
||||
for (var block : IndexBlock.values()) {
|
||||
if (Files.exists(indexWriteWordsFile.get(id, block).toPath()) &&
|
||||
Files.exists(indexWriteUrlsFile.get(id, block).toPath())) {
|
||||
Files.move(
|
||||
@ -172,6 +173,7 @@ public class IndexServicesFactory {
|
||||
StandardCopyOption.REPLACE_EXISTING);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
};
|
||||
}
|
||||
@ -205,8 +207,8 @@ class PartitionedDataFile {
|
||||
this.pattern = pattern;
|
||||
}
|
||||
|
||||
public File get(int id) {
|
||||
Path partitionDir = partition.resolve(Integer.toString(id));
|
||||
public File get(Object id) {
|
||||
Path partitionDir = partition.resolve(id.toString());
|
||||
if (!partitionDir.toFile().exists()) {
|
||||
partitionDir.toFile().mkdir();
|
||||
}
|
||||
@ -223,13 +225,13 @@ class DoublePartitionedDataFile {
|
||||
this.pattern = pattern;
|
||||
}
|
||||
|
||||
public File get(int id, int id2) {
|
||||
Path partitionDir = partition.resolve(Integer.toString(id));
|
||||
public File get(Object id, Object id2) {
|
||||
Path partitionDir = partition.resolve(id.toString());
|
||||
|
||||
if (!partitionDir.toFile().exists()) {
|
||||
partitionDir.toFile().mkdir();
|
||||
}
|
||||
partitionDir = partitionDir.resolve(Integer.toString(id2));
|
||||
partitionDir = partitionDir.resolve(id2.toString());
|
||||
if (!partitionDir.toFile().exists()) {
|
||||
partitionDir.toFile().mkdir();
|
||||
}
|
||||
|
@ -47,6 +47,9 @@ public class EdgeIndexClient extends AbstractDynamicClient implements EdgeIndexW
|
||||
var wordSetBuilder = IndexPutKeywordsReq.WordSet.newBuilder();
|
||||
wordSetBuilder.setIndex(wordSet.block().ordinal());
|
||||
wordSetBuilder.addAllWords(List.of(wordSet.keywords()));
|
||||
for (var meta : wordSet.metadata()) {
|
||||
wordSetBuilder.addMeta(meta);
|
||||
}
|
||||
keywordBuilder.addWordSet(wordSetBuilder.build());
|
||||
|
||||
var req = keywordBuilder.build();
|
||||
|
@ -21,7 +21,6 @@ import org.slf4j.LoggerFactory;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
@Singleton
|
||||
public class EdgeIndexLocalService implements EdgeIndexWriterClient {
|
||||
@ -53,9 +52,9 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient {
|
||||
return;
|
||||
}
|
||||
|
||||
for (var chunk : ListChunker.chopList(List.of(wordSet.keywords()), SearchIndexJournalEntry.MAX_LENGTH)) {
|
||||
for (var chunk : ListChunker.chopList(wordSet, SearchIndexJournalEntry.MAX_LENGTH)) {
|
||||
|
||||
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
|
||||
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk.keywords(), chunk.metadata()));
|
||||
var header = new SearchIndexJournalEntryHeader(domain, url, wordSet.block());
|
||||
|
||||
indexWriter.put(header, entry);
|
||||
@ -63,19 +62,22 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient {
|
||||
|
||||
}
|
||||
|
||||
private long[] getOrInsertWordIds(List<String> words) {
|
||||
long[] ids = new long[words.size()];
|
||||
int putId = 0;
|
||||
private long[] getOrInsertWordIds(String[] words, long[] meta) {
|
||||
long[] ids = new long[words.length*2];
|
||||
int putIdx = 0;
|
||||
|
||||
for (int i = 0; i < words.length; i++) {
|
||||
String word = words[i];
|
||||
|
||||
for (String word : words) {
|
||||
long id = lexicon.getOrInsert(word);
|
||||
if (id != DictionaryHashMap.NO_VALUE) {
|
||||
ids[putId++] = id;
|
||||
ids[putIdx++] = id;
|
||||
ids[putIdx++] = meta[i];
|
||||
}
|
||||
}
|
||||
|
||||
if (putId != words.size()) {
|
||||
ids = Arrays.copyOf(ids, putId);
|
||||
if (putIdx != words.length*2) {
|
||||
ids = Arrays.copyOf(ids, putIdx);
|
||||
}
|
||||
return ids;
|
||||
}
|
||||
|
@ -20,12 +20,14 @@ import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry.MAX_LENGTH;
|
||||
|
||||
public class SearchIndexConverter {
|
||||
public static final BTreeContext urlsBTreeContext = new BTreeContext(5, 1, ~0, 8);
|
||||
public static final int ENTRY_URL_OFFSET = 0;
|
||||
public static final int ENTRY_METADATA_OFFSET = 1;
|
||||
public static final int ENTRY_SIZE = 2;
|
||||
|
||||
private final long[] tmpWordsBuffer = new long[MAX_LENGTH];
|
||||
public static final BTreeContext urlsBTreeContext = new BTreeContext(5, ENTRY_SIZE, ~0, 8);
|
||||
|
||||
private final long[] tmpWordsBuffer = SearchIndexJournalReader.createAdequateTempBuffer();
|
||||
|
||||
private final Path tmpFileDir;
|
||||
|
||||
@ -72,7 +74,7 @@ public class SearchIndexConverter {
|
||||
return;
|
||||
}
|
||||
|
||||
logger.info("Converting {} ({}) {} {}", block.id, block, inputFile, journalReader.fileHeader);
|
||||
logger.info("Converting {} ({}) {} {}", block.ordinal(), block, inputFile, journalReader.fileHeader);
|
||||
|
||||
var lock = partitioner.getReadLock();
|
||||
try {
|
||||
@ -80,10 +82,10 @@ public class SearchIndexConverter {
|
||||
|
||||
var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
|
||||
|
||||
logger.info("Creating word index table {} for block {} ({})", outputFileWords, block.id, block);
|
||||
logger.info("Creating word index table {} for block {}", outputFileWords, block.ordinal());
|
||||
WordIndexOffsetsTable wordIndexTable = createWordIndexTable(journalReader, outputFileWords);
|
||||
|
||||
logger.info("Creating word urls table {} for block {} ({})", outputFileUrls, block.id, block);
|
||||
logger.info("Creating word urls table {} for block {}", outputFileUrls, block.ordinal());
|
||||
createUrlTable(journalReader, tmpUrlsFile, wordIndexTable);
|
||||
|
||||
Files.delete(tmpUrlsFile);
|
||||
@ -111,10 +113,10 @@ public class SearchIndexConverter {
|
||||
|
||||
final SearchIndexJournalEntry entryData = entry.readEntryUsingBuffer(tmpWordsBuffer);
|
||||
|
||||
for (int i = 0; i < entryData.size(); i++) {
|
||||
int wordId = (int) entryData.get(i);
|
||||
for (var record : entryData) {
|
||||
int wordId = record.wordId();
|
||||
if (wordId < 0 || wordId >= topWord) {
|
||||
logger.warn("Bad wordId {}", wordId);
|
||||
logger.warn("Bad word {}", record);
|
||||
}
|
||||
wordsTableWriter.acceptWord(wordId);
|
||||
}
|
||||
@ -138,7 +140,7 @@ public class SearchIndexConverter {
|
||||
try (RandomAccessFile urlsTmpFileRAF = new RandomAccessFile(tmpUrlsFile.toFile(), "rw");
|
||||
FileChannel urlsTmpFileChannel = urlsTmpFileRAF.getChannel()) {
|
||||
|
||||
try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, numberOfWordsTotal, 10_000_000)) {
|
||||
try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, ENTRY_SIZE * numberOfWordsTotal, 10_000_000)) {
|
||||
int[] wordWriteOffset = new int[wordOffsetsTable.length()];
|
||||
|
||||
for (var entry : journalReader) {
|
||||
@ -146,21 +148,29 @@ public class SearchIndexConverter {
|
||||
|
||||
var entryData = entry.readEntryUsingBuffer(tmpWordsBuffer);
|
||||
|
||||
for (int i = 0; i < entryData.size(); i++) {
|
||||
int wordId = (int) entryData.get(i);
|
||||
for (var record : entryData) {
|
||||
int wordId = record.wordId();
|
||||
long metadata = record.metadata();
|
||||
|
||||
if (wordId >= wordWriteOffset.length)
|
||||
if (wordId >= wordWriteOffset.length) {
|
||||
logger.warn("Overflowing wordId {}", wordId);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (wordId < 0) {
|
||||
logger.warn("Negative wordId {}", wordId);
|
||||
}
|
||||
|
||||
final long urlInternal = translateUrl(entry.docId());
|
||||
if (wordId > 0) {
|
||||
rwf.put(wordOffsetsTable.get(wordId - 1) + wordWriteOffset[wordId]++, urlInternal);
|
||||
} else {
|
||||
rwf.put(wordWriteOffset[wordId]++, urlInternal);
|
||||
}
|
||||
|
||||
long offset;
|
||||
if (wordId > 0) offset = wordOffsetsTable.get(wordId - 1) + wordWriteOffset[wordId];
|
||||
else offset = wordWriteOffset[wordId];
|
||||
|
||||
rwf.put(offset + ENTRY_URL_OFFSET, urlInternal);
|
||||
rwf.put(offset + ENTRY_METADATA_OFFSET, metadata);
|
||||
|
||||
wordWriteOffset[wordId] += ENTRY_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
@ -171,9 +181,9 @@ public class SearchIndexConverter {
|
||||
|
||||
try (var urlsTmpFileMap = MultimapFileLong.forOutput(tmpUrlsFile, numberOfWordsTotal)) {
|
||||
if (wordOffsetsTable.length() > 0) {
|
||||
var urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit);
|
||||
var urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit, ENTRY_SIZE);
|
||||
|
||||
wordOffsetsTable.forEachRange(urlTmpFileSorter::sort);
|
||||
wordOffsetsTable.forEachRange(urlTmpFileSorter::sortRange);
|
||||
|
||||
urlsTmpFileMap.force();
|
||||
} else {
|
||||
@ -187,7 +197,7 @@ public class SearchIndexConverter {
|
||||
wordOffsetsTable.foldRanges((accumulatorIdx, start, length) -> {
|
||||
// Note: The return value is accumulated into accumulatorIdx!
|
||||
|
||||
return writer.write(accumulatorIdx, length,
|
||||
return writer.write(accumulatorIdx, length/ENTRY_SIZE,
|
||||
slice -> slice.transferFromFileChannel(urlsTmpFileChannel, 0, start, start + length));
|
||||
});
|
||||
|
||||
|
@ -9,7 +9,6 @@ import gnu.trove.set.hash.TIntHashSet;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.BetterReversePageRank;
|
||||
import nu.marginalia.util.ranking.BetterStandardPageRank;
|
||||
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
||||
import org.slf4j.Logger;
|
||||
@ -87,8 +86,25 @@ public class SearchIndexDao {
|
||||
|
||||
@SneakyThrows
|
||||
public TIntList getStandardDomains() {
|
||||
var spr = new BuggyStandardPageRank(rankingDomains,rankingSettings.standard.toArray(String[]::new));
|
||||
return spr.pageRankWithPeripheralNodes(spr.size()/2);
|
||||
TIntArrayList results = new TIntArrayList();
|
||||
|
||||
try (var connection = dataSource.getConnection();
|
||||
var stmt = connection.prepareStatement(
|
||||
"""
|
||||
SELECT ID FROM EC_DOMAIN
|
||||
WHERE INDEXED>0
|
||||
AND STATE='ACTIVE'
|
||||
AND DOMAIN_ALIAS IS NULL
|
||||
ORDER BY ID ASC
|
||||
""");
|
||||
) {
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
results.add(rs.getInt(1));
|
||||
}
|
||||
}
|
||||
return results;
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
|
@ -110,11 +110,12 @@ public class SearchIndexPartitioner {
|
||||
return true;
|
||||
if (academiaRanking.hasBucket(bucketId, domainId))
|
||||
return true;
|
||||
if (standardRanking.hasBucket(bucketId, domainId))
|
||||
return true;
|
||||
if (specialDomainRanking.hasBucket(bucketId, domainId))
|
||||
return true;
|
||||
|
||||
if (standardRanking.hasBucket(bucketId, domainId))
|
||||
return true;
|
||||
|
||||
return DYNAMIC_BUCKET_LENGTH == bucketId;
|
||||
}
|
||||
|
||||
@ -148,15 +149,15 @@ public class SearchIndexPartitioner {
|
||||
if (academiaRanking != null && academiaRanking.ownsBucket(bucketId)) {
|
||||
return academiaRanking.translateId(id);
|
||||
}
|
||||
if (standardRanking != null && standardRanking.ownsBucket(bucketId)) {
|
||||
return standardRanking.translateId(id);
|
||||
}
|
||||
if (specialDomainRanking != null && specialDomainRanking.ownsBucket(bucketId)) {
|
||||
return specialDomainRanking.translateId(id);
|
||||
}
|
||||
if (retroRanking != null) {
|
||||
return retroRanking.translateId(id);
|
||||
|
||||
// standard gets passed traight through
|
||||
if (standardRanking != null && standardRanking.ownsBucket(bucketId)) {
|
||||
return id;
|
||||
}
|
||||
|
||||
return id;
|
||||
}
|
||||
|
||||
|
@ -52,7 +52,7 @@ public class SearchIndexPreconverter {
|
||||
var lock = partitioner.getReadLock();
|
||||
try {
|
||||
lock.lock();
|
||||
ByteBuffer buffer = ByteBuffer.allocateDirect(8192);
|
||||
ByteBuffer buffer = ByteBuffer.allocateDirect(65536);
|
||||
for (var entry : indexJournalReader) {
|
||||
if (!partitioner.isGoodUrl(entry.urlId())
|
||||
|| spamDomains.contains(entry.domainId())) {
|
||||
@ -93,7 +93,7 @@ public class SearchIndexPreconverter {
|
||||
}
|
||||
|
||||
public boolean shouldWrite(SearchIndexPartitioner partitioner, SearchIndexJournalReader.JournalEntry entry) {
|
||||
return shard.block == entry.header.block().id
|
||||
return shard.block == entry.header.block().ordinal()
|
||||
&& partitioner.filterUnsafe(entry.domainId(), shard.bucket);
|
||||
}
|
||||
|
||||
|
@ -23,10 +23,10 @@ public class WordIndexOffsetsTable {
|
||||
|
||||
for (int i = 1; i < table.length; i++) {
|
||||
long start = table[i-1];
|
||||
int length = (int) (table[i] - start);
|
||||
long end = table[i];
|
||||
|
||||
if (length != 0) {
|
||||
o.accept(start, length);
|
||||
if (start != end) {
|
||||
o.accept(start, end);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -58,7 +58,7 @@ public class WordIndexOffsetsTable {
|
||||
}
|
||||
|
||||
public interface OffsetTableEntryConsumer {
|
||||
void accept(long start, int length) throws IOException;
|
||||
void accept(long start, long end) throws IOException;
|
||||
}
|
||||
|
||||
public interface OffsetTableEntryFoldConsumer {
|
||||
|
@ -8,8 +8,10 @@ import nu.marginalia.wmsa.edge.index.reader.IndexWordsTable;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.ENTRY_SIZE;
|
||||
import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.urlsBTreeContext;
|
||||
|
||||
public class WordsTableWriter {
|
||||
@ -23,8 +25,10 @@ public class WordsTableWriter {
|
||||
}
|
||||
|
||||
public void acceptWord(int wordId) {
|
||||
for (int i = 0; i < ENTRY_SIZE; i++) {
|
||||
table.lengths().increment(wordId);
|
||||
}
|
||||
}
|
||||
|
||||
public WordIndexOffsetsTable getTable() {
|
||||
return table.offsets();
|
||||
@ -58,7 +62,7 @@ public class WordsTableWriter {
|
||||
mapSlice.put(idx++, (long)length<<32);
|
||||
mapSlice.put(idx++, 0);
|
||||
|
||||
urlFileOffset += (urlsBTreeContext.calculateSize(length));
|
||||
urlFileOffset += (urlsBTreeContext.calculateSize(length / ENTRY_SIZE));
|
||||
}
|
||||
|
||||
for (int i = 1; i < offsetTable.length; i++) {
|
||||
@ -68,7 +72,7 @@ public class WordsTableWriter {
|
||||
mapSlice.put(idx++, (long)length << 32 | i);
|
||||
mapSlice.put(idx++, urlFileOffset);
|
||||
|
||||
urlFileOffset += (urlsBTreeContext.calculateSize(length));
|
||||
urlFileOffset += (urlsBTreeContext.calculateSize(length / ENTRY_SIZE));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -12,6 +12,8 @@ import org.jetbrains.annotations.NotNull;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.Iterator;
|
||||
|
||||
import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry.ENTRY_SIZE;
|
||||
import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry.MAX_LENGTH;
|
||||
import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS;
|
||||
|
||||
public class SearchIndexJournalReader implements Iterable<SearchIndexJournalReader.JournalEntry> {
|
||||
@ -23,6 +25,10 @@ public class SearchIndexJournalReader implements Iterable<SearchIndexJournalRead
|
||||
private final MultimapFileLongSlice map;
|
||||
private final long committedSize;
|
||||
|
||||
public static long[] createAdequateTempBuffer() {
|
||||
return new long[MAX_LENGTH*ENTRY_SIZE];
|
||||
}
|
||||
|
||||
public SearchIndexJournalReader(MultimapFileLong map) {
|
||||
fileHeader = new SearchIndexJournalFileHeader(map.get(0), map.get(1));
|
||||
committedSize = map.get(0) / 8 - FILE_HEADER_SIZE_LONGS;
|
||||
@ -92,7 +98,7 @@ public class SearchIndexJournalReader implements Iterable<SearchIndexJournalRead
|
||||
public IndexBlock block() {
|
||||
return header.block();
|
||||
}
|
||||
public int wordCount() { return header.entrySize(); }
|
||||
public int wordCount() { return header.entrySize() / ENTRY_SIZE; }
|
||||
|
||||
public SearchIndexJournalEntry readEntry() {
|
||||
long[] dest = new long[header.entrySize()];
|
||||
|
@ -26,7 +26,7 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
||||
private RandomAccessFile raf;
|
||||
private FileChannel channel;
|
||||
|
||||
public static final int MAX_BLOCK_SIZE = SearchIndexJournalEntry.MAX_LENGTH*32*8*4;
|
||||
public static final int MAX_BLOCK_SIZE = SearchIndexJournalEntry.MAX_LENGTH*128*8*4;
|
||||
private final ByteBuffer byteBuffer;
|
||||
private long pos;
|
||||
|
||||
@ -83,7 +83,7 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
||||
byteBuffer.clear();
|
||||
|
||||
byteBuffer.putInt(entryData.size());
|
||||
byteBuffer.putInt(header.block().id);
|
||||
byteBuffer.putInt(header.block().ordinal());
|
||||
byteBuffer.putLong(header.documentId());
|
||||
|
||||
entryData.write(byteBuffer);
|
||||
|
@ -2,12 +2,14 @@ package nu.marginalia.wmsa.edge.index.journal.model;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
|
||||
public class SearchIndexJournalEntry {
|
||||
public class SearchIndexJournalEntry implements Iterable<SearchIndexJournalEntry.Record> {
|
||||
private final int size;
|
||||
private final long[] underlyingArray;
|
||||
|
||||
public static final int MAX_LENGTH = 1000;
|
||||
public static final int ENTRY_SIZE = 2;
|
||||
|
||||
public SearchIndexJournalEntry(long[] underlyingArray) {
|
||||
this.size = underlyingArray.length;
|
||||
@ -46,4 +48,24 @@ public class SearchIndexJournalEntry {
|
||||
return String.format("%s[%s]", getClass().getSimpleName(), Arrays.toString(toArray()));
|
||||
}
|
||||
|
||||
public Iterator<Record> iterator() {
|
||||
return new EntryIterator();
|
||||
}
|
||||
|
||||
private class EntryIterator implements Iterator<Record> {
|
||||
int pos = -ENTRY_SIZE;
|
||||
|
||||
public boolean hasNext() {
|
||||
return pos + ENTRY_SIZE < size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Record next() {
|
||||
pos+=ENTRY_SIZE;
|
||||
|
||||
return new Record((int) underlyingArray[pos], underlyingArray[pos+1]);
|
||||
}
|
||||
}
|
||||
|
||||
public record Record(int wordId, long metadata) {}
|
||||
}
|
||||
|
@ -5,6 +5,7 @@ import com.google.common.hash.Hashing;
|
||||
import io.prometheus.client.Gauge;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.util.dict.DictionaryMap;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -16,7 +17,7 @@ import java.util.concurrent.locks.ReadWriteLock;
|
||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||
|
||||
public class KeywordLexicon implements AutoCloseable {
|
||||
private final DictionaryHashMap reverseIndex;
|
||||
private final DictionaryMap reverseIndex;
|
||||
|
||||
private final ReadWriteLock memoryLock = new ReentrantReadWriteLock();
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
@ -30,7 +31,7 @@ public class KeywordLexicon implements AutoCloseable {
|
||||
private final KeywordLexiconJournal journal;
|
||||
|
||||
@SneakyThrows
|
||||
public KeywordLexicon(KeywordLexiconJournal keywordLexiconJournal, DictionaryHashMap reverseIndexHashMap) {
|
||||
public KeywordLexicon(KeywordLexiconJournal keywordLexiconJournal, DictionaryMap reverseIndexHashMap) {
|
||||
|
||||
journal = keywordLexiconJournal;
|
||||
reverseIndex = reverseIndexHashMap;
|
||||
|
@ -1,16 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.index.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@AllArgsConstructor
|
||||
public class EdgeIndexSearchTerms {
|
||||
public List<Integer> includes = new ArrayList<>();
|
||||
public List<Integer> excludes = new ArrayList<>();
|
||||
|
||||
public boolean isEmpty() {
|
||||
return includes.isEmpty();
|
||||
}
|
||||
}
|
@ -0,0 +1,32 @@
|
||||
package nu.marginalia.wmsa.edge.index.model;
|
||||
|
||||
import java.util.EnumSet;
|
||||
|
||||
public enum EdgePageWordFlags {
|
||||
Title,
|
||||
Subjects,
|
||||
NamesWords,
|
||||
Site,
|
||||
SiteAdjacent,
|
||||
Simple;
|
||||
|
||||
public int asBit() {
|
||||
return 1 << ordinal();
|
||||
}
|
||||
|
||||
public boolean isPresent(long value) {
|
||||
return (asBit() & value) > 0;
|
||||
}
|
||||
|
||||
public static EnumSet<EdgePageWordFlags> decode(long encodedValue) {
|
||||
EnumSet<EdgePageWordFlags> ret = EnumSet.noneOf(EdgePageWordFlags.class);
|
||||
|
||||
for (EdgePageWordFlags f : values()) {
|
||||
if ((encodedValue & f.asBit()) > 0) {
|
||||
ret.add(f);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
}
|
@ -0,0 +1,90 @@
|
||||
package nu.marginalia.wmsa.edge.index.model;
|
||||
|
||||
import nu.marginalia.util.BrailleBlockPunchCards;
|
||||
|
||||
import java.util.EnumSet;
|
||||
|
||||
import static java.lang.Math.max;
|
||||
import static java.lang.Math.min;
|
||||
|
||||
public record EdgePageWordMetadata(int tfIdf,
|
||||
int positions,
|
||||
int quality,
|
||||
int count,
|
||||
EnumSet<EdgePageWordFlags> flags) {
|
||||
|
||||
// If flags are moved from the least significant end of
|
||||
// this struct, then EntrySourceFromBTree will break.
|
||||
|
||||
public static final long COUNT_MASK = 0xFL;
|
||||
public static final int COUNT_SHIFT = 8;
|
||||
|
||||
public static final long QUALITY_MASK = 0xFL;
|
||||
public static final int QUALITY_SHIFT = 12;
|
||||
|
||||
public static final long TF_IDF_MASK = 0xFFFFL;
|
||||
public static final int TF_IDF_SHIFT = 16;
|
||||
|
||||
public static final int POSITIONS_SHIFT = 32;
|
||||
|
||||
public EdgePageWordMetadata(long value) {
|
||||
this(
|
||||
(int)((value >>> TF_IDF_SHIFT) & TF_IDF_MASK),
|
||||
(int)(value >>> POSITIONS_SHIFT),
|
||||
(int)((value >>> QUALITY_SHIFT) & QUALITY_MASK),
|
||||
(int)((value >>> COUNT_SHIFT) & COUNT_MASK),
|
||||
EdgePageWordFlags.decode(value)
|
||||
);
|
||||
}
|
||||
|
||||
public static int decodeQuality(long encoded) {
|
||||
return (int)((encoded >>> QUALITY_SHIFT) & QUALITY_MASK);
|
||||
}
|
||||
|
||||
public static boolean hasFlags(long encoded, long metadataBitMask) {
|
||||
return (encoded & metadataBitMask) == encoded;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder(getClass().getSimpleName());
|
||||
sb.append('[')
|
||||
.append("tfidf=").append(tfIdf).append(", ")
|
||||
.append("quality=").append(quality).append(", ")
|
||||
.append("count=").append(count).append(", ")
|
||||
.append("positions=[").append(BrailleBlockPunchCards.printBits(positions, 32)).append(']');
|
||||
sb.append(", flags=").append(flags).append(']');
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/* Encoded in a 64 bit long as
|
||||
0-8 flags
|
||||
8-12 count,
|
||||
12-16 quality,
|
||||
16-32 tf-idf [0, 65536]
|
||||
32-64 position mask
|
||||
*/
|
||||
public long encode() {
|
||||
long ret = 0;
|
||||
|
||||
for (var flag : flags) {
|
||||
ret |= flag.asBit();
|
||||
}
|
||||
|
||||
ret |= min(TF_IDF_MASK, max(0, tfIdf)) << TF_IDF_SHIFT;
|
||||
ret |= min(COUNT_MASK, max(0, count)) << COUNT_SHIFT;
|
||||
ret |= min(QUALITY_MASK, max(0, quality)) << QUALITY_SHIFT;
|
||||
ret |= ((long)(positions)) << POSITIONS_SHIFT;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return count == 0 && positions == 0 && flags.isEmpty() && tfIdf == 0;
|
||||
}
|
||||
|
||||
public static long emptyValue() {
|
||||
return 0L;
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -1,20 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.index.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
|
||||
@AllArgsConstructor @Getter
|
||||
@ToString
|
||||
public class EdgePutWordsRequest {
|
||||
public EdgeId<EdgeDomain> domainId;
|
||||
public EdgeId<EdgeUrl> urlId;
|
||||
public double quality;
|
||||
|
||||
public EdgePageWordSet wordSet;
|
||||
private int index = 0;
|
||||
}
|
@ -1,47 +1,35 @@
|
||||
package nu.marginalia.wmsa.edge.index.model;
|
||||
|
||||
public enum IndexBlock {
|
||||
TitleKeywords(IndexBlockType.QUALITY_SIGNAL, 0, 0),
|
||||
Title(IndexBlockType.QUALITY_SIGNAL, 1, 1),
|
||||
Title(IndexBlockType.PAGE_DATA),
|
||||
Meta(IndexBlockType.PAGE_DATA),
|
||||
|
||||
Link(IndexBlockType.QUALITY_SIGNAL, 2, 1.15),
|
||||
Words_1(IndexBlockType.PAGE_DATA),
|
||||
Words_2(IndexBlockType.PAGE_DATA),
|
||||
Words_4(IndexBlockType.PAGE_DATA),
|
||||
Words_8(IndexBlockType.PAGE_DATA),
|
||||
Words_16Plus(IndexBlockType.PAGE_DATA),
|
||||
|
||||
Subjects(IndexBlockType.QUALITY_SIGNAL, 3, 1.0),
|
||||
NamesWords(IndexBlockType.QUALITY_SIGNAL, 4, 3.0),
|
||||
Link(IndexBlockType.QUALITY_SIGNAL),
|
||||
Site(IndexBlockType.QUALITY_SIGNAL),
|
||||
|
||||
Artifacts(IndexBlockType.PAGE_DATA, 5, 10),
|
||||
Meta(IndexBlockType.PAGE_DATA, 6, 7),
|
||||
Artifacts(IndexBlockType.PAGE_DATA),
|
||||
|
||||
Tfidf_Top(IndexBlockType.TF_IDF, 7, 1.5),
|
||||
Tfidf_Middle(IndexBlockType.TF_IDF, 8, 2),
|
||||
Tfidf_Lower(IndexBlockType.TF_IDF, 9, 3.5),
|
||||
|
||||
Words_1(IndexBlockType.PAGE_DATA, 10, 2.0),
|
||||
Words_2(IndexBlockType.PAGE_DATA,11, 3.5),
|
||||
Words_4(IndexBlockType.PAGE_DATA,12, 4.0),
|
||||
Words_8(IndexBlockType.PAGE_DATA,13, 4.5),
|
||||
Words_16Plus(IndexBlockType.PAGE_DATA,14, 7.0),
|
||||
|
||||
Site(IndexBlockType.QUALITY_SIGNAL, 15, 1.2)
|
||||
Tfidf_High(IndexBlockType.TRANSIENT),
|
||||
Subjects(IndexBlockType.TRANSIENT)
|
||||
;
|
||||
|
||||
public final IndexBlockType type;
|
||||
public final int id;
|
||||
public final double sortOrder;
|
||||
|
||||
IndexBlock(IndexBlockType type, int id, double sortOrder) {
|
||||
IndexBlock(IndexBlockType type) {
|
||||
this.type = type;
|
||||
this.sortOrder = sortOrder;
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
// This is kind of a hot method, and Enum.values() allocates a new
|
||||
// array each call.
|
||||
private static final IndexBlock[] values = IndexBlock.values();
|
||||
public static IndexBlock byId(int id) {
|
||||
for (IndexBlock block : values()) {
|
||||
if (id == block.id) {
|
||||
return block;
|
||||
}
|
||||
}
|
||||
throw new IllegalArgumentException("Bad block id");
|
||||
return values[id];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,7 +1,10 @@
|
||||
package nu.marginalia.wmsa.edge.index.model;
|
||||
|
||||
public enum IndexBlockType {
|
||||
/** This block is only used for joins */
|
||||
QUALITY_SIGNAL,
|
||||
TF_IDF,
|
||||
PAGE_DATA
|
||||
/** This block contains page keywords */
|
||||
PAGE_DATA,
|
||||
/** This block is only used for generation */
|
||||
TRANSIENT
|
||||
}
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.wmsa.edge.index.reader;
|
||||
|
||||
import com.upserve.uppend.blobs.NativeIO;
|
||||
import nu.marginalia.util.btree.BTreeReader;
|
||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -17,7 +16,6 @@ import static nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter.wo
|
||||
public class IndexWordsTable implements AutoCloseable {
|
||||
protected final MultimapFileLong words;
|
||||
protected final BTreeReader reader;
|
||||
protected final BTreeHeader header;
|
||||
protected final int HEADER_OFFSET = 1;
|
||||
final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@ -26,8 +24,7 @@ public class IndexWordsTable implements AutoCloseable {
|
||||
public IndexWordsTable(MultimapFileLong words) {
|
||||
this.words = words;
|
||||
|
||||
reader = new BTreeReader(words, wordsBTreeContext);
|
||||
header = reader.getHeader(HEADER_OFFSET);
|
||||
reader = new BTreeReader(words, wordsBTreeContext, HEADER_OFFSET);
|
||||
|
||||
madvise();
|
||||
}
|
||||
@ -49,7 +46,7 @@ public class IndexWordsTable implements AutoCloseable {
|
||||
}
|
||||
|
||||
public long positionForWord(int wordId) {
|
||||
long offset = reader.findEntry(header, wordId);
|
||||
long offset = reader.findEntry(wordId);
|
||||
|
||||
if (offset < 0) {
|
||||
return -1L;
|
||||
@ -60,7 +57,7 @@ public class IndexWordsTable implements AutoCloseable {
|
||||
|
||||
public int wordLength(int wordId) {
|
||||
|
||||
long offset = reader.findEntry(header, wordId);
|
||||
long offset = reader.findEntry(wordId);
|
||||
if (offset < 0) {
|
||||
return -1;
|
||||
}
|
||||
@ -72,7 +69,7 @@ public class IndexWordsTable implements AutoCloseable {
|
||||
words.advice(NativeIO.Advice.Random);
|
||||
words.advice0(NativeIO.Advice.WillNeed);
|
||||
|
||||
var h = reader.getHeader(HEADER_OFFSET);
|
||||
var h = reader.getHeader();
|
||||
int length = (int)(h.dataOffsetLongs() - h.indexOffsetLongs());
|
||||
|
||||
words.adviceRange(NativeIO.Advice.WillNeed, h.indexOffsetLongs(), length);
|
||||
@ -80,8 +77,8 @@ public class IndexWordsTable implements AutoCloseable {
|
||||
}
|
||||
|
||||
public void forEachWordsOffset(LongConsumer offsetConsumer) {
|
||||
int n = header.numEntries();
|
||||
long offset = header.dataOffsetLongs();
|
||||
int n = reader.numEntries();
|
||||
long offset = reader.getHeader().dataOffsetLongs();
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
try {
|
||||
|
@ -5,21 +5,13 @@ import com.google.inject.name.Named;
|
||||
import com.upserve.uppend.blobs.NativeIO;
|
||||
import io.reactivex.rxjava3.schedulers.Schedulers;
|
||||
import nu.marginalia.util.btree.BTreeReader;
|
||||
import nu.marginalia.util.btree.CachingBTreeReader;
|
||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
public class SearchIndex implements AutoCloseable {
|
||||
|
||||
@ -27,8 +19,6 @@ public class SearchIndex implements AutoCloseable {
|
||||
private final IndexWordsTable words;
|
||||
public final String name;
|
||||
private final RandomAccessFile wordsFile;
|
||||
private final BTreeReader bTreeReader;
|
||||
private final CachingBTreeReader cachingBTreeReader;
|
||||
|
||||
private final Logger logger;
|
||||
|
||||
@ -49,16 +39,13 @@ public class SearchIndex implements AutoCloseable {
|
||||
urls = MultimapFileLong.forReading(inUrls.toPath());
|
||||
words = IndexWordsTable.ofFile(wordsFile);
|
||||
|
||||
bTreeReader = new BTreeReader(urls, SearchIndexConverter.urlsBTreeContext);
|
||||
cachingBTreeReader = new CachingBTreeReader(urls, SearchIndexConverter.urlsBTreeContext);
|
||||
|
||||
Schedulers.io().scheduleDirect(() -> madvise(urls, bTreeReader));
|
||||
Schedulers.io().scheduleDirect(() -> madvise(urls));
|
||||
}
|
||||
|
||||
private void madvise(MultimapFileLong urls, BTreeReader reader) {
|
||||
private void madvise(MultimapFileLong urls) {
|
||||
|
||||
words.forEachWordsOffset(offset -> {
|
||||
var h = reader.getHeader(offset);
|
||||
var h = BTreeReader.createHeader(urls, offset);
|
||||
long length = h.dataOffsetLongs() - h.indexOffsetLongs();
|
||||
|
||||
urls.adviceRange(NativeIO.Advice.WillNeed, offset, 512);
|
||||
@ -70,174 +57,16 @@ public class SearchIndex implements AutoCloseable {
|
||||
}
|
||||
|
||||
|
||||
public long numUrls(IndexQueryCachePool pool, int wordId) {
|
||||
public long numUrls(int wordId) {
|
||||
int length = words.wordLength(wordId);
|
||||
if (length < 0) return 0;
|
||||
if (length > 0) return length;
|
||||
|
||||
return rangeForWord(pool, wordId).numEntries();
|
||||
}
|
||||
|
||||
public IndexBTreeRange rangeForWord(IndexQueryCachePool pool, int wordId) {
|
||||
IndexBTreeRange range = pool.getRange(words, wordId);
|
||||
|
||||
if (range == null) {
|
||||
range = new IndexBTreeRange(words.positionForWord(wordId));
|
||||
pool.cacheRange(words, wordId, range);
|
||||
}
|
||||
|
||||
return range;
|
||||
}
|
||||
|
||||
public IndexBTreeRange rangeForWord(int wordId) {
|
||||
return new IndexBTreeRange(words.positionForWord(wordId));
|
||||
}
|
||||
|
||||
public class IndexBTreeRange {
|
||||
public final long dataOffset;
|
||||
private BTreeHeader header;
|
||||
public IndexBTreeRange(long dataOffset) {
|
||||
this.dataOffset = dataOffset;
|
||||
}
|
||||
|
||||
public LongStream stream(int bufferSize) {
|
||||
if (dataOffset < 0) {
|
||||
return LongStream.empty();
|
||||
}
|
||||
if (header == null) {
|
||||
header = bTreeReader.getHeader(dataOffset);
|
||||
}
|
||||
|
||||
long urlOffset = header.dataOffsetLongs();
|
||||
long endOffset = header.dataOffsetLongs() + header.numEntries();
|
||||
int stepSize = Math.min(bufferSize, header.numEntries());
|
||||
|
||||
long[] buffer = new long[stepSize];
|
||||
|
||||
return LongStream
|
||||
.iterate(urlOffset, i -> i< endOffset, i->i+stepSize)
|
||||
.flatMap(pos -> {
|
||||
int sz = (int)(Math.min(pos+stepSize, endOffset) - pos);
|
||||
urls.read(buffer, sz, pos);
|
||||
return Arrays.stream(buffer, 0, sz);
|
||||
});
|
||||
}
|
||||
|
||||
public EntrySource asEntrySource() {
|
||||
return new AsEntrySource();
|
||||
}
|
||||
|
||||
public QueryFilterStepIf asExcludeFilterStep(IndexQueryCachePool pool) {
|
||||
return new AsExcludeQueryFilterStep(pool);
|
||||
}
|
||||
|
||||
|
||||
public LongStream stream() {
|
||||
return stream(1024);
|
||||
}
|
||||
|
||||
public boolean isPresent() {
|
||||
return dataOffset >= 0;
|
||||
}
|
||||
|
||||
public long numEntries() {
|
||||
if (header != null) {
|
||||
return header.numEntries();
|
||||
}
|
||||
else if (dataOffset < 0) return 0L;
|
||||
else {
|
||||
header = bTreeReader.getHeader(dataOffset);
|
||||
return header.numEntries();
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasUrl(CachingBTreeReader.BTreeCachedIndex cache, long url) {
|
||||
if (dataOffset < 0) return false;
|
||||
|
||||
return cachingBTreeReader.findEntry(cache, url) >= 0;
|
||||
}
|
||||
|
||||
public boolean hasUrl(IndexQueryCachePool pool, long url) {
|
||||
if (dataOffset < 0)
|
||||
return false;
|
||||
|
||||
CachingBTreeReader.BTreeCachedIndex cache = pool.getIndexCache(SearchIndex.this, this);
|
||||
|
||||
return cachingBTreeReader.findEntry(cache, url) >= 0;
|
||||
}
|
||||
|
||||
public CachingBTreeReader.BTreeCachedIndex createIndexCache() {
|
||||
if (dataOffset < 0)
|
||||
return null;
|
||||
|
||||
if (header == null) {
|
||||
header = cachingBTreeReader.getHeader(dataOffset);
|
||||
}
|
||||
|
||||
return cachingBTreeReader.prepareCache(header);
|
||||
}
|
||||
|
||||
class AsEntrySource implements EntrySource {
|
||||
long pos;
|
||||
final long endOffset;
|
||||
|
||||
public SearchIndex getIndex() {
|
||||
return SearchIndex.this;
|
||||
};
|
||||
|
||||
public AsEntrySource() {
|
||||
if (dataOffset <= 0) {
|
||||
pos = -1;
|
||||
endOffset = -1;
|
||||
return;
|
||||
}
|
||||
|
||||
if (header == null) {
|
||||
header = bTreeReader.getHeader(dataOffset);
|
||||
}
|
||||
|
||||
pos = header.dataOffsetLongs();
|
||||
endOffset = header.dataOffsetLongs() + header.numEntries();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int read(long[] buffer, int n) {
|
||||
if (pos >= endOffset) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int rb = Math.min(n, (int)(endOffset - pos));
|
||||
urls.read(buffer, rb, pos);
|
||||
pos += rb;
|
||||
return rb;
|
||||
}
|
||||
}
|
||||
|
||||
class AsExcludeQueryFilterStep implements QueryFilterStepIf {
|
||||
private final CachingBTreeReader.BTreeCachedIndex cache;
|
||||
|
||||
public AsExcludeQueryFilterStep(IndexQueryCachePool pool) {
|
||||
cache = pool.getIndexCache(SearchIndex.this, IndexBTreeRange.this);
|
||||
}
|
||||
|
||||
public SearchIndex getIndex() {
|
||||
return SearchIndex.this;
|
||||
};
|
||||
public double cost() {
|
||||
return cache.getIndexedDataSize();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean test(long value) {
|
||||
return !hasUrl(cache, value);
|
||||
}
|
||||
|
||||
public String describe() {
|
||||
return "Exclude["+name+"]";
|
||||
}
|
||||
return rangeForWord(wordId).numEntries();
|
||||
}
|
||||
|
||||
public SearchIndexURLRange rangeForWord(int wordId) {
|
||||
return new SearchIndexURLRange(urls, words.positionForWord(wordId));
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -5,7 +5,6 @@ import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexDomainQueryFactory;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -22,31 +21,14 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
private final IndexDomainQueryFactory domainQueryFactory;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private static final IndexBlock[] indicesBySearchOrder = new IndexBlock[] {
|
||||
IndexBlock.Title,
|
||||
IndexBlock.Tfidf_Top,
|
||||
IndexBlock.Tfidf_Middle,
|
||||
IndexBlock.Tfidf_Lower,
|
||||
IndexBlock.Words_1,
|
||||
IndexBlock.Words_2,
|
||||
IndexBlock.Words_4,
|
||||
IndexBlock.Words_8,
|
||||
IndexBlock.Words_16Plus,
|
||||
};
|
||||
|
||||
@Inject
|
||||
public SearchIndexReader(
|
||||
EnumMap<IndexBlock, SearchIndex> indices) {
|
||||
this.indices = indices;
|
||||
|
||||
var lowIndex = indices.get(IndexBlock.Tfidf_Lower);
|
||||
var midIndex = indices.get(IndexBlock.Tfidf_Middle);
|
||||
var topIndex = indices.get(IndexBlock.Tfidf_Top);
|
||||
var linkIndex = indices.get(IndexBlock.Link);
|
||||
var titleIndex = indices.get(IndexBlock.Title);
|
||||
var siteIndex = indices.get(IndexBlock.Site);
|
||||
var metaIndex = indices.get(IndexBlock.Meta);
|
||||
var topicIndex = indices.get(IndexBlock.Subjects);
|
||||
|
||||
var words1 = indices.get(IndexBlock.Words_1);
|
||||
var words2 = indices.get(IndexBlock.Words_2);
|
||||
@ -57,7 +39,7 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
|
||||
queryBuilders = new EnumMap<>(IndexBlock.class);
|
||||
|
||||
List<SearchIndex> excludeIndices = listOfNonNulls(metaIndex, titleIndex, topIndex, midIndex, lowIndex, words1);
|
||||
List<SearchIndex> excludeIndices = listOfNonNulls(metaIndex, titleIndex, words1, words2, words4, words8, words16);
|
||||
|
||||
queryBuilders.put(IndexBlock.Title, new IndexQueryFactory(listOfNonNulls(metaIndex, titleIndex, linkIndex), excludeIndices));
|
||||
queryBuilders.put(IndexBlock.Words_1, new IndexQueryFactory(listOfNonNulls(metaIndex, words1), excludeIndices));
|
||||
@ -66,7 +48,7 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
queryBuilders.put(IndexBlock.Words_8, new IndexQueryFactory(listOfNonNulls(metaIndex, words8), excludeIndices));
|
||||
queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryFactory(listOfNonNulls(metaIndex, words16, artifacts), excludeIndices));
|
||||
|
||||
domainQueryFactory = new IndexDomainQueryFactory(siteIndex, listOfNonNulls(topicIndex));
|
||||
domainQueryFactory = new IndexDomainQueryFactory(indices.get(IndexBlock.Words_1));
|
||||
}
|
||||
|
||||
@SafeVarargs
|
||||
@ -75,17 +57,31 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
}
|
||||
|
||||
|
||||
public IndexQueryFactory.IndexQueryBuilder findWord(IndexQueryCachePool cachePool, IndexBlock block, int wordId) {
|
||||
public IndexQueryFactory.IndexQueryBuilder findWord(IndexBlock block, Integer quality, int wordId) {
|
||||
var builder = queryBuilders.get(block);
|
||||
|
||||
if (builder == null)
|
||||
return null;
|
||||
|
||||
return builder.buildQuery(cachePool, wordId);
|
||||
if (quality == null) {
|
||||
return builder.buildQuery(wordId);
|
||||
}
|
||||
else {
|
||||
return builder.buildQuery(quality, wordId);
|
||||
}
|
||||
}
|
||||
|
||||
public IndexQuery findDomain(IndexQueryCachePool cachePool, int wordId) {
|
||||
return domainQueryFactory.buildQuery(cachePool, wordId);
|
||||
public IndexQueryFactory.IndexQueryBuilder findWordForDomainList(IndexBlock block, List<Integer> domains, int wordId) {
|
||||
var builder = queryBuilders.get(block);
|
||||
|
||||
if (builder == null)
|
||||
return null;
|
||||
|
||||
return builder.buildQuery(domains, wordId);
|
||||
}
|
||||
|
||||
public IndexQuery findDomain(int wordId) {
|
||||
return domainQueryFactory.buildQuery(wordId);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -96,7 +92,7 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public long numHits(IndexQueryCachePool pool, IndexBlock block, int word) {
|
||||
public long numHits(IndexBlock block, int word) {
|
||||
IndexQueryFactory builder = queryBuilders.get(block);
|
||||
|
||||
if (builder == null)
|
||||
@ -104,31 +100,18 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
|
||||
long hits = 0;
|
||||
for (var index : builder.getIndicies()) {
|
||||
hits += index.numUrls(pool, word);
|
||||
hits += index.numUrls(word);
|
||||
}
|
||||
return hits;
|
||||
}
|
||||
|
||||
public IndexBlock getBlockForResult(IndexQueryCachePool cachePool, int searchTerm, long urlId) {
|
||||
for (var block : indicesBySearchOrder) {
|
||||
var index = indices.get(block);
|
||||
|
||||
if (null == index) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (cachePool.isUrlPresent(index, searchTerm, urlId))
|
||||
return block;
|
||||
|
||||
}
|
||||
|
||||
return IndexBlock.Words_16Plus;
|
||||
}
|
||||
|
||||
public boolean isTermInBucket(IndexQueryCachePool cachePool, IndexBlock block, int searchTerm, long urlId) {
|
||||
public long[] getMetadata(IndexBlock block, int termId, long[] ids) {
|
||||
final var index = indices.get(block);
|
||||
if (null == index) return false;
|
||||
if (null == index) {
|
||||
return new long[ids.length];
|
||||
}
|
||||
|
||||
return cachePool.isUrlPresent(index, searchTerm, urlId);
|
||||
return indices.get(block).rangeForWord(termId).getMetadata(ids);
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,100 @@
|
||||
package nu.marginalia.wmsa.edge.index.reader;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongLongImmutablePair;
|
||||
import nu.marginalia.util.btree.BTreeQueryBuffer;
|
||||
import nu.marginalia.util.btree.BTreeReader;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.EmptyEntrySource;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySourceFromBTree;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySourceFromMapRange;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
import static nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags.*;
|
||||
|
||||
public class SearchIndexURLRange {
|
||||
public final long dataOffset;
|
||||
private final MultimapFileLong urlsFile;
|
||||
|
||||
@Nullable
|
||||
private final BTreeReader reader;
|
||||
|
||||
public SearchIndexURLRange(MultimapFileLong urlsFile, long dataOffset) {
|
||||
this.dataOffset = dataOffset;
|
||||
this.urlsFile = urlsFile;
|
||||
|
||||
if (dataOffset >= 0) {
|
||||
this.reader = new BTreeReader(urlsFile, SearchIndexConverter.urlsBTreeContext, dataOffset);
|
||||
} else {
|
||||
this.reader = null;
|
||||
}
|
||||
}
|
||||
|
||||
public EntrySource asPrefixSource(long prefix, long prefixNext) {
|
||||
if (reader == null)
|
||||
return new EmptyEntrySource();
|
||||
|
||||
LongLongImmutablePair startAndEnd = reader.getRangeForPrefix(prefix, prefixNext);
|
||||
|
||||
if (startAndEnd.firstLong() == startAndEnd.secondLong()) {
|
||||
return new EmptyEntrySource();
|
||||
}
|
||||
|
||||
return new EntrySourceFromMapRange(urlsFile, startAndEnd.firstLong(), startAndEnd.secondLong());
|
||||
}
|
||||
|
||||
public EntrySource asEntrySource() {
|
||||
return new EntrySourceFromBTree(reader, EntrySourceFromBTree.NO_MASKING, null);
|
||||
}
|
||||
public EntrySource asQualityLimitingEntrySource(int limit) {
|
||||
return new EntrySourceFromBTree(reader, EntrySourceFromBTree.NO_MASKING, limit);
|
||||
}
|
||||
public EntrySource asDomainEntrySource() {
|
||||
return new EntrySourceFromBTree(reader, Subjects.asBit() | Site.asBit() | Title.asBit(), null);
|
||||
}
|
||||
|
||||
public boolean isPresent() {
|
||||
return dataOffset >= 0;
|
||||
}
|
||||
|
||||
public long numEntries() {
|
||||
if (reader == null)
|
||||
return 0L;
|
||||
|
||||
return reader.numEntries();
|
||||
}
|
||||
|
||||
public void retainUrls(BTreeQueryBuffer buffer) {
|
||||
if (reader != null)
|
||||
reader.retainEntries(buffer);
|
||||
}
|
||||
|
||||
public void rejectUrls(BTreeQueryBuffer buffer) {
|
||||
if (reader != null)
|
||||
reader.rejectEntries(buffer);
|
||||
}
|
||||
|
||||
public boolean hasUrl(long url) {
|
||||
if (reader == null)
|
||||
return false;
|
||||
|
||||
return reader.findEntry(url) >= 0;
|
||||
}
|
||||
|
||||
|
||||
public long[] getMetadata(long[] urls) {
|
||||
if (reader == null) {
|
||||
return new long[urls.length];
|
||||
}
|
||||
|
||||
return reader.queryData(urls, 1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("BTreeRange(@" + dataOffset + ", size = " + numEntries() + ")");
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,111 @@
|
||||
package nu.marginalia.wmsa.edge.index.svc;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.prometheus.client.Histogram;
|
||||
import nu.marginalia.util.btree.BTreeQueryBuffer;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.wmsa.client.GsonFactory;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexSearchBudget;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeIdList;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
||||
import org.apache.http.HttpStatus;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.HaltException;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.util.OptionalInt;
|
||||
|
||||
import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH;
|
||||
import static spark.Spark.halt;
|
||||
|
||||
@Singleton
|
||||
public class EdgeIndexDomainQueryService {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private static final Histogram wmsa_edge_index_domain_query_time = Histogram.build().name("wmsa_edge_index_domain_query_time").linearBuckets(25/1000., 25/1000., 15).help("-").register();
|
||||
|
||||
private final Gson gson = GsonFactory.get();
|
||||
|
||||
private final SearchIndexes indexes;
|
||||
|
||||
@Inject
|
||||
public EdgeIndexDomainQueryService(SearchIndexes indexes) {
|
||||
this.indexes = indexes;
|
||||
}
|
||||
|
||||
public Object searchDomain(Request request, Response response) {
|
||||
if (indexes.getLexiconReader() == null) {
|
||||
logger.warn("Dictionary reader not yet initialized");
|
||||
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
|
||||
}
|
||||
|
||||
String json = request.body();
|
||||
EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class);
|
||||
|
||||
try {
|
||||
return wmsa_edge_index_domain_query_time.time(() -> queryDomain(specsSet));
|
||||
}
|
||||
catch (HaltException ex) {
|
||||
logger.warn("Halt", ex);
|
||||
throw ex;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.info("Error during domain search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
|
||||
logger.info("Error", ex);
|
||||
Spark.halt(500, "Error");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public EdgeDomainSearchResults queryDomain(EdgeDomainSearchSpecification specsSet) {
|
||||
|
||||
final OptionalInt wordId = lookUpWord(specsSet.keyword);
|
||||
final EdgeIdList<EdgeUrl> urlIds = new EdgeIdList<>();
|
||||
|
||||
final IndexSearchBudget budget = new IndexSearchBudget(50);
|
||||
|
||||
if (wordId.isEmpty()) {
|
||||
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
|
||||
}
|
||||
|
||||
BTreeQueryBuffer buffer = new BTreeQueryBuffer(512);
|
||||
|
||||
for (int bucket = 0; budget.hasTimeLeft() && bucket < DYNAMIC_BUCKET_LENGTH+1; bucket++) {
|
||||
|
||||
final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(1);
|
||||
var query = indexes.getBucket(bucket).getDomainQuery(wordId.getAsInt(), localFilter);
|
||||
|
||||
while (query.hasMore() && urlIds.size() < specsSet.maxResults) {
|
||||
query.getMoreResults(buffer);
|
||||
|
||||
for (int i = 0; i < buffer.end && urlIds.size() < specsSet.maxResults; i++) {
|
||||
long result = buffer.data[i];
|
||||
if (localFilter.test(result)) {
|
||||
urlIds.add((int) (result & 0xFFFF_FFFFL));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
|
||||
}
|
||||
|
||||
private OptionalInt lookUpWord(String s) {
|
||||
int ret = indexes.getLexiconReader().get(s);
|
||||
if (ret == DictionaryHashMap.NO_VALUE) {
|
||||
return OptionalInt.empty();
|
||||
}
|
||||
return OptionalInt.of(ret);
|
||||
}
|
||||
|
||||
}
|
@ -5,6 +5,7 @@ import com.google.inject.Singleton;
|
||||
import com.google.protobuf.InvalidProtocolBufferException;
|
||||
import nu.marginalia.util.ListChunker;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
|
||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
|
||||
@ -21,7 +22,6 @@ import spark.Request;
|
||||
import spark.Response;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
@Singleton
|
||||
public class EdgeIndexLexiconService {
|
||||
@ -35,6 +35,11 @@ public class EdgeIndexLexiconService {
|
||||
this.keywordLexicon = servicesFactory.getKeywordLexicon();
|
||||
}
|
||||
|
||||
public EdgeIndexLexiconService(SearchIndexes indexes, KeywordLexicon lexicon) {
|
||||
this.indexes = indexes;
|
||||
this.keywordLexicon = lexicon;
|
||||
}
|
||||
|
||||
public Object getWordId(Request request, Response response) {
|
||||
final String word = request.splat()[0];
|
||||
|
||||
@ -73,31 +78,37 @@ public class EdgeIndexLexiconService {
|
||||
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
|
||||
IndexPutKeywordsReq.WordSet words, int idx
|
||||
) {
|
||||
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
|
||||
|
||||
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
|
||||
IndexBlock block = IndexBlock.values()[words.getIndex()];
|
||||
|
||||
for (var chunk : ListChunker.chopList(words.getWordsList(), SearchIndexJournalEntry.MAX_LENGTH)) {
|
||||
var wordArray = words.getWordsList().toArray(String[]::new);
|
||||
var metaArray = words.getMetaList().stream().mapToLong(Long::valueOf).toArray();
|
||||
|
||||
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
|
||||
DocumentKeywords documentKeywords = new DocumentKeywords(block, wordArray, metaArray);
|
||||
for (var chunk : ListChunker.chopList(documentKeywords, SearchIndexJournalEntry.MAX_LENGTH)) {
|
||||
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk.keywords(), chunk.metadata()));
|
||||
var header = new SearchIndexJournalEntryHeader(domainId, urlId, block);
|
||||
|
||||
indexWriter.put(header, entry);
|
||||
}
|
||||
}
|
||||
|
||||
private long[] getOrInsertWordIds(List<String> words) {
|
||||
long[] ids = new long[words.size()];
|
||||
private long[] getOrInsertWordIds(String[] words, long[] meta) {
|
||||
long[] ids = new long[words.length*2];
|
||||
int putIdx = 0;
|
||||
|
||||
for (String word : words) {
|
||||
for (int i = 0; i < words.length; i++) {
|
||||
String word = words[i];
|
||||
|
||||
long id = keywordLexicon.getOrInsert(word);
|
||||
if (id != DictionaryHashMap.NO_VALUE) {
|
||||
ids[putIdx++] = id;
|
||||
ids[putIdx++] = meta[i];
|
||||
}
|
||||
}
|
||||
|
||||
if (putIdx != words.size()) {
|
||||
if (putIdx != words.length*2) {
|
||||
ids = Arrays.copyOf(ids, putIdx);
|
||||
}
|
||||
return ids;
|
||||
|
@ -7,22 +7,23 @@ import gnu.trove.set.hash.TIntHashSet;
|
||||
import io.prometheus.client.Counter;
|
||||
import io.prometheus.client.Gauge;
|
||||
import io.prometheus.client.Histogram;
|
||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
import it.unimi.dsi.fastutil.ints.IntComparator;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||
import it.unimi.dsi.fastutil.longs.LongAVLTreeSet;
|
||||
import nu.marginalia.util.btree.BTreeQueryBuffer;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.wmsa.client.GsonFactory;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.edge.index.EdgeIndexBucket;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryParams;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexSearchBudget;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeIdList;
|
||||
import nu.marginalia.wmsa.edge.model.search.*;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
||||
import org.apache.http.HttpStatus;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -36,7 +37,6 @@ import java.util.function.LongPredicate;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static java.util.Comparator.comparing;
|
||||
import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH;
|
||||
import static spark.Spark.halt;
|
||||
|
||||
@Singleton
|
||||
@ -50,7 +50,6 @@ public class EdgeIndexQueryService {
|
||||
|
||||
private static final Gauge wmsa_edge_index_query_cost = Gauge.build().name("wmsa_edge_index_query_cost").help("-").register();
|
||||
private static final Histogram wmsa_edge_index_query_time = Histogram.build().name("wmsa_edge_index_query_time").linearBuckets(25/1000., 25/1000., 15).help("-").register();
|
||||
private static final Histogram wmsa_edge_index_domain_query_time = Histogram.build().name("wmsa_edge_index_domain_query_time").linearBuckets(25/1000., 25/1000., 15).help("-").register();
|
||||
|
||||
private final Gson gson = GsonFactory.get();
|
||||
|
||||
@ -61,30 +60,6 @@ public class EdgeIndexQueryService {
|
||||
this.indexes = indexes;
|
||||
}
|
||||
|
||||
public Object searchDomain(Request request, Response response) {
|
||||
if (indexes.getLexiconReader() == null) {
|
||||
logger.warn("Dictionary reader not yet initialized");
|
||||
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
|
||||
}
|
||||
|
||||
String json = request.body();
|
||||
EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class);
|
||||
|
||||
try {
|
||||
return wmsa_edge_index_domain_query_time.time(() -> queryDomain(specsSet));
|
||||
}
|
||||
catch (HaltException ex) {
|
||||
logger.warn("Halt", ex);
|
||||
throw ex;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.info("Error during domain search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
|
||||
logger.info("Error", ex);
|
||||
Spark.halt(500, "Error");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public Object search(Request request, Response response) {
|
||||
if (indexes.getLexiconReader() == null) {
|
||||
logger.warn("Dictionary reader not yet initialized");
|
||||
@ -94,6 +69,7 @@ public class EdgeIndexQueryService {
|
||||
String json = request.body();
|
||||
EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class);
|
||||
|
||||
|
||||
try {
|
||||
return wmsa_edge_index_query_time.time(() -> query(specsSet));
|
||||
}
|
||||
@ -117,51 +93,20 @@ public class EdgeIndexQueryService {
|
||||
|
||||
wmsa_edge_index_query_cost.set(searchQuery.getDataCost());
|
||||
|
||||
if (!searchQuery.hasTimeLeft()) {
|
||||
wmsa_edge_index_query_timeouts.inc();
|
||||
}
|
||||
|
||||
return new EdgeSearchResultSet(results);
|
||||
}
|
||||
|
||||
public EdgeDomainSearchResults queryDomain(EdgeDomainSearchSpecification specsSet) {
|
||||
|
||||
final OptionalInt wordId = lookUpWord(specsSet.keyword);
|
||||
|
||||
final EdgeIdList<EdgeUrl> urlIds = new EdgeIdList<>();
|
||||
|
||||
final IndexQueryCachePool pool = new IndexQueryCachePool();
|
||||
final IndexSearchBudget budget = new IndexSearchBudget(50);
|
||||
|
||||
if (wordId.isEmpty()) {
|
||||
|
||||
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
|
||||
}
|
||||
|
||||
for (int bucket = 0; budget.hasTimeLeft() && bucket < DYNAMIC_BUCKET_LENGTH+1; bucket++) {
|
||||
|
||||
final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(1);
|
||||
|
||||
var query = indexes.getBucket(bucket).getDomainQuery(pool, wordId.getAsInt(), localFilter);
|
||||
long[] buffer = new long[512];
|
||||
|
||||
while (query.hasMore() && urlIds.size() < specsSet.maxResults) {
|
||||
int cnt = query.getMoreResults(buffer, budget);
|
||||
for (int i = 0; i < cnt && urlIds.size() < specsSet.maxResults; i++) {
|
||||
long result = buffer[i];
|
||||
if (localFilter.test(result)) {
|
||||
urlIds.add((int) (result & 0xFFFF_FFFFL));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
|
||||
}
|
||||
|
||||
private class SearchQuery {
|
||||
private final int fetchSize;
|
||||
private final TIntHashSet seenResults;
|
||||
private final EdgeSearchSpecification specsSet;
|
||||
private final IndexSearchBudget budget;
|
||||
private final IndexQueryCachePool cachePool = new IndexQueryCachePool();
|
||||
|
||||
private final Integer qualityLimit;
|
||||
private final Integer rankLimit;
|
||||
private long dataCost = 0;
|
||||
|
||||
public SearchQuery(EdgeSearchSpecification specsSet) {
|
||||
@ -169,6 +114,8 @@ public class EdgeIndexQueryService {
|
||||
this.budget = new IndexSearchBudget(specsSet.timeoutMs);
|
||||
this.fetchSize = specsSet.fetchSize;
|
||||
this.seenResults = new TIntHashSet(fetchSize, 0.5f);
|
||||
this.qualityLimit = specsSet.quality;
|
||||
this.rankLimit = specsSet.rank;
|
||||
}
|
||||
|
||||
private List<EdgeSearchResultItem> execute() {
|
||||
@ -178,22 +125,18 @@ public class EdgeIndexQueryService {
|
||||
results.addAll(performSearch(sq));
|
||||
}
|
||||
|
||||
|
||||
final SearchTermEvaluator evaluator = new SearchTermEvaluator(specsSet, results);
|
||||
for (var result : results) {
|
||||
addResultScores(result);
|
||||
evaluator.addResultScores(result);
|
||||
}
|
||||
|
||||
if (!budget.hasTimeLeft()) {
|
||||
wmsa_edge_index_query_timeouts.inc();
|
||||
return createResultList(results);
|
||||
}
|
||||
|
||||
private List<EdgeSearchResultItem> createResultList(Set<EdgeSearchResultItem> results) {
|
||||
|
||||
var domainCountFilter = new ResultDomainDeduplicator(specsSet.limitByDomain);
|
||||
|
||||
if (WmsaHome.isDebug()) {
|
||||
cachePool.printSummary(logger);
|
||||
}
|
||||
cachePool.clear();
|
||||
|
||||
List<EdgeSearchResultItem> resultList = results.stream()
|
||||
.sorted(
|
||||
comparing(EdgeSearchResultItem::getScore)
|
||||
@ -204,6 +147,9 @@ public class EdgeIndexQueryService {
|
||||
.collect(Collectors.toList());
|
||||
|
||||
if (resultList.size() > specsSet.getLimitTotal()) {
|
||||
// This can't be made a stream limit() operation because we need domainCountFilter
|
||||
// to run over the entire list to provide accurate statistics
|
||||
|
||||
resultList.subList(specsSet.getLimitTotal(), resultList.size()).clear();
|
||||
}
|
||||
|
||||
@ -219,16 +165,20 @@ public class EdgeIndexQueryService {
|
||||
{
|
||||
|
||||
final List<EdgeSearchResultItem> results = new ArrayList<>(fetchSize);
|
||||
final EdgeIndexSearchTerms searchTerms = getSearchTerms(sq);
|
||||
final SearchTerms searchTerms = getSearchTerms(sq);
|
||||
|
||||
if (searchTerms.isEmpty())
|
||||
if (searchTerms.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
final BTreeQueryBuffer buffer = new BTreeQueryBuffer(fetchSize);
|
||||
|
||||
for (int indexBucket : specsSet.buckets) {
|
||||
final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT);
|
||||
|
||||
if (!budget.hasTimeLeft()) {
|
||||
logger.info("Query timed out, omitting {}:{} for query {}, ({}), -{}", indexBucket, sq.block, sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude);
|
||||
logger.info("Query timed out, omitting {}:{} for query {}, ({}), -{}",
|
||||
indexBucket, sq.block, sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude);
|
||||
continue;
|
||||
|
||||
}
|
||||
@ -237,20 +187,22 @@ public class EdgeIndexQueryService {
|
||||
break;
|
||||
}
|
||||
|
||||
IndexQuery query = getQuery(cachePool, indexBucket, sq.block, localFilter::filterRawValue, searchTerms);
|
||||
long[] buf = new long[fetchSize];
|
||||
IndexQueryParams queryParams = new IndexQueryParams(sq.block, searchTerms, qualityLimit, rankLimit, specsSet.domains);
|
||||
|
||||
IndexQuery query = getQuery(indexBucket, localFilter::filterRawValue, queryParams);
|
||||
|
||||
while (query.hasMore() && results.size() < fetchSize && budget.hasTimeLeft()) {
|
||||
int cnt = query.getMoreResults(buf, budget);
|
||||
buffer.reset();
|
||||
query.getMoreResults(buffer);
|
||||
|
||||
for (int i = 0; i < cnt && results.size() < fetchSize; i++) {
|
||||
final long id = buf[i];
|
||||
for (int i = 0; i < buffer.size() && results.size() < fetchSize; i++) {
|
||||
final long id = buffer.data[i];
|
||||
|
||||
if (!seenResults.add((int)(id & 0xFFFF_FFFFL)) || !localFilter.test(id)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
results.add(new EdgeSearchResultItem(indexBucket, id));
|
||||
results.add(new EdgeSearchResultItem(indexBucket, sq.block, id));
|
||||
}
|
||||
}
|
||||
|
||||
@ -261,40 +213,127 @@ public class EdgeIndexQueryService {
|
||||
return results;
|
||||
}
|
||||
|
||||
private IndexQuery getQuery(IndexQueryCachePool cachePool, int bucket, IndexBlock block,
|
||||
LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
|
||||
private IndexQuery getQuery(int bucket, LongPredicate filter, IndexQueryParams params) {
|
||||
|
||||
if (!indexes.isValidBucket(bucket)) {
|
||||
logger.warn("Invalid bucket {}", bucket);
|
||||
return new IndexQuery(Collections.emptyList());
|
||||
}
|
||||
|
||||
return indexes.getBucket(bucket).getQuery(cachePool, block, filter, searchTerms);
|
||||
return indexes.getBucket(bucket).getQuery(filter, params);
|
||||
}
|
||||
|
||||
private void addResultScores(EdgeSearchResultItem searchResult) {
|
||||
public boolean hasTimeLeft() {
|
||||
return budget.hasTimeLeft();
|
||||
}
|
||||
|
||||
private record IndexAndBucket(IndexBlock block, int bucket) {}
|
||||
|
||||
public long getDataCost() {
|
||||
return dataCost;
|
||||
}
|
||||
|
||||
record ResultTerm (int bucket, int termId, long combinedUrlId) {}
|
||||
}
|
||||
|
||||
public class SearchTermEvaluator {
|
||||
private static final EdgePageWordMetadata blankMetadata = new EdgePageWordMetadata(EdgePageWordMetadata.emptyValue());
|
||||
|
||||
private final Map<SearchQuery.ResultTerm, EdgePageWordMetadata> termData = new HashMap<>(16);
|
||||
|
||||
private final List<List<String>> searchTermVariants;
|
||||
|
||||
public SearchTermEvaluator(EdgeSearchSpecification specsSet, Set<EdgeSearchResultItem> results) {
|
||||
this.searchTermVariants = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
|
||||
|
||||
final int[] termIdsAll = getIncludeTermIds(specsSet);
|
||||
|
||||
Map<SearchQuery.IndexAndBucket, LongAVLTreeSet> resultIdsByBucket = new HashMap<>(7);
|
||||
|
||||
for (int termId : termIdsAll) {
|
||||
|
||||
for (var result: results) {
|
||||
resultIdsByBucket
|
||||
.computeIfAbsent(new SearchQuery.IndexAndBucket(result.block, result.bucketId),
|
||||
id -> new LongAVLTreeSet())
|
||||
.add(result.combinedId);
|
||||
}
|
||||
|
||||
resultIdsByBucket.forEach((indexAndBucket, resultIds) ->
|
||||
loadMetadata(termId, indexAndBucket.bucket, indexAndBucket.block, resultIds));
|
||||
|
||||
resultIdsByBucket.clear();
|
||||
}
|
||||
}
|
||||
|
||||
private int[] getIncludeTermIds(EdgeSearchSpecification specsSet) {
|
||||
|
||||
final var reader = Objects.requireNonNull(indexes.getLexiconReader());
|
||||
|
||||
List<List<String>> searchTermVariants = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
|
||||
final List<String> terms = specsSet.allIncludeSearchTerms();
|
||||
final IntList ret = new IntArrayList(terms.size());
|
||||
|
||||
// Memoize calls to getTermData, as they're somewhat expensive and highly redundant
|
||||
Map<ResultTerm, ResultTermData> termMetadata = new HashMap<>(32);
|
||||
for (var term : terms) {
|
||||
int id = reader.get(term);
|
||||
|
||||
if (id >= 0)
|
||||
ret.add(id);
|
||||
}
|
||||
|
||||
return ret.toIntArray();
|
||||
}
|
||||
|
||||
private void loadMetadata(int termId, int bucket, IndexBlock indexBlock,
|
||||
LongAVLTreeSet docIdsMissingMetadata)
|
||||
{
|
||||
EdgeIndexBucket index = indexes.getBucket(bucket);
|
||||
|
||||
if (docIdsMissingMetadata.isEmpty())
|
||||
return;
|
||||
|
||||
|
||||
long[] ids = docIdsMissingMetadata.toLongArray();
|
||||
long[] metadata = index.getMetadata(indexBlock, termId, ids);
|
||||
|
||||
for (int i = 0; i < metadata.length; i++) {
|
||||
if (metadata[i] == 0L)
|
||||
continue;
|
||||
|
||||
termData.put(
|
||||
new SearchQuery.ResultTerm(bucket, termId, ids[i]),
|
||||
new EdgePageWordMetadata(metadata[i])
|
||||
);
|
||||
|
||||
docIdsMissingMetadata.remove(ids[i]);
|
||||
}
|
||||
}
|
||||
|
||||
public void addResultScores(EdgeSearchResultItem searchResult) {
|
||||
final var reader = Objects.requireNonNull(indexes.getLexiconReader());
|
||||
|
||||
double bestScore = 0;
|
||||
|
||||
for (int searchTermListIdx = 0; searchTermListIdx < searchTermVariants.size(); searchTermListIdx++) {
|
||||
double setScore = 0;
|
||||
int setSize = 0;
|
||||
for (var searchTerm : searchTermVariants.get(searchTermListIdx)) {
|
||||
var termList = searchTermVariants.get(searchTermListIdx);
|
||||
|
||||
for (int termIdx = 0; termIdx < termList.size(); termIdx++) {
|
||||
String searchTerm = termList.get(termIdx);
|
||||
|
||||
final int termId = reader.get(searchTerm);
|
||||
|
||||
ResultTermData data = termMetadata.computeIfAbsent(
|
||||
new ResultTerm(searchResult.bucketId, termId, searchResult.getCombinedId()), this::getTermData);
|
||||
var key = new SearchQuery.ResultTerm(searchResult.bucketId, termId, searchResult.getCombinedId());
|
||||
var metadata = termData.getOrDefault(key, blankMetadata);
|
||||
|
||||
EdgeSearchResultKeywordScore score = new EdgeSearchResultKeywordScore(searchTermListIdx, searchTerm, metadata);
|
||||
|
||||
var score = data.asScore(searchTermListIdx, searchTerm);
|
||||
searchResult.scores.add(score);
|
||||
setScore += score.value();
|
||||
setScore += score.termValue();
|
||||
if (termIdx == 0) {
|
||||
setScore += score.documentValue();
|
||||
}
|
||||
|
||||
setSize++;
|
||||
}
|
||||
bestScore = Math.min(bestScore, setScore/setSize);
|
||||
@ -303,64 +342,27 @@ public class EdgeIndexQueryService {
|
||||
searchResult.setScore(bestScore);
|
||||
}
|
||||
|
||||
private ResultTermData getTermData(ResultTerm resultTerm) {
|
||||
final EdgeIndexBucket bucket = indexes.getBucket(resultTerm.bucket);
|
||||
final int termId = resultTerm.termId;
|
||||
final long combinedUrlId = resultTerm.combinedUrlId;
|
||||
|
||||
return new ResultTermData(bucket.getTermScore(cachePool, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Title, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Link, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Site, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Subjects, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.NamesWords, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Top, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Middle, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Lower, termId, combinedUrlId)
|
||||
);
|
||||
}
|
||||
|
||||
public long getDataCost() {
|
||||
return dataCost;
|
||||
}
|
||||
|
||||
record ResultTerm (int bucket, int termId, long combinedUrlId) {}
|
||||
record ResultTermData (IndexBlock index,
|
||||
boolean title,
|
||||
boolean link,
|
||||
boolean site,
|
||||
boolean subject,
|
||||
boolean name,
|
||||
boolean high,
|
||||
boolean mid,
|
||||
boolean low
|
||||
) {
|
||||
public EdgeSearchResultKeywordScore asScore(int set, String searchTerm) {
|
||||
return new EdgeSearchResultKeywordScore(set, searchTerm, index, title, link, site, subject, name, high, mid, low);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private EdgeIndexSearchTerms getSearchTerms(EdgeSearchSubquery request) {
|
||||
final List<Integer> excludes = new ArrayList<>();
|
||||
final List<Integer> includes = new ArrayList<>();
|
||||
private SearchTerms getSearchTerms(EdgeSearchSubquery request) {
|
||||
final IntList excludes = new IntArrayList();
|
||||
final IntList includes = new IntArrayList();
|
||||
|
||||
for (var include : request.searchTermsInclude) {
|
||||
var word = lookUpWord(include);
|
||||
if (word.isEmpty()) {
|
||||
logger.debug("Unknown search term: " + include);
|
||||
return new EdgeIndexSearchTerms(Collections.emptyList(), Collections.emptyList());
|
||||
return new SearchTerms();
|
||||
}
|
||||
includes.add(word.getAsInt());
|
||||
}
|
||||
|
||||
|
||||
for (var advice : request.searchTermsAdvice) {
|
||||
var word = lookUpWord(advice);
|
||||
if (word.isEmpty()) {
|
||||
logger.debug("Unknown search term: " + advice);
|
||||
return new EdgeIndexSearchTerms(Collections.emptyList(), Collections.emptyList());
|
||||
return new SearchTerms();
|
||||
}
|
||||
includes.add(word.getAsInt());
|
||||
}
|
||||
@ -369,7 +371,26 @@ public class EdgeIndexQueryService {
|
||||
lookUpWord(exclude).ifPresent(excludes::add);
|
||||
}
|
||||
|
||||
return new EdgeIndexSearchTerms(includes, excludes);
|
||||
return new SearchTerms(includes, excludes);
|
||||
}
|
||||
|
||||
public record SearchTerms(IntList includes, IntList excludes) {
|
||||
public SearchTerms() {
|
||||
this(IntList.of(), IntList.of());
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return includes.isEmpty();
|
||||
}
|
||||
|
||||
public int[] sortedDistinctIncludes(IntComparator comparator) {
|
||||
if (includes.isEmpty())
|
||||
return includes.toIntArray();
|
||||
|
||||
IntList list = new IntArrayList(new IntOpenHashSet(includes));
|
||||
list.sort(comparator);
|
||||
return list.toIntArray();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user