mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
October Release (#118)
Co-authored-by: vlofgren <vlofgren@gmail.com> Co-authored-by: vlofgren <vlofgren@marginalia.nu> Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/118
This commit is contained in:
parent
9a7d052c43
commit
df49ccbe59
@ -175,7 +175,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
|||||||
|
|
||||||
driver.get("http://proxyNginx/");
|
driver.get("http://proxyNginx/");
|
||||||
System.out.println(driver.getTitle());
|
System.out.println(driver.getTitle());
|
||||||
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
// System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
||||||
|
|
||||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("frontpage"));
|
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("frontpage"));
|
||||||
}
|
}
|
||||||
@ -249,7 +249,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
|||||||
|
|
||||||
driver.get("http://proxyNginx/search?query=browse:wikipedia.local");
|
driver.get("http://proxyNginx/search?query=browse:wikipedia.local");
|
||||||
System.out.println(driver.getTitle());
|
System.out.println(driver.getTitle());
|
||||||
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
// System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
||||||
|
|
||||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("browse"));
|
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("browse"));
|
||||||
}
|
}
|
||||||
@ -259,7 +259,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
|||||||
|
|
||||||
driver.get("http://proxyNginx/search?query=define:adiabatic");
|
driver.get("http://proxyNginx/search?query=define:adiabatic");
|
||||||
System.out.println(driver.getTitle());
|
System.out.println(driver.getTitle());
|
||||||
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
// System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
||||||
|
|
||||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("define"));
|
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("define"));
|
||||||
}
|
}
|
||||||
@ -269,7 +269,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
|||||||
|
|
||||||
driver.get("http://proxyNginx/search?query=3%2B3");
|
driver.get("http://proxyNginx/search?query=3%2B3");
|
||||||
System.out.println(driver.getTitle());
|
System.out.println(driver.getTitle());
|
||||||
System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
// System.out.println(driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML"));
|
||||||
|
|
||||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("eval"));
|
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("eval"));
|
||||||
}
|
}
|
||||||
|
313
marginalia_nu/src/jmh/java/nu/marginalia/BitSetTest.java
Normal file
313
marginalia_nu/src/jmh/java/nu/marginalia/BitSetTest.java
Normal file
@ -0,0 +1,313 @@
|
|||||||
|
package nu.marginalia;
|
||||||
|
|
||||||
|
import nu.marginalia.util.AndCardIntSet;
|
||||||
|
import org.openjdk.jmh.annotations.*;
|
||||||
|
import org.roaringbitmap.RoaringBitmap;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
|
public class BitSetTest {
|
||||||
|
@org.openjdk.jmh.annotations.State(Scope.Benchmark)
|
||||||
|
public static class State {
|
||||||
|
List<RoaringBitmap> roar = new ArrayList<>();
|
||||||
|
List<AndCardIntSet> acbs = new ArrayList<>();
|
||||||
|
|
||||||
|
List<RoaringBitmap> roarLow = new ArrayList<>();
|
||||||
|
List<RoaringBitmap> roarHigh = new ArrayList<>();
|
||||||
|
|
||||||
|
List<AndCardIntSet> acbsLow = new ArrayList<>();
|
||||||
|
List<AndCardIntSet> acbsHigh = new ArrayList<>();
|
||||||
|
|
||||||
|
@Setup(Level.Trial)
|
||||||
|
public void setUp() {
|
||||||
|
var rand = new Random();
|
||||||
|
|
||||||
|
for (int i = 0; i < 100; i++) {
|
||||||
|
int card = 1 + rand.nextInt(10);
|
||||||
|
|
||||||
|
var rb = new RoaringBitmap();
|
||||||
|
var cbs = new AndCardIntSet();
|
||||||
|
|
||||||
|
for (int j = 0; j < card; j++) {
|
||||||
|
int val = rand.nextInt(1_000_000);
|
||||||
|
rb.add(val);
|
||||||
|
cbs.add(val);
|
||||||
|
}
|
||||||
|
acbsLow.add(cbs);
|
||||||
|
roarLow.add(rb);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < 10; i++) {
|
||||||
|
int card = 1 + rand.nextInt(10000, 20000);
|
||||||
|
|
||||||
|
var rb = new RoaringBitmap();
|
||||||
|
|
||||||
|
for (int j = 0; j < card; j++) {
|
||||||
|
int val = rand.nextInt(1_000_000);
|
||||||
|
rb.add(val);
|
||||||
|
}
|
||||||
|
acbsHigh.add(AndCardIntSet.of(rb));
|
||||||
|
roarHigh.add(rb);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for (int i = 0; i < 100000; i++) {
|
||||||
|
var rb = new RoaringBitmap();
|
||||||
|
var cbs = new AndCardIntSet();
|
||||||
|
|
||||||
|
int val = rand.nextInt(1_000_000);
|
||||||
|
rb.add(val);
|
||||||
|
cbs.add(val);
|
||||||
|
|
||||||
|
acbs.add(cbs);
|
||||||
|
roar.add(rb);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < 10000; i++) {
|
||||||
|
int card = 1 + rand.nextInt(10);
|
||||||
|
|
||||||
|
var rb = new RoaringBitmap();
|
||||||
|
var cbs = new AndCardIntSet();
|
||||||
|
|
||||||
|
for (int j = 0; j < card; j++) {
|
||||||
|
int val = rand.nextInt(1_000_000);
|
||||||
|
rb.add(val);
|
||||||
|
cbs.add(val);
|
||||||
|
}
|
||||||
|
acbs.add(cbs);
|
||||||
|
roar.add(rb);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < 1000; i++) {
|
||||||
|
int card = 1 + rand.nextInt(100);
|
||||||
|
|
||||||
|
var rb = new RoaringBitmap();
|
||||||
|
var cbs = new AndCardIntSet();
|
||||||
|
|
||||||
|
for (int j = 0; j < card; j++) {
|
||||||
|
int val = rand.nextInt(1_000_000);
|
||||||
|
rb.add(val);
|
||||||
|
cbs.add(val);
|
||||||
|
}
|
||||||
|
acbs.add(cbs);
|
||||||
|
roar.add(rb);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < 100; i++) {
|
||||||
|
int card = 1 + rand.nextInt(1000);
|
||||||
|
|
||||||
|
var rb = new RoaringBitmap();
|
||||||
|
var cbs = new AndCardIntSet();
|
||||||
|
|
||||||
|
for (int j = 0; j < card; j++) {
|
||||||
|
int val = rand.nextInt(1_000_000);
|
||||||
|
rb.add(val);
|
||||||
|
cbs.add(val);
|
||||||
|
}
|
||||||
|
acbs.add(cbs);
|
||||||
|
roar.add(rb);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < 100; i++) {
|
||||||
|
int card = 1 + rand.nextInt(10000);
|
||||||
|
|
||||||
|
var rb = new RoaringBitmap();
|
||||||
|
var cbs = new AndCardIntSet();
|
||||||
|
|
||||||
|
for (int j = 0; j < card; j++) {
|
||||||
|
int val = rand.nextInt(1_000_000);
|
||||||
|
rb.add(val);
|
||||||
|
cbs.add(val);
|
||||||
|
}
|
||||||
|
acbs.add(cbs);
|
||||||
|
roar.add(rb);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < 2; i++) {
|
||||||
|
int card = 1 + rand.nextInt(100000);
|
||||||
|
|
||||||
|
var rb = new RoaringBitmap();
|
||||||
|
var cbs = new AndCardIntSet();
|
||||||
|
|
||||||
|
for (int j = 0; j < card; j++) {
|
||||||
|
int val = rand.nextInt(1_000_000);
|
||||||
|
rb.add(val);
|
||||||
|
cbs.add(val);
|
||||||
|
}
|
||||||
|
acbs.add(cbs);
|
||||||
|
roar.add(rb);
|
||||||
|
}
|
||||||
|
Collections.shuffle(acbs);
|
||||||
|
Collections.shuffle(roar);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// @Benchmark
|
||||||
|
// @BenchmarkMode(Mode.Throughput)
|
||||||
|
// @Fork(value = 5, warmups = 5)
|
||||||
|
// public Object roaringCard(State state) {
|
||||||
|
// long val = 0;
|
||||||
|
//
|
||||||
|
// for (int i = 0; i < state.roar.size(); i++) {
|
||||||
|
// for (int j = i+1; j < state.roar.size(); j++) {
|
||||||
|
// val += RoaringBitmap.andCardinality(state.roar.get(i), state.roar.get(j));
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// return val;
|
||||||
|
// }
|
||||||
|
// @Benchmark
|
||||||
|
// @BenchmarkMode(Mode.Throughput)
|
||||||
|
// @Fork(value = 2, warmups = 2)
|
||||||
|
// public Object roaringCardNorm(State state) {
|
||||||
|
// long val = 0;
|
||||||
|
//
|
||||||
|
// for (int i = 0; i < state.roar.size()/1000; i++) {
|
||||||
|
// for (int j = i+1; j < state.roar.size(); j++) {
|
||||||
|
//
|
||||||
|
// var a = state.roar.get(i);
|
||||||
|
// var b = state.roar.get(j);
|
||||||
|
// val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// return val;
|
||||||
|
// }
|
||||||
|
// @Benchmark
|
||||||
|
// @BenchmarkMode(Mode.Throughput)
|
||||||
|
// @Fork(value = 5, warmups = 5)
|
||||||
|
// public Object cbsCard(State state) {
|
||||||
|
// long val = 0;
|
||||||
|
//
|
||||||
|
// for (int i = 0; i < state.roar.size(); i++) {
|
||||||
|
// for (int j = i+1; j < state.roar.size(); j++) {
|
||||||
|
// val += AndCardIntSet.andCardinality(state.acbs.get(i), state.acbs.get(j));
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// return val;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// @Benchmark
|
||||||
|
// @BenchmarkMode(Mode.Throughput)
|
||||||
|
// @Fork(value = 1, warmups = 1)
|
||||||
|
// public Object cbsCardNorm(State state) {
|
||||||
|
// double val = 0;
|
||||||
|
//
|
||||||
|
// for (int i = 0; i < state.roar.size()/1000; i++) {
|
||||||
|
// for (int j = i+1; j < state.roar.size(); j++) {
|
||||||
|
// var a = state.acbs.get(i);
|
||||||
|
// var b = state.acbs.get(j);
|
||||||
|
// val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.cardinality()*b.cardinality()));
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// return val;
|
||||||
|
// }
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
@BenchmarkMode(Mode.Throughput)
|
||||||
|
@Fork(value = 1, warmups = 1)
|
||||||
|
public Object cbsLowLow(State state) {
|
||||||
|
double val = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < state.acbsLow.size(); i++) {
|
||||||
|
for (int j = 0; j < state.acbsLow.size(); j++) {
|
||||||
|
var a = state.acbsLow.get(i);
|
||||||
|
var b = state.acbsLow.get(j);
|
||||||
|
val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
@BenchmarkMode(Mode.Throughput)
|
||||||
|
@Fork(value = 1, warmups = 1)
|
||||||
|
public Object cbsHighHigh(State state) {
|
||||||
|
double val = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < state.acbsHigh.size(); i++) {
|
||||||
|
for (int j = 0; j < state.acbsHigh.size(); j++) {
|
||||||
|
var a = state.acbsHigh.get(i);
|
||||||
|
var b = state.acbsHigh.get(j);
|
||||||
|
val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
@BenchmarkMode(Mode.Throughput)
|
||||||
|
@Fork(value = 1, warmups = 1)
|
||||||
|
public Object cbsHighLow(State state) {
|
||||||
|
double val = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < state.acbsHigh.size(); i++) {
|
||||||
|
for (int j = 0; j < state.acbsLow.size(); j++) {
|
||||||
|
var a = state.acbsHigh.get(i);
|
||||||
|
var b = state.acbsLow.get(j);
|
||||||
|
val += AndCardIntSet.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
@BenchmarkMode(Mode.Throughput)
|
||||||
|
@Fork(value = 1, warmups = 1)
|
||||||
|
public Object roarLowLow(State state) {
|
||||||
|
double val = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < state.roarLow.size(); i++) {
|
||||||
|
for (int j = 0; j < state.roarLow.size(); j++) {
|
||||||
|
var a = state.roarLow.get(i);
|
||||||
|
var b = state.roarLow.get(j);
|
||||||
|
val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
@BenchmarkMode(Mode.Throughput)
|
||||||
|
@Fork(value = 1, warmups = 1)
|
||||||
|
public Object roarHighLow(State state) {
|
||||||
|
double val = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < state.roarHigh.size(); i++) {
|
||||||
|
for (int j = 0; j < state.roarLow.size(); j++) {
|
||||||
|
var a = state.roarHigh.get(i);
|
||||||
|
var b = state.roarLow.get(j);
|
||||||
|
val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Benchmark
|
||||||
|
@BenchmarkMode(Mode.Throughput)
|
||||||
|
@Fork(value = 1, warmups = 1)
|
||||||
|
public Object roarHighHigh(State state) {
|
||||||
|
double val = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < state.roarHigh.size(); i++) {
|
||||||
|
for (int j = 0; j < state.roarHigh.size(); j++) {
|
||||||
|
var a = state.roarHigh.get(i);
|
||||||
|
var b = state.roarHigh.get(j);
|
||||||
|
val += RoaringBitmap.andCardinality(a, b) / (Math.sqrt(a.getCardinality()*b.getCardinality()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
}
|
@ -1,85 +0,0 @@
|
|||||||
package nu.marginalia;
|
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
|
||||||
import org.openjdk.jmh.annotations.*;
|
|
||||||
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.stream.IntStream;
|
|
||||||
import java.util.stream.LongStream;
|
|
||||||
|
|
||||||
public class ByteBufferBlockReadVsIndividualRead {
|
|
||||||
|
|
||||||
@State(Scope.Benchmark)
|
|
||||||
public static class ByteBufferState {
|
|
||||||
private MultimapFileLong mmf;
|
|
||||||
private Path file;
|
|
||||||
private static final int size = 800*1024*1024;
|
|
||||||
@Setup(Level.Iteration)
|
|
||||||
@SneakyThrows
|
|
||||||
public void setUp() {
|
|
||||||
file = Files.createTempFile("jmh", ".dat");
|
|
||||||
mmf = MultimapFileLong.forOutput(file, size);
|
|
||||||
for (int i = 0; i < size; i++) {
|
|
||||||
mmf.put(i, i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@TearDown(Level.Iteration)
|
|
||||||
@SneakyThrows
|
|
||||||
public void tearDown() {
|
|
||||||
mmf.close();
|
|
||||||
Files.delete(file);
|
|
||||||
}
|
|
||||||
|
|
||||||
LongStream basicStream() {
|
|
||||||
return IntStream.range(0, size).mapToLong(mmf::get);
|
|
||||||
}
|
|
||||||
|
|
||||||
LongStream blockStream(int blockSize) {
|
|
||||||
long urlOffset = 0;
|
|
||||||
long endOffset = size;
|
|
||||||
|
|
||||||
long[] arry = new long[blockSize];
|
|
||||||
|
|
||||||
return LongStream
|
|
||||||
.iterate(urlOffset, i -> i< endOffset, i->i+blockSize)
|
|
||||||
.flatMap(pos -> {
|
|
||||||
int sz = (int)(Math.min(pos+blockSize, endOffset) - pos);
|
|
||||||
mmf.read(arry, sz, pos);
|
|
||||||
return Arrays.stream(arry, 0, sz);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// @Benchmark @BenchmarkMode(Mode.Throughput)
|
|
||||||
// @Fork(value = 1, warmups = 1)
|
|
||||||
// @Warmup(iterations = 1)
|
|
||||||
public long testBasic(ByteBufferState state) {
|
|
||||||
return state.basicStream().sum();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Benchmark @BenchmarkMode(Mode.Throughput)
|
|
||||||
@Fork(value = 1, warmups = 1)
|
|
||||||
@Warmup(iterations = 0)
|
|
||||||
public long testBlock128(ByteBufferState state) {
|
|
||||||
return state.blockStream(128).sum();
|
|
||||||
}
|
|
||||||
@Benchmark @BenchmarkMode(Mode.Throughput)
|
|
||||||
@Fork(value = 1, warmups = 1)
|
|
||||||
@Warmup(iterations = 0)
|
|
||||||
public long testBlock1024(ByteBufferState state) {
|
|
||||||
return state.blockStream(1024).sum();
|
|
||||||
}
|
|
||||||
@Benchmark @BenchmarkMode(Mode.Throughput)
|
|
||||||
@Fork(value = 1, warmups = 1)
|
|
||||||
@Warmup(iterations = 0)
|
|
||||||
public long testBlock8192(ByteBufferState state) {
|
|
||||||
return state.blockStream(8192).sum();
|
|
||||||
}
|
|
||||||
}
|
|
@ -0,0 +1,205 @@
|
|||||||
|
package nu.marginalia.util;
|
||||||
|
|
||||||
|
import com.google.common.hash.HashFunction;
|
||||||
|
import com.google.common.hash.Hashing;
|
||||||
|
import gnu.trove.list.array.TIntArrayList;
|
||||||
|
import gnu.trove.set.hash.TIntHashSet;
|
||||||
|
import org.roaringbitmap.RoaringBitmap;
|
||||||
|
|
||||||
|
|
||||||
|
public class AndCardIntSet {
|
||||||
|
final TIntArrayList backingList;
|
||||||
|
long hash;
|
||||||
|
|
||||||
|
public AndCardIntSet() {
|
||||||
|
backingList = new TIntArrayList(16);
|
||||||
|
backingList.sort();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static AndCardIntSet of(int... list) {
|
||||||
|
var set = new TIntHashSet(list);
|
||||||
|
TIntArrayList lst = new TIntArrayList(set);
|
||||||
|
lst.sort();
|
||||||
|
|
||||||
|
return new AndCardIntSet(lst);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static AndCardIntSet of(RoaringBitmap bmap) {
|
||||||
|
|
||||||
|
TIntArrayList lst = new TIntArrayList(bmap.getCardinality());
|
||||||
|
lst.addAll(bmap.toArray());
|
||||||
|
|
||||||
|
return new AndCardIntSet(lst);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private AndCardIntSet(TIntArrayList list) {
|
||||||
|
backingList = list;
|
||||||
|
hash = 0;
|
||||||
|
|
||||||
|
if (list.size() < 128) {
|
||||||
|
for (int v : list.toArray()) {
|
||||||
|
int bit = hasher.hashInt(v).asInt() % 64;
|
||||||
|
hash |= (1L << bit);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
hash = ~0L;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final HashFunction hasher = Hashing.murmur3_128(0);
|
||||||
|
|
||||||
|
public boolean add(int val) {
|
||||||
|
if (!contains(val)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (backingList.size() < 128) {
|
||||||
|
int bit = hasher.hashInt(val).asInt() % 64;
|
||||||
|
hash |= (1L << bit);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
hash = ~0L;
|
||||||
|
}
|
||||||
|
backingList.add(val);
|
||||||
|
backingList.sort();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean contains(int val) {
|
||||||
|
return backingList.binarySearch(val) >= 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getCardinality() {
|
||||||
|
return backingList.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int andCardinality(AndCardIntSet a, AndCardIntSet b) {
|
||||||
|
|
||||||
|
if (!testHash(a,b)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (a.getCardinality() + b.getCardinality() < 10) {
|
||||||
|
return andLinearSmall(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
return andLinear(a,b);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int andLinearSmall(AndCardIntSet a, AndCardIntSet b) {
|
||||||
|
int sum = 0;
|
||||||
|
for (int i = 0; i < a.getCardinality(); i++) {
|
||||||
|
for (int j = 0; j < b.getCardinality(); j++) {
|
||||||
|
if (a.backingList.getQuick(i) == b.backingList.getQuick(j))
|
||||||
|
sum++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int andLinear(AndCardIntSet a, AndCardIntSet b) {
|
||||||
|
|
||||||
|
int i = 0, j = 0;
|
||||||
|
int card = 0;
|
||||||
|
|
||||||
|
do {
|
||||||
|
int diff = a.backingList.getQuick(i) - b.backingList.getQuick(j);
|
||||||
|
|
||||||
|
if (diff < 0) i++;
|
||||||
|
else if (diff > 0) j++;
|
||||||
|
else {
|
||||||
|
i++;
|
||||||
|
j++;
|
||||||
|
card++;
|
||||||
|
}
|
||||||
|
} while (i < a.getCardinality() && j < b.getCardinality());
|
||||||
|
|
||||||
|
return card;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean testHash(AndCardIntSet a, AndCardIntSet b) {
|
||||||
|
return (a.hash & b.hash) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean cardinalityExceeds(int val) {
|
||||||
|
return getCardinality() >= val;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static AndCardIntSet and(AndCardIntSet a, AndCardIntSet b) {
|
||||||
|
int i = 0;
|
||||||
|
int j = 0;
|
||||||
|
|
||||||
|
TIntArrayList andVals = new TIntArrayList(1 + (int)Math.sqrt(a.getCardinality()));
|
||||||
|
|
||||||
|
while (i < a.getCardinality() && j < b.getCardinality()) {
|
||||||
|
int diff = a.backingList.getQuick(i) - b.backingList.getQuick(j);
|
||||||
|
if (diff < 0) i++;
|
||||||
|
else if (diff > 0) j++;
|
||||||
|
else {
|
||||||
|
andVals.add(a.backingList.getQuick(i));
|
||||||
|
i++;
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new AndCardIntSet(andVals);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static double weightedProduct(float[] weights, AndCardIntSet a, AndCardIntSet b) {
|
||||||
|
int i = 0;
|
||||||
|
int j = 0;
|
||||||
|
|
||||||
|
double sum = 0;
|
||||||
|
|
||||||
|
if (a.getCardinality() + b.getCardinality() < 10) {
|
||||||
|
return weightedProductSmall(weights, a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
do {
|
||||||
|
int diff = a.backingList.getQuick(i) - b.backingList.getQuick(j);
|
||||||
|
if (diff < 0) i++;
|
||||||
|
else if (diff > 0) j++;
|
||||||
|
else {
|
||||||
|
sum += weights[a.backingList.getQuick(i)];
|
||||||
|
i++;
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
} while (i < a.getCardinality() && j < b.getCardinality());
|
||||||
|
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static double weightedProductSmall(float[] weights, AndCardIntSet a, AndCardIntSet b) {
|
||||||
|
double sum = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < a.getCardinality(); i++) {
|
||||||
|
for (int j = 0; j < b.getCardinality(); j++) {
|
||||||
|
int av = a.backingList.getQuick(i);
|
||||||
|
int bv = b.backingList.getQuick(j);
|
||||||
|
if (av == bv)
|
||||||
|
sum+=weights[av];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
public double mulAndSum(float[] weights) {
|
||||||
|
double sum = 0;
|
||||||
|
for (int i = 0; i < backingList.size(); i++) {
|
||||||
|
sum += weights[backingList.getQuick(i)];
|
||||||
|
}
|
||||||
|
return sum;
|
||||||
|
}
|
||||||
|
public int[] toArray() {
|
||||||
|
return backingList.toArray();
|
||||||
|
}
|
||||||
|
|
||||||
|
public TIntArrayList values() {
|
||||||
|
return backingList;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,52 @@
|
|||||||
|
package nu.marginalia.util;
|
||||||
|
|
||||||
|
public class BrailleBlockPunchCards {
|
||||||
|
|
||||||
|
public static String printBits(int val, int bits) {
|
||||||
|
StringBuilder builder = new StringBuilder();
|
||||||
|
|
||||||
|
for (int b = 0; b < bits; b+=8, val>>>=8) {
|
||||||
|
builder.append((char)('\u2800'+bin2brail(val)));
|
||||||
|
}
|
||||||
|
|
||||||
|
return builder.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The braille block in unicode U2800 is neat because it contains
|
||||||
|
* 8 "bits", but for historical reasons, they're addressed in a bit
|
||||||
|
* of an awkward way. Braille used to be a 2x6 grid, but it was extended
|
||||||
|
* to 2x8.
|
||||||
|
*
|
||||||
|
* It's addressed as follows
|
||||||
|
*
|
||||||
|
* 0 3
|
||||||
|
* 1 4
|
||||||
|
* 2 5
|
||||||
|
* 6 7 <-- extended braille
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* We want to use it as a dot matrix to represent bits. To do that we need
|
||||||
|
* to do this transformation:
|
||||||
|
*
|
||||||
|
* 0 1 2 3 4 5 6 7 native order bits
|
||||||
|
* | | | \ _\__\/ |
|
||||||
|
* | | | / \ \ \ |
|
||||||
|
* 0 1 2 6 3 4 5 7 braille order bits
|
||||||
|
*
|
||||||
|
* 01 02 04 08 10 20 40 80
|
||||||
|
* 01+02+04 +80 : &0x87
|
||||||
|
* << 10+20+40 : &0x70, <<1
|
||||||
|
* 08 >> >> >> : &0x08, >>3
|
||||||
|
*
|
||||||
|
* Or in other words we do
|
||||||
|
* (v & 0x87)
|
||||||
|
* | ((v & 0x70) >> 1)
|
||||||
|
* | ((v & 0x08) << 3)
|
||||||
|
*
|
||||||
|
* Thanks for coming to my TED talk.
|
||||||
|
*/
|
||||||
|
|
||||||
|
private static char bin2brail(int v) {
|
||||||
|
return (char)((v & 0x87) | ((v & 0x70) >> 1) | ((v & 0x08) << 3));
|
||||||
|
}
|
||||||
|
}
|
@ -1,5 +1,7 @@
|
|||||||
package nu.marginalia.util;
|
package nu.marginalia.util;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -14,13 +16,13 @@ public class ListChunker {
|
|||||||
*
|
*
|
||||||
* @see List#subList
|
* @see List#subList
|
||||||
*/
|
*/
|
||||||
public static <T> List<List<T>> chopList(List<T> data, int size) {
|
public static List<DocumentKeywords> chopList(DocumentKeywords data, int size) {
|
||||||
if (data.isEmpty())
|
if (data.isEmpty())
|
||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
else if (data.size() < size)
|
else if (data.size() < size)
|
||||||
return List.of(data);
|
return List.of(data);
|
||||||
|
|
||||||
final List<List<T>> ret = new ArrayList<>(1 + data.size() / size);
|
final List<DocumentKeywords> ret = new ArrayList<>(1 + data.size() / size);
|
||||||
|
|
||||||
for (int i = 0; i < data.size(); i+=size) {
|
for (int i = 0; i < data.size(); i+=size) {
|
||||||
ret.add(data.subList(i, Math.min(data.size(), i+size)));
|
ret.add(data.subList(i, Math.min(data.size(), i+size)));
|
||||||
|
@ -0,0 +1,33 @@
|
|||||||
|
package nu.marginalia.util.btree;
|
||||||
|
|
||||||
|
import nu.marginalia.util.btree.model.BTreeContext;
|
||||||
|
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||||
|
import nu.marginalia.util.multimap.MultimapFileLongSlice;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* End-of-page mark that's used as a sentinel to verify that
|
||||||
|
* the BTreeWriter's caller actually writes as much as they say
|
||||||
|
* they want to. (Failing to do so will corrupt the tree)
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class BTreeDogEar {
|
||||||
|
|
||||||
|
private MultimapFileLongSlice sentinelSlice;
|
||||||
|
|
||||||
|
public BTreeDogEar(BTreeContext ctx, BTreeHeader header, MultimapFileLongSlice base) {
|
||||||
|
if (header.numEntries() > 3) {
|
||||||
|
sentinelSlice = base.atOffset((long) header.numEntries() * ctx.entrySize() - 3);
|
||||||
|
sentinelSlice.put(0, 4L);
|
||||||
|
sentinelSlice.put(1, 5L);
|
||||||
|
sentinelSlice.put(2, 1L);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean verify() {
|
||||||
|
if (sentinelSlice == null)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
return 4 != sentinelSlice.get(0) || 5 != sentinelSlice.get(1) || 1 != sentinelSlice.get(2);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,146 @@
|
|||||||
|
package nu.marginalia.util.btree;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
public class BTreeQueryBuffer {
|
||||||
|
public final long[] data;
|
||||||
|
public int end;
|
||||||
|
|
||||||
|
private int read = 0;
|
||||||
|
private int write = 0;
|
||||||
|
|
||||||
|
public BTreeQueryBuffer(int size) {
|
||||||
|
this.data = new long[size];
|
||||||
|
this.end = size;
|
||||||
|
}
|
||||||
|
|
||||||
|
public BTreeQueryBuffer(long [] data, int size) {
|
||||||
|
this.data = data;
|
||||||
|
this.end = size;
|
||||||
|
}
|
||||||
|
|
||||||
|
private BTreeQueryBuffer(long [] data) {
|
||||||
|
this.data = data;
|
||||||
|
this.end = data.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
public BTreeQueryBuffer[] split(int... splitPoints) {
|
||||||
|
BTreeQueryBuffer[] ret = new BTreeQueryBuffer[splitPoints.length+1];
|
||||||
|
|
||||||
|
ret[0] = new BTreeQueryBuffer(Arrays.copyOfRange(data, 0, splitPoints[0]));
|
||||||
|
for (int i = 1; i < splitPoints.length; i++) {
|
||||||
|
ret[i] = new BTreeQueryBuffer(Arrays.copyOfRange(data, splitPoints[i-1], splitPoints[i]));
|
||||||
|
}
|
||||||
|
ret[ret.length-1] = new BTreeQueryBuffer(Arrays.copyOfRange(data, splitPoints[splitPoints.length-1], end));
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void gather(BTreeQueryBuffer... buffers) {
|
||||||
|
int start = 0;
|
||||||
|
|
||||||
|
for (var buffer : buffers) {
|
||||||
|
System.arraycopy(buffer.data, 0, data, start, buffer.end);
|
||||||
|
start += buffer.end;
|
||||||
|
}
|
||||||
|
|
||||||
|
this.read = 0;
|
||||||
|
this.write = 0;
|
||||||
|
this.end = start;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long[] copyData() {
|
||||||
|
return Arrays.copyOf(data, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void retainAll() {
|
||||||
|
read = write = end;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isEmpty() {
|
||||||
|
return end == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int size() {
|
||||||
|
return end;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long currentValue() {
|
||||||
|
return data[read];
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean rejectAndAdvance() {
|
||||||
|
return ++read < end;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean retainAndAdvance() {
|
||||||
|
if (read != write) {
|
||||||
|
long tmp = data[write];
|
||||||
|
data[write] = data[read];
|
||||||
|
data[read] = tmp;
|
||||||
|
}
|
||||||
|
|
||||||
|
write++;
|
||||||
|
|
||||||
|
return ++read < end;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasMore() {
|
||||||
|
return read < end;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void finalizeFiltering() {
|
||||||
|
end = write;
|
||||||
|
read = 0;
|
||||||
|
write = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void startFilterForRange(int pos, int end) {
|
||||||
|
read = write = pos;
|
||||||
|
this.end = end;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reset() {
|
||||||
|
end = data.length;
|
||||||
|
read = 0;
|
||||||
|
write = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void zero() {
|
||||||
|
end = 0;
|
||||||
|
read = 0;
|
||||||
|
write = 0;
|
||||||
|
Arrays.fill(data, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void uniq() {
|
||||||
|
if (end <= 1) return;
|
||||||
|
|
||||||
|
long prev = currentValue();
|
||||||
|
retainAndAdvance();
|
||||||
|
|
||||||
|
while (hasMore()) {
|
||||||
|
|
||||||
|
long val = currentValue();
|
||||||
|
|
||||||
|
if (prev == val) {
|
||||||
|
rejectAndAdvance();
|
||||||
|
} else {
|
||||||
|
retainAndAdvance();
|
||||||
|
prev = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
finalizeFiltering();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return getClass().getSimpleName() + "[" +
|
||||||
|
"read = " + read +
|
||||||
|
",write = " + write +
|
||||||
|
",end = " + end +
|
||||||
|
",data = [" + Arrays.toString(Arrays.copyOf(data, end)) + "]]";
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,5 +1,7 @@
|
|||||||
package nu.marginalia.util.btree;
|
package nu.marginalia.util.btree;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongLongImmutablePair;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.util.btree.model.BTreeContext;
|
import nu.marginalia.util.btree.model.BTreeContext;
|
||||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||||
@ -14,70 +16,275 @@ public class BTreeReader {
|
|||||||
|
|
||||||
private final MultimapSearcher indexSearcher;
|
private final MultimapSearcher indexSearcher;
|
||||||
private final MultimapSearcher dataSearcher;
|
private final MultimapSearcher dataSearcher;
|
||||||
|
private final BTreeHeader header;
|
||||||
|
|
||||||
public BTreeReader(MultimapFileLong file, BTreeContext ctx) {
|
public BTreeReader(MultimapFileLong file, BTreeContext ctx, BTreeHeader header) {
|
||||||
this.file = file;
|
this.file = file;
|
||||||
this.indexSearcher = MultimapSearcher.forContext(file, ~0, 1);
|
this.indexSearcher = MultimapSearcher.forContext(file, ~0, 1);
|
||||||
this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize());
|
this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize());
|
||||||
|
|
||||||
this.ctx = ctx;
|
this.ctx = ctx;
|
||||||
|
this.header = header;
|
||||||
}
|
}
|
||||||
|
|
||||||
public BTreeHeader getHeader(long fileOffset) {
|
public BTreeReader(MultimapFileLong file, BTreeContext ctx, long offset) {
|
||||||
|
this.file = file;
|
||||||
|
this.indexSearcher = MultimapSearcher.forContext(file, ~0, 1);
|
||||||
|
this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize());
|
||||||
|
|
||||||
|
this.ctx = ctx;
|
||||||
|
this.header = createHeader(file, offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static BTreeHeader createHeader(MultimapFileLong file, long fileOffset) {
|
||||||
return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
|
return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public BTreeHeader getHeader() {
|
||||||
|
return header;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int numEntries() {
|
||||||
|
return header.numEntries();
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public void retainEntries(BTreeQueryBuffer buffer) {
|
||||||
|
if (header.layers() == 0) {
|
||||||
|
BTreePointer pointer = new BTreePointer(header);
|
||||||
|
pointer.retainData(buffer);
|
||||||
|
}
|
||||||
|
retainSingle(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public void rejectEntries(BTreeQueryBuffer buffer) {
|
||||||
|
if (header.layers() == 0) {
|
||||||
|
BTreePointer pointer = new BTreePointer(header);
|
||||||
|
pointer.rejectData(buffer);
|
||||||
|
}
|
||||||
|
rejectSingle(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void retainSingle(BTreeQueryBuffer buffer) {
|
||||||
|
|
||||||
|
BTreePointer pointer = new BTreePointer(header);
|
||||||
|
|
||||||
|
for (; buffer.hasMore(); pointer.resetToRoot()) {
|
||||||
|
|
||||||
|
long val = buffer.currentValue() & ctx.equalityMask();
|
||||||
|
|
||||||
|
if (!pointer.walkToData(val)) {
|
||||||
|
buffer.rejectAndAdvance();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
pointer.retainData(buffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void rejectSingle(BTreeQueryBuffer buffer) {
|
||||||
|
BTreePointer pointer = new BTreePointer(header);
|
||||||
|
|
||||||
|
for (; buffer.hasMore(); pointer.resetToRoot()) {
|
||||||
|
|
||||||
|
long val = buffer.currentValue() & ctx.equalityMask();
|
||||||
|
|
||||||
|
if (pointer.walkToData(val) && pointer.containsData(val)) {
|
||||||
|
buffer.rejectAndAdvance();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
buffer.retainAndAdvance();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @return file offset of entry matching keyRaw, negative if absent
|
* @return file offset of entry matching keyRaw, negative if absent
|
||||||
*/
|
*/
|
||||||
public long findEntry(BTreeHeader header, final long keyRaw) {
|
public long findEntry(final long keyRaw) {
|
||||||
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
|
||||||
|
|
||||||
final long key = keyRaw & ctx.equalityMask();
|
final long key = keyRaw & ctx.equalityMask();
|
||||||
final long dataAddress = header.dataOffsetLongs();
|
|
||||||
|
|
||||||
final long searchStart;
|
BTreePointer ip = new BTreePointer(header);
|
||||||
final long numEntries;
|
|
||||||
|
|
||||||
if (header.layers() == 0) { // For small data, there is no index block, only a flat data block
|
while (!ip.isDataLayer())
|
||||||
searchStart = dataAddress;
|
ip.walkToChild(key);
|
||||||
numEntries = header.numEntries();
|
|
||||||
|
return ip.findData(key);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void readData(long[] data, int n, long pos) {
|
||||||
|
file.read(data, n, header.dataOffsetLongs() + pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
public long[] queryData(long[] urls, int offset) {
|
||||||
|
BTreePointer pointer = new BTreePointer(header);
|
||||||
|
|
||||||
|
long[] ret = new long[urls.length];
|
||||||
|
|
||||||
|
for (int i = 0; i < urls.length; i++, pointer.resetToRoot()) {
|
||||||
|
if (pointer.walkToData(urls[i])) {
|
||||||
|
long dataAddress = pointer.findData(urls[i]);
|
||||||
|
if (dataAddress >= 0) {
|
||||||
|
ret[i] = file.get(dataAddress + offset);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else {
|
|
||||||
long dataLayerOffset = searchIndex(header, key);
|
return ret;
|
||||||
if (dataLayerOffset < 0) {
|
}
|
||||||
return dataLayerOffset;
|
|
||||||
|
/** Find the range of values so that prefixStart <= n < prefixNext */
|
||||||
|
public LongLongImmutablePair getRangeForPrefix(long prefixStart, long prefixNext) {
|
||||||
|
long lowerBoundStart = lowerBound(prefixStart);
|
||||||
|
long lowerBoundEnd = lowerBound(prefixNext);
|
||||||
|
|
||||||
|
return new LongLongImmutablePair(lowerBoundStart, lowerBoundEnd);
|
||||||
|
}
|
||||||
|
|
||||||
|
private long lowerBound(long key) {
|
||||||
|
key &= ctx.equalityMask();
|
||||||
|
|
||||||
|
BTreePointer ip = new BTreePointer(header);
|
||||||
|
|
||||||
|
while (!ip.isDataLayer())
|
||||||
|
ip.walkToChild(key);
|
||||||
|
|
||||||
|
return ip.findDataLower(key);
|
||||||
|
}
|
||||||
|
|
||||||
|
private class BTreePointer {
|
||||||
|
private final long[] layerOffsets;
|
||||||
|
|
||||||
|
private int layer;
|
||||||
|
private long offset;
|
||||||
|
private long boundary;
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return getClass().getSimpleName() + "[" +
|
||||||
|
"layer = " + layer + " ," +
|
||||||
|
"offset = " + offset + "]";
|
||||||
|
}
|
||||||
|
|
||||||
|
public BTreePointer(BTreeHeader header) {
|
||||||
|
layer = header.layers() - 1;
|
||||||
|
offset = 0;
|
||||||
|
layerOffsets = header.getRelativeLayerOffsets(ctx);
|
||||||
|
boundary = Long.MAX_VALUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void resetToRoot() {
|
||||||
|
this.layer = header.layers() - 1;
|
||||||
|
this.offset = 0;
|
||||||
|
this.boundary = Long.MAX_VALUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int layer() {
|
||||||
|
return layer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean walkToChild(long key) {
|
||||||
|
final long indexAddress = header.indexOffsetLongs();
|
||||||
|
|
||||||
|
final long indexLayerBlockOffset = layerOffsets[layer] + offset;
|
||||||
|
|
||||||
|
final long searchStart = indexAddress + indexLayerBlockOffset;
|
||||||
|
final long nextLayerOffset = (int)(indexSearcher.binarySearchLower(key, searchStart, ctx.BLOCK_SIZE_WORDS()) - searchStart);
|
||||||
|
|
||||||
|
if (nextLayerOffset < 0)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
layer --;
|
||||||
|
boundary = file.get(searchStart + offset);
|
||||||
|
offset = ctx.BLOCK_SIZE_WORDS() * (offset + nextLayerOffset);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean walkToData(long key) {
|
||||||
|
while (!isDataLayer()) {
|
||||||
|
if (!walkToChild(key)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isDataLayer() {
|
||||||
|
return layer < 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean containsData(long key) {
|
||||||
|
return findData(key) >= 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long findData(long key) {
|
||||||
|
if (layer > 0) {
|
||||||
|
throw new IllegalStateException("Looking for data in an index layer");
|
||||||
}
|
}
|
||||||
|
|
||||||
searchStart = dataAddress + dataLayerOffset * ctx.entrySize();
|
long searchStart = header.dataOffsetLongs() + offset * ctx.entrySize();
|
||||||
numEntries = min(header.numEntries() - dataLayerOffset, blockSize);
|
int numEntries = min((int)(header.numEntries() - offset), ctx.BLOCK_SIZE_WORDS());
|
||||||
|
|
||||||
|
return dataSearcher.binarySearch(key, searchStart, numEntries);
|
||||||
}
|
}
|
||||||
|
|
||||||
return dataSearcher.binarySearch(key, searchStart, numEntries);
|
public long findDataLower(long key) {
|
||||||
}
|
if (layer > 0) {
|
||||||
|
throw new IllegalStateException("Looking for data in an index layer");
|
||||||
|
}
|
||||||
|
|
||||||
private long searchIndex(BTreeHeader header, long key) {
|
long searchStart = header.dataOffsetLongs() + offset * ctx.entrySize();
|
||||||
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
int numEntries = min((int)(header.numEntries() - offset), ctx.BLOCK_SIZE_WORDS());
|
||||||
final long indexAddress = header.indexOffsetLongs();
|
|
||||||
|
|
||||||
long layerOffset = 0;
|
return dataSearcher.binarySearchLower(key, searchStart, numEntries);
|
||||||
|
|
||||||
for (int i = header.layers() - 1; i >= 0; --i) {
|
|
||||||
final long indexLayerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset;
|
|
||||||
|
|
||||||
final long nextLayerOffset = relativePositionInIndex(key, indexAddress + indexLayerBlockOffset, blockSize);
|
|
||||||
if (nextLayerOffset < 0)
|
|
||||||
return nextLayerOffset;
|
|
||||||
|
|
||||||
layerOffset = blockSize * (nextLayerOffset + layerOffset);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return layerOffset;
|
public void retainData(BTreeQueryBuffer buffer) {
|
||||||
|
|
||||||
|
long dataOffset = findData(buffer.currentValue());
|
||||||
|
if (dataOffset >= 0) {
|
||||||
|
buffer.retainAndAdvance();
|
||||||
|
|
||||||
|
long blockBase = header.dataOffsetLongs() + offset * ctx.entrySize();
|
||||||
|
long relOffset = dataOffset - blockBase;
|
||||||
|
|
||||||
|
int numEntries =
|
||||||
|
min((int) (header.numEntries() - relOffset), ctx.BLOCK_SIZE_WORDS()) / ctx.entrySize();
|
||||||
|
|
||||||
|
if (buffer.currentValue() <= boundary) {
|
||||||
|
file.retain(buffer, boundary, dataOffset, numEntries, ctx.equalityMask(), ctx.entrySize());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
buffer.rejectAndAdvance();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void rejectData(BTreeQueryBuffer buffer) {
|
||||||
|
|
||||||
|
long dataOffset = findData(buffer.currentValue());
|
||||||
|
if (dataOffset >= 0) {
|
||||||
|
buffer.rejectAndAdvance();
|
||||||
|
|
||||||
|
long blockBase = header.dataOffsetLongs() + offset * ctx.entrySize();
|
||||||
|
long relOffset = dataOffset - blockBase;
|
||||||
|
|
||||||
|
int numEntries =
|
||||||
|
min((int) (header.numEntries() - relOffset), ctx.BLOCK_SIZE_WORDS()) / ctx.entrySize();
|
||||||
|
|
||||||
|
if (buffer.currentValue() <= boundary) {
|
||||||
|
file.reject(buffer, boundary, dataOffset, numEntries, ctx.equalityMask(), ctx.entrySize());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
buffer.retainAndAdvance();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private long relativePositionInIndex(long key, long start, long n) {
|
|
||||||
return indexSearcher.binarySearchUpper(key, start, n) - start;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,8 @@ package nu.marginalia.util.btree;
|
|||||||
import nu.marginalia.util.btree.model.BTreeContext;
|
import nu.marginalia.util.btree.model.BTreeContext;
|
||||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||||
import nu.marginalia.util.multimap.MultimapFileLongSlice;
|
import nu.marginalia.util.multimap.MultimapFileLongSlice;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
@ -10,6 +12,7 @@ import java.io.IOException;
|
|||||||
public class BTreeWriter {
|
public class BTreeWriter {
|
||||||
private final BTreeContext ctx;
|
private final BTreeContext ctx;
|
||||||
private final MultimapFileLongSlice map;
|
private final MultimapFileLongSlice map;
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
public BTreeWriter(MultimapFileLongSlice map, BTreeContext ctx) {
|
public BTreeWriter(MultimapFileLongSlice map, BTreeContext ctx) {
|
||||||
this.map = map;
|
this.map = map;
|
||||||
@ -39,7 +42,16 @@ public class BTreeWriter {
|
|||||||
|
|
||||||
header.write(map, offset);
|
header.write(map, offset);
|
||||||
|
|
||||||
writeIndexCallback.write(map.atOffset(header.dataOffsetLongs()));
|
|
||||||
|
var slice = map.atOffset(header.dataOffsetLongs());
|
||||||
|
|
||||||
|
BTreeDogEar dogEar = new BTreeDogEar(ctx, header, slice);
|
||||||
|
|
||||||
|
writeIndexCallback.write(slice);
|
||||||
|
|
||||||
|
if (!dogEar.verify()) {
|
||||||
|
logger.error("Dog ear was not overwritten: {}", header);
|
||||||
|
}
|
||||||
|
|
||||||
if (header.layers() < 1) { // The data is too small to benefit from indexing
|
if (header.layers() < 1) { // The data is too small to benefit from indexing
|
||||||
return ctx.calculateSize(numEntries);
|
return ctx.calculateSize(numEntries);
|
||||||
|
@ -1,136 +0,0 @@
|
|||||||
package nu.marginalia.util.btree;
|
|
||||||
|
|
||||||
import nu.marginalia.util.btree.model.BTreeContext;
|
|
||||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
|
||||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
|
||||||
import nu.marginalia.util.multimap.MultimapSearcher;
|
|
||||||
|
|
||||||
import static java.lang.Math.min;
|
|
||||||
|
|
||||||
public class CachingBTreeReader {
|
|
||||||
|
|
||||||
private final MultimapFileLong file;
|
|
||||||
public final BTreeContext ctx;
|
|
||||||
|
|
||||||
private final MultimapSearcher dataSearcher;
|
|
||||||
|
|
||||||
public CachingBTreeReader(MultimapFileLong file, BTreeContext ctx) {
|
|
||||||
this.file = file;
|
|
||||||
this.dataSearcher = MultimapSearcher.forContext(file, ctx.equalityMask(), ctx.entrySize());
|
|
||||||
|
|
||||||
this.ctx = ctx;
|
|
||||||
}
|
|
||||||
|
|
||||||
public BTreeHeader getHeader(long fileOffset) {
|
|
||||||
return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
|
|
||||||
}
|
|
||||||
|
|
||||||
public BTreeCachedIndex prepareCache(BTreeHeader header) {
|
|
||||||
return new BTreeCachedIndex(header);
|
|
||||||
}
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @return file offset of entry matching keyRaw, negative if absent
|
|
||||||
*/
|
|
||||||
public long findEntry(BTreeCachedIndex cache, final long keyRaw) {
|
|
||||||
BTreeHeader header = cache.header;
|
|
||||||
|
|
||||||
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
|
||||||
|
|
||||||
final long key = keyRaw & ctx.equalityMask();
|
|
||||||
final long dataAddress = header.dataOffsetLongs();
|
|
||||||
|
|
||||||
final long searchStart;
|
|
||||||
final long numEntries;
|
|
||||||
|
|
||||||
if (header.layers() == 0) { // For small data, there is no index block, only a flat data block
|
|
||||||
searchStart = dataAddress;
|
|
||||||
numEntries = header.numEntries();
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
cache.load();
|
|
||||||
|
|
||||||
long dataLayerOffset = searchIndex(header, cache, key);
|
|
||||||
if (dataLayerOffset < 0) {
|
|
||||||
return dataLayerOffset;
|
|
||||||
}
|
|
||||||
|
|
||||||
searchStart = dataAddress + dataLayerOffset * ctx.entrySize();
|
|
||||||
numEntries = min(header.numEntries() - dataLayerOffset, blockSize);
|
|
||||||
}
|
|
||||||
|
|
||||||
return dataSearcher.binarySearch(key, searchStart, numEntries);
|
|
||||||
}
|
|
||||||
|
|
||||||
private long searchIndex(BTreeHeader header, BTreeCachedIndex cache, long key) {
|
|
||||||
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
|
||||||
long layerOffset = 0;
|
|
||||||
|
|
||||||
for (int i = header.layers() - 1; i >= 0; --i) {
|
|
||||||
final long indexLayerBlockOffset = header.relativeIndexLayerOffset(ctx, i) + layerOffset;
|
|
||||||
|
|
||||||
final long nextLayerOffset = cache.relativePositionInIndex(key, (int) indexLayerBlockOffset, blockSize);
|
|
||||||
if (nextLayerOffset < 0)
|
|
||||||
return nextLayerOffset;
|
|
||||||
|
|
||||||
layerOffset = blockSize * (nextLayerOffset + layerOffset);
|
|
||||||
}
|
|
||||||
|
|
||||||
return layerOffset;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** A cache for the BTree index data that will drastically reduce the number of disk reads
|
|
||||||
* for repeated queries against the same tree. The memory consumption is typically very low
|
|
||||||
* and the disk access pattern for reading the entire index relatively cheap.
|
|
||||||
*/
|
|
||||||
public class BTreeCachedIndex {
|
|
||||||
long[] indexData;
|
|
||||||
final BTreeHeader header;
|
|
||||||
|
|
||||||
final int indexedDataSize;
|
|
||||||
|
|
||||||
public BTreeCachedIndex(BTreeHeader header) {
|
|
||||||
this.header = header;
|
|
||||||
indexedDataSize = header.numEntries();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void load() {
|
|
||||||
if (indexData != null)
|
|
||||||
return;
|
|
||||||
|
|
||||||
int size = (int)(header.dataOffsetLongs() - header.indexOffsetLongs());
|
|
||||||
indexData = new long[size];
|
|
||||||
file.read(indexData, header.indexOffsetLongs());
|
|
||||||
}
|
|
||||||
|
|
||||||
long relativePositionInIndex(long key, int fromIndex, int n) {
|
|
||||||
int low = 0;
|
|
||||||
int high = n - 1;
|
|
||||||
|
|
||||||
while (low <= high) {
|
|
||||||
int mid = (low + high) >>> 1;
|
|
||||||
long midVal = indexData[fromIndex + mid];
|
|
||||||
|
|
||||||
if (midVal < key)
|
|
||||||
low = mid + 1;
|
|
||||||
else if (midVal > key)
|
|
||||||
high = mid - 1;
|
|
||||||
else
|
|
||||||
return mid;
|
|
||||||
}
|
|
||||||
return low;
|
|
||||||
}
|
|
||||||
|
|
||||||
public long sizeBytes() {
|
|
||||||
return isLoaded() ? 8L*indexData.length : 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getIndexedDataSize() {
|
|
||||||
return indexedDataSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isLoaded() {
|
|
||||||
return indexData != null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -19,7 +19,7 @@ public record BTreeContext(int MAX_LAYERS,
|
|||||||
}
|
}
|
||||||
|
|
||||||
public int numIndexLayers(int numEntries) {
|
public int numIndexLayers(int numEntries) {
|
||||||
if (numEntries <= BLOCK_SIZE_WORDS*2) {
|
if (numEntries <= BLOCK_SIZE_WORDS*2/entrySize) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
for (int i = 1; i < MAX_LAYERS; i++) {
|
for (int i = 1; i < MAX_LAYERS; i++) {
|
||||||
|
@ -26,7 +26,6 @@ public class DictionaryData {
|
|||||||
|
|
||||||
if (rb == -1) {
|
if (rb == -1) {
|
||||||
int end = activeBank.getEnd();
|
int end = activeBank.getEnd();
|
||||||
logger.debug("Switching bank @ {}", end);
|
|
||||||
var newBank = new DictionaryDataBank(end, DICTIONARY_BANK_SIZE);
|
var newBank = new DictionaryDataBank(end, DICTIONARY_BANK_SIZE);
|
||||||
rb = newBank.add(key);
|
rb = newBank.add(key);
|
||||||
|
|
||||||
|
@ -16,7 +16,7 @@ import static nu.marginalia.util.FileSizeUtil.readableSize;
|
|||||||
* Spiritually influenced by GNU Trove's hash maps
|
* Spiritually influenced by GNU Trove's hash maps
|
||||||
* LGPL 2.1
|
* LGPL 2.1
|
||||||
*/
|
*/
|
||||||
public class DictionaryHashMap {
|
public class DictionaryHashMap implements DictionaryMap {
|
||||||
private static final Logger logger = LoggerFactory.getLogger(DictionaryHashMap.class);
|
private static final Logger logger = LoggerFactory.getLogger(DictionaryHashMap.class);
|
||||||
private static final Gauge probe_count_metrics
|
private static final Gauge probe_count_metrics
|
||||||
= Gauge.build("wmsa_dictionary_hash_map_probe_count", "Probing Count")
|
= Gauge.build("wmsa_dictionary_hash_map_probe_count", "Probing Count")
|
||||||
@ -81,6 +81,7 @@ public class DictionaryHashMap {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public int size() {
|
public int size() {
|
||||||
return sz.get();
|
return sz.get();
|
||||||
}
|
}
|
||||||
@ -97,6 +98,7 @@ public class DictionaryHashMap {
|
|||||||
buffers[buffer].put(bufferIdx, val);
|
buffers[buffer].put(bufferIdx, val);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public int put(long key) {
|
public int put(long key) {
|
||||||
|
|
||||||
long hash = key & 0x7FFF_FFFF_FFFF_FFFFL;
|
long hash = key & 0x7FFF_FFFF_FFFF_FFFFL;
|
||||||
@ -143,6 +145,7 @@ public class DictionaryHashMap {
|
|||||||
return di;
|
return di;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public int get(long key) {
|
public int get(long key) {
|
||||||
final long hash = key & 0x7FFF_FFFF_FFFF_FFFFL;
|
final long hash = key & 0x7FFF_FFFF_FFFF_FFFFL;
|
||||||
final long cell = hash % hashTableSize;
|
final long cell = hash % hashTableSize;
|
||||||
|
@ -0,0 +1,9 @@
|
|||||||
|
package nu.marginalia.util.dict;
|
||||||
|
|
||||||
|
public interface DictionaryMap {
|
||||||
|
int size();
|
||||||
|
|
||||||
|
int put(long key);
|
||||||
|
|
||||||
|
int get(long key);
|
||||||
|
}
|
@ -72,7 +72,7 @@ public enum UnicodeRanges {
|
|||||||
int count = 0;
|
int count = 0;
|
||||||
int max = sensitive ? 15 : 100;
|
int max = sensitive ? 15 : 100;
|
||||||
|
|
||||||
for (int i = 0; i < text.length(); i++) {
|
for (int i = 0; i < Math.min(2000, text.length()); i++) {
|
||||||
char c = text.charAt(i);
|
char c = text.charAt(i);
|
||||||
if (c >= min && c <= max) {
|
if (c >= min && c <= max) {
|
||||||
if (count++ > max) {
|
if (count++ > max) {
|
||||||
|
@ -88,6 +88,9 @@ public class WordPatterns {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static boolean hasWordQualities(String s) {
|
public static boolean hasWordQualities(String s) {
|
||||||
|
if (s.isBlank())
|
||||||
|
return false;
|
||||||
|
|
||||||
int start = 0;
|
int start = 0;
|
||||||
int end = s.length();
|
int end = s.length();
|
||||||
if (s.charAt(0) == '#') start++;
|
if (s.charAt(0) == '#') start++;
|
||||||
@ -95,13 +98,14 @@ public class WordPatterns {
|
|||||||
|
|
||||||
for (int i = start; i < end; i++) {
|
for (int i = start; i < end; i++) {
|
||||||
char c = s.charAt(i);
|
char c = s.charAt(i);
|
||||||
if (!("_@.'+-".indexOf(c) >= 0)
|
if (("_@.'+-".indexOf(c) < 0)
|
||||||
&& !(c >= 'a' && c <= 'z')
|
&& !(c >= 'a' && c <= 'z')
|
||||||
&& !(c >= 'A' && c <= 'Z')
|
&& !(c >= 'A' && c <= 'Z')
|
||||||
&& !(c >= '0' && c <= '9')
|
&& !(c >= '0' && c <= '9')
|
||||||
&& !(c >= '\u00C0' && c <= '\u00D6')
|
&& !(c >= '\u00C0' && c <= '\u00D6')
|
||||||
&& !(c >= '\u00D8' && c <= '\u00f6')
|
&& !(c >= '\u00D8' && c <= '\u00f6')
|
||||||
&& !(c >= '\u00f8' && c <= '\u00ff')) {
|
&& !(c >= '\u00f8' && c <= '\u00ff'))
|
||||||
|
{
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -119,10 +123,14 @@ public class WordPatterns {
|
|||||||
if (!filter(s)) {
|
if (!filter(s)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (topWords.contains(s.toLowerCase())) {
|
if (isTopWord(s)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static boolean isTopWord(String s) {
|
||||||
|
return topWords.contains(s.toLowerCase());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -2,8 +2,10 @@ package nu.marginalia.util.language.processing;
|
|||||||
|
|
||||||
import nu.marginalia.util.language.WordPatterns;
|
import nu.marginalia.util.language.WordPatterns;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
|
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
||||||
import nu.marginalia.util.language.processing.model.WordRep;
|
import nu.marginalia.util.language.processing.model.WordRep;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||||
@ -20,14 +22,9 @@ public class DocumentKeywordExtractor {
|
|||||||
private final NameCounter nameCounter;
|
private final NameCounter nameCounter;
|
||||||
private final SubjectCounter subjectCounter;
|
private final SubjectCounter subjectCounter;
|
||||||
|
|
||||||
private final TermFrequencyDict dict;
|
|
||||||
private final double docCount;
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public DocumentKeywordExtractor(TermFrequencyDict dict) {
|
public DocumentKeywordExtractor(TermFrequencyDict dict) {
|
||||||
this.dict = dict;
|
|
||||||
docCount = dict.docCount();
|
|
||||||
|
|
||||||
keywordExtractor = new KeywordExtractor();
|
keywordExtractor = new KeywordExtractor();
|
||||||
|
|
||||||
tfIdfCounter = new KeywordCounter(dict, keywordExtractor);
|
tfIdfCounter = new KeywordCounter(dict, keywordExtractor);
|
||||||
@ -36,69 +33,105 @@ public class DocumentKeywordExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public EdgePageWordSet extractKeywordsMinimal(DocumentLanguageData documentLanguageData) {
|
public EdgePageWordSet extractKeywordsMinimal(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
|
||||||
|
|
||||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||||
|
|
||||||
KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
|
|
||||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
||||||
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
||||||
|
|
||||||
List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
|
tfIdfCounter.countHisto(keywordMetadata, documentLanguageData);
|
||||||
List<WordRep> topKeywords = new ArrayList<>(wordsTfIdf.top());
|
|
||||||
|
|
||||||
Collection<String> artifacts = getArtifacts(documentLanguageData);
|
for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
|
||||||
|
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
|
||||||
|
for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed);
|
||||||
|
|
||||||
|
List<String> artifacts = getArtifacts(documentLanguageData);
|
||||||
|
|
||||||
|
keywordMetadata.flagsTemplate().add(EdgePageWordFlags.Simple);
|
||||||
|
|
||||||
return new EdgePageWordSet(
|
return new EdgePageWordSet(
|
||||||
createWords(IndexBlock.Subjects, subjects),
|
createWords(keywordMetadata, IndexBlock.Title, titleWords),
|
||||||
createWords(IndexBlock.Title, titleWords),
|
EdgePageWords.withBlankMetadata(IndexBlock.Artifacts, artifacts)
|
||||||
createWords(IndexBlock.NamesWords, wordsNamesAll),
|
|
||||||
createWords(IndexBlock.Tfidf_Top, topKeywords),
|
|
||||||
createWords(IndexBlock.Tfidf_Middle, midKeywords),
|
|
||||||
new EdgePageWords(IndexBlock.Artifacts, artifacts)
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData, KeywordMetadata keywordMetadata) {
|
||||||
|
|
||||||
public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData) {
|
|
||||||
|
|
||||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||||
|
|
||||||
KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
|
getWordPositions(keywordMetadata, documentLanguageData);
|
||||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
|
|
||||||
|
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
||||||
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
||||||
|
|
||||||
List<WordRep> lowKeywords = new ArrayList<>(wordsTfIdf.lower());
|
List<WordRep> wordsTfIdf = tfIdfCounter.countHisto(keywordMetadata, documentLanguageData);
|
||||||
List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
|
|
||||||
List<WordRep> topKeywords = new ArrayList<>(wordsTfIdf.top());
|
|
||||||
|
|
||||||
Collection<String> artifacts = getArtifacts(documentLanguageData);
|
for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
|
||||||
|
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
|
||||||
|
for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed);
|
||||||
|
|
||||||
|
List<String> artifacts = getArtifacts(documentLanguageData);
|
||||||
|
|
||||||
var wordSet = new EdgePageWordSet(
|
var wordSet = new EdgePageWordSet(
|
||||||
createWords(IndexBlock.Subjects, subjects),
|
createWords(keywordMetadata, IndexBlock.Title, titleWords),
|
||||||
createWords(IndexBlock.Title, titleWords),
|
createWords(keywordMetadata, IndexBlock.Tfidf_High, wordsTfIdf),
|
||||||
createWords(IndexBlock.NamesWords, wordsNamesAll),
|
createWords(keywordMetadata, IndexBlock.Subjects, subjects),
|
||||||
createWords(IndexBlock.Tfidf_Top, topKeywords),
|
EdgePageWords.withBlankMetadata(IndexBlock.Artifacts, artifacts)
|
||||||
createWords(IndexBlock.Tfidf_Middle, midKeywords),
|
|
||||||
createWords(IndexBlock.Tfidf_Lower, lowKeywords),
|
|
||||||
new EdgePageWords(IndexBlock.Artifacts, artifacts)
|
|
||||||
);
|
);
|
||||||
|
|
||||||
getSimpleWords(wordSet, documentLanguageData,
|
getSimpleWords(keywordMetadata, wordSet, documentLanguageData,
|
||||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus);
|
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus);
|
||||||
|
|
||||||
return wordSet;
|
return wordSet;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void getSimpleWords(EdgePageWordSet wordSet, DocumentLanguageData documentLanguageData, IndexBlock... blocks) {
|
|
||||||
|
public void getWordPositions(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
|
||||||
|
Map<String, Integer> ret = keywordMetadata.positionMask();
|
||||||
|
|
||||||
|
int posCtr = 0;
|
||||||
|
for (var sent : dld.titleSentences) {
|
||||||
|
int posBit = (int)((1L << (posCtr/4)) & 0xFFFF_FFFFL);
|
||||||
|
|
||||||
|
for (var word : sent) {
|
||||||
|
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var span : keywordExtractor.getNames(sent)) {
|
||||||
|
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
posCtr+=4;
|
||||||
|
for (var sent : dld.sentences) {
|
||||||
|
int posBit = (int)((1L << (posCtr/4)) & 0xFFFF_FFFFL);
|
||||||
|
|
||||||
|
for (var word : sent) {
|
||||||
|
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var span : keywordExtractor.getNames(sent)) {
|
||||||
|
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||||
|
}
|
||||||
|
|
||||||
|
posCtr++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private int bitwiseOr(int a, int b) {
|
||||||
|
return a | b;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void getSimpleWords(KeywordMetadata metadata, EdgePageWordSet wordSet, DocumentLanguageData documentLanguageData, IndexBlock... blocks) {
|
||||||
|
|
||||||
|
EnumSet<EdgePageWordFlags> flagsTemplate = EnumSet.noneOf(EdgePageWordFlags.class);
|
||||||
|
|
||||||
int start = 0;
|
int start = 0;
|
||||||
int lengthGoal = 32;
|
int lengthGoal = 32;
|
||||||
|
|
||||||
for (int blockIdx = 0; blockIdx < blocks.length-1 && start < documentLanguageData.sentences.length; blockIdx++) {
|
for (int blockIdx = 0; blockIdx < blocks.length && start < documentLanguageData.sentences.length; blockIdx++) {
|
||||||
IndexBlock block = blocks[blockIdx];
|
IndexBlock block = blocks[blockIdx];
|
||||||
Set<String> words = new HashSet<>(lengthGoal+100);
|
Set<EdgePageWords.Entry> words = new HashSet<>(lengthGoal+100);
|
||||||
|
|
||||||
int pos;
|
int pos;
|
||||||
int length = 0;
|
int length = 0;
|
||||||
@ -110,55 +143,26 @@ public class DocumentKeywordExtractor {
|
|||||||
if (!word.isStopWord()) {
|
if (!word.isStopWord()) {
|
||||||
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
||||||
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
|
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
|
||||||
words.add(w);
|
words.add(new EdgePageWords.Entry(w, metadata.forWord(flagsTemplate, word.stemmed())));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (var names : keywordExtractor.getNames(sent)) {
|
||||||
|
var rep = new WordRep(sent, names);
|
||||||
|
String w = AsciiFlattener.flattenUnicode(rep.word);
|
||||||
|
|
||||||
|
words.add(new EdgePageWords.Entry(w, metadata.forWord(flagsTemplate, rep.stemmed)));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
wordSet.append(block, words);
|
wordSet.append(block, words);
|
||||||
start = pos;
|
start = pos;
|
||||||
lengthGoal+=32;
|
lengthGoal+=32;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (start < documentLanguageData.sentences.length) {
|
|
||||||
|
|
||||||
Map<String, Integer> counts = new HashMap<>(documentLanguageData.totalNumWords());
|
|
||||||
for (int pos = start; pos < documentLanguageData.sentences.length && counts.size() < lengthGoal; pos++) {
|
|
||||||
var sent = documentLanguageData.sentences[pos];
|
|
||||||
for (var word : sent) {
|
|
||||||
if (!word.isStopWord()) {
|
|
||||||
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
|
||||||
if (counts.containsKey(w) || (WordPatterns.singleWordQualitiesPredicate.test(w))) {
|
|
||||||
counts.merge(w, 1, Integer::sum);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Set<String> lastSet;
|
|
||||||
if (counts.size() < 1024) {
|
|
||||||
lastSet = counts.keySet();
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
lastSet = counts.entrySet().stream()
|
|
||||||
.sorted(Comparator.comparing(e -> {
|
|
||||||
double N = docCount; // Number of documents in term freq dictionary
|
|
||||||
|
|
||||||
// Caveat: This is actually the *negated* term score, because the second logarithm has
|
|
||||||
// its parameter inverted (log(a^b) = b log(a); here b = -1)
|
|
||||||
return (1 + Math.log(e.getValue())) * Math.log((1. + dict.getTermFreq(e.getKey())) / N);
|
|
||||||
}))
|
|
||||||
.map(Map.Entry::getKey)
|
|
||||||
.limit(1024)
|
|
||||||
.collect(Collectors.toCollection(LinkedHashSet::new));
|
|
||||||
}
|
|
||||||
|
|
||||||
wordSet.append(blocks[blocks.length - 1], lastSet);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+");
|
private static final Pattern mailLikePattern = Pattern.compile("[a-zA-Z0-9._\\-]+@[a-zA-Z0-9]+(\\.[a-zA-Z0-9]+)+");
|
||||||
private Collection<String> getArtifacts(DocumentLanguageData documentLanguageData) {
|
private List<String> getArtifacts(DocumentLanguageData documentLanguageData) {
|
||||||
Set<String> reps = new HashSet<>();
|
Set<String> reps = new HashSet<>();
|
||||||
|
|
||||||
for (var sent : documentLanguageData.sentences) {
|
for (var sent : documentLanguageData.sentences) {
|
||||||
@ -183,7 +187,7 @@ public class DocumentKeywordExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return reps;
|
return new ArrayList<>(reps);
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<WordRep> extractTitleWords(DocumentLanguageData documentLanguageData) {
|
private List<WordRep> extractTitleWords(DocumentLanguageData documentLanguageData) {
|
||||||
@ -193,7 +197,21 @@ public class DocumentKeywordExtractor {
|
|||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) {
|
public EdgePageWords createWords(KeywordMetadata metadata,
|
||||||
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns::hasWordQualities).collect(Collectors.toSet()));
|
IndexBlock block,
|
||||||
|
Collection<WordRep> words) {
|
||||||
|
|
||||||
|
Set<EdgePageWords.Entry> entries = new HashSet<>(words.size());
|
||||||
|
for (var word : words) {
|
||||||
|
|
||||||
|
String flatWord = AsciiFlattener.flattenUnicode(word.word);
|
||||||
|
if (!WordPatterns.hasWordQualities(flatWord)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
entries.add(new EdgePageWords.Entry(flatWord, metadata.forWord(metadata.flagsTemplate(), word.stemmed)));
|
||||||
|
}
|
||||||
|
|
||||||
|
return new EdgePageWords(block, entries);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,15 +1,19 @@
|
|||||||
package nu.marginalia.util.language.processing;
|
package nu.marginalia.util.language.processing;
|
||||||
|
|
||||||
|
import com.github.jknack.handlebars.internal.lang3.StringUtils;
|
||||||
|
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||||
import nu.marginalia.util.language.WordPatterns;
|
import nu.marginalia.util.language.WordPatterns;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
|
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
||||||
import nu.marginalia.util.language.processing.model.WordRep;
|
import nu.marginalia.util.language.processing.model.WordRep;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Map;
|
import java.util.List;
|
||||||
import java.util.Set;
|
|
||||||
import java.util.regex.Pattern;
|
import static java.lang.Math.max;
|
||||||
|
|
||||||
public class KeywordCounter {
|
public class KeywordCounter {
|
||||||
private final KeywordExtractor keywordExtractor;
|
private final KeywordExtractor keywordExtractor;
|
||||||
@ -19,72 +23,78 @@ public class KeywordCounter {
|
|||||||
public KeywordCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) {
|
public KeywordCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) {
|
||||||
this.dict = dict;
|
this.dict = dict;
|
||||||
this.keywordExtractor = keywordExtractor;
|
this.keywordExtractor = keywordExtractor;
|
||||||
this.docCount = (double) dict.docCount();
|
this.docCount = dict.docCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
public WordHistogram countHisto(DocumentLanguageData dld) {
|
public List<WordRep> countHisto(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
|
||||||
HashMap<String, Integer> counts = new HashMap<>(15000);
|
TObjectIntHashMap<String> counts = new TObjectIntHashMap<>(10_000, 0.7f);
|
||||||
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(15000);
|
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(15000);
|
||||||
|
|
||||||
|
|
||||||
for (var sent : dld.sentences) {
|
for (var sent : dld.sentences) {
|
||||||
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
||||||
for (var span : keywords) {
|
for (var span : keywords) {
|
||||||
if (span.size() == 1 &&
|
|
||||||
WordPatterns.isStopWord(sent.words[span.start]))
|
if (span.size() == 1 && WordPatterns.isStopWord(sent.words[span.start])) {
|
||||||
continue;
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
String stemmed = sent.constructStemmedWordFromSpan(span);
|
var rep = new WordRep(sent, span);
|
||||||
|
|
||||||
counts.merge(stemmed, 1, Integer::sum);
|
counts.adjustOrPutValue(rep.stemmed, 1, 1);
|
||||||
instances.computeIfAbsent(stemmed, k -> new HashSet<>(500)).add(new WordRep(sent, span));
|
var instanceSet = instances.computeIfAbsent(rep.stemmed, k -> new HashSet<>(500));
|
||||||
|
if (instanceSet.size() < 250) {
|
||||||
|
instanceSet.add(rep);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
double maxC = counts.values().stream().mapToDouble(Double::valueOf).max().orElse(1);
|
HashMap<String, WordFrequencyData> tfIdf = keywordMetadata.wordsTfIdf();
|
||||||
|
List<WordRep> tfIdfHigh = new ArrayList<>();
|
||||||
|
|
||||||
Set<WordRep> h5 = new HashSet<>(2500);
|
int maxVal = maxValue(counts);
|
||||||
Set<WordRep> h10 = new HashSet<>(500);
|
|
||||||
Set<WordRep> h15 = new HashSet<>(500);
|
|
||||||
|
|
||||||
int doubleWordCount = 0;
|
counts.forEachEntry((key, cnt) -> {
|
||||||
|
int value = getTermValue(key, cnt, maxVal);
|
||||||
|
|
||||||
for (var entry : counts.entrySet()) {
|
tfIdf.put(key, new WordFrequencyData(cnt, value));
|
||||||
double value = getTermValue(entry, maxC);
|
|
||||||
|
|
||||||
double avgCnt = entry.getValue();
|
if (cnt > 1 && value > 100) {
|
||||||
String wordStemmed = entry.getKey();
|
tfIdfHigh.addAll(instances.get(key));
|
||||||
|
}
|
||||||
|
|
||||||
Set<WordRep> histogram;
|
return true;
|
||||||
if (value < -3 && avgCnt>1) histogram = h15;
|
});
|
||||||
else if (value < -1.75 && avgCnt>1) histogram = h10;
|
|
||||||
else if (value < -1 &&
|
|
||||||
(!wordStemmed.contains("_") || doubleWordCount++ < 50))
|
|
||||||
histogram = h5;
|
|
||||||
else continue;
|
|
||||||
|
|
||||||
histogram.addAll(instances.get(wordStemmed));
|
return tfIdfHigh;
|
||||||
}
|
|
||||||
return new WordHistogram(h5, h10, h15);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final Pattern separator = Pattern.compile("_");
|
private int maxValue(TObjectIntHashMap<?> map) {
|
||||||
|
int maxC = 0;
|
||||||
|
for (int c : map.values()) {
|
||||||
|
maxC = max(c, maxC);
|
||||||
|
}
|
||||||
|
return maxC;
|
||||||
|
}
|
||||||
|
|
||||||
public double getTermValue(Map.Entry<String, Integer> e, double maxValue) {
|
public int getTermValue(String key, int count, double maxValue) {
|
||||||
String key = e.getKey();
|
if (key.indexOf('_') >= 0) {
|
||||||
if (key.contains("_")) {
|
String[] parts = StringUtils.split(key, '_');
|
||||||
String[] parts = separator.split(e.getKey());
|
|
||||||
double totalValue = 0.;
|
double totalValue = 0.;
|
||||||
for (String part : parts) {
|
for (String part : parts) {
|
||||||
totalValue += value(part, e.getValue(), maxValue);
|
totalValue += value(part, count, maxValue);
|
||||||
}
|
}
|
||||||
return totalValue / parts.length;
|
return normalizeValue(totalValue / parts.length);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
return value(key, e.getValue(), maxValue);
|
return normalizeValue(value(key, count, maxValue));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int normalizeValue(double v) {
|
||||||
|
return (int)(-v*75);
|
||||||
|
}
|
||||||
|
|
||||||
double value(String key, double value, double maxValue) {
|
double value(String key, double value, double maxValue) {
|
||||||
double freq = dict.getTermFreqStemmed(key);
|
double freq = dict.getTermFreqStemmed(key);
|
||||||
if (freq < 1) {
|
if (freq < 1) {
|
||||||
@ -93,5 +103,5 @@ public class KeywordCounter {
|
|||||||
return (0.1 + 0.9*value/maxValue) * Math.log(freq/docCount);
|
return (0.1 + 0.9*value/maxValue) * Math.log(freq/docCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
public record WordHistogram(Set<WordRep> lower, Set<WordRep> mid, Set<WordRep> top) { }
|
public record WordFrequencyData(int count, int tfIdfNormalized) { }
|
||||||
}
|
}
|
||||||
|
@ -1,64 +0,0 @@
|
|||||||
package nu.marginalia.util.language.processing;
|
|
||||||
|
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
|
||||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
|
||||||
import nu.marginalia.util.language.processing.model.WordRep;
|
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
public class LongNameCounter {
|
|
||||||
private final KeywordExtractor keywordExtractor;
|
|
||||||
private final TermFrequencyDict dict;
|
|
||||||
private final double docCount;
|
|
||||||
public LongNameCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) {
|
|
||||||
this.dict = dict;
|
|
||||||
docCount = (double) dict.docCount();
|
|
||||||
this.keywordExtractor = keywordExtractor;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<WordRep> count(DocumentLanguageData dld) {
|
|
||||||
HashMap<String, Double> counts = new HashMap<>(1000);
|
|
||||||
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
|
|
||||||
|
|
||||||
for (int i = 0; i < dld.sentences.length; i++) {
|
|
||||||
DocumentSentence sent = dld.sentences[i];
|
|
||||||
var keywords = keywordExtractor.getNamesStrict(sent);
|
|
||||||
for (var span : keywords) {
|
|
||||||
var stemmed = sent.constructStemmedWordFromSpan(span);
|
|
||||||
counts.merge(stemmed, 1., Double::sum);
|
|
||||||
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return counts.entrySet().stream().filter(e -> termSize(e.getKey()) > 1)
|
|
||||||
.sorted(Comparator.comparing(this::getTermValue))
|
|
||||||
.limit(Math.min(50, counts.size()/3))
|
|
||||||
.map(Map.Entry::getKey)
|
|
||||||
.flatMap(w -> instances.get(w).stream()).collect(Collectors.toList());
|
|
||||||
}
|
|
||||||
|
|
||||||
int termSize(String word) {
|
|
||||||
return 1 + (int) word.chars().filter(c -> c == '_').count();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
final Pattern separator = Pattern.compile("_");
|
|
||||||
|
|
||||||
public double getTermValue(Map.Entry<String, Double> e) {
|
|
||||||
String[] parts = separator.split(e.getKey());
|
|
||||||
double totalValue = 0.;
|
|
||||||
for (String part : parts) {
|
|
||||||
totalValue += value(part, e.getValue());
|
|
||||||
}
|
|
||||||
return totalValue / Math.sqrt(parts.length);
|
|
||||||
}
|
|
||||||
|
|
||||||
double value(String key, double value) {
|
|
||||||
return (1+Math.log(value)) * Math.log((1.1+dict.getTermFreqStemmed(key))/11820118.);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
@ -37,7 +37,8 @@ public class NameCounter {
|
|||||||
.sorted(Comparator.comparing(e -> -e.getValue()))
|
.sorted(Comparator.comparing(e -> -e.getValue()))
|
||||||
.limit(150)
|
.limit(150)
|
||||||
.map(Map.Entry::getKey)
|
.map(Map.Entry::getKey)
|
||||||
.flatMap(w -> instances.get(w).stream()).collect(Collectors.toList());
|
.flatMap(w -> instances.get(w).stream())
|
||||||
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.util.language.processing;
|
package nu.marginalia.util.language.processing;
|
||||||
|
|
||||||
import com.github.datquocnguyen.RDRPOSTagger;
|
import com.github.datquocnguyen.RDRPOSTagger;
|
||||||
|
import com.github.jknack.handlebars.internal.lang3.StringUtils;
|
||||||
import gnu.trove.list.array.TIntArrayList;
|
import gnu.trove.list.array.TIntArrayList;
|
||||||
import gnu.trove.map.hash.TObjectIntHashMap;
|
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
@ -125,11 +126,45 @@ public class SentenceExtractor {
|
|||||||
return counts;
|
return counts;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final Pattern dotPattern = Pattern.compile("\\.+$");
|
|
||||||
private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)");
|
private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)");
|
||||||
private static final Pattern spacesPattern = Pattern.compile("\\s+");
|
|
||||||
|
|
||||||
private static final Pattern badCharPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|(\\.(\\s+|$))");
|
// private static final Pattern badCharPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|(\\.(\\s+|$))");
|
||||||
|
|
||||||
|
private boolean isBadChar(char c) {
|
||||||
|
if (c >= 'a' && c <= 'z') return false;
|
||||||
|
if (c >= 'A' && c <= 'Z') return false;
|
||||||
|
if (c >= '0' && c <= '9') return false;
|
||||||
|
if ("_#@.".indexOf(c) >= 0) return false;
|
||||||
|
if (c >= '\u00C0' && c <= '\u00D6') return false;
|
||||||
|
if (c >= '\u00D8' && c <= '\u00F6') return false;
|
||||||
|
if (c >= '\u00F8' && c <= '\u00FF') return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
private String sanitizeString(String s) {
|
||||||
|
char[] newChars = new char[s.length()];
|
||||||
|
int pi = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < newChars.length; i++) {
|
||||||
|
char c = s.charAt(i);
|
||||||
|
if (!isBadChar(c)) {
|
||||||
|
newChars[pi++] = c;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
newChars[pi++] = ' ';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
s = new String(newChars, 0, pi);
|
||||||
|
|
||||||
|
if (s.startsWith(".")) {
|
||||||
|
s = s.substring(1);
|
||||||
|
if (s.isBlank())
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
public DocumentSentence extractSentence(String text) {
|
public DocumentSentence extractSentence(String text) {
|
||||||
var wordsAndSeps = splitSegment(text);
|
var wordsAndSeps = splitSegment(text);
|
||||||
@ -139,7 +174,7 @@ public class SentenceExtractor {
|
|||||||
var lc = toLc(wordsAndSeps.words);
|
var lc = toLc(wordsAndSeps.words);
|
||||||
|
|
||||||
return new DocumentSentence(
|
return new DocumentSentence(
|
||||||
badCharPattern.matcher(text).replaceAll(" "), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc)
|
sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -161,7 +196,7 @@ public class SentenceExtractor {
|
|||||||
sentences = sentenceDetector.sentDetect(textNormalizedSpaces);
|
sentences = sentenceDetector.sentDetect(textNormalizedSpaces);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
sentences = textNormalizedSpaces.split("[.]");
|
sentences = StringUtils.split(textNormalizedSpaces, '.');
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sentences.length > 250) {
|
if (sentences.length > 250) {
|
||||||
@ -196,8 +231,8 @@ public class SentenceExtractor {
|
|||||||
separators[i] = Arrays.copyOf(separators[i], 250);
|
separators[i] = Arrays.copyOf(separators[i], 250);
|
||||||
}
|
}
|
||||||
for (int j = 0; j < tokens[i].length; j++) {
|
for (int j = 0; j < tokens[i].length; j++) {
|
||||||
if (tokens[i][j].endsWith(".")) {
|
while (tokens[i][j].endsWith(".")) {
|
||||||
tokens[i][j] = dotPattern.matcher(tokens[i][j]).replaceAll("");
|
tokens[i][j] = StringUtils.removeEnd(tokens[i][j], ".");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -216,7 +251,7 @@ public class SentenceExtractor {
|
|||||||
|
|
||||||
DocumentSentence[] ret = new DocumentSentence[sentences.length];
|
DocumentSentence[] ret = new DocumentSentence[sentences.length];
|
||||||
for (int i = 0; i < ret.length; i++) {
|
for (int i = 0; i < ret.length; i++) {
|
||||||
ret[i] = new DocumentSentence(badCharPattern.matcher(sentences[i]).replaceAll(" "), tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]);
|
ret[i] = new DocumentSentence(sanitizeString(sentences[i]), tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -5,9 +5,7 @@ import nu.marginalia.util.language.processing.model.WordRep;
|
|||||||
import nu.marginalia.util.language.processing.model.WordSpan;
|
import nu.marginalia.util.language.processing.model.WordSpan;
|
||||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.*;
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
public class SubjectCounter {
|
public class SubjectCounter {
|
||||||
@ -27,7 +25,9 @@ public class SubjectCounter {
|
|||||||
|
|
||||||
public List<WordRep> count(DocumentLanguageData dld) {
|
public List<WordRep> count(DocumentLanguageData dld) {
|
||||||
|
|
||||||
Map<WordRep, Integer> counts = new HashMap<>();
|
Map<String, Integer> counts = new HashMap<>();
|
||||||
|
Map<String, Set<WordRep>> instances = new HashMap<>();
|
||||||
|
|
||||||
for (var sentence : dld.sentences) {
|
for (var sentence : dld.sentences) {
|
||||||
for (WordSpan kw : keywordExtractor.getNames(sentence)) {
|
for (WordSpan kw : keywordExtractor.getNames(sentence)) {
|
||||||
if (kw.end + 2 >= sentence.length()) {
|
if (kw.end + 2 >= sentence.length()) {
|
||||||
@ -41,7 +41,13 @@ public class SubjectCounter {
|
|||||||
String nextNextTag = sentence.posTags[kw.end+1];
|
String nextNextTag = sentence.posTags[kw.end+1];
|
||||||
|
|
||||||
if (isVerb(nextTag) && isDetOrAdverbOrVerb(nextNextTag)) {
|
if (isVerb(nextTag) && isDetOrAdverbOrVerb(nextNextTag)) {
|
||||||
counts.merge(new WordRep(sentence, new WordSpan(kw.start, kw.end)), -1, Integer::sum);
|
var span = new WordSpan(kw.start, kw.end);
|
||||||
|
var rep = new WordRep(sentence, span);
|
||||||
|
|
||||||
|
String stemmed = rep.stemmed;
|
||||||
|
|
||||||
|
counts.merge(stemmed, -1, Integer::sum);
|
||||||
|
instances.computeIfAbsent(stemmed, s -> new HashSet<>()).add(rep);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -49,8 +55,8 @@ public class SubjectCounter {
|
|||||||
int best = counts.values().stream().mapToInt(Integer::valueOf).min().orElse(0);
|
int best = counts.values().stream().mapToInt(Integer::valueOf).min().orElse(0);
|
||||||
|
|
||||||
return counts.entrySet().stream().sorted(Map.Entry.comparingByValue())
|
return counts.entrySet().stream().sorted(Map.Entry.comparingByValue())
|
||||||
.filter(e -> e.getValue()<-2 && e.getValue()<best*0.75)
|
.filter(e -> e.getValue()<-2 && e.getValue()<=best*0.75)
|
||||||
.map(Map.Entry::getKey)
|
.flatMap(e -> instances.getOrDefault(e.getKey(), Collections.emptySet()).stream())
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -0,0 +1,58 @@
|
|||||||
|
package nu.marginalia.util.language.processing.model;
|
||||||
|
|
||||||
|
import nu.marginalia.util.language.processing.KeywordCounter;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
|
||||||
|
|
||||||
|
import java.util.EnumSet;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
|
||||||
|
public record KeywordMetadata(HashSet<String> titleKeywords,
|
||||||
|
HashSet<String> subjectKeywords,
|
||||||
|
HashSet<String> namesKeywords,
|
||||||
|
HashMap<String, KeywordCounter.WordFrequencyData> wordsTfIdf,
|
||||||
|
HashMap<String, Integer> positionMask,
|
||||||
|
EnumSet<EdgePageWordFlags> flagsTemplate,
|
||||||
|
int quality
|
||||||
|
)
|
||||||
|
{
|
||||||
|
|
||||||
|
private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0);
|
||||||
|
|
||||||
|
public KeywordMetadata(double quality, EnumSet<EdgePageWordFlags> flags) {
|
||||||
|
this(new HashSet<>(50), new HashSet<>(10), new HashSet<>(50),
|
||||||
|
new HashMap<>(15_000),
|
||||||
|
new HashMap<>(10_000),
|
||||||
|
flags,
|
||||||
|
(int)(-quality));
|
||||||
|
}
|
||||||
|
|
||||||
|
public KeywordMetadata(double quality) {
|
||||||
|
this(quality, EnumSet.noneOf(EdgePageWordFlags.class));
|
||||||
|
}
|
||||||
|
|
||||||
|
public long forWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
|
||||||
|
|
||||||
|
KeywordCounter.WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty);
|
||||||
|
EnumSet<EdgePageWordFlags> flags = flagsTemplate.clone();
|
||||||
|
|
||||||
|
if (subjectKeywords.contains(stemmed))
|
||||||
|
flags.add(EdgePageWordFlags.Subjects);
|
||||||
|
|
||||||
|
if (namesKeywords.contains(stemmed))
|
||||||
|
flags.add(EdgePageWordFlags.NamesWords);
|
||||||
|
|
||||||
|
if (titleKeywords.contains(stemmed))
|
||||||
|
flags.add(EdgePageWordFlags.Title);
|
||||||
|
|
||||||
|
int positions = positionMask.getOrDefault(stemmed, 0);
|
||||||
|
|
||||||
|
return new EdgePageWordMetadata(tfidf.tfIdfNormalized(), positions, quality, tfidf.count(), flags).encode();
|
||||||
|
}
|
||||||
|
|
||||||
|
public int quality() {
|
||||||
|
return -quality;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,21 +1,22 @@
|
|||||||
package nu.marginalia.util.language.processing.model;
|
package nu.marginalia.util.language.processing.model;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.EqualsAndHashCode;
|
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
|
||||||
@AllArgsConstructor @EqualsAndHashCode @Getter
|
@AllArgsConstructor @Getter
|
||||||
public class WordRep implements Comparable<WordRep> {
|
public class WordRep implements Comparable<WordRep> {
|
||||||
|
|
||||||
public WordRep(DocumentSentence sent, WordSpan span) {
|
public WordRep(DocumentSentence sent, WordSpan span) {
|
||||||
word = sent.constructWordFromSpan(span);
|
word = sent.constructWordFromSpan(span);
|
||||||
stemmed = sent.constructStemmedWordFromSpan(span);
|
stemmed = sent.constructStemmedWordFromSpan(span);
|
||||||
length = span.end - span.start;
|
length = span.end - span.start;
|
||||||
|
|
||||||
hashCode = Objects.hash(word);
|
hashCode = Objects.hash(word);
|
||||||
}
|
}
|
||||||
|
|
||||||
public final int length;
|
public final int length;
|
||||||
public final String word;
|
public final String word;
|
||||||
public final String stemmed;
|
public final String stemmed;
|
||||||
@ -34,4 +35,12 @@ public class WordRep implements Comparable<WordRep> {
|
|||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
return hashCode;
|
return hashCode;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean equals(Object other) {
|
||||||
|
if (other == this) return true;
|
||||||
|
if (other instanceof WordRep wr) {
|
||||||
|
return Objects.equals(wr.word, word);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,7 @@ package nu.marginalia.util.multimap;
|
|||||||
|
|
||||||
import com.upserve.uppend.blobs.NativeIO;
|
import com.upserve.uppend.blobs.NativeIO;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.util.btree.BTreeQueryBuffer;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -100,15 +101,15 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
|
|||||||
public MultimapSearcherBase createSearcher() {
|
public MultimapSearcherBase createSearcher() {
|
||||||
return new MultimapSearcherBase(this);
|
return new MultimapSearcherBase(this);
|
||||||
}
|
}
|
||||||
public MultimapSorter createSorter(Path tmpFile, int internalSortLimit) {
|
public MultimapSorter createSorter(Path tmpFile, int internalSortLimit, int minStepSize) {
|
||||||
return new MultimapSorter(this, tmpFile, internalSortLimit);
|
return new MultimapSorter(this, tmpFile, internalSortLimit, minStepSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void advice(NativeIO.Advice advice) {
|
public void advice(NativeIO.Advice advice) {
|
||||||
this.defaultAdvice = advice;
|
this.defaultAdvice = advice;
|
||||||
for (var buffer : mappedByteBuffers) {
|
for (var buffer : mappedByteBuffers) {
|
||||||
NativeIO.madvise(buffer, advice);
|
NativeIO.madvise(buffer, advice);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -340,6 +341,49 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(LongBuffer vals, int n, long idx) {
|
||||||
|
if (idx+n >= mappedSize) {
|
||||||
|
grow(idx+n);
|
||||||
|
}
|
||||||
|
int iN = (int)((idx + n) / bufferSize);
|
||||||
|
|
||||||
|
for (int i = 0; i < n; ) {
|
||||||
|
int i0 = (int)((idx + i) / bufferSize);
|
||||||
|
|
||||||
|
int bufferOffset = (int) ((idx+i) % bufferSize);
|
||||||
|
var buffer = buffers.get(i0);
|
||||||
|
|
||||||
|
final int l;
|
||||||
|
|
||||||
|
if (i0 < iN) l = bufferSize - bufferOffset;
|
||||||
|
else l = Math.min(n - i, bufferSize - bufferOffset);
|
||||||
|
|
||||||
|
buffer.put(bufferOffset, vals, vals.position() + i, l);
|
||||||
|
i+=l;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void swapn(int n, long idx1, long idx2) {
|
||||||
|
for (int i = 0; i < n; i++)
|
||||||
|
swap(idx1+i, idx2+i);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void swap(long idx1, long idx2) {
|
||||||
|
LongBuffer buff1 = buffers.get((int)(idx1) / bufferSize);
|
||||||
|
final int o1 = (int) (idx1) % bufferSize;
|
||||||
|
|
||||||
|
LongBuffer buff2 = buffers.get((int)(idx2) / bufferSize);
|
||||||
|
final int o2 = (int) (idx2) % bufferSize;
|
||||||
|
|
||||||
|
long tmp = buff1.get(o1);
|
||||||
|
buff1.put(o1, buff2.get(o2));
|
||||||
|
buff2.put(o2, tmp);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setRange(long idx, int n, long val) {
|
public void setRange(long idx, int n, long val) {
|
||||||
if (n == 0) return;
|
if (n == 0) return;
|
||||||
@ -410,6 +454,387 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long binarySearchInternal(long key, long fromIndex, int step, long n, long mask) {
|
||||||
|
if (fromIndex + n*step >= mappedSize)
|
||||||
|
grow(fromIndex + n*step);
|
||||||
|
|
||||||
|
long low = 0;
|
||||||
|
long high = n - 1;
|
||||||
|
|
||||||
|
if (fromIndex/bufferSize == (fromIndex+step*n)/bufferSize) {
|
||||||
|
int idx = (int)(fromIndex / bufferSize);
|
||||||
|
|
||||||
|
while (low <= high) {
|
||||||
|
long mid = (low + high) >>> 1;
|
||||||
|
long off = fromIndex + mid*step;
|
||||||
|
long midVal = buffers.get(idx).get((int)(off % bufferSize)) & mask;
|
||||||
|
|
||||||
|
if (midVal < key)
|
||||||
|
low = mid + 1;
|
||||||
|
else if (midVal > key)
|
||||||
|
high = mid - 1;
|
||||||
|
else
|
||||||
|
return fromIndex + mid*step;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
while (low <= high) {
|
||||||
|
long mid = (low + high) >>> 1;
|
||||||
|
long off = fromIndex + mid*step;
|
||||||
|
long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize)) & mask;
|
||||||
|
|
||||||
|
if (midVal < key)
|
||||||
|
low = mid + 1;
|
||||||
|
else if (midVal > key)
|
||||||
|
high = mid - 1;
|
||||||
|
else
|
||||||
|
return fromIndex + mid*step;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return -1L-(fromIndex + high*step);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long binarySearchInternal(long key, long fromIndex, long n, long mask) {
|
||||||
|
if (fromIndex + n >= mappedSize)
|
||||||
|
grow(fromIndex + n);
|
||||||
|
|
||||||
|
long low = 0;
|
||||||
|
long high = n - 1;
|
||||||
|
|
||||||
|
if (fromIndex/bufferSize == (fromIndex+n)/bufferSize) {
|
||||||
|
int idx = (int)(fromIndex / bufferSize);
|
||||||
|
|
||||||
|
while (low <= high) {
|
||||||
|
long mid = (low + high) >>> 1;
|
||||||
|
long off = fromIndex + mid;
|
||||||
|
long midVal = buffers.get(idx).get((int)(off % bufferSize)) & mask;
|
||||||
|
|
||||||
|
if (midVal < key)
|
||||||
|
low = mid + 1;
|
||||||
|
else if (midVal > key)
|
||||||
|
high = mid - 1;
|
||||||
|
else
|
||||||
|
return fromIndex + mid;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
while (low <= high) {
|
||||||
|
long mid = (low + high) >>> 1;
|
||||||
|
long off = fromIndex + mid;
|
||||||
|
long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize)) & mask;
|
||||||
|
|
||||||
|
if (midVal < key)
|
||||||
|
low = mid + 1;
|
||||||
|
else if (midVal > key)
|
||||||
|
high = mid - 1;
|
||||||
|
else
|
||||||
|
return fromIndex + mid;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return -1L-(fromIndex + high);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long binarySearchInternal(long key, long fromIndex, long n) {
|
||||||
|
if (fromIndex + n >= mappedSize)
|
||||||
|
grow(fromIndex + n);
|
||||||
|
|
||||||
|
long low = 0;
|
||||||
|
long high = n - 1;
|
||||||
|
|
||||||
|
if (fromIndex/bufferSize == (fromIndex+n)/bufferSize) {
|
||||||
|
int idx = (int)(fromIndex / bufferSize);
|
||||||
|
|
||||||
|
while (low <= high) {
|
||||||
|
long mid = (low + high) >>> 1;
|
||||||
|
long off = fromIndex + mid;
|
||||||
|
long midVal = buffers.get(idx).get((int)(off % bufferSize));
|
||||||
|
|
||||||
|
if (midVal < key)
|
||||||
|
low = mid + 1;
|
||||||
|
else if (midVal > key)
|
||||||
|
high = mid - 1;
|
||||||
|
else
|
||||||
|
return fromIndex + mid;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
while (low <= high) {
|
||||||
|
long mid = (low + high) >>> 1;
|
||||||
|
long off = fromIndex + mid;
|
||||||
|
long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize));
|
||||||
|
|
||||||
|
if (midVal < key)
|
||||||
|
low = mid + 1;
|
||||||
|
else if (midVal > key)
|
||||||
|
high = mid - 1;
|
||||||
|
else
|
||||||
|
return fromIndex + mid;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return -1L-(fromIndex + high);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long binarySearchUpperInternal(long key, long fromIndex, long n) {
|
||||||
|
if (fromIndex + n >= mappedSize)
|
||||||
|
grow(fromIndex + n);
|
||||||
|
|
||||||
|
long low = 0;
|
||||||
|
long high = n - 1;
|
||||||
|
|
||||||
|
if (fromIndex/bufferSize == (fromIndex+n)/bufferSize) {
|
||||||
|
int idx = (int)(fromIndex / bufferSize);
|
||||||
|
|
||||||
|
while (low <= high) {
|
||||||
|
long mid = (low + high) >>> 1;
|
||||||
|
long off = fromIndex + mid;
|
||||||
|
long midVal = buffers.get(idx).get((int)(off % bufferSize));
|
||||||
|
|
||||||
|
if (midVal < key)
|
||||||
|
low = mid + 1;
|
||||||
|
else if (midVal > key)
|
||||||
|
high = mid - 1;
|
||||||
|
else
|
||||||
|
return fromIndex + mid;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
while (low <= high) {
|
||||||
|
long mid = (low + high) >>> 1;
|
||||||
|
long off = fromIndex + mid;
|
||||||
|
long midVal = buffers.get((int)(off / bufferSize)).get((int)(off % bufferSize));
|
||||||
|
|
||||||
|
if (midVal < key)
|
||||||
|
low = mid + 1;
|
||||||
|
else if (midVal > key)
|
||||||
|
high = mid - 1;
|
||||||
|
else
|
||||||
|
return fromIndex + mid;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return fromIndex + low;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isSameBuffer(long a, long b) {
|
||||||
|
return a / bufferSize == b/bufferSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long quickSortPartition(int wordSize, long low, long high) {
|
||||||
|
if (high >= mappedSize)
|
||||||
|
grow(high + wordSize - 1);
|
||||||
|
|
||||||
|
if (isSameBuffer(low, high + wordSize - 1)) {
|
||||||
|
// Specialization that circumvents the need for expensive calls to
|
||||||
|
// MultimapFileLong.get() in the most common scenario
|
||||||
|
|
||||||
|
return quickSortPartitionSameBuffer(wordSize, low, high);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return quickSortPartitionDifferentBuffers(wordSize, low, high);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void insertionSort(int wordSize, long start, int n) {
|
||||||
|
if (start + n + wordSize - 1 >= mappedSize)
|
||||||
|
grow(start + n + wordSize - 1);
|
||||||
|
|
||||||
|
if (n == 1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isSameBuffer(start, start + (long)n*wordSize-1L)) {
|
||||||
|
final var buffer = buffers.get((int) (start / bufferSize));
|
||||||
|
int off = (int) (start % bufferSize);
|
||||||
|
|
||||||
|
for (int i = 1; i < n; i++) {
|
||||||
|
for (int j = i; j > 0; j--) {
|
||||||
|
int a = off + wordSize*(j-1);
|
||||||
|
int b = off + wordSize*j;
|
||||||
|
|
||||||
|
if (buffer.get(a) > buffer.get(b)) {
|
||||||
|
for (int w = 0; w < wordSize; w++) {
|
||||||
|
long tmp = buffer.get(a+w);
|
||||||
|
buffer.put(a+w, buffer.get(b+w));
|
||||||
|
buffer.put(b+w, tmp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else for (int i = 1; i < n; i++) {
|
||||||
|
for (int j = i; j > 0; j--) {
|
||||||
|
long a = start + (long)wordSize*(j-1);
|
||||||
|
long b = start + (long)wordSize*j;
|
||||||
|
|
||||||
|
if (get(a) > get(b)) {
|
||||||
|
swap(a, b);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private long quickSortPartitionDifferentBuffers(int wordSize, long low, long high) {
|
||||||
|
|
||||||
|
long pivotPoint = ((low + high) / (2L*wordSize)) * wordSize;
|
||||||
|
long pivot = get(pivotPoint);
|
||||||
|
|
||||||
|
long i = low - wordSize;
|
||||||
|
long j = high + wordSize;
|
||||||
|
|
||||||
|
for (;;) {
|
||||||
|
do {
|
||||||
|
i+=wordSize;
|
||||||
|
} while (get(i) < pivot);
|
||||||
|
|
||||||
|
do {
|
||||||
|
j-=wordSize;
|
||||||
|
}
|
||||||
|
while (get(j) > pivot);
|
||||||
|
|
||||||
|
if (i >= j) return j;
|
||||||
|
else swapn(wordSize, i, j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private long quickSortPartitionSameBuffer(int wordSize, long low, long high) {
|
||||||
|
|
||||||
|
final var buffer = buffers.get((int) (low / bufferSize));
|
||||||
|
|
||||||
|
int pivotPoint = (int) ((low + high) / (2L*wordSize)) * wordSize % bufferSize;
|
||||||
|
long pivot = buffer.get(pivotPoint);
|
||||||
|
|
||||||
|
int j = (int) (high) % bufferSize + wordSize;
|
||||||
|
int i = (int) (low) % bufferSize - wordSize;
|
||||||
|
|
||||||
|
long j0 = high + wordSize - j;
|
||||||
|
|
||||||
|
for (;;) {
|
||||||
|
do {
|
||||||
|
i+=wordSize;
|
||||||
|
} while (buffer.get(i) < pivot);
|
||||||
|
|
||||||
|
do {
|
||||||
|
j-=wordSize;
|
||||||
|
}
|
||||||
|
while (buffer.get(j) > pivot);
|
||||||
|
|
||||||
|
if (i >= j) return j0 + j;
|
||||||
|
else {
|
||||||
|
for (int w = 0; w < wordSize; w++) {
|
||||||
|
long tmp = buffer.get(i+w);
|
||||||
|
buffer.put(i+w, buffer.get(j+w));
|
||||||
|
buffer.put(j+w, tmp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
public void retain(BTreeQueryBuffer buffer, long boundary, long searchStart, long numEntries, long mask, int stepSize) {
|
||||||
|
|
||||||
|
final long end = searchStart + stepSize * numEntries;
|
||||||
|
if (end < mappedSize) {
|
||||||
|
grow(end);
|
||||||
|
}
|
||||||
|
|
||||||
|
long bv = buffer.currentValue() & mask;
|
||||||
|
long av = get(searchStart) & mask;
|
||||||
|
long pos = searchStart;
|
||||||
|
|
||||||
|
int bi = (int)(searchStart / bufferSize);
|
||||||
|
int bo = (int)(searchStart % bufferSize);
|
||||||
|
|
||||||
|
LongBuffer data = buffers.get(bi);
|
||||||
|
|
||||||
|
while (bv <= boundary && buffer.hasMore()) {
|
||||||
|
if (bv < av) {
|
||||||
|
if (!buffer.rejectAndAdvance()) break;
|
||||||
|
bv = buffer.currentValue() & mask;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
else if (bv == av) {
|
||||||
|
if (!buffer.retainAndAdvance()) break;
|
||||||
|
bv = buffer.currentValue() & mask;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
pos += stepSize;
|
||||||
|
if (pos < end) {
|
||||||
|
bo += stepSize;
|
||||||
|
if (bo >= bufferSize) {
|
||||||
|
data = buffers.get(++bi);
|
||||||
|
bo = 0;
|
||||||
|
}
|
||||||
|
av = data.get(bo) & mask;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reject(BTreeQueryBuffer buffer, long boundary, long searchStart, long numEntries, long mask, int stepSize) {
|
||||||
|
|
||||||
|
final long end = searchStart + stepSize * numEntries;
|
||||||
|
if (end < mappedSize) {
|
||||||
|
grow(end);
|
||||||
|
}
|
||||||
|
|
||||||
|
long bv = buffer.currentValue() & mask;
|
||||||
|
long av = get(searchStart) & mask;
|
||||||
|
long pos = searchStart;
|
||||||
|
|
||||||
|
int bi = (int)(searchStart / bufferSize);
|
||||||
|
int bo = (int)(searchStart % bufferSize);
|
||||||
|
|
||||||
|
LongBuffer data = buffers.get(bi);
|
||||||
|
|
||||||
|
while (bv <= boundary && buffer.hasMore()) {
|
||||||
|
if (bv < av) {
|
||||||
|
if (!buffer.retainAndAdvance()) break;
|
||||||
|
bv = buffer.currentValue() & mask;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
else if (bv == av) {
|
||||||
|
if (!buffer.rejectAndAdvance()) break;
|
||||||
|
bv = buffer.currentValue() & mask;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
pos += stepSize;
|
||||||
|
if (pos < end) {
|
||||||
|
bo += stepSize;
|
||||||
|
if (bo >= bufferSize) {
|
||||||
|
data = buffers.get(++bi);
|
||||||
|
bo = 0;
|
||||||
|
}
|
||||||
|
av = data.get(bo) & mask;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
@ -424,6 +849,4 @@ public class MultimapFileLong implements AutoCloseable, MultimapFileLongSlice {
|
|||||||
System.runFinalization();
|
System.runFinalization();
|
||||||
System.gc();
|
System.gc();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -61,6 +61,17 @@ public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice {
|
|||||||
map.write(vals, idx+off);
|
map.write(vals, idx+off);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(LongBuffer vals, int n, long idx) {
|
||||||
|
map.write(vals, n,idx+off);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void swapn(int n, long idx1, long idx2) {
|
||||||
|
map.swapn(n, idx1+off, idx2+off);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd)
|
public void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
@ -75,4 +86,35 @@ public class MultimapFileLongOffsetSlice implements MultimapFileLongSlice {
|
|||||||
|
|
||||||
return new MultimapFileLongOffsetSlice(map, this.off + off);
|
return new MultimapFileLongOffsetSlice(map, this.off + off);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long binarySearchInternal(long key, long fromIndex, int step, long n, long mask) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long binarySearchInternal(long key, long fromIndex, long n, long mask) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long binarySearchInternal(long key, long fromIndex, long n) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long binarySearchUpperInternal(long key, long fromIndex, long n) {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long quickSortPartition(int wordSize, long low, long highInclusive) {
|
||||||
|
return map.quickSortPartition(wordSize, low+off, highInclusive+off);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void insertionSort(int wordSize, long start, int n) {
|
||||||
|
map.insertionSort(wordSize, start+off, n);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -25,9 +25,23 @@ public interface MultimapFileLongSlice {
|
|||||||
|
|
||||||
void write(LongBuffer vals, long idx);
|
void write(LongBuffer vals, long idx);
|
||||||
|
|
||||||
|
void write(LongBuffer vals, int n, long idx);
|
||||||
|
|
||||||
|
void swapn(int n, long idx1, long idx2);
|
||||||
|
|
||||||
void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException;
|
void transferFromFileChannel(FileChannel sourceChannel, long destOffset, long sourceStart, long sourceEnd) throws IOException;
|
||||||
|
|
||||||
default MultimapFileLongSlice atOffset(long off) {
|
default MultimapFileLongSlice atOffset(long off) {
|
||||||
return new MultimapFileLongOffsetSlice(this, off);
|
return new MultimapFileLongOffsetSlice(this, off);
|
||||||
}
|
}
|
||||||
|
long binarySearchInternal(long key, long fromIndex, int step, long n, long mask);
|
||||||
|
long binarySearchInternal(long key, long fromIndex, long n, long mask);
|
||||||
|
|
||||||
|
long binarySearchInternal(long key, long fromIndex, long n);
|
||||||
|
|
||||||
|
long binarySearchUpperInternal(long key, long fromIndex, long n);
|
||||||
|
|
||||||
|
long quickSortPartition(int wordSize, long low, long highInclusive);
|
||||||
|
|
||||||
|
void insertionSort(int wordSize, long start, int n);
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.util.multimap;
|
package nu.marginalia.util.multimap;
|
||||||
|
|
||||||
public interface MultimapSearcher {
|
public interface MultimapSearcher {
|
||||||
long binarySearchUpper(long key, long fromIndex, long n);
|
long binarySearchLower(long key, long fromIndex, long n);
|
||||||
long binarySearch(long key, long fromIndex, long n);
|
long binarySearch(long key, long fromIndex, long n);
|
||||||
|
|
||||||
static MultimapSearcher forContext(MultimapFileLongSlice slice, long mask, int stepSize) {
|
static MultimapSearcher forContext(MultimapFileLongSlice slice, long mask, int stepSize) {
|
||||||
@ -25,8 +25,8 @@ class SimpleMultimapSearcher implements MultimapSearcher {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long binarySearchUpper(long key, long fromIndex, long n) {
|
public long binarySearchLower(long key, long fromIndex, long n) {
|
||||||
return base.binarySearchUpper(key, fromIndex, n);
|
return base.binarySearchLower(key, fromIndex, n);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -46,8 +46,8 @@ class MaskedMultimapSearcher implements MultimapSearcher {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long binarySearchUpper(long key, long fromIndex, long n) {
|
public long binarySearchLower(long key, long fromIndex, long n) {
|
||||||
return base.binarySearchUpper(key, fromIndex, n, mask);
|
return base.binarySearchLower(key, fromIndex, n, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -69,8 +69,8 @@ class SteppingMaskedMultimapSearcher implements MultimapSearcher {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long binarySearchUpper(long key, long fromIndex, long n) {
|
public long binarySearchLower(long key, long fromIndex, long n) {
|
||||||
return base.binarySearchUpper(key, fromIndex, step, n, mask);
|
return base.binarySearchLower(key, fromIndex, step, n, mask);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -29,26 +29,12 @@ public class MultimapSearcherBase {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public long binarySearchUpper(long key, long fromIndex, long n) {
|
public long binarySearchLower(long key, long fromIndex, long n) {
|
||||||
long low = 0;
|
return mmf.binarySearchUpperInternal(key, fromIndex, n);
|
||||||
long high = n - 1;
|
|
||||||
|
|
||||||
while (low <= high) {
|
|
||||||
long mid = (low + high) >>> 1;
|
|
||||||
long midVal = get(fromIndex + mid);
|
|
||||||
|
|
||||||
if (midVal < key)
|
|
||||||
low = mid + 1;
|
|
||||||
else if (midVal > key)
|
|
||||||
high = mid - 1;
|
|
||||||
else
|
|
||||||
return fromIndex + mid;
|
|
||||||
}
|
|
||||||
return fromIndex + low;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public long binarySearchUpper(long key, long fromIndex, long n, long mask) {
|
public long binarySearchLower(long key, long fromIndex, long n, long mask) {
|
||||||
long low = 0;
|
long low = 0;
|
||||||
long high = n - 1;
|
long high = n - 1;
|
||||||
|
|
||||||
@ -67,7 +53,7 @@ public class MultimapSearcherBase {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public long binarySearchUpper(long key, long fromIndex, int step, long n, long mask) {
|
public long binarySearchLower(long key, long fromIndex, int step, long n, long mask) {
|
||||||
long low = 0;
|
long low = 0;
|
||||||
long high = n - 1;
|
long high = n - 1;
|
||||||
|
|
||||||
@ -82,62 +68,19 @@ public class MultimapSearcherBase {
|
|||||||
else
|
else
|
||||||
return fromIndex + mid*step;
|
return fromIndex + mid*step;
|
||||||
}
|
}
|
||||||
return fromIndex + low;
|
return fromIndex + low*step;
|
||||||
}
|
}
|
||||||
|
|
||||||
public long binarySearch(long key, long fromIndex, long n) {
|
public long binarySearch(long key, long fromIndex, long n) {
|
||||||
long low = 0;
|
return mmf.binarySearchInternal(key, fromIndex, n);
|
||||||
long high = n - 1;
|
|
||||||
|
|
||||||
while (low <= high) {
|
|
||||||
long mid = (low + high) >>> 1;
|
|
||||||
long midVal = get(fromIndex + mid);
|
|
||||||
|
|
||||||
if (midVal < key)
|
|
||||||
low = mid + 1;
|
|
||||||
else if (midVal > key)
|
|
||||||
high = mid - 1;
|
|
||||||
else
|
|
||||||
return fromIndex + mid;
|
|
||||||
}
|
|
||||||
return -1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public long binarySearch(long key, long fromIndex, long n, long mask) {
|
public long binarySearch(long key, long fromIndex, long n, long mask) {
|
||||||
long low = 0;
|
return mmf.binarySearchInternal(key, fromIndex, n, mask);
|
||||||
long high = n - 1;
|
|
||||||
|
|
||||||
while (low <= high) {
|
|
||||||
long mid = (low + high) >>> 1;
|
|
||||||
long midVal = get(fromIndex + mid) & mask;
|
|
||||||
|
|
||||||
if (midVal < key)
|
|
||||||
low = mid + 1;
|
|
||||||
else if (midVal > key)
|
|
||||||
high = mid - 1;
|
|
||||||
else
|
|
||||||
return fromIndex + mid;
|
|
||||||
}
|
|
||||||
return -1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public long binarySearch(long key, long fromIndex, int step, long n, long mask) {
|
public long binarySearch(long key, long fromIndex, int step, long n, long mask) {
|
||||||
long low = 0;
|
return mmf.binarySearchInternal(key, fromIndex, step, n, mask);
|
||||||
long high = n - 1;
|
|
||||||
|
|
||||||
while (low <= high) {
|
|
||||||
long mid = (low + high) >>> 1;
|
|
||||||
long midVal = get(fromIndex + mid*step) & mask;
|
|
||||||
|
|
||||||
if (midVal < key)
|
|
||||||
low = mid + 1;
|
|
||||||
else if (midVal > key)
|
|
||||||
high = mid - 1;
|
|
||||||
else
|
|
||||||
return fromIndex + mid*step;
|
|
||||||
}
|
|
||||||
return -1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,87 +1,147 @@
|
|||||||
package nu.marginalia.util.multimap;
|
package nu.marginalia.util.multimap;
|
||||||
|
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.RandomAccessFile;
|
import java.io.RandomAccessFile;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
import java.nio.LongBuffer;
|
import java.nio.LongBuffer;
|
||||||
import java.nio.channels.FileChannel;
|
import java.nio.channels.FileChannel;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.Arrays;
|
|
||||||
|
|
||||||
import static nu.marginalia.util.multimap.MultimapFileLong.WORD_SIZE;
|
import static nu.marginalia.util.multimap.MultimapFileLong.WORD_SIZE;
|
||||||
|
|
||||||
public class MultimapSorter {
|
public class MultimapSorter {
|
||||||
private final Path tmpFileDir;
|
private final Path tmpFileDir;
|
||||||
private final int internalSortLimit;
|
|
||||||
private final MultimapFileLongSlice multimapFileLong;
|
private final MultimapFileLongSlice multimapFileLong;
|
||||||
private final long[] buffer;
|
private final LongBuffer buffer;
|
||||||
|
private final int internalSortLimit;
|
||||||
|
private final int wordSize;
|
||||||
|
|
||||||
public MultimapSorter(MultimapFileLongSlice multimapFileLong, Path tmpFileDir, int internalSortLimit) {
|
private static final Logger logger = LoggerFactory.getLogger(MultimapSorter.class);
|
||||||
|
|
||||||
|
public MultimapSorter(MultimapFileLongSlice multimapFileLong, Path tmpFileDir, int internalSortLimit, int wordSize) {
|
||||||
this.multimapFileLong = multimapFileLong;
|
this.multimapFileLong = multimapFileLong;
|
||||||
this.tmpFileDir = tmpFileDir;
|
this.tmpFileDir = tmpFileDir;
|
||||||
this.internalSortLimit = internalSortLimit;
|
this.internalSortLimit = internalSortLimit;
|
||||||
buffer = new long[internalSortLimit];
|
this.wordSize = wordSize;
|
||||||
|
buffer = ByteBuffer.allocateDirect(internalSortLimit * wordSize * 8).asLongBuffer();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void sort(long start, int length) throws IOException {
|
public void sortRange(long start, long end) throws IOException {
|
||||||
if (length <= internalSortLimit) {
|
if (end - start < internalSortLimit) {
|
||||||
multimapFileLong.read(buffer, length, start);
|
quickSortLH(start, end - wordSize);
|
||||||
Arrays.sort(buffer, 0, length);
|
|
||||||
multimapFileLong.write(buffer, length, start);
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
externalSort(start, length);
|
mergeSort(start, (int) (end - start));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (long lp = start + wordSize; lp < end; lp += wordSize) {
|
||||||
|
if (multimapFileLong.get(lp - wordSize) > multimapFileLong.get(lp)) {
|
||||||
|
|
||||||
|
logger.error("Sort contract breached [{}:{} ({}), ws={}, <isl={}, bc={}]",
|
||||||
|
start, end,
|
||||||
|
end - start,
|
||||||
|
wordSize, end - start < internalSortLimit,
|
||||||
|
buffer.capacity());
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void mergeSort(long start, int lengthLongs) throws IOException {
|
||||||
|
if (lengthLongs == 1)
|
||||||
|
return;
|
||||||
|
|
||||||
private void externalSort(long start, int length) throws IOException {
|
if (lengthLongs < buffer.capacity()) {
|
||||||
Path tmpFile = Files.createTempFile(tmpFileDir,"sort-"+start+"-"+(start+length), ".dat");
|
mergeSort(start, lengthLongs, buffer);
|
||||||
|
|
||||||
try (var raf = new RandomAccessFile(tmpFile.toFile(), "rw"); var channel = raf.getChannel()) {
|
|
||||||
var workBuffer =
|
|
||||||
channel.map(FileChannel.MapMode.READ_WRITE, 0, length * WORD_SIZE)
|
|
||||||
.asLongBuffer();
|
|
||||||
|
|
||||||
int width = Math.min(Integer.highestOneBit(length), Integer.highestOneBit(internalSortLimit));
|
|
||||||
|
|
||||||
// Do in-memory sorting up until internalSortLimit first
|
|
||||||
for (int i = 0; i < length; i += width) {
|
|
||||||
sort(start + i, Math.min(width, length-i));
|
|
||||||
}
|
|
||||||
|
|
||||||
// Then merge sort on disk for the rest
|
|
||||||
for (; width < length; width*=2) {
|
|
||||||
|
|
||||||
for (int i = 0; i < length; i += 2*width) {
|
|
||||||
merge(start, i, Math.min(i+width, length), Math.min(i+2*width, length), workBuffer);
|
|
||||||
}
|
|
||||||
|
|
||||||
workBuffer.clear();
|
|
||||||
multimapFileLong.write(workBuffer, start);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
finally {
|
else {
|
||||||
tmpFile.toFile().delete();
|
Path tmpFile = Files.createTempFile(tmpFileDir,"sort-"+start+"-"+(start+lengthLongs), ".dat");
|
||||||
|
try (var raf = new RandomAccessFile(tmpFile.toFile(), "rw"); var channel = raf.getChannel()) {
|
||||||
|
var workBuffer =
|
||||||
|
channel.map(FileChannel.MapMode.READ_WRITE, 0, wordSize * lengthLongs * WORD_SIZE)
|
||||||
|
.asLongBuffer();
|
||||||
|
mergeSort(start, lengthLongs, workBuffer);
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
tmpFile.toFile().delete();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
private void mergeSort(long start, int length, LongBuffer workBuffer) throws IOException {
|
||||||
|
int width = Math.min(Integer.highestOneBit(length), Integer.highestOneBit(buffer.capacity()));
|
||||||
|
|
||||||
|
// Do in-memory sorting up until internalSortLimit first
|
||||||
|
for (int i = 0; i < length; i += width) {
|
||||||
|
quickSort(start + i, Math.min(width, length-i));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Then finish with merge sort
|
||||||
|
for (; width < length; width*=2) {
|
||||||
|
|
||||||
|
for (int i = 0; i < length; i += 2*width) {
|
||||||
|
merge(start, i, Math.min(i+width, length), Math.min(i+2*width, length), workBuffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
workBuffer.clear();
|
||||||
|
multimapFileLong.write(workBuffer, length, start);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void merge(long offset, int left, int right, int end, LongBuffer workBuffer) {
|
void merge(long offset, int left, int right, int end, LongBuffer workBuffer) {
|
||||||
int i = left;
|
long idxL = left;
|
||||||
int j = right;
|
long idxR = right;
|
||||||
|
|
||||||
for (int k = left; k < end; k++) {
|
for (int putPos = left; putPos < end; putPos+= wordSize) {
|
||||||
final long bufferI = multimapFileLong.get(offset+i);
|
final long bufferL = multimapFileLong.get(offset+idxL);
|
||||||
final long bufferJ = multimapFileLong.get(offset+j);
|
final long bufferR = multimapFileLong.get(offset+idxR);
|
||||||
|
|
||||||
if (i < right && (j >= end || bufferI < bufferJ)) {
|
if (idxL < right && (idxR >= end || bufferL < bufferR)) {
|
||||||
workBuffer.put(k, bufferI);
|
workBuffer.put(putPos, bufferL);
|
||||||
i++;
|
for (int s = 1; s < wordSize; s++) {
|
||||||
|
workBuffer.put(putPos + s, multimapFileLong.get(offset + idxL + s));
|
||||||
|
}
|
||||||
|
idxL+= wordSize;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
workBuffer.put(k, bufferJ);
|
workBuffer.put(putPos, bufferR);
|
||||||
j++;
|
for (int s = 1; s < wordSize; s++) {
|
||||||
|
workBuffer.put(putPos + s, multimapFileLong.get(offset + idxR + s));
|
||||||
|
}
|
||||||
|
idxR+= wordSize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void insertionSort(long start, int n) {
|
||||||
|
multimapFileLong.insertionSort(wordSize, start, n);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void swap(long a, long b) {
|
||||||
|
multimapFileLong.swapn(wordSize, a, b);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void quickSort(long start, long length) {
|
||||||
|
quickSortLH(start, start + length - wordSize);
|
||||||
|
|
||||||
|
}
|
||||||
|
public void quickSortLH(long low, long highInclusive) {
|
||||||
|
|
||||||
|
if (low >= 0 && highInclusive >= 0 && low < highInclusive) {
|
||||||
|
|
||||||
|
if (highInclusive - low < 32) {
|
||||||
|
multimapFileLong.insertionSort(wordSize, low, (int) (1 + (highInclusive - low) / wordSize));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
long p = multimapFileLong.quickSortPartition(wordSize, low, highInclusive);
|
||||||
|
|
||||||
|
quickSortLH(low, p);
|
||||||
|
quickSortLH(p + wordSize, highInclusive);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -11,27 +11,16 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.concurrent.LinkedBlockingQueue;
|
import java.util.concurrent.LinkedBlockingQueue;
|
||||||
|
|
||||||
public class UpdateDomainRanksTool2 {
|
public class UpdateDomainRanksTool2 {
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool2.class);
|
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool2.class);
|
||||||
|
|
||||||
public Set<String> originDomains = new HashSet<>();
|
|
||||||
public Set<Integer> originDomainIds = new HashSet<>();
|
|
||||||
public final long domainIdMax = -1;
|
public final long domainIdMax = -1;
|
||||||
public int domainCount;
|
public int domainCount;
|
||||||
private volatile static int rankMax;
|
private volatile static int rankMax;
|
||||||
|
|
||||||
public int maxId() {
|
|
||||||
return (int) domainIdMax;
|
|
||||||
}
|
|
||||||
public int domainCount() {
|
|
||||||
return domainCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
|
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
|
||||||
volatile static boolean running = true;
|
volatile static boolean running = true;
|
||||||
|
|
||||||
@ -44,23 +33,14 @@ public class UpdateDomainRanksTool2 {
|
|||||||
var uploader = new Thread(() -> uploadThread(conn), "Uploader");
|
var uploader = new Thread(() -> uploadThread(conn), "Uploader");
|
||||||
|
|
||||||
logger.info("Ranking");
|
logger.info("Ranking");
|
||||||
// "memex.marginalia.nu", "wiki.xxiivv.com", "bikobatanari.art", "sadgrl.online", "lileks.com",
|
|
||||||
// "www.rep.routledge.com", "www.personal.kent.edu", "xroads.virginia.edu", "classics.mit.edu", "faculty.washington.edu", "monadnock.net"
|
|
||||||
var ds = new DatabaseModule().provideConnection();
|
var ds = new DatabaseModule().provideConnection();
|
||||||
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||||
var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
||||||
// var rpr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "%edu");
|
|
||||||
// var spr = new BetterStandardPageRank(new DatabaseModule().provideConnection(), "memex.marginalia.nu");
|
|
||||||
|
|
||||||
var rankVector = rpr.pageRankVector();
|
var rankVector = rpr.pageRankVector();
|
||||||
var norm = rankVector.norm();
|
|
||||||
rankMax = rpr.size();
|
rankMax = rpr.size();
|
||||||
uploader.start();
|
uploader.start();
|
||||||
|
|
||||||
|
|
||||||
rankMax = rpr.size();
|
|
||||||
|
|
||||||
|
|
||||||
rpr.pageRankWithPeripheralNodes(rankMax).forEach(i -> {
|
rpr.pageRankWithPeripheralNodes(rankMax).forEach(i -> {
|
||||||
try {
|
try {
|
||||||
uploadQueue.put(i);
|
uploadQueue.put(i);
|
||||||
|
@ -0,0 +1,298 @@
|
|||||||
|
package nu.marginalia.util.tool;
|
||||||
|
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import gnu.trove.map.hash.TIntIntHashMap;
|
||||||
|
import gnu.trove.map.hash.TIntObjectHashMap;
|
||||||
|
import gnu.trove.set.hash.TIntHashSet;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.util.AndCardIntSet;
|
||||||
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
|
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDaoImpl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
|
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||||
|
import org.roaringbitmap.RoaringBitmap;
|
||||||
|
|
||||||
|
import java.sql.ResultSet;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.concurrent.LinkedBlockingDeque;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.function.Consumer;
|
||||||
|
|
||||||
|
import static nu.marginalia.util.AndCardIntSet.*;
|
||||||
|
|
||||||
|
public class EdgeDomainLinkConsineSimilarityMain {
|
||||||
|
ArrayList<Integer> idsList = new ArrayList<>(100_000);
|
||||||
|
ArrayList<AndCardIntSet> itemsList = new ArrayList<>(100_000);
|
||||||
|
TIntObjectHashMap<AndCardIntSet> dToSMap = new TIntObjectHashMap<>(100_000);
|
||||||
|
TIntIntHashMap aliasMap = new TIntIntHashMap(100_000, 0.75f, -1, -1);
|
||||||
|
TIntHashSet indexed = new TIntHashSet(100_000);
|
||||||
|
|
||||||
|
float weights[];
|
||||||
|
|
||||||
|
private HikariDataSource dataSource;
|
||||||
|
|
||||||
|
public EdgeDomainLinkConsineSimilarityMain(HikariDataSource dataSource) throws SQLException {
|
||||||
|
this.dataSource = dataSource;
|
||||||
|
|
||||||
|
Map<Integer, RoaringBitmap> tmpMap = new HashMap<>(100_000);
|
||||||
|
try (
|
||||||
|
var conn = dataSource.getConnection();
|
||||||
|
var aliasStmt = conn.prepareStatement("SELECT ID, DOMAIN_ALIAS FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NOT NULL");
|
||||||
|
var indexedStmt = conn.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE INDEXED>0");
|
||||||
|
var linksStmt = conn.prepareStatement("SELECT SOURCE_DOMAIN_ID, DEST_DOMAIN_ID FROM EC_DOMAIN_LINK")) {
|
||||||
|
ResultSet rsp;
|
||||||
|
|
||||||
|
aliasStmt.setFetchSize(10_000);
|
||||||
|
rsp = aliasStmt.executeQuery();
|
||||||
|
while (rsp.next()) {
|
||||||
|
aliasMap.put(rsp.getInt(1), rsp.getInt(2));
|
||||||
|
}
|
||||||
|
|
||||||
|
indexedStmt.setFetchSize(10_000);
|
||||||
|
rsp = indexedStmt.executeQuery();
|
||||||
|
while (rsp.next()) {
|
||||||
|
indexed.add(rsp.getInt(1));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
linksStmt.setFetchSize(10_000);
|
||||||
|
rsp = linksStmt.executeQuery();
|
||||||
|
while (rsp.next()) {
|
||||||
|
int source = deAlias(rsp.getInt(1));
|
||||||
|
int dest = deAlias(rsp.getInt(2));
|
||||||
|
|
||||||
|
tmpMap.computeIfAbsent(dest, this::createBitmapWithSelf).add(source);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tmpMap.entrySet().stream()
|
||||||
|
.filter(e -> isEligible(e.getValue()))
|
||||||
|
.forEach(e -> {
|
||||||
|
var val = of(e.getValue());
|
||||||
|
idsList.add(e.getKey());
|
||||||
|
itemsList.add(val);
|
||||||
|
dToSMap.put(e.getKey(), val);
|
||||||
|
});
|
||||||
|
weights = new float[1 + idsList.stream().mapToInt(i -> i).max().orElse(0)];
|
||||||
|
for (int i = 0; i < idsList.size(); i++) {
|
||||||
|
weights[idsList.get(i)] = getWeight(idsList.get(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isEligible(RoaringBitmap value) {
|
||||||
|
int cardinality = value.getCardinality();
|
||||||
|
|
||||||
|
return cardinality < 10000;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int deAlias(int id) {
|
||||||
|
int val = aliasMap.get(id);
|
||||||
|
if (val < 0)
|
||||||
|
return id;
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
|
||||||
|
LinkedBlockingDeque<DomainSimilarities> similaritiesLinkedBlockingDeque = new LinkedBlockingDeque<>(10);
|
||||||
|
volatile boolean running;
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public void tryDomains(String... domainName) {
|
||||||
|
var dataStoreDao = new EdgeDataStoreDaoImpl(dataSource);
|
||||||
|
|
||||||
|
System.out.println(Arrays.toString(domainName));
|
||||||
|
|
||||||
|
int[] domainIds = Arrays.stream(domainName).map(EdgeDomain::new)
|
||||||
|
.map(dataStoreDao::getDomainId)
|
||||||
|
.mapToInt(EdgeId::id)
|
||||||
|
.map(this::deAlias)
|
||||||
|
.toArray();
|
||||||
|
|
||||||
|
for (int domainId : domainIds) {
|
||||||
|
findAdjacentDtoS(domainId, similarities -> {
|
||||||
|
for (var similarity : similarities.similarities()) {
|
||||||
|
if (indexed.contains(similarity.domainId)) System.out.print("*");
|
||||||
|
System.out.println(dataStoreDao.getDomain(new EdgeId<>(similarity.domainId)).map(Object::toString).orElse("") + " " + prettyPercent(similarity.value));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String prettyPercent(double val) {
|
||||||
|
return String.format("%2.2f%%", 100. * val);
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public void loadAll() {
|
||||||
|
running = true;
|
||||||
|
var thread = new Thread(this::insertThreadRun);
|
||||||
|
thread.start();
|
||||||
|
idsList.parallelStream()
|
||||||
|
.filter(id -> !aliasMap.containsKey(id))
|
||||||
|
.forEach(id -> findAdjacent(id, this::addToQueue));
|
||||||
|
running = false;
|
||||||
|
thread.join();
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
void addToQueue(DomainSimilarities similarities) {
|
||||||
|
similaritiesLinkedBlockingDeque.putLast(similarities);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void insertThreadRun() {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement(
|
||||||
|
"""
|
||||||
|
INSERT INTO EC_DOMAIN_NEIGHBORS_2
|
||||||
|
(DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS)
|
||||||
|
VALUES (?, ?, ?)
|
||||||
|
ON DUPLICATE KEY UPDATE RELATEDNESS = GREATEST(EC_DOMAIN_NEIGHBORS_2.RELATEDNESS, VALUES(RELATEDNESS))
|
||||||
|
""")
|
||||||
|
) {
|
||||||
|
while (running || !similaritiesLinkedBlockingDeque.isEmpty()) {
|
||||||
|
var item = similaritiesLinkedBlockingDeque.pollFirst(60, TimeUnit.SECONDS);
|
||||||
|
if (item == null) continue;
|
||||||
|
|
||||||
|
for (var similarity : item.similarities) {
|
||||||
|
stmt.setInt(1, item.domainId);
|
||||||
|
stmt.setInt(2, similarity.domainId);
|
||||||
|
stmt.setDouble(3, similarity.value);
|
||||||
|
stmt.addBatch();
|
||||||
|
}
|
||||||
|
stmt.executeBatch();
|
||||||
|
}
|
||||||
|
} catch (SQLException | InterruptedException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public RoaringBitmap createBitmapWithSelf(int val) {
|
||||||
|
var bm = new RoaringBitmap();
|
||||||
|
bm.add(val);
|
||||||
|
return bm;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void findAdjacent(int domainId, Consumer<DomainSimilarities> andThen) {
|
||||||
|
findAdjacentDtoS(domainId, andThen);
|
||||||
|
}
|
||||||
|
|
||||||
|
double cosineSimilarity(AndCardIntSet a, AndCardIntSet b) {
|
||||||
|
double andCardinality = andCardinality(a, b);
|
||||||
|
andCardinality /= Math.sqrt(a.getCardinality());
|
||||||
|
andCardinality /= Math.sqrt(b.getCardinality());
|
||||||
|
return andCardinality;
|
||||||
|
}
|
||||||
|
|
||||||
|
double expensiveCosineSimilarity(AndCardIntSet a, AndCardIntSet b) {
|
||||||
|
return weightedProduct(weights, a, b) / Math.sqrt(a.mulAndSum(weights) * b.mulAndSum(weights));
|
||||||
|
}
|
||||||
|
|
||||||
|
float getWeight(int i) {
|
||||||
|
var vector = dToSMap.get(i);
|
||||||
|
|
||||||
|
if (vector == null) return 1.0f;
|
||||||
|
return 1.0f / (float) Math.log(2+vector.getCardinality());
|
||||||
|
}
|
||||||
|
|
||||||
|
record DomainSimilarities(int domainId, List<DomainSimilarity> similarities) {};
|
||||||
|
record DomainSimilarity(int domainId, double value) {};
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private void findAdjacentDtoS(int domainId, Consumer<DomainSimilarities> andThen) {
|
||||||
|
var vector = dToSMap.get(domainId);
|
||||||
|
if (vector == null || !vector.cardinalityExceeds(10)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println("DtoS " + domainId);
|
||||||
|
|
||||||
|
List<DomainSimilarity> similarities = new ArrayList<>(1000);
|
||||||
|
|
||||||
|
/** The minimum cardinality a vector can have so that
|
||||||
|
*
|
||||||
|
* a (x) b
|
||||||
|
* ------- < k is given by k^2
|
||||||
|
* |a||b|
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
int cardMin = Math.max(2, (int) (0.01 * vector.getCardinality()));
|
||||||
|
|
||||||
|
for (int i = 0; i < itemsList.size(); i++) {
|
||||||
|
|
||||||
|
int id = idsList.get(i);
|
||||||
|
if (id == domainId)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
var otherVec = itemsList.get(i);
|
||||||
|
if (otherVec.getCardinality() < cardMin)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
double similarity = cosineSimilarity(vector, otherVec);
|
||||||
|
if (similarity > 0.1) {
|
||||||
|
var recalculated = expensiveCosineSimilarity(vector, otherVec);
|
||||||
|
if (recalculated > 0.1) {
|
||||||
|
similarities.add(new DomainSimilarity(id, recalculated));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (similarities.size() > 128) {
|
||||||
|
similarities.sort(Comparator.comparing(DomainSimilarity::value));
|
||||||
|
similarities.subList(0, similarities.size() - 128).clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
andThen.accept(new DomainSimilarities(domainId, similarities));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// @SneakyThrows
|
||||||
|
// private void findAdjacentDtoS(Consumer<DomainSimilarities> andThen, int... domainIds) {
|
||||||
|
// var vectors = Arrays.stream(domainIds).mapToObj(dToSMap::get)
|
||||||
|
// .filter(Objects::nonNull)
|
||||||
|
// .filter(vec -> vec.cardinalityExceeds(10))
|
||||||
|
// .toArray(AndCardIntSet[]::new);
|
||||||
|
// Set<Integer> domainIdsSet = new HashSet<>(Arrays.stream(domainIds).boxed().toList());
|
||||||
|
//
|
||||||
|
// if (vectors.length != domainIds.length)
|
||||||
|
// return;
|
||||||
|
//
|
||||||
|
// List<DomainSimilarity> similarities = dToSMap.entrySet().parallelStream()
|
||||||
|
// .filter(e -> !domainIdsSet.contains(e.getKey()) && indexed.contains(e.getKey()))
|
||||||
|
// .flatMap(entry -> {
|
||||||
|
//
|
||||||
|
// double similarity = 0.;
|
||||||
|
// for (var vector : vectors) {
|
||||||
|
// similarity += cosineSimilarity(vector, entry.getValue());
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// if (similarity > 0.1 * vectors.length) {
|
||||||
|
// double recalculated = 0;
|
||||||
|
// for (var vector : vectors) {
|
||||||
|
// recalculated += expensiveCosineSimilarity(vector, entry.getValue());
|
||||||
|
// }
|
||||||
|
// if (recalculated > 0.1 * vectors.length) {
|
||||||
|
// return Stream.of(new DomainSimilarity(entry.getKey(), recalculated));
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
// return Stream.empty();
|
||||||
|
// }).sorted(Comparator.comparing(DomainSimilarity::value))
|
||||||
|
// .toList();
|
||||||
|
//
|
||||||
|
// andThen.accept(new DomainSimilarities(domainIds[0], similarities));
|
||||||
|
// }
|
||||||
|
|
||||||
|
|
||||||
|
public static void main(String[] args) throws SQLException {
|
||||||
|
DatabaseModule dm = new DatabaseModule();
|
||||||
|
|
||||||
|
var main = new EdgeDomainLinkConsineSimilarityMain(dm.provideConnection());
|
||||||
|
if (args.length == 0) {
|
||||||
|
main.loadAll();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
main.tryDomains(args);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -2,8 +2,14 @@ package nu.marginalia.wmsa.api.model;
|
|||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
|
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore;
|
||||||
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
|
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
@AllArgsConstructor @Getter
|
@AllArgsConstructor @Getter
|
||||||
public class ApiSearchResult {
|
public class ApiSearchResult {
|
||||||
public String url;
|
public String url;
|
||||||
@ -11,10 +17,30 @@ public class ApiSearchResult {
|
|||||||
public String description;
|
public String description;
|
||||||
public double quality;
|
public double quality;
|
||||||
|
|
||||||
|
public List<List<ApiSearchResultQueryDetails>> details = new ArrayList<>();
|
||||||
|
|
||||||
public ApiSearchResult(EdgeUrlDetails url) {
|
public ApiSearchResult(EdgeUrlDetails url) {
|
||||||
this.url = url.url.toString();
|
this.url = url.url.toString();
|
||||||
this.title = url.getTitle();
|
this.title = url.getTitle();
|
||||||
this.description = url.getDescription();
|
this.description = url.getDescription();
|
||||||
this.quality = url.getTermScore();
|
this.quality = url.getTermScore();
|
||||||
|
|
||||||
|
if (url.resultItem != null) {
|
||||||
|
var bySet = url.resultItem.scores.stream().collect(Collectors.groupingBy(EdgeSearchResultKeywordScore::set));
|
||||||
|
|
||||||
|
outer:
|
||||||
|
for (var entries : bySet.values()) {
|
||||||
|
List<ApiSearchResultQueryDetails> lst = new ArrayList<>();
|
||||||
|
for (var entry : entries) {
|
||||||
|
var metadata = entry.metadata();
|
||||||
|
if (metadata.isEmpty())
|
||||||
|
continue outer;
|
||||||
|
|
||||||
|
Set<String> flags = metadata.flags().stream().map(Object::toString).collect(Collectors.toSet());
|
||||||
|
lst.add(new ApiSearchResultQueryDetails(entry.keyword(), metadata.tfIdf(),metadata.count(), flags));
|
||||||
|
}
|
||||||
|
details.add(lst);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,16 @@
|
|||||||
|
package nu.marginalia.wmsa.api.model;
|
||||||
|
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Getter;
|
||||||
|
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
@AllArgsConstructor @Getter
|
||||||
|
public class ApiSearchResultQueryDetails {
|
||||||
|
|
||||||
|
String keyword;
|
||||||
|
int tfIdf;
|
||||||
|
int count;
|
||||||
|
|
||||||
|
Set<String> flagsUnstableAPI;
|
||||||
|
}
|
@ -5,6 +5,7 @@ import nu.marginalia.wmsa.auth.AuthMain;
|
|||||||
import nu.marginalia.wmsa.configuration.command.*;
|
import nu.marginalia.wmsa.configuration.command.*;
|
||||||
import nu.marginalia.wmsa.edge.assistant.EdgeAssistantMain;
|
import nu.marginalia.wmsa.edge.assistant.EdgeAssistantMain;
|
||||||
import nu.marginalia.wmsa.edge.dating.DatingMain;
|
import nu.marginalia.wmsa.edge.dating.DatingMain;
|
||||||
|
import nu.marginalia.wmsa.edge.explorer.ExplorerMain;
|
||||||
import nu.marginalia.wmsa.edge.index.EdgeIndexMain;
|
import nu.marginalia.wmsa.edge.index.EdgeIndexMain;
|
||||||
import nu.marginalia.wmsa.edge.search.EdgeSearchMain;
|
import nu.marginalia.wmsa.edge.search.EdgeSearchMain;
|
||||||
import nu.marginalia.wmsa.encyclopedia.EncyclopediaMain;
|
import nu.marginalia.wmsa.encyclopedia.EncyclopediaMain;
|
||||||
@ -37,6 +38,7 @@ public enum ServiceDescriptor {
|
|||||||
ENCYCLOPEDIA("encyclopedia", 5040, EncyclopediaMain.class),
|
ENCYCLOPEDIA("encyclopedia", 5040, EncyclopediaMain.class),
|
||||||
|
|
||||||
DATING("dating", 5070, DatingMain.class),
|
DATING("dating", 5070, DatingMain.class),
|
||||||
|
EXPLORER("explorer", 5071, ExplorerMain.class),
|
||||||
|
|
||||||
TEST_1("test-1", 0, null),
|
TEST_1("test-1", 0, null),
|
||||||
TEST_2("test-2", 0, null);
|
TEST_2("test-2", 0, null);
|
||||||
@ -77,7 +79,8 @@ public enum ServiceDescriptor {
|
|||||||
|
|
||||||
public static void main(String... args) {
|
public static void main(String... args) {
|
||||||
MainMapLookup.setMainArguments(args);
|
MainMapLookup.setMainArguments(args);
|
||||||
Map<String, Command> functions = Stream.of(new ListCommand(),
|
Map<String, Command> functions = Stream.of(
|
||||||
|
new ListCommand(),
|
||||||
new StartCommand(),
|
new StartCommand(),
|
||||||
new ConvertCommand(),
|
new ConvertCommand(),
|
||||||
new CrawlCommand(),
|
new CrawlCommand(),
|
||||||
|
@ -12,6 +12,7 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import spark.Request;
|
import spark.Request;
|
||||||
import spark.Response;
|
import spark.Response;
|
||||||
|
import spark.Spark;
|
||||||
|
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
|
|
||||||
@ -85,6 +86,12 @@ public class ScreenshotService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private Object serveSvgPlaceholder(Response response, int id) {
|
private Object serveSvgPlaceholder(Response response, int id) {
|
||||||
|
|
||||||
|
var domainName = edgeDataStoreDao.getDomain(new EdgeId<>(id)).map(Object::toString);
|
||||||
|
if (domainName.isEmpty()) {
|
||||||
|
Spark.halt(404);
|
||||||
|
}
|
||||||
|
|
||||||
response.type("image/svg+xml");
|
response.type("image/svg+xml");
|
||||||
return String.format("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" +
|
return String.format("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" +
|
||||||
"<svg\n" +
|
"<svg\n" +
|
||||||
@ -111,6 +118,6 @@ public class ScreenshotService {
|
|||||||
" style=\"font-size:32px;fill:#000000;font-family:monospace;\"\n" +
|
" style=\"font-size:32px;fill:#000000;font-family:monospace;\"\n" +
|
||||||
" x=\"320\" y=\"240\" dominant-baseline=\"middle\" text-anchor=\"middle\">%s</text>\n" +
|
" x=\"320\" y=\"240\" dominant-baseline=\"middle\" text-anchor=\"middle\">%s</text>\n" +
|
||||||
" </g>\n" +
|
" </g>\n" +
|
||||||
"</svg>\n", edgeDataStoreDao.getDomain(new EdgeId<>(id)));
|
"</svg>\n", domainName.get());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,69 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting;
|
||||||
|
|
||||||
|
import com.github.luben.zstd.ZstdOutputStream;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||||
|
|
||||||
|
import java.io.BufferedOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.PrintWriter;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
import java.time.LocalDateTime;
|
||||||
|
import java.time.ZoneOffset;
|
||||||
|
|
||||||
|
public class ConversionLog implements AutoCloseable, Interpreter {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
private final PrintWriter writer;
|
||||||
|
|
||||||
|
public ConversionLog(Path rootDir) throws IOException {
|
||||||
|
String fileName = String.format("conversion-log-%s.zstd", LocalDateTime.now().toEpochSecond(ZoneOffset.UTC));
|
||||||
|
Path logFile = rootDir.resolve(fileName);
|
||||||
|
|
||||||
|
writer = new PrintWriter(new ZstdOutputStream(
|
||||||
|
new BufferedOutputStream(Files.newOutputStream(logFile, StandardOpenOption.WRITE, StandardOpenOption.CREATE))));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
writer.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadUrl(EdgeUrl[] url) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadDomain(EdgeDomain[] domain) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadRssFeed(EdgeUrl[] rssFeed) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadDomainLink(DomainLink[] links) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public synchronized void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) {
|
||||||
|
writer.printf("%s\t%s\n", loadProcessedDocumentWithError.url(), loadProcessedDocumentWithError.reason());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadDomainRedirect(DomainLink link) {}
|
||||||
|
}
|
@ -54,5 +54,4 @@ public class ConvertedDomainReader {
|
|||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -5,9 +5,9 @@ import com.google.inject.Guice;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Injector;
|
import com.google.inject.Injector;
|
||||||
import nu.marginalia.util.ParallelPipe;
|
import nu.marginalia.util.ParallelPipe;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.compiler.InstructionsCompiler;
|
||||||
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor;
|
import nu.marginalia.wmsa.edge.converting.processor.DomainProcessor;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.InstructionsCompiler;
|
|
||||||
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||||
import nu.marginalia.wmsa.edge.crawling.WorkLog;
|
import nu.marginalia.wmsa.edge.crawling.WorkLog;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||||
@ -47,11 +47,15 @@ public class ConverterMain {
|
|||||||
Gson gson
|
Gson gson
|
||||||
) throws Exception {
|
) throws Exception {
|
||||||
|
|
||||||
instructionWriter = new LoadInstructionWriter(plan.process.getDir(), gson);
|
;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
logger.info("Starting pipe");
|
logger.info("Starting pipe");
|
||||||
|
|
||||||
try (WorkLog processLog = plan.createProcessWorkLog()) {
|
try (WorkLog processLog = plan.createProcessWorkLog();
|
||||||
|
ConversionLog log = new ConversionLog(plan.process.getDir())) {
|
||||||
|
instructionWriter = new LoadInstructionWriter(log, plan.process.getDir(), gson);
|
||||||
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 20, 4, 2) {
|
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 20, 4, 2) {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -24,10 +24,13 @@ import java.util.List;
|
|||||||
|
|
||||||
public class LoadInstructionWriter {
|
public class LoadInstructionWriter {
|
||||||
|
|
||||||
|
private ConversionLog log;
|
||||||
private final Path outputDir;
|
private final Path outputDir;
|
||||||
private final Gson gson;
|
private final Gson gson;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(LoadInstructionWriter.class);
|
private static final Logger logger = LoggerFactory.getLogger(LoadInstructionWriter.class);
|
||||||
public LoadInstructionWriter(Path outputDir, Gson gson) {
|
|
||||||
|
public LoadInstructionWriter(ConversionLog log, Path outputDir, Gson gson) {
|
||||||
|
this.log = log;
|
||||||
this.outputDir = outputDir;
|
this.outputDir = outputDir;
|
||||||
this.gson = gson;
|
this.gson = gson;
|
||||||
|
|
||||||
@ -35,6 +38,7 @@ public class LoadInstructionWriter {
|
|||||||
throw new IllegalArgumentException("Output dir " + outputDir + " does not exist");
|
throw new IllegalArgumentException("Output dir " + outputDir + " does not exist");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public String accept(String id, List<Instruction> instructionList) throws IOException {
|
public String accept(String id, List<Instruction> instructionList) throws IOException {
|
||||||
Path outputFile = getOutputFile(id);
|
Path outputFile = getOutputFile(id);
|
||||||
|
|
||||||
@ -48,6 +52,8 @@ public class LoadInstructionWriter {
|
|||||||
logger.info("Writing {} - {} - {}", id, instructionList.size(), summary);
|
logger.info("Writing {} - {} - {}", id, instructionList.size(), summary);
|
||||||
|
|
||||||
for (var instr : instructionList) {
|
for (var instr : instructionList) {
|
||||||
|
instr.apply(log);
|
||||||
|
|
||||||
outputStream.append(instr.tag().name());
|
outputStream.append(instr.tag().name());
|
||||||
outputStream.append(' ');
|
outputStream.append(' ');
|
||||||
gson.toJson(instr, outputStream);
|
gson.toJson(instr, outputStream);
|
||||||
@ -66,6 +72,7 @@ public class LoadInstructionWriter {
|
|||||||
if (!Files.exists(destDir)) {
|
if (!Files.exists(destDir)) {
|
||||||
Files.createDirectories(destDir);
|
Files.createDirectories(destDir);
|
||||||
}
|
}
|
||||||
|
|
||||||
return destDir.resolve(id + ".pzstd");
|
return destDir.resolve(id + ".pzstd");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -70,7 +70,11 @@ public class ReindexTriggerMain {
|
|||||||
};
|
};
|
||||||
|
|
||||||
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/repartition")).build()).execute();
|
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/repartition")).build()).execute();
|
||||||
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/preconvert")).build()).execute();
|
|
||||||
|
if (!Boolean.getBoolean("no-preconvert")) {
|
||||||
|
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/preconvert")).build()).execute();
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < DYNAMIC_BUCKET_LENGTH+1; i++) {
|
for (int i = 0; i < DYNAMIC_BUCKET_LENGTH+1; i++) {
|
||||||
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/reindex/" + i)).build()).execute();
|
client.newCall(new Request.Builder().post(rb).url(new URL("http", args[0], ServiceDescriptor.EDGE_INDEX.port, "/ops/reindex/" + i)).build()).execute();
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,58 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.compiler;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadKeywords;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.IndexBlockType;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class DocumentsCompiler {
|
||||||
|
|
||||||
|
public void compile(List<Instruction> ret, List<ProcessedDocument> documents) {
|
||||||
|
|
||||||
|
for (var doc : documents) {
|
||||||
|
compileDocumentDetails(ret, doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var doc : documents) {
|
||||||
|
compileWords(ret, doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private void compileDocumentDetails(List<Instruction> ret, ProcessedDocument doc) {
|
||||||
|
var details = doc.details;
|
||||||
|
|
||||||
|
if (details != null) {
|
||||||
|
ret.add(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard, details.length, details.hashCode, details.quality));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
ret.add(new LoadProcessedDocumentWithError(doc.url, doc.state, doc.stateReason));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void compileWords(List<Instruction> ret, ProcessedDocument doc) {
|
||||||
|
var words = doc.words;
|
||||||
|
|
||||||
|
if (words != null) {
|
||||||
|
|
||||||
|
var wordsArray = words.values().stream()
|
||||||
|
.filter(this::filterNonTransients)
|
||||||
|
.map(DocumentKeywords::new)
|
||||||
|
.toArray(DocumentKeywords[]::new);
|
||||||
|
|
||||||
|
ret.add(new LoadKeywords(doc.url, wordsArray));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean filterNonTransients(EdgePageWords words) {
|
||||||
|
return words.block.type != IndexBlockType.TRANSIENT;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,23 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.compiler;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadRssFeed;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
public class FeedsCompiler {
|
||||||
|
|
||||||
|
public void compile(List<Instruction> ret, List<ProcessedDocument> documents) {
|
||||||
|
|
||||||
|
EdgeUrl[] feeds = documents.stream().map(doc -> doc.details)
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.flatMap(dets -> dets.feedLinks.stream())
|
||||||
|
.distinct()
|
||||||
|
.toArray(EdgeUrl[]::new);
|
||||||
|
|
||||||
|
ret.add(new LoadRssFeed(feeds));
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,57 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.compiler;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDomain;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class InstructionsCompiler {
|
||||||
|
private final UrlsCompiler urlsCompiler;
|
||||||
|
private final DocumentsCompiler documentsCompiler;
|
||||||
|
private final FeedsCompiler feedsCompiler;
|
||||||
|
private final LinksCompiler linksCompiler;
|
||||||
|
private final RedirectCompiler redirectCompiler;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public InstructionsCompiler(UrlsCompiler urlsCompiler,
|
||||||
|
DocumentsCompiler documentsCompiler,
|
||||||
|
FeedsCompiler feedsCompiler,
|
||||||
|
LinksCompiler linksCompiler,
|
||||||
|
RedirectCompiler redirectCompiler)
|
||||||
|
{
|
||||||
|
this.urlsCompiler = urlsCompiler;
|
||||||
|
this.documentsCompiler = documentsCompiler;
|
||||||
|
this.feedsCompiler = feedsCompiler;
|
||||||
|
this.linksCompiler = linksCompiler;
|
||||||
|
this.redirectCompiler = redirectCompiler;
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Instruction> compile(ProcessedDomain domain) {
|
||||||
|
List<Instruction> ret = new ArrayList<>(domain.size()*4);
|
||||||
|
|
||||||
|
ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.ip));
|
||||||
|
|
||||||
|
if (domain.documents != null) {
|
||||||
|
urlsCompiler.compile(ret, domain.documents);
|
||||||
|
documentsCompiler.compile(ret, domain.documents);
|
||||||
|
|
||||||
|
feedsCompiler.compile(ret, domain.documents);
|
||||||
|
|
||||||
|
linksCompiler.compile(ret, domain.domain, domain.documents);
|
||||||
|
}
|
||||||
|
if (domain.redirect != null) {
|
||||||
|
redirectCompiler.compile(ret, domain.domain, domain.redirect);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,26 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.compiler;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomainLink;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
public class LinksCompiler {
|
||||||
|
|
||||||
|
public void compile(List<Instruction> ret, EdgeDomain from, List<ProcessedDocument> documents) {
|
||||||
|
|
||||||
|
DomainLink[] links = documents.stream().map(doc -> doc.details)
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.flatMap(dets -> dets.linksExternal.stream())
|
||||||
|
.map(link -> link.domain)
|
||||||
|
.distinct()
|
||||||
|
.map(domain -> new DomainLink(from, domain))
|
||||||
|
.toArray(DomainLink[]::new);
|
||||||
|
|
||||||
|
ret.add(new LoadDomainLink(links));
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,19 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.compiler;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomain;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomainLink;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomainRedirect;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class RedirectCompiler {
|
||||||
|
|
||||||
|
public void compile(List<Instruction> ret, EdgeDomain from, EdgeDomain to) {
|
||||||
|
ret.add(new LoadDomain(to));
|
||||||
|
ret.add(new LoadDomainLink(new DomainLink(from, to)));
|
||||||
|
ret.add(new LoadDomainRedirect(new DomainLink(from, to)));
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,49 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.compiler;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadDomain;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
public class UrlsCompiler {
|
||||||
|
|
||||||
|
private static final int MAX_INTERNAL_LINKS = 25;
|
||||||
|
|
||||||
|
public void compile(List<Instruction> ret, List<ProcessedDocument> documents) {
|
||||||
|
Set<EdgeUrl> seenUrls = new HashSet<>(documents.size()*4);
|
||||||
|
Set<EdgeDomain> seenDomains = new HashSet<>(documents.size());
|
||||||
|
|
||||||
|
for (var doc : documents) {
|
||||||
|
seenUrls.add(doc.url);
|
||||||
|
|
||||||
|
if (doc.details != null) {
|
||||||
|
|
||||||
|
for (var url : doc.details.linksExternal) {
|
||||||
|
if (seenDomains.add(url.domain)) {
|
||||||
|
seenUrls.add(url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (doc.isOk()) {
|
||||||
|
// Don't load more than a few from linksInternal, grows too big for no reason
|
||||||
|
var linksToAdd = new ArrayList<>(doc.details.linksInternal);
|
||||||
|
if (linksToAdd.size() > MAX_INTERNAL_LINKS) {
|
||||||
|
linksToAdd.subList(MAX_INTERNAL_LINKS, linksToAdd.size()).clear();
|
||||||
|
}
|
||||||
|
seenUrls.addAll(linksToAdd);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ret.add(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new)));
|
||||||
|
ret.add(new LoadUrl(seenUrls.toArray(EdgeUrl[]::new)));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,17 +1,47 @@
|
|||||||
package nu.marginalia.wmsa.edge.converting.interpreter.instruction;
|
package nu.marginalia.wmsa.edge.converting.interpreter.instruction;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
public record DocumentKeywords(IndexBlock block, String... keywords) {
|
public record DocumentKeywords(IndexBlock block,
|
||||||
|
String[] keywords,
|
||||||
|
long[] metadata) {
|
||||||
|
|
||||||
public DocumentKeywords(EdgePageWords words) {
|
public DocumentKeywords(EdgePageWords words) {
|
||||||
this(words.block, words.words.toArray(String[]::new));
|
this(words.block,
|
||||||
|
words.words.toArray(String[]::new),
|
||||||
|
words.metadata.toArray());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return getClass().getSimpleName()+"["+block +", "+Arrays.toString(keywords)+"]";
|
StringBuilder sb = new StringBuilder();
|
||||||
|
sb.append(getClass().getSimpleName());
|
||||||
|
sb.append('[').append(block).append(", ");
|
||||||
|
for (int i = 0; i < keywords.length; i++) {
|
||||||
|
sb.append("\n\t ");
|
||||||
|
if (metadata[i] != 0) {
|
||||||
|
sb.append(keywords[i]).append("/").append(new EdgePageWordMetadata(metadata[i]));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
sb.append(keywords[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sb.append("\n]").toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isEmpty() {
|
||||||
|
return keywords.length == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int size() {
|
||||||
|
return keywords.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
public DocumentKeywords subList(int start, int end) {
|
||||||
|
return new DocumentKeywords(block, Arrays.copyOfRange(keywords, start, end), Arrays.copyOfRange(metadata, start, end));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -8,7 +8,8 @@ import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
|
|||||||
|
|
||||||
|
|
||||||
public record LoadProcessedDocumentWithError(EdgeUrl url,
|
public record LoadProcessedDocumentWithError(EdgeUrl url,
|
||||||
EdgeUrlState state) implements Instruction
|
EdgeUrlState state,
|
||||||
|
String reason) implements Instruction
|
||||||
{
|
{
|
||||||
@Override
|
@Override
|
||||||
public void apply(Interpreter interpreter) {
|
public void apply(Interpreter interpreter) {
|
||||||
|
@ -25,34 +25,13 @@ public class SqlLoadUrls {
|
|||||||
@Inject
|
@Inject
|
||||||
public SqlLoadUrls(HikariDataSource dataSource) {
|
public SqlLoadUrls(HikariDataSource dataSource) {
|
||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
try (var conn = dataSource.getConnection()) {
|
|
||||||
try (var stmt = conn.createStatement()) {
|
|
||||||
stmt.execute("DROP PROCEDURE IF EXISTS INSERT_URL");
|
|
||||||
stmt.execute("""
|
|
||||||
CREATE PROCEDURE INSERT_URL (
|
|
||||||
IN PROTO VARCHAR(255),
|
|
||||||
IN DOMAIN VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci,
|
|
||||||
IN PORT INT,
|
|
||||||
IN PATH VARCHAR(255),
|
|
||||||
IN PARAM VARCHAR(255),
|
|
||||||
IN PATH_HASH BIGINT
|
|
||||||
)
|
|
||||||
BEGIN
|
|
||||||
INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PARAM,PATH_HASH) SELECT PROTO,ID,PORT,PATH,PARAM,PATH_HASH FROM EC_DOMAIN WHERE DOMAIN_NAME=DOMAIN;
|
|
||||||
END
|
|
||||||
""");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (SQLException ex) {
|
|
||||||
throw new RuntimeException("Failed to set up loader", ex);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void load(LoaderData data, EdgeUrl[] urls) {
|
public void load(LoaderData data, EdgeUrl[] urls) {
|
||||||
Set<EdgeDomain> affectedDomains = new HashSet<>();
|
Set<EdgeDomain> affectedDomains = new HashSet<>();
|
||||||
|
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?,?,?)");
|
var insertCall = conn.prepareStatement("INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PARAM,PATH_HASH) VALUES (?,?,?,?,?,?)");
|
||||||
var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=?")
|
var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=?")
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
@ -67,7 +46,7 @@ public class SqlLoadUrls {
|
|||||||
affectedDomains.add(url.domain);
|
affectedDomains.add(url.domain);
|
||||||
|
|
||||||
insertCall.setString(1, url.proto);
|
insertCall.setString(1, url.proto);
|
||||||
insertCall.setString(2, url.domain.toString());
|
insertCall.setInt(2, data.getDomainId(url.domain));
|
||||||
if (url.port != null) {
|
if (url.port != null) {
|
||||||
insertCall.setInt(3, url.port);
|
insertCall.setInt(3, url.port);
|
||||||
}
|
}
|
||||||
@ -79,7 +58,7 @@ public class SqlLoadUrls {
|
|||||||
insertCall.setLong(6, hashPath(url.path, url.param));
|
insertCall.setLong(6, hashPath(url.path, url.param));
|
||||||
insertCall.addBatch();
|
insertCall.addBatch();
|
||||||
|
|
||||||
if (cnt++ == 250) {
|
if (cnt++ == 1000) {
|
||||||
var ret = insertCall.executeBatch();
|
var ret = insertCall.executeBatch();
|
||||||
conn.commit();
|
conn.commit();
|
||||||
|
|
||||||
|
@ -1,11 +1,18 @@
|
|||||||
package nu.marginalia.wmsa.edge.converting.model;
|
package nu.marginalia.wmsa.edge.converting.model;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
|
||||||
|
|
||||||
public class DisqualifiedException extends Exception {
|
public class DisqualifiedException extends Exception {
|
||||||
public final DisqualificationReason reason;
|
public final DisqualificationReason reason;
|
||||||
|
|
||||||
public DisqualifiedException(DisqualificationReason reason) {
|
public DisqualifiedException(DisqualificationReason reason) {
|
||||||
this.reason = reason;
|
this.reason = reason;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public DisqualifiedException(CrawlerDocumentStatus crawlerStatus) {
|
||||||
|
this.reason = DisqualificationReason.fromCrawlerStatus(crawlerStatus);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Throwable fillInStackTrace() {
|
public Throwable fillInStackTrace() {
|
||||||
return this;
|
return this;
|
||||||
@ -18,6 +25,22 @@ public class DisqualifiedException extends Exception {
|
|||||||
STATUS,
|
STATUS,
|
||||||
QUALITY,
|
QUALITY,
|
||||||
ACCEPTABLE_ADS,
|
ACCEPTABLE_ADS,
|
||||||
FORBIDDEN
|
FORBIDDEN,
|
||||||
|
SHORT_CIRCUIT,
|
||||||
|
|
||||||
|
PROCESSING_EXCEPTION,
|
||||||
|
|
||||||
|
BAD_CONTENT_TYPE,
|
||||||
|
BAD_CHARSET,
|
||||||
|
REDIRECT,
|
||||||
|
ROBOTS_TXT,
|
||||||
|
ERROR,
|
||||||
|
Timeout, // Don't you dare
|
||||||
|
BAD_CANONICAL
|
||||||
|
;
|
||||||
|
|
||||||
|
public static DisqualificationReason fromCrawlerStatus(CrawlerDocumentStatus crawlerStatus) {
|
||||||
|
return DisqualificationReason.valueOf(crawlerStatus.name());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -17,6 +17,10 @@ public class ProcessedDocument {
|
|||||||
public EdgeUrlState state;
|
public EdgeUrlState state;
|
||||||
public String stateReason;
|
public String stateReason;
|
||||||
|
|
||||||
|
public boolean isOk() {
|
||||||
|
return EdgeUrlState.OK == state;
|
||||||
|
}
|
||||||
|
|
||||||
public OptionalDouble quality() {
|
public OptionalDouble quality() {
|
||||||
if (details != null) {
|
if (details != null) {
|
||||||
return OptionalDouble.of(details.quality);
|
return OptionalDouble.of(details.quality);
|
||||||
|
@ -7,6 +7,7 @@ import nu.marginalia.util.language.LanguageFilter;
|
|||||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
|
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
||||||
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
|
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
|
||||||
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException.DisqualificationReason;
|
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException.DisqualificationReason;
|
||||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||||
@ -81,32 +82,12 @@ public class DocumentProcessor {
|
|||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ProcessedDocument process(CrawledDocument crawledDocument, CrawledDomain crawledDomain) {
|
public ProcessedDocument process(CrawledDocument crawledDocument, CrawledDomain crawledDomain) {
|
||||||
ProcessedDocument ret = new ProcessedDocument();
|
ProcessedDocument ret = new ProcessedDocument();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
ret.url = getDocumentUrl(crawledDocument);
|
processDocument(crawledDocument, crawledDomain, ret);
|
||||||
ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus);
|
|
||||||
|
|
||||||
if (ret.state == EdgeUrlState.OK) {
|
|
||||||
|
|
||||||
if (AcceptableAds.hasAcceptableAdsHeader(crawledDocument)) {
|
|
||||||
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isAcceptedContentType(crawledDocument)) {
|
|
||||||
var detailsWords = createDetails(crawledDomain, crawledDocument);
|
|
||||||
|
|
||||||
ret.details = detailsWords.details();
|
|
||||||
ret.words = detailsWords.words();
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
throw new DisqualifiedException(DisqualificationReason.CONTENT_TYPE);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
throw new DisqualifiedException(DisqualificationReason.STATUS);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
catch (DisqualifiedException ex) {
|
catch (DisqualifiedException ex) {
|
||||||
ret.state = EdgeUrlState.DISQUALIFIED;
|
ret.state = EdgeUrlState.DISQUALIFIED;
|
||||||
@ -115,6 +96,7 @@ public class DocumentProcessor {
|
|||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
ret.state = EdgeUrlState.DISQUALIFIED;
|
ret.state = EdgeUrlState.DISQUALIFIED;
|
||||||
|
ret.stateReason = DisqualificationReason.PROCESSING_EXCEPTION.toString();
|
||||||
logger.info("Failed to convert " + crawledDocument.url, ex);
|
logger.info("Failed to convert " + crawledDocument.url, ex);
|
||||||
ex.printStackTrace();
|
ex.printStackTrace();
|
||||||
}
|
}
|
||||||
@ -122,6 +104,32 @@ public class DocumentProcessor {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void processDocument(CrawledDocument crawledDocument, CrawledDomain crawledDomain, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException {
|
||||||
|
|
||||||
|
var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus);
|
||||||
|
if (crawlerStatus != CrawlerDocumentStatus.OK) {
|
||||||
|
throw new DisqualifiedException(crawlerStatus);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (AcceptableAds.hasAcceptableAdsHeader(crawledDocument)) {
|
||||||
|
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isAcceptedContentType(crawledDocument)) {
|
||||||
|
throw new DisqualifiedException(DisqualificationReason.CONTENT_TYPE);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
ret.url = getDocumentUrl(crawledDocument);
|
||||||
|
ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus);
|
||||||
|
|
||||||
|
var detailsWithWordsLinks = createDetails(crawledDomain, crawledDocument);
|
||||||
|
|
||||||
|
ret.details = detailsWithWordsLinks.details();
|
||||||
|
ret.words = detailsWithWordsLinks.words();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument)
|
private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument)
|
||||||
throws URISyntaxException
|
throws URISyntaxException
|
||||||
{
|
{
|
||||||
@ -193,9 +201,11 @@ public class DocumentProcessor {
|
|||||||
ret.standard = getHtmlStandard(doc);
|
ret.standard = getHtmlStandard(doc);
|
||||||
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
|
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
|
||||||
|
|
||||||
ret.quality = documentValuator.getQuality(ret.standard, doc, dld);
|
ret.quality = documentValuator.getQuality(crawledDocument, ret.standard, doc, dld);
|
||||||
ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();
|
ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();
|
||||||
|
|
||||||
|
KeywordMetadata keywordMetadata = new KeywordMetadata(ret.quality);
|
||||||
|
|
||||||
EdgePageWordSet words;
|
EdgePageWordSet words;
|
||||||
if (shouldDoSimpleProcessing(url, ret)) {
|
if (shouldDoSimpleProcessing(url, ret)) {
|
||||||
/* Some documents we'll index, but only superficially. This is a compromise
|
/* Some documents we'll index, but only superficially. This is a compromise
|
||||||
@ -203,12 +213,12 @@ public class DocumentProcessor {
|
|||||||
queries. This also saves a lot of processing power.
|
queries. This also saves a lot of processing power.
|
||||||
*/
|
*/
|
||||||
ret.features = Set.of(HtmlFeature.UNKNOWN);
|
ret.features = Set.of(HtmlFeature.UNKNOWN);
|
||||||
words = keywordExtractor.extractKeywordsMinimal(dld);
|
words = keywordExtractor.extractKeywordsMinimal(dld, keywordMetadata);
|
||||||
ret.description = "";
|
ret.description = "";
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld);
|
ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld);
|
||||||
words = keywordExtractor.extractKeywords(dld);
|
words = keywordExtractor.extractKeywords(dld, keywordMetadata);
|
||||||
ret.description = getDescription(doc);
|
ret.description = getDescription(doc);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -239,6 +249,10 @@ public class DocumentProcessor {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Annoying wordpress crap
|
||||||
|
if (url.path.startsWith("/tag/") && url.path.endsWith("/")) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -262,7 +276,7 @@ public class DocumentProcessor {
|
|||||||
|
|
||||||
ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
|
ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
|
||||||
|
|
||||||
words.append(IndexBlock.Meta, tagWords);
|
words.appendWithNoMeta(IndexBlock.Meta, tagWords);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) {
|
private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) {
|
||||||
@ -296,14 +310,21 @@ public class DocumentProcessor {
|
|||||||
.ifPresent(lp::acceptFeed);
|
.ifPresent(lp::acceptFeed);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
createLinkKeywords(words, lp);
|
||||||
|
createFileLinkKeywords(words, lp, domain);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createLinkKeywords(EdgePageWordSet words, LinkProcessor lp) {
|
||||||
final Set<String> linkTerms = new HashSet<>();
|
final Set<String> linkTerms = new HashSet<>();
|
||||||
|
|
||||||
for (var fd : lp.getForeignDomains()) {
|
for (var fd : lp.getForeignDomains()) {
|
||||||
linkTerms.add("links:"+fd.toString().toLowerCase());
|
linkTerms.add("links:"+fd.toString().toLowerCase());
|
||||||
linkTerms.add("links:"+fd.getDomain().toLowerCase());
|
linkTerms.add("links:"+fd.getDomain().toLowerCase());
|
||||||
}
|
}
|
||||||
words.append(IndexBlock.Meta, linkTerms);
|
words.appendWithNoMeta(IndexBlock.Meta, linkTerms);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createFileLinkKeywords(EdgePageWordSet words, LinkProcessor lp, EdgeDomain domain) {
|
||||||
Set<String> fileKeywords = new HashSet<>(100);
|
Set<String> fileKeywords = new HashSet<>(100);
|
||||||
for (var link : lp.getNonIndexableUrls()) {
|
for (var link : lp.getNonIndexableUrls()) {
|
||||||
|
|
||||||
@ -314,8 +335,8 @@ public class DocumentProcessor {
|
|||||||
synthesizeFilenameKeyword(fileKeywords, link);
|
synthesizeFilenameKeyword(fileKeywords, link);
|
||||||
|
|
||||||
}
|
}
|
||||||
words.append(IndexBlock.Artifacts, fileKeywords);
|
|
||||||
|
|
||||||
|
words.appendWithNoMeta(IndexBlock.Artifacts, fileKeywords);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void synthesizeFilenameKeyword(Set<String> fileKeywords, EdgeUrl link) {
|
private void synthesizeFilenameKeyword(Set<String> fileKeywords, EdgeUrl link) {
|
||||||
@ -364,5 +385,7 @@ public class DocumentProcessor {
|
|||||||
return doc.text().length();
|
return doc.text().length();
|
||||||
}
|
}
|
||||||
|
|
||||||
private record DetailsWithWords(ProcessedDocumentDetails details, EdgePageWordSet words) {}
|
private record DetailsWithWords(ProcessedDocumentDetails details,
|
||||||
|
EdgePageWordSet words) {}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -3,17 +3,22 @@ package nu.marginalia.wmsa.edge.converting.processor;
|
|||||||
import com.google.common.base.Strings;
|
import com.google.common.base.Strings;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
|
||||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.CommonKeywordExtractor;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.CommonKeywordExtractor;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.InternalLinkGraph;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.IndexBlockType;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import static nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus.BAD_CANONICAL;
|
import static nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus.BAD_CANONICAL;
|
||||||
|
|
||||||
@ -47,6 +52,8 @@ public class DomainProcessor {
|
|||||||
|
|
||||||
fixBadCanonicalTags(crawledDomain.doc);
|
fixBadCanonicalTags(crawledDomain.doc);
|
||||||
|
|
||||||
|
InternalLinkGraph internalLinkGraph = new InternalLinkGraph();
|
||||||
|
|
||||||
DocumentDisqualifier disqualifier = new DocumentDisqualifier();
|
DocumentDisqualifier disqualifier = new DocumentDisqualifier();
|
||||||
for (var doc : crawledDomain.doc) {
|
for (var doc : crawledDomain.doc) {
|
||||||
if (disqualifier.isQualified()) {
|
if (disqualifier.isQualified()) {
|
||||||
@ -54,6 +61,9 @@ public class DomainProcessor {
|
|||||||
|
|
||||||
if (processedDoc.url != null) {
|
if (processedDoc.url != null) {
|
||||||
ret.documents.add(processedDoc);
|
ret.documents.add(processedDoc);
|
||||||
|
|
||||||
|
internalLinkGraph.accept(processedDoc);
|
||||||
|
|
||||||
processedDoc.quality().ifPresent(disqualifier::offer);
|
processedDoc.quality().ifPresent(disqualifier::offer);
|
||||||
}
|
}
|
||||||
else if ("LANGUAGE".equals(processedDoc.stateReason)) {
|
else if ("LANGUAGE".equals(processedDoc.stateReason)) {
|
||||||
@ -62,24 +72,16 @@ public class DomainProcessor {
|
|||||||
}
|
}
|
||||||
else { // Short-circuit processing if quality is too low
|
else { // Short-circuit processing if quality is too low
|
||||||
var stub = documentProcessor.makeDisqualifiedStub(doc);
|
var stub = documentProcessor.makeDisqualifiedStub(doc);
|
||||||
|
stub.stateReason = DisqualifiedException.DisqualificationReason.SHORT_CIRCUIT.toString();
|
||||||
if (stub.url != null) {
|
if (stub.url != null) {
|
||||||
ret.documents.add(stub);
|
ret.documents.add(stub);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Set<String> commonSiteWords = new HashSet<>(10);
|
flagCommonSiteWords(ret);
|
||||||
|
flagAdjacentSiteWords(internalLinkGraph, ret);
|
||||||
|
|
||||||
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Tfidf_Top, IndexBlock.Subjects));
|
|
||||||
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Title));
|
|
||||||
|
|
||||||
if (!commonSiteWords.isEmpty()) {
|
|
||||||
for (var doc : ret.documents) {
|
|
||||||
if (doc.words != null) {
|
|
||||||
doc.words.get(IndexBlock.Site).addAll(commonSiteWords);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
ret.documents = Collections.emptyList();
|
ret.documents = Collections.emptyList();
|
||||||
@ -90,6 +92,70 @@ public class DomainProcessor {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void flagCommonSiteWords(ProcessedDomain processedDomain) {
|
||||||
|
Set<String> commonSiteWords = new HashSet<>(10);
|
||||||
|
|
||||||
|
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain, IndexBlock.Tfidf_High, IndexBlock.Subjects));
|
||||||
|
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain, IndexBlock.Title));
|
||||||
|
|
||||||
|
if (commonSiteWords.isEmpty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var doc : processedDomain.documents) {
|
||||||
|
if (doc.words != null) {
|
||||||
|
for (var block : IndexBlock.values()) {
|
||||||
|
if (block.type == IndexBlockType.PAGE_DATA) {
|
||||||
|
doc.words.get(block).setFlagOnMetadataForWords(EdgePageWordFlags.Site, commonSiteWords);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void flagAdjacentSiteWords(InternalLinkGraph internalLinkGraph, ProcessedDomain processedDomain) {
|
||||||
|
var invertedGraph = internalLinkGraph.trimAndInvert();
|
||||||
|
|
||||||
|
Map<EdgeUrl, Set<String>> linkedKeywords = new HashMap<>(100);
|
||||||
|
|
||||||
|
invertedGraph.forEach((url, linkingUrls) -> {
|
||||||
|
Map<String, Integer> keywords = new HashMap<>(100);
|
||||||
|
|
||||||
|
for (var linkingUrl : linkingUrls) {
|
||||||
|
for (var keyword : internalLinkGraph.getKeywords(linkingUrl)) {
|
||||||
|
keywords.merge(keyword, 1, Integer::sum);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var words = keywords.entrySet().stream()
|
||||||
|
.filter(e -> e.getValue() > 3)
|
||||||
|
.map(Map.Entry::getKey)
|
||||||
|
.filter(internalLinkGraph.getCandidateKeywords(url)::contains)
|
||||||
|
.collect(Collectors.toSet());
|
||||||
|
if (!words.isEmpty()) {
|
||||||
|
linkedKeywords.put(url, words);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
for (var doc : processedDomain.documents) {
|
||||||
|
if (doc.words == null)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
final Set<String> keywords = linkedKeywords.get(doc.url);
|
||||||
|
if (keywords == null)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
for (var block : IndexBlock.values()) {
|
||||||
|
if (block.type == IndexBlockType.PAGE_DATA) {
|
||||||
|
doc.words.get(block).setFlagOnMetadataForWords(EdgePageWordFlags.SiteAdjacent, keywords);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private void fixBadCanonicalTags(List<CrawledDocument> docs) {
|
private void fixBadCanonicalTags(List<CrawledDocument> docs) {
|
||||||
Map<String, Set<String>> seenCanonicals = new HashMap<>();
|
Map<String, Set<String>> seenCanonicals = new HashMap<>();
|
||||||
Set<String> seenUrls = new HashSet<>();
|
Set<String> seenUrls = new HashSet<>();
|
||||||
@ -162,7 +228,8 @@ public class DomainProcessor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
boolean isQualified() {
|
boolean isQualified() {
|
||||||
return count < 25 || goodCount*10 >= count;
|
return true;
|
||||||
|
// return count < 25 || goodCount*10 >= count;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,116 +0,0 @@
|
|||||||
package nu.marginalia.wmsa.edge.converting.processor;
|
|
||||||
|
|
||||||
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
|
||||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.*;
|
|
||||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
|
||||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
|
|
||||||
public class InstructionsCompiler {
|
|
||||||
|
|
||||||
public List<Instruction> compile(ProcessedDomain domain) {
|
|
||||||
List<Instruction> ret = new ArrayList<>(domain.size()*4);
|
|
||||||
|
|
||||||
ret.add(new LoadProcessedDomain(domain.domain, domain.state, domain.ip));
|
|
||||||
|
|
||||||
if (domain.documents != null) {
|
|
||||||
compileUrls(ret, domain.documents);
|
|
||||||
compileDocuments(ret, domain.documents);
|
|
||||||
compileFeeds(ret, domain.documents);
|
|
||||||
|
|
||||||
compileLinks(ret, domain.domain, domain.documents);
|
|
||||||
}
|
|
||||||
if (domain.redirect != null) {
|
|
||||||
compileRedirect(ret, domain.domain, domain.redirect);
|
|
||||||
}
|
|
||||||
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void compileRedirect(List<Instruction> ret, EdgeDomain from, EdgeDomain to) {
|
|
||||||
ret.add(new LoadDomain(to));
|
|
||||||
ret.add(new LoadDomainLink(new DomainLink(from, to)));
|
|
||||||
ret.add(new LoadDomainRedirect(new DomainLink(from, to)));
|
|
||||||
}
|
|
||||||
|
|
||||||
private void compileUrls(List<Instruction> ret, List<ProcessedDocument> documents) {
|
|
||||||
Set<EdgeUrl> seenUrls = new HashSet<>(documents.size()*4);
|
|
||||||
Set<EdgeDomain> seenDomains = new HashSet<>(documents.size());
|
|
||||||
|
|
||||||
for (var doc : documents) {
|
|
||||||
seenUrls.add(doc.url);
|
|
||||||
|
|
||||||
if (doc.details != null) {
|
|
||||||
for (var url : doc.details.linksExternal) {
|
|
||||||
seenDomains.add(url.domain);
|
|
||||||
}
|
|
||||||
seenUrls.addAll(doc.details.linksExternal);
|
|
||||||
seenUrls.addAll(doc.details.linksInternal);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
ret.add(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new)));
|
|
||||||
ret.add(new LoadUrl(seenUrls.toArray(EdgeUrl[]::new)));
|
|
||||||
}
|
|
||||||
|
|
||||||
private void compileLinks(List<Instruction> ret, EdgeDomain from, List<ProcessedDocument> documents) {
|
|
||||||
DomainLink[] links = documents.stream().map(doc -> doc.details)
|
|
||||||
.filter(Objects::nonNull)
|
|
||||||
.flatMap(dets -> dets.linksExternal.stream())
|
|
||||||
.map(link -> link.domain)
|
|
||||||
.distinct()
|
|
||||||
.map(domain -> new DomainLink(from, domain))
|
|
||||||
.toArray(DomainLink[]::new);
|
|
||||||
|
|
||||||
ret.add(new LoadDomainLink(links));
|
|
||||||
}
|
|
||||||
|
|
||||||
private void compileFeeds(List<Instruction> ret, List<ProcessedDocument> documents) {
|
|
||||||
|
|
||||||
EdgeUrl[] feeds = documents.stream().map(doc -> doc.details)
|
|
||||||
.filter(Objects::nonNull)
|
|
||||||
.flatMap(dets -> dets.feedLinks.stream())
|
|
||||||
.distinct()
|
|
||||||
.toArray(EdgeUrl[]::new);
|
|
||||||
|
|
||||||
ret.add(new LoadRssFeed(feeds));
|
|
||||||
}
|
|
||||||
|
|
||||||
private void compileDocuments(List<Instruction> ret, List<ProcessedDocument> documents) {
|
|
||||||
|
|
||||||
for (var doc : documents) {
|
|
||||||
compileDocumentDetails(ret, doc);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (var doc : documents) {
|
|
||||||
compileWords(ret, doc);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private void compileDocumentDetails(List<Instruction> ret, ProcessedDocument doc) {
|
|
||||||
var details = doc.details;
|
|
||||||
|
|
||||||
if (details != null) {
|
|
||||||
ret.add(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard, details.length, details.hashCode, details.quality));
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
ret.add(new LoadProcessedDocumentWithError(doc.url, doc.state));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void compileWords(List<Instruction> ret, ProcessedDocument doc) {
|
|
||||||
var words = doc.words;
|
|
||||||
if (words != null) {
|
|
||||||
var wordsArray = words.values().stream()
|
|
||||||
.map(DocumentKeywords::new)
|
|
||||||
.toArray(DocumentKeywords[]::new);
|
|
||||||
|
|
||||||
ret.add(new LoadKeywords(doc.url, wordsArray));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -3,6 +3,7 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
|
|||||||
import crawlercommons.utils.Strings;
|
import crawlercommons.utils.Strings;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
|
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
|
||||||
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeHtmlStandard;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
@ -23,13 +24,12 @@ public class DocumentValuator {
|
|||||||
|
|
||||||
);
|
);
|
||||||
|
|
||||||
public double getQuality(EdgeHtmlStandard htmlStandard, Document doc, DocumentLanguageData dld) throws DisqualifiedException {
|
public double getQuality(CrawledDocument crawledDocument, EdgeHtmlStandard htmlStandard, Document parsedDocument, DocumentLanguageData dld) throws DisqualifiedException {
|
||||||
double smutCoefficient = dld.streamLowerCase().filter(filthTable::contains).count();
|
double smutCoefficient = dld.streamLowerCase().filter(filthTable::contains).count();
|
||||||
double scriptPenalty = getScriptPenalty(doc);
|
double scriptPenalty = getScriptPenalty(parsedDocument);
|
||||||
|
|
||||||
|
int textBodyLength = parsedDocument.text().length();
|
||||||
int textBodyLength = doc.text().length();
|
int rawLength = crawledDocument.documentBody.length();
|
||||||
int rawLength = doc.html().length();
|
|
||||||
|
|
||||||
if (textBodyLength == 0) {
|
if (textBodyLength == 0) {
|
||||||
throw new DisqualifiedException(LENGTH);
|
throw new DisqualifiedException(LENGTH);
|
||||||
|
@ -3,10 +3,7 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.*;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector;
|
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector;
|
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector;
|
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
@ -43,13 +40,15 @@ public class FeatureExtractor {
|
|||||||
private final RecipeDetector recipeDetector;
|
private final RecipeDetector recipeDetector;
|
||||||
private final TextileCraftDetector textileCraftDetector;
|
private final TextileCraftDetector textileCraftDetector;
|
||||||
private final WoodworkingDetector woodworkingDetector;
|
private final WoodworkingDetector woodworkingDetector;
|
||||||
|
private final GoogleAnwersSpamDetector googleAnwersSpamDetector;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public FeatureExtractor(AdblockSimulator adblockSimulator, RecipeDetector recipeDetector, TextileCraftDetector textileCraftDetector, WoodworkingDetector woodworkingDetector) {
|
public FeatureExtractor(AdblockSimulator adblockSimulator, RecipeDetector recipeDetector, TextileCraftDetector textileCraftDetector, WoodworkingDetector woodworkingDetector, GoogleAnwersSpamDetector googleAnwersSpamDetector) {
|
||||||
this.adblockSimulator = adblockSimulator;
|
this.adblockSimulator = adblockSimulator;
|
||||||
this.recipeDetector = recipeDetector;
|
this.recipeDetector = recipeDetector;
|
||||||
this.textileCraftDetector = textileCraftDetector;
|
this.textileCraftDetector = textileCraftDetector;
|
||||||
this.woodworkingDetector = woodworkingDetector;
|
this.woodworkingDetector = woodworkingDetector;
|
||||||
|
this.googleAnwersSpamDetector = googleAnwersSpamDetector;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc, DocumentLanguageData dld) {
|
public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc, DocumentLanguageData dld) {
|
||||||
@ -57,6 +56,10 @@ public class FeatureExtractor {
|
|||||||
|
|
||||||
final Elements scriptTags = doc.getElementsByTag("script");
|
final Elements scriptTags = doc.getElementsByTag("script");
|
||||||
|
|
||||||
|
if (googleAnwersSpamDetector.testP(doc) > 0.5) {
|
||||||
|
features.add(HtmlFeature.GA_SPAM);
|
||||||
|
}
|
||||||
|
|
||||||
for (var scriptTag : scriptTags) {
|
for (var scriptTag : scriptTags) {
|
||||||
if (isJavascriptTag(scriptTag)) {
|
if (isJavascriptTag(scriptTag)) {
|
||||||
features.add(HtmlFeature.JS);
|
features.add(HtmlFeature.JS);
|
||||||
|
@ -7,14 +7,14 @@ public enum HtmlFeature {
|
|||||||
JS("special:scripts"),
|
JS("special:scripts"),
|
||||||
AFFILIATE_LINK( "special:affiliate"),
|
AFFILIATE_LINK( "special:affiliate"),
|
||||||
TRACKING("special:tracking"),
|
TRACKING("special:tracking"),
|
||||||
|
|
||||||
COOKIES("special:cookies"),
|
COOKIES("special:cookies"),
|
||||||
|
|
||||||
CATEGORY_FOOD("category:food"),
|
CATEGORY_FOOD("category:food"),
|
||||||
|
|
||||||
ADVERTISEMENT("special:ads"),
|
ADVERTISEMENT("special:ads"),
|
||||||
|
|
||||||
CATEGORY_CRAFTS("category:crafts"),
|
CATEGORY_CRAFTS("category:crafts"),
|
||||||
|
|
||||||
|
GA_SPAM("special:gaspam"),
|
||||||
|
|
||||||
UNKNOWN("special:uncategorized")
|
UNKNOWN("special:uncategorized")
|
||||||
;
|
;
|
||||||
|
|
||||||
|
@ -0,0 +1,54 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public class InternalLinkGraph {
|
||||||
|
private final Map<EdgeUrl, Set<EdgeUrl>> internalLinkGraph = new HashMap<>(1000);
|
||||||
|
private final Set<EdgeUrl> goodUrls = new HashSet<>(1000);
|
||||||
|
private final Map<EdgeUrl, Set<String>> topKeywordsByUrl = new HashMap<>(1000);
|
||||||
|
private final Map<EdgeUrl, Set<String>> candidateKeywordsByUrl = new HashMap<>(1000);
|
||||||
|
|
||||||
|
public void accept(ProcessedDocument doc) {
|
||||||
|
if (doc.details == null || doc.details.linksInternal == null)
|
||||||
|
return;
|
||||||
|
|
||||||
|
goodUrls.add(doc.url);
|
||||||
|
internalLinkGraph.put(doc.url, new HashSet<>(doc.details.linksInternal));
|
||||||
|
|
||||||
|
Set<String> topKeywords = new HashSet<>(doc.words.get(IndexBlock.Tfidf_High).words);
|
||||||
|
topKeywords.addAll(doc.words.get(IndexBlock.Subjects).words);
|
||||||
|
topKeywordsByUrl.put(doc.url, topKeywords);
|
||||||
|
|
||||||
|
Set<String> candidateKeywords = new HashSet<>(topKeywords);
|
||||||
|
candidateKeywords.addAll(doc.words.get(IndexBlock.Tfidf_High).words);
|
||||||
|
candidateKeywords.addAll(doc.words.get(IndexBlock.Subjects).words);
|
||||||
|
candidateKeywordsByUrl.put(doc.url, candidateKeywords);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<EdgeUrl, Set<EdgeUrl>> trimAndInvert() {
|
||||||
|
internalLinkGraph.values().forEach(dest -> dest.retainAll(goodUrls));
|
||||||
|
|
||||||
|
Map<EdgeUrl, Set<EdgeUrl>> inverted = new HashMap<>(goodUrls.size());
|
||||||
|
|
||||||
|
internalLinkGraph.forEach((source, dests) -> {
|
||||||
|
dests.forEach(dest -> inverted.computeIfAbsent(dest,
|
||||||
|
d->new HashSet<>(25))
|
||||||
|
.add(source));
|
||||||
|
});
|
||||||
|
|
||||||
|
internalLinkGraph.clear();
|
||||||
|
|
||||||
|
return inverted;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Set<String> getKeywords(EdgeUrl url) {
|
||||||
|
return topKeywordsByUrl.getOrDefault(url, Collections.emptySet());
|
||||||
|
}
|
||||||
|
public Set<String> getCandidateKeywords(EdgeUrl url) {
|
||||||
|
return candidateKeywordsByUrl.getOrDefault(url, Collections.emptySet());
|
||||||
|
}
|
||||||
|
}
|
@ -5,7 +5,6 @@ import com.google.common.base.Strings;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import org.jetbrains.annotations.Contract;
|
import org.jetbrains.annotations.Contract;
|
||||||
import org.jetbrains.annotations.Nullable;
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -202,7 +201,6 @@ public class LinkParser {
|
|||||||
return binarySuffixList.stream().anyMatch(str::endsWith);
|
return binarySuffixList.stream().anyMatch(str::endsWith);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Nullable
|
|
||||||
public EdgeUrl getBaseLink(Document parsed, EdgeUrl documentUrl) {
|
public EdgeUrl getBaseLink(Document parsed, EdgeUrl documentUrl) {
|
||||||
var baseTags = parsed.getElementsByTag("base");
|
var baseTags = parsed.getElementsByTag("base");
|
||||||
|
|
||||||
|
@ -1,9 +1,13 @@
|
|||||||
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
import java.util.Arrays;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.StringJoiner;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
public class QueryParams {
|
public class QueryParams {
|
||||||
|
|
||||||
@ -15,10 +19,28 @@ public class QueryParams {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
var ret = Arrays.stream(paramSplitterPattern.split(queryParams))
|
String ret;
|
||||||
.filter(param -> QueryParams.isPermittedParam(path, param))
|
if (queryParams.indexOf('&') >= 0) {
|
||||||
.sorted()
|
|
||||||
.collect(Collectors.joining("&"));
|
List<String> parts = new ArrayList<>();
|
||||||
|
for (var part : StringUtils.split(queryParams, '&')) {
|
||||||
|
if (QueryParams.isPermittedParam(path, part)) {
|
||||||
|
parts.add(part);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (parts.size() > 1) {
|
||||||
|
parts.sort(Comparator.naturalOrder());
|
||||||
|
}
|
||||||
|
StringJoiner retJoiner = new StringJoiner("&");
|
||||||
|
parts.forEach(retJoiner::add);
|
||||||
|
ret = retJoiner.toString();
|
||||||
|
}
|
||||||
|
else if (isPermittedParam(path, queryParams)) {
|
||||||
|
ret = queryParams;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
if (ret.isBlank())
|
if (ret.isBlank())
|
||||||
return null;
|
return null;
|
||||||
|
@ -0,0 +1,36 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
||||||
|
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class GoogleAnwersSpamDetector {
|
||||||
|
|
||||||
|
private final List<String> prefixes = List.of("What", "Why", "How", "When", "Is");
|
||||||
|
|
||||||
|
public double testP(Document doc) {
|
||||||
|
if (trialTag(doc, "h1")) return 1;
|
||||||
|
if (trialTag(doc, "h2")) return 1;
|
||||||
|
if (trialTag(doc, "h3")) return 1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean trialTag(Document doc, String tagName) {
|
||||||
|
int positive = 0;
|
||||||
|
int total = 0;
|
||||||
|
|
||||||
|
for (var elem : doc.getElementsByTag(tagName)) {
|
||||||
|
String text = elem.text();
|
||||||
|
for (var prefix : prefixes) {
|
||||||
|
if (text.startsWith(prefix)) {
|
||||||
|
positive++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
total ++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return positive > 4 && positive / (double) total > 0.5;
|
||||||
|
}
|
||||||
|
}
|
@ -29,7 +29,7 @@ public class CrawlJobExtractorMain {
|
|||||||
"""
|
"""
|
||||||
SELECT ID
|
SELECT ID
|
||||||
FROM EC_DOMAIN
|
FROM EC_DOMAIN
|
||||||
WHERE URL_PART=?
|
WHERE DOMAIN_NAME=?
|
||||||
""";
|
""";
|
||||||
|
|
||||||
private static final String domainsSql =
|
private static final String domainsSql =
|
||||||
|
@ -11,6 +11,17 @@ import java.util.regex.Pattern;
|
|||||||
public class UrlBlocklist {
|
public class UrlBlocklist {
|
||||||
private final List<Predicate<String>> patterns = new ArrayList<>();
|
private final List<Predicate<String>> patterns = new ArrayList<>();
|
||||||
|
|
||||||
|
private record UrlPatternContains(String contains, Pattern pattern) implements Predicate<String> {
|
||||||
|
public boolean test(String s) {
|
||||||
|
return s.contains(contains) && pattern.matcher(s).find();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private record UrlPatternMinLength(int minLength, Pattern pattern) implements Predicate<String> {
|
||||||
|
public boolean test(String s) {
|
||||||
|
return s.length() >= minLength && pattern.matcher(s).find();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// domains that have a lot of links but we know we don't want to crawl
|
// domains that have a lot of links but we know we don't want to crawl
|
||||||
private final Set<String> badDomains = Set.of("t.co", "facebook.com",
|
private final Set<String> badDomains = Set.of("t.co", "facebook.com",
|
||||||
"instagram.com", "youtube.com",
|
"instagram.com", "youtube.com",
|
||||||
@ -18,18 +29,24 @@ public class UrlBlocklist {
|
|||||||
|
|
||||||
public UrlBlocklist() {
|
public UrlBlocklist() {
|
||||||
// Don't deep-crawl git repos
|
// Don't deep-crawl git repos
|
||||||
patterns.add(Pattern.compile("\\.git/.+").asPredicate());
|
patterns.add(s -> s.contains(".git/"));
|
||||||
patterns.add(Pattern.compile("wp-content/upload").asPredicate());
|
|
||||||
|
patterns.add(s -> s.contains("wp-content/upload"));
|
||||||
|
patterns.add(s -> s.contains("-download-free"));
|
||||||
|
|
||||||
// long base64-strings in URLs are typically git hashes or the like, rarely worth crawling
|
// long base64-strings in URLs are typically git hashes or the like, rarely worth crawling
|
||||||
patterns.add(Pattern.compile(".*/[^/]*[a-f0-9]{32,}(/|$)").asPredicate());
|
patterns.add(new UrlPatternMinLength(48, Pattern.compile(".*/[^/]*[a-f0-9]{32,}(/|$)")));
|
||||||
|
|
||||||
// link farms &c
|
// link farms &c
|
||||||
patterns.add(Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$").asPredicate());
|
patterns.add(new UrlPatternContains("/download", Pattern.compile("/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$")));
|
||||||
patterns.add(Pattern.compile("/permalink/[a-z]+(-([A-Za-z]+|[0-9]+)){3,}\\.(htm|html|php)$").asPredicate());
|
patterns.add(new UrlPatternContains("/permalink", Pattern.compile("/permalink/[a-z]+(-([A-Za-z]+|[0-9]+)){3,}\\.(htm|html|php)$")));
|
||||||
patterns.add(Pattern.compile("(webrx3|lib|pdf|book|720p).*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$").asPredicate());
|
patterns.add(new UrlPatternContains("/webrx", Pattern.compile("webrx3.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")));
|
||||||
patterns.add(Pattern.compile("/node/.*/[a-z]+(-[a-z0-9]+)+.htm$").asPredicate());
|
patterns.add(new UrlPatternContains("/lib", Pattern.compile("lib.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")));
|
||||||
patterns.add(Pattern.compile(".*-download-free$").asPredicate());
|
patterns.add(new UrlPatternContains("/pdf", Pattern.compile("pdf.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")));
|
||||||
|
patterns.add(new UrlPatternContains("/book", Pattern.compile("book.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")));
|
||||||
|
patterns.add(new UrlPatternContains("/720p", Pattern.compile("720p.*/[A-Za-z]+(-([A-Za-z]+|[0-9]+)){3,}((-[0-9]+)?/|\\.(php|htm|html))$")));
|
||||||
|
patterns.add(new UrlPatternContains("/node", Pattern.compile("/node/.*/[a-z]+(-[a-z0-9]+)+.htm$")));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isUrlBlocked(EdgeUrl url) {
|
public boolean isUrlBlocked(EdgeUrl url) {
|
||||||
|
@ -31,6 +31,8 @@ public class CrawlerRetreiver {
|
|||||||
private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 500);
|
private static final long DEFAULT_CRAWL_DELAY_MIN_MS = Long.getLong("defaultCrawlDelay", 500);
|
||||||
private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500);
|
private static final long DEFAULT_CRAWL_DELAY_MAX_MS = Long.getLong("defaultCrawlDelaySlow", 2500);
|
||||||
|
|
||||||
|
private static final int MAX_ERRORS = 10;
|
||||||
|
|
||||||
private final LinkedList<EdgeUrl> queue = new LinkedList<>();
|
private final LinkedList<EdgeUrl> queue = new LinkedList<>();
|
||||||
private final HttpFetcher fetcher;
|
private final HttpFetcher fetcher;
|
||||||
|
|
||||||
@ -50,6 +52,8 @@ public class CrawlerRetreiver {
|
|||||||
private static final IpBlockList ipBlocklist;
|
private static final IpBlockList ipBlocklist;
|
||||||
private static final UrlBlocklist urlBlocklist = new UrlBlocklist();
|
private static final UrlBlocklist urlBlocklist = new UrlBlocklist();
|
||||||
|
|
||||||
|
int errorCount = 0;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
ipBlocklist = new IpBlockList(new GeoIpBlocklist());
|
ipBlocklist = new IpBlockList(new GeoIpBlocklist());
|
||||||
@ -137,7 +141,7 @@ public class CrawlerRetreiver {
|
|||||||
|
|
||||||
int fetchedCount = 0;
|
int fetchedCount = 0;
|
||||||
|
|
||||||
while (!queue.isEmpty() && visited.size() < depth) {
|
while (!queue.isEmpty() && visited.size() < depth && errorCount < MAX_ERRORS ) {
|
||||||
var top = queue.removeFirst();
|
var top = queue.removeFirst();
|
||||||
|
|
||||||
if (!robotsRules.isAllowed(top.toString())) {
|
if (!robotsRules.isAllowed(top.toString())) {
|
||||||
@ -179,6 +183,10 @@ public class CrawlerRetreiver {
|
|||||||
EdgeUrl.parse(d.url).map(EdgeUrl::toString).ifPresent(visited::add);
|
EdgeUrl.parse(d.url).map(EdgeUrl::toString).ifPresent(visited::add);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ("ERROR".equals(d.crawlerStatus)) {
|
||||||
|
errorCount++;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
long crawledTime = System.currentTimeMillis() - startTime;
|
long crawledTime = System.currentTimeMillis() - startTime;
|
||||||
|
@ -10,6 +10,7 @@ import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
|
|||||||
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
@ImplementedBy(EdgeDataStoreDaoImpl.class)
|
@ImplementedBy(EdgeDataStoreDaoImpl.class)
|
||||||
public interface EdgeDataStoreDao {
|
public interface EdgeDataStoreDao {
|
||||||
@ -23,7 +24,7 @@ public interface EdgeDataStoreDao {
|
|||||||
|
|
||||||
List<EdgeUrlDetails> getUrlDetailsMulti(EdgeIdCollection<EdgeUrl> ids);
|
List<EdgeUrlDetails> getUrlDetailsMulti(EdgeIdCollection<EdgeUrl> ids);
|
||||||
|
|
||||||
EdgeDomain getDomain(EdgeId<EdgeDomain> id);
|
Optional<EdgeDomain> getDomain(EdgeId<EdgeDomain> id);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -93,7 +93,9 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
WORDS_TOTAL, FORMAT, FEATURES,
|
WORDS_TOTAL, FORMAT, FEATURES,
|
||||||
IP, DOMAIN_STATE,
|
IP, DOMAIN_STATE,
|
||||||
DATA_HASH
|
DATA_HASH
|
||||||
FROM EC_URL_VIEW WHERE ID IN
|
FROM EC_URL_VIEW
|
||||||
|
WHERE TITLE IS NOT NULL
|
||||||
|
AND ID IN
|
||||||
""" + idString)) {
|
""" + idString)) {
|
||||||
stmt.setFetchSize(ids.size());
|
stmt.setFetchSize(ids.size());
|
||||||
|
|
||||||
@ -113,7 +115,9 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
EdgePageScoreAdjustment.zero(), // urlQualityAdjustment
|
EdgePageScoreAdjustment.zero(), // urlQualityAdjustment
|
||||||
Integer.MAX_VALUE, // rankingId
|
Integer.MAX_VALUE, // rankingId
|
||||||
Double.MAX_VALUE, // termScore
|
Double.MAX_VALUE, // termScore
|
||||||
1 // resultsFromSameDomain
|
1, // resultsFromSameDomain
|
||||||
|
"", // positions
|
||||||
|
null // result item
|
||||||
);
|
);
|
||||||
if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF
|
if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF
|
||||||
&& Strings.isNullOrEmpty(val.description)
|
&& Strings.isNullOrEmpty(val.description)
|
||||||
@ -309,18 +313,17 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public EdgeDomain getDomain(EdgeId<EdgeDomain> id) {
|
public Optional<EdgeDomain> getDomain(EdgeId<EdgeDomain> id) {
|
||||||
try (var connection = dataSource.getConnection()) {
|
try (var connection = dataSource.getConnection()) {
|
||||||
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
|
try (var stmt = connection.prepareStatement("SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID=?")) {
|
||||||
stmt.setInt(1, id.id());
|
stmt.setInt(1, id.id());
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
if (rsp.next()) {
|
if (rsp.next()) {
|
||||||
return new EdgeDomain(rsp.getString(1));
|
return Optional.of(new EdgeDomain(rsp.getString(1)));
|
||||||
}
|
}
|
||||||
throw new NoSuchElementException();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -4,7 +4,6 @@ import com.google.inject.Inject;
|
|||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import gnu.trove.set.hash.TIntHashSet;
|
import gnu.trove.set.hash.TIntHashSet;
|
||||||
import io.prometheus.client.Counter;
|
|
||||||
import io.reactivex.rxjava3.schedulers.Schedulers;
|
import io.reactivex.rxjava3.schedulers.Schedulers;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -18,8 +17,6 @@ public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist {
|
|||||||
private final HikariDataSource dataSource;
|
private final HikariDataSource dataSource;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
private static final Counter wmsa_blacklist_intercept = Counter.build("wmsa_blacklist_intercept",
|
|
||||||
"wmsa_blacklist_intercept").register();
|
|
||||||
@Inject
|
@Inject
|
||||||
public EdgeDomainBlacklistImpl(HikariDataSource dataSource) {
|
public EdgeDomainBlacklistImpl(HikariDataSource dataSource) {
|
||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
@ -65,7 +62,6 @@ public class EdgeDomainBlacklistImpl implements EdgeDomainBlacklist {
|
|||||||
@Override
|
@Override
|
||||||
public boolean isBlacklisted(int domainId) {
|
public boolean isBlacklisted(int domainId) {
|
||||||
if (spamDomainSet.contains(domainId)) {
|
if (spamDomainSet.contains(domainId)) {
|
||||||
wmsa_blacklist_intercept.inc();
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -0,0 +1,34 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.explorer;
|
||||||
|
|
||||||
|
import com.google.inject.Guice;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Injector;
|
||||||
|
import nu.marginalia.wmsa.configuration.MainClass;
|
||||||
|
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||||
|
import nu.marginalia.wmsa.configuration.module.ConfigurationModule;
|
||||||
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
|
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||||
|
import spark.Spark;
|
||||||
|
|
||||||
|
public class ExplorerMain extends MainClass {
|
||||||
|
final ExplorerService service;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public ExplorerMain(ExplorerService service) {
|
||||||
|
this.service = service;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String... args) {
|
||||||
|
init(ServiceDescriptor.EXPLORER, args);
|
||||||
|
|
||||||
|
Spark.staticFileLocation("/static/explore/");
|
||||||
|
|
||||||
|
Injector injector = Guice.createInjector(
|
||||||
|
new ConfigurationModule(),
|
||||||
|
new DatabaseModule()
|
||||||
|
);
|
||||||
|
|
||||||
|
injector.getInstance(ExplorerMain.class);
|
||||||
|
injector.getInstance(Initialization.class).setReady();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,253 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.explorer;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.name.Named;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||||
|
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
||||||
|
import nu.marginalia.wmsa.configuration.server.Service;
|
||||||
|
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
|
||||||
|
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
|
||||||
|
import nu.marginalia.wmsa.resource_store.StaticResources;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
import spark.Request;
|
||||||
|
import spark.Response;
|
||||||
|
import spark.Spark;
|
||||||
|
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public class ExplorerService extends Service {
|
||||||
|
|
||||||
|
private final MustacheRenderer<Object> renderer;
|
||||||
|
private final HikariDataSource dataSource;
|
||||||
|
private final StaticResources staticResources;
|
||||||
|
|
||||||
|
record SearchResult(
|
||||||
|
String domain,
|
||||||
|
String url,
|
||||||
|
double relatedness,
|
||||||
|
boolean hasMore,
|
||||||
|
boolean active,
|
||||||
|
boolean indexed) implements Comparable<SearchResult> {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compareTo(@NotNull SearchResult o) {
|
||||||
|
return (int)(o.relatedness - relatedness);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
record SearchResults(String query, String message, String aliasDomain, List<SearchResult> resultList) { }
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
@Inject
|
||||||
|
public ExplorerService(@Named("service-host") String ip,
|
||||||
|
@Named("service-port") Integer port,
|
||||||
|
Initialization initialization,
|
||||||
|
MetricsServer metricsServer,
|
||||||
|
RendererFactory rendererFactory,
|
||||||
|
HikariDataSource dataSource,
|
||||||
|
StaticResources staticResources
|
||||||
|
) {
|
||||||
|
|
||||||
|
super(ip, port, initialization, metricsServer);
|
||||||
|
|
||||||
|
renderer = rendererFactory.renderer("explorer/explorer");
|
||||||
|
this.dataSource = dataSource;
|
||||||
|
this.staticResources = staticResources;
|
||||||
|
Spark.get("/public/", this::serveIndex, this::render);
|
||||||
|
Spark.get("/public/search", this::search, this::render);
|
||||||
|
Spark.get("/public/:resource", this::serveStatic);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private Object serveStatic(Request request, Response response) {
|
||||||
|
String resource = request.params("resource");
|
||||||
|
staticResources.serveStatic("explore", resource, request, response);
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
public String render(Object results) {
|
||||||
|
return renderer.render(results);
|
||||||
|
}
|
||||||
|
|
||||||
|
private SearchResults serveIndex(Request request, Response response) {
|
||||||
|
|
||||||
|
return new SearchResults("", "", null, Collections.emptyList());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private SearchResults search(Request request, Response response) throws SQLException {
|
||||||
|
String query = request.queryParams("domain");
|
||||||
|
|
||||||
|
query = trimUrlJunk(query);
|
||||||
|
|
||||||
|
DomainIdInformation domainId = getDomainId(query);
|
||||||
|
if (!domainId.isPresent()) {
|
||||||
|
return new SearchResults(query,
|
||||||
|
"Could not find such a domain (maybe try adding/removing www?)",
|
||||||
|
null, Collections.emptyList());
|
||||||
|
}
|
||||||
|
|
||||||
|
var relatedDomains = getRelatedDomains(domainId);
|
||||||
|
|
||||||
|
if (relatedDomains.isEmpty()) {
|
||||||
|
String message = """
|
||||||
|
I've got nothing. This may either be due to the website being far out in the periphery of Marginalia's
|
||||||
|
search engine index, or it may be due to the website being too big.
|
||||||
|
A few hundred of the biggest websites are excluded for performance reasons. They are usually
|
||||||
|
not very interesting to look at either as everyone links to them and there's no real pattern to discern.
|
||||||
|
""";
|
||||||
|
|
||||||
|
return new SearchResults(query, message, domainId.alias, relatedDomains);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new SearchResults(query, "", domainId.alias, relatedDomains);
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<SearchResult> getRelatedDomains(DomainIdInformation domainIdInformation) throws SQLException {
|
||||||
|
List<SearchResult> ret = new ArrayList<>();
|
||||||
|
Set<String> seen = new HashSet<>();
|
||||||
|
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("""
|
||||||
|
SELECT
|
||||||
|
NV.NEIGHBOR_NAME,
|
||||||
|
NV.RELATEDNESS,
|
||||||
|
(LV.DOMAIN_ID IS NOT NULL),
|
||||||
|
(STATE = 'ACTIVE' OR STATE='SOCIAL_MEDIA'),
|
||||||
|
INDEXED > 0
|
||||||
|
FROM EC_NEIGHBORS_VIEW NV
|
||||||
|
LEFT JOIN EC_NEIGHBORS_VIEW LV ON (NV.NEIGHBOR_ID=LV.DOMAIN_ID)
|
||||||
|
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NV.NEIGHBOR_ID
|
||||||
|
WHERE NV.DOMAIN_ID=?
|
||||||
|
GROUP BY NV.NEIGHBOR_ID
|
||||||
|
ORDER BY NV.RELATEDNESS DESC
|
||||||
|
""");
|
||||||
|
var stmtRev = conn.prepareStatement("""
|
||||||
|
SELECT
|
||||||
|
NV.DOMAIN_NAME,
|
||||||
|
NV.RELATEDNESS,
|
||||||
|
(LV.NEIGHBOR_ID IS NOT NULL),
|
||||||
|
(STATE = 'ACTIVE' OR STATE='SOCIAL_MEDIA'),
|
||||||
|
INDEXED > 0
|
||||||
|
FROM EC_NEIGHBORS_VIEW NV
|
||||||
|
LEFT JOIN EC_NEIGHBORS_VIEW LV ON (NV.DOMAIN_ID=LV.NEIGHBOR_ID)
|
||||||
|
INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=NV.DOMAIN_ID
|
||||||
|
WHERE NV.NEIGHBOR_ID=?
|
||||||
|
GROUP BY NV.DOMAIN_ID
|
||||||
|
ORDER BY NV.RELATEDNESS DESC
|
||||||
|
"""
|
||||||
|
);
|
||||||
|
|
||||||
|
) {
|
||||||
|
|
||||||
|
stmt.setInt(1, domainIdInformation.domainId);
|
||||||
|
var rsp = stmt.executeQuery();
|
||||||
|
while (rsp.next()) {
|
||||||
|
|
||||||
|
String domainName = rsp.getString(1);
|
||||||
|
double relatedness = rsp.getDouble(2);
|
||||||
|
boolean hasMore = rsp.getBoolean(3);
|
||||||
|
boolean active = rsp.getBoolean(4);
|
||||||
|
boolean indexed = rsp.getBoolean(5);
|
||||||
|
|
||||||
|
seen.add(domainName);
|
||||||
|
|
||||||
|
String url = "http://" + domainName + "/";
|
||||||
|
|
||||||
|
|
||||||
|
if (domainName.length() < 48 && domainName.contains(".")) {
|
||||||
|
ret.add(new SearchResult(
|
||||||
|
domainName,
|
||||||
|
url,
|
||||||
|
relatedness,
|
||||||
|
hasMore,
|
||||||
|
active,
|
||||||
|
indexed
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
stmtRev.setInt(1, domainIdInformation.domainId);
|
||||||
|
rsp = stmtRev.executeQuery();
|
||||||
|
while (rsp.next()) {
|
||||||
|
|
||||||
|
String domainName = rsp.getString(1);
|
||||||
|
double relatedness = rsp.getDouble(2);
|
||||||
|
boolean hasMore = rsp.getBoolean(3);
|
||||||
|
boolean active = rsp.getBoolean(4);
|
||||||
|
boolean indexed = rsp.getBoolean(5);
|
||||||
|
|
||||||
|
String url = "http://" + domainName + "/";
|
||||||
|
|
||||||
|
if (!seen.add(domainName))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (domainName.length() < 48 && domainName.contains(".")) {
|
||||||
|
ret.add(new SearchResult(
|
||||||
|
domainName,
|
||||||
|
url,
|
||||||
|
relatedness,
|
||||||
|
hasMore,
|
||||||
|
active,
|
||||||
|
indexed
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Comparator<SearchResult> comp = SearchResult::compareTo;
|
||||||
|
comp = comp.thenComparing(SearchResult::domain);
|
||||||
|
ret.sort(comp);
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private DomainIdInformation getDomainId(String query) throws SQLException {
|
||||||
|
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("""
|
||||||
|
SELECT IFNULL(ALIAS.ID, DOMAIN.ID), DOMAIN.INDEXED>0 OR ALIAS.INDEXED>0, ALIAS.DOMAIN_NAME
|
||||||
|
FROM EC_DOMAIN DOMAIN
|
||||||
|
LEFT JOIN EC_DOMAIN ALIAS ON DOMAIN.DOMAIN_ALIAS=ALIAS.ID
|
||||||
|
WHERE DOMAIN.DOMAIN_NAME=?
|
||||||
|
""")) {
|
||||||
|
stmt.setString(1, query);
|
||||||
|
var rsp = stmt.executeQuery();
|
||||||
|
if (rsp.next()) {
|
||||||
|
return new DomainIdInformation(
|
||||||
|
rsp.getInt(1),
|
||||||
|
rsp.getBoolean(2),
|
||||||
|
rsp.getString(3)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return new DomainIdInformation(-1, false, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String trimUrlJunk(String query) {
|
||||||
|
if (query.startsWith("http://")) {
|
||||||
|
query = query.substring(7);
|
||||||
|
}
|
||||||
|
if (query.startsWith("https://")) {
|
||||||
|
query = query.substring(8);
|
||||||
|
}
|
||||||
|
|
||||||
|
int lastSlash = query.indexOf('/');
|
||||||
|
if (lastSlash > 0) {
|
||||||
|
query = query.substring(0, lastSlash);
|
||||||
|
}
|
||||||
|
|
||||||
|
return query;
|
||||||
|
}
|
||||||
|
|
||||||
|
record DomainIdInformation(int domainId, boolean indexed, String alias) {
|
||||||
|
boolean isPresent() {
|
||||||
|
return domainId >= 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -1,20 +1,19 @@
|
|||||||
package nu.marginalia.wmsa.edge.index;
|
package nu.marginalia.wmsa.edge.index;
|
||||||
|
|
||||||
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter;
|
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter;
|
||||||
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
|
||||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory;
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryParams;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator;
|
import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepFromPredicate;
|
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepFromPredicate;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryRankLimitingFilter;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.concurrent.locks.Lock;
|
import java.util.concurrent.locks.Lock;
|
||||||
import java.util.concurrent.locks.ReadWriteLock;
|
import java.util.concurrent.locks.ReadWriteLock;
|
||||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||||
@ -103,54 +102,65 @@ public class EdgeIndexBucket {
|
|||||||
return indexReader != null;
|
return indexReader != null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public IndexQuery getQuery(IndexQueryCachePool cachePool, IndexBlock block, LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
|
public IndexQuery getQuery(LongPredicate filter, IndexQueryParams params) {
|
||||||
|
|
||||||
if (null == indexReader) {
|
if (null == indexReader) {
|
||||||
logger.warn("Index reader not neady {}", block);
|
logger.warn("Index reader not neady {}", params.block());
|
||||||
return new IndexQuery(Collections.emptyList());
|
return new IndexQuery(Collections.emptyList());
|
||||||
}
|
}
|
||||||
|
|
||||||
final int[] orderedIncludes = searchTerms.includes
|
final int[] orderedIncludes = params.searchTerms()
|
||||||
.stream()
|
.sortedDistinctIncludes((a, b) -> compareKeywords(params.block(), a, b));
|
||||||
.sorted(Comparator.comparingLong(i -> indexReader.numHits(cachePool, block, i)))
|
|
||||||
.distinct()
|
|
||||||
.mapToInt(Integer::intValue)
|
|
||||||
.toArray();
|
|
||||||
|
|
||||||
IndexQueryFactory.IndexQueryBuilder query;
|
IndexQueryFactory.IndexQueryBuilder query = createQueryBuilder(orderedIncludes[0], params);
|
||||||
|
|
||||||
query = indexReader.findWord(cachePool, block, orderedIncludes[0]);
|
|
||||||
if (query == null) {
|
if (query == null) {
|
||||||
return new IndexQuery(Collections.emptyList());
|
return new IndexQuery(Collections.emptyList());
|
||||||
}
|
}
|
||||||
|
|
||||||
query.filter(filter);
|
query.addInclusionFilter(new QueryFilterStepFromPredicate(filter));
|
||||||
|
if (params.rankLimit() != null) {
|
||||||
|
query.addInclusionFilter(new QueryRankLimitingFilter(params.rankLimit()));
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 1; i < orderedIncludes.length; i++) {
|
for (int i = 1; i < orderedIncludes.length; i++) {
|
||||||
query = query.also(orderedIncludes[i]);
|
query = query.also(orderedIncludes[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int term : searchTerms.excludes) {
|
for (int term : params.searchTerms().excludes()) {
|
||||||
query = query.not(term);
|
query = query.not(term);
|
||||||
}
|
}
|
||||||
|
|
||||||
return query.build();
|
return query.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private IndexQueryFactory.IndexQueryBuilder createQueryBuilder(int firstKeyword, IndexQueryParams params) {
|
||||||
|
|
||||||
public IndexQuery getDomainQuery(IndexQueryCachePool pool, int wordId, ResultDomainDeduplicator localFilter) {
|
if (params.targetDomains() != null && !params.targetDomains().isEmpty()) {
|
||||||
var query = indexReader.findDomain(pool, wordId);
|
return indexReader.findWordForDomainList(params.block(), params.targetDomains(), firstKeyword);
|
||||||
|
}
|
||||||
|
return indexReader.findWord(params.block(), params.qualityLimit(), firstKeyword);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private int compareKeywords(IndexBlock block, int a, int b) {
|
||||||
|
return Long.compare(
|
||||||
|
indexReader.numHits(block, a),
|
||||||
|
indexReader.numHits(block, b)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public IndexQuery getDomainQuery(int wordId, ResultDomainDeduplicator localFilter) {
|
||||||
|
var query = indexReader.findDomain(wordId);
|
||||||
|
|
||||||
query.addInclusionFilter(new QueryFilterStepFromPredicate(localFilter::filterRawValue));
|
query.addInclusionFilter(new QueryFilterStepFromPredicate(localFilter::filterRawValue));
|
||||||
|
|
||||||
return query;
|
return query;
|
||||||
}
|
}
|
||||||
|
|
||||||
public IndexBlock getTermScore(IndexQueryCachePool cachePool, int termId, long urlId) {
|
/** Replaces the values of ids with their associated metadata, or 0L if absent */
|
||||||
return indexReader.getBlockForResult(cachePool, termId, urlId);
|
public long[] getMetadata(IndexBlock block, int termId, long[] ids) {
|
||||||
|
return indexReader.getMetadata(block, termId, ids);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isTermInBucket(IndexQueryCachePool cachePool, IndexBlock block, int termId, long urlId) {
|
|
||||||
return indexReader.isTermInBucket(cachePool, block, termId, urlId);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -2,8 +2,8 @@ package nu.marginalia.wmsa.edge.index;
|
|||||||
|
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
|
||||||
import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException;
|
import nu.marginalia.wmsa.edge.index.conversion.ConversionUnnecessaryException;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
@ -18,9 +18,6 @@ public class EdgeIndexControl {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void regenerateIndex(int id) {
|
public void regenerateIndex(int id) {
|
||||||
System.runFinalization();
|
|
||||||
System.gc();
|
|
||||||
|
|
||||||
for (IndexBlock block : IndexBlock.values()) {
|
for (IndexBlock block : IndexBlock.values()) {
|
||||||
try {
|
try {
|
||||||
servicesFactory.convertIndex(id, block);
|
servicesFactory.convertIndex(id, block);
|
||||||
|
@ -9,6 +9,7 @@ import nu.marginalia.wmsa.configuration.server.Initialization;
|
|||||||
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
||||||
import nu.marginalia.wmsa.configuration.server.Service;
|
import nu.marginalia.wmsa.configuration.server.Service;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexDomainQueryService;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexLexiconService;
|
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexLexiconService;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexOpsService;
|
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexOpsService;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexQueryService;
|
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexQueryService;
|
||||||
@ -39,7 +40,9 @@ public class EdgeIndexService extends Service {
|
|||||||
|
|
||||||
EdgeIndexOpsService opsService,
|
EdgeIndexOpsService opsService,
|
||||||
EdgeIndexLexiconService lexiconService,
|
EdgeIndexLexiconService lexiconService,
|
||||||
EdgeIndexQueryService indexQueryService)
|
EdgeIndexQueryService indexQueryService,
|
||||||
|
EdgeIndexDomainQueryService domainQueryService
|
||||||
|
)
|
||||||
{
|
{
|
||||||
super(ip, port, init, metricsServer);
|
super(ip, port, init, metricsServer);
|
||||||
|
|
||||||
@ -51,7 +54,7 @@ public class EdgeIndexService extends Service {
|
|||||||
Spark.post("/words/", lexiconService::putWords);
|
Spark.post("/words/", lexiconService::putWords);
|
||||||
|
|
||||||
Spark.post("/search/", indexQueryService::search, gson::toJson);
|
Spark.post("/search/", indexQueryService::search, gson::toJson);
|
||||||
Spark.post("/search-domain/", indexQueryService::searchDomain, gson::toJson);
|
Spark.post("/search-domain/", domainQueryService::searchDomain, gson::toJson);
|
||||||
|
|
||||||
Spark.get("/dictionary/*", lexiconService::getWordId, gson::toJson);
|
Spark.get("/dictionary/*", lexiconService::getWordId, gson::toJson);
|
||||||
|
|
||||||
|
@ -103,9 +103,9 @@ public class IndexServicesFactory {
|
|||||||
|
|
||||||
public void convertIndex(int id, IndexBlock block) throws ConversionUnnecessaryException, IOException {
|
public void convertIndex(int id, IndexBlock block) throws ConversionUnnecessaryException, IOException {
|
||||||
var converter = new SearchIndexConverter(block, id, tmpFileDir,
|
var converter = new SearchIndexConverter(block, id, tmpFileDir,
|
||||||
preconverterOutputFile.get(id, block.ordinal()),
|
preconverterOutputFile.get(id, block),
|
||||||
indexWriteWordsFile.get(id, block.id),
|
indexWriteWordsFile.get(id, block),
|
||||||
indexWriteUrlsFile.get(id, block.id),
|
indexWriteUrlsFile.get(id, block),
|
||||||
partitioner,
|
partitioner,
|
||||||
domainBlacklist
|
domainBlacklist
|
||||||
);
|
);
|
||||||
@ -118,7 +118,7 @@ public class IndexServicesFactory {
|
|||||||
|
|
||||||
for (int index = 0; index < (DYNAMIC_BUCKET_LENGTH + 1); index++) {
|
for (int index = 0; index < (DYNAMIC_BUCKET_LENGTH + 1); index++) {
|
||||||
for (IndexBlock block : IndexBlock.values()) {
|
for (IndexBlock block : IndexBlock.values()) {
|
||||||
shards.put(new SearchIndexPreconverter.Shard(index, block.ordinal()), getPreconverterOutputFile(index, block.ordinal()));
|
shards.put(new SearchIndexPreconverter.Shard(index, block.ordinal()), getPreconverterOutputFile(index, block));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -129,7 +129,7 @@ public class IndexServicesFactory {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
private File getPreconverterOutputFile(int index, int block) {
|
private File getPreconverterOutputFile(int index, IndexBlock block) {
|
||||||
return preconverterOutputFile.get(index, block);
|
return preconverterOutputFile.get(index, block);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -141,7 +141,7 @@ public class IndexServicesFactory {
|
|||||||
indexMap.put(block, createSearchIndex(id, block));
|
indexMap.put(block, createSearchIndex(id, block));
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.error("Could not create index {}-{}", id, block);
|
logger.error("Could not create index {}-{} ({})", id, block, ex.getMessage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return new SearchIndexReader(indexMap);
|
return new SearchIndexReader(indexMap);
|
||||||
@ -150,8 +150,8 @@ public class IndexServicesFactory {
|
|||||||
private SearchIndex createSearchIndex(int bucketId, IndexBlock block) {
|
private SearchIndex createSearchIndex(int bucketId, IndexBlock block) {
|
||||||
try {
|
try {
|
||||||
return new SearchIndex("IndexReader"+bucketId+":"+ block.name(),
|
return new SearchIndex("IndexReader"+bucketId+":"+ block.name(),
|
||||||
indexReadUrlsFile.get(bucketId, block.id),
|
indexReadUrlsFile.get(bucketId, block),
|
||||||
indexReadWordsFile.get(bucketId, block.id));
|
indexReadWordsFile.get(bucketId, block));
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
@ -159,9 +159,10 @@ public class IndexServicesFactory {
|
|||||||
|
|
||||||
public Callable<Boolean> switchFilesJob(int id) {
|
public Callable<Boolean> switchFilesJob(int id) {
|
||||||
return () -> {
|
return () -> {
|
||||||
for (int block = 0; block < IndexBlock.values().length; block++) {
|
|
||||||
|
for (var block : IndexBlock.values()) {
|
||||||
if (Files.exists(indexWriteWordsFile.get(id, block).toPath()) &&
|
if (Files.exists(indexWriteWordsFile.get(id, block).toPath()) &&
|
||||||
Files.exists(indexWriteUrlsFile.get(id, block).toPath())) {
|
Files.exists(indexWriteUrlsFile.get(id, block).toPath())) {
|
||||||
Files.move(
|
Files.move(
|
||||||
indexWriteWordsFile.get(id, block).toPath(),
|
indexWriteWordsFile.get(id, block).toPath(),
|
||||||
indexReadWordsFile.get(id, block).toPath(),
|
indexReadWordsFile.get(id, block).toPath(),
|
||||||
@ -172,6 +173,7 @@ public class IndexServicesFactory {
|
|||||||
StandardCopyOption.REPLACE_EXISTING);
|
StandardCopyOption.REPLACE_EXISTING);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -205,8 +207,8 @@ class PartitionedDataFile {
|
|||||||
this.pattern = pattern;
|
this.pattern = pattern;
|
||||||
}
|
}
|
||||||
|
|
||||||
public File get(int id) {
|
public File get(Object id) {
|
||||||
Path partitionDir = partition.resolve(Integer.toString(id));
|
Path partitionDir = partition.resolve(id.toString());
|
||||||
if (!partitionDir.toFile().exists()) {
|
if (!partitionDir.toFile().exists()) {
|
||||||
partitionDir.toFile().mkdir();
|
partitionDir.toFile().mkdir();
|
||||||
}
|
}
|
||||||
@ -223,13 +225,13 @@ class DoublePartitionedDataFile {
|
|||||||
this.pattern = pattern;
|
this.pattern = pattern;
|
||||||
}
|
}
|
||||||
|
|
||||||
public File get(int id, int id2) {
|
public File get(Object id, Object id2) {
|
||||||
Path partitionDir = partition.resolve(Integer.toString(id));
|
Path partitionDir = partition.resolve(id.toString());
|
||||||
|
|
||||||
if (!partitionDir.toFile().exists()) {
|
if (!partitionDir.toFile().exists()) {
|
||||||
partitionDir.toFile().mkdir();
|
partitionDir.toFile().mkdir();
|
||||||
}
|
}
|
||||||
partitionDir = partitionDir.resolve(Integer.toString(id2));
|
partitionDir = partitionDir.resolve(id2.toString());
|
||||||
if (!partitionDir.toFile().exists()) {
|
if (!partitionDir.toFile().exists()) {
|
||||||
partitionDir.toFile().mkdir();
|
partitionDir.toFile().mkdir();
|
||||||
}
|
}
|
||||||
|
@ -47,6 +47,9 @@ public class EdgeIndexClient extends AbstractDynamicClient implements EdgeIndexW
|
|||||||
var wordSetBuilder = IndexPutKeywordsReq.WordSet.newBuilder();
|
var wordSetBuilder = IndexPutKeywordsReq.WordSet.newBuilder();
|
||||||
wordSetBuilder.setIndex(wordSet.block().ordinal());
|
wordSetBuilder.setIndex(wordSet.block().ordinal());
|
||||||
wordSetBuilder.addAllWords(List.of(wordSet.keywords()));
|
wordSetBuilder.addAllWords(List.of(wordSet.keywords()));
|
||||||
|
for (var meta : wordSet.metadata()) {
|
||||||
|
wordSetBuilder.addMeta(meta);
|
||||||
|
}
|
||||||
keywordBuilder.addWordSet(wordSetBuilder.build());
|
keywordBuilder.addWordSet(wordSetBuilder.build());
|
||||||
|
|
||||||
var req = keywordBuilder.build();
|
var req = keywordBuilder.build();
|
||||||
|
@ -21,7 +21,6 @@ import org.slf4j.LoggerFactory;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class EdgeIndexLocalService implements EdgeIndexWriterClient {
|
public class EdgeIndexLocalService implements EdgeIndexWriterClient {
|
||||||
@ -53,9 +52,9 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (var chunk : ListChunker.chopList(List.of(wordSet.keywords()), SearchIndexJournalEntry.MAX_LENGTH)) {
|
for (var chunk : ListChunker.chopList(wordSet, SearchIndexJournalEntry.MAX_LENGTH)) {
|
||||||
|
|
||||||
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
|
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk.keywords(), chunk.metadata()));
|
||||||
var header = new SearchIndexJournalEntryHeader(domain, url, wordSet.block());
|
var header = new SearchIndexJournalEntryHeader(domain, url, wordSet.block());
|
||||||
|
|
||||||
indexWriter.put(header, entry);
|
indexWriter.put(header, entry);
|
||||||
@ -63,19 +62,22 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private long[] getOrInsertWordIds(List<String> words) {
|
private long[] getOrInsertWordIds(String[] words, long[] meta) {
|
||||||
long[] ids = new long[words.size()];
|
long[] ids = new long[words.length*2];
|
||||||
int putId = 0;
|
int putIdx = 0;
|
||||||
|
|
||||||
|
for (int i = 0; i < words.length; i++) {
|
||||||
|
String word = words[i];
|
||||||
|
|
||||||
for (String word : words) {
|
|
||||||
long id = lexicon.getOrInsert(word);
|
long id = lexicon.getOrInsert(word);
|
||||||
if (id != DictionaryHashMap.NO_VALUE) {
|
if (id != DictionaryHashMap.NO_VALUE) {
|
||||||
ids[putId++] = id;
|
ids[putIdx++] = id;
|
||||||
|
ids[putIdx++] = meta[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (putId != words.size()) {
|
if (putIdx != words.length*2) {
|
||||||
ids = Arrays.copyOf(ids, putId);
|
ids = Arrays.copyOf(ids, putIdx);
|
||||||
}
|
}
|
||||||
return ids;
|
return ids;
|
||||||
}
|
}
|
||||||
|
@ -20,12 +20,14 @@ import java.nio.channels.FileChannel;
|
|||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
|
||||||
import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry.MAX_LENGTH;
|
|
||||||
|
|
||||||
public class SearchIndexConverter {
|
public class SearchIndexConverter {
|
||||||
public static final BTreeContext urlsBTreeContext = new BTreeContext(5, 1, ~0, 8);
|
public static final int ENTRY_URL_OFFSET = 0;
|
||||||
|
public static final int ENTRY_METADATA_OFFSET = 1;
|
||||||
|
public static final int ENTRY_SIZE = 2;
|
||||||
|
|
||||||
private final long[] tmpWordsBuffer = new long[MAX_LENGTH];
|
public static final BTreeContext urlsBTreeContext = new BTreeContext(5, ENTRY_SIZE, ~0, 8);
|
||||||
|
|
||||||
|
private final long[] tmpWordsBuffer = SearchIndexJournalReader.createAdequateTempBuffer();
|
||||||
|
|
||||||
private final Path tmpFileDir;
|
private final Path tmpFileDir;
|
||||||
|
|
||||||
@ -72,7 +74,7 @@ public class SearchIndexConverter {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info("Converting {} ({}) {} {}", block.id, block, inputFile, journalReader.fileHeader);
|
logger.info("Converting {} ({}) {} {}", block.ordinal(), block, inputFile, journalReader.fileHeader);
|
||||||
|
|
||||||
var lock = partitioner.getReadLock();
|
var lock = partitioner.getReadLock();
|
||||||
try {
|
try {
|
||||||
@ -80,10 +82,10 @@ public class SearchIndexConverter {
|
|||||||
|
|
||||||
var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
|
var tmpUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
|
||||||
|
|
||||||
logger.info("Creating word index table {} for block {} ({})", outputFileWords, block.id, block);
|
logger.info("Creating word index table {} for block {}", outputFileWords, block.ordinal());
|
||||||
WordIndexOffsetsTable wordIndexTable = createWordIndexTable(journalReader, outputFileWords);
|
WordIndexOffsetsTable wordIndexTable = createWordIndexTable(journalReader, outputFileWords);
|
||||||
|
|
||||||
logger.info("Creating word urls table {} for block {} ({})", outputFileUrls, block.id, block);
|
logger.info("Creating word urls table {} for block {}", outputFileUrls, block.ordinal());
|
||||||
createUrlTable(journalReader, tmpUrlsFile, wordIndexTable);
|
createUrlTable(journalReader, tmpUrlsFile, wordIndexTable);
|
||||||
|
|
||||||
Files.delete(tmpUrlsFile);
|
Files.delete(tmpUrlsFile);
|
||||||
@ -111,10 +113,10 @@ public class SearchIndexConverter {
|
|||||||
|
|
||||||
final SearchIndexJournalEntry entryData = entry.readEntryUsingBuffer(tmpWordsBuffer);
|
final SearchIndexJournalEntry entryData = entry.readEntryUsingBuffer(tmpWordsBuffer);
|
||||||
|
|
||||||
for (int i = 0; i < entryData.size(); i++) {
|
for (var record : entryData) {
|
||||||
int wordId = (int) entryData.get(i);
|
int wordId = record.wordId();
|
||||||
if (wordId < 0 || wordId >= topWord) {
|
if (wordId < 0 || wordId >= topWord) {
|
||||||
logger.warn("Bad wordId {}", wordId);
|
logger.warn("Bad word {}", record);
|
||||||
}
|
}
|
||||||
wordsTableWriter.acceptWord(wordId);
|
wordsTableWriter.acceptWord(wordId);
|
||||||
}
|
}
|
||||||
@ -138,7 +140,7 @@ public class SearchIndexConverter {
|
|||||||
try (RandomAccessFile urlsTmpFileRAF = new RandomAccessFile(tmpUrlsFile.toFile(), "rw");
|
try (RandomAccessFile urlsTmpFileRAF = new RandomAccessFile(tmpUrlsFile.toFile(), "rw");
|
||||||
FileChannel urlsTmpFileChannel = urlsTmpFileRAF.getChannel()) {
|
FileChannel urlsTmpFileChannel = urlsTmpFileRAF.getChannel()) {
|
||||||
|
|
||||||
try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, numberOfWordsTotal, 10_000_000)) {
|
try (RandomWriteFunnel rwf = new RandomWriteFunnel(tmpFileDir, ENTRY_SIZE * numberOfWordsTotal, 10_000_000)) {
|
||||||
int[] wordWriteOffset = new int[wordOffsetsTable.length()];
|
int[] wordWriteOffset = new int[wordOffsetsTable.length()];
|
||||||
|
|
||||||
for (var entry : journalReader) {
|
for (var entry : journalReader) {
|
||||||
@ -146,21 +148,29 @@ public class SearchIndexConverter {
|
|||||||
|
|
||||||
var entryData = entry.readEntryUsingBuffer(tmpWordsBuffer);
|
var entryData = entry.readEntryUsingBuffer(tmpWordsBuffer);
|
||||||
|
|
||||||
for (int i = 0; i < entryData.size(); i++) {
|
for (var record : entryData) {
|
||||||
int wordId = (int) entryData.get(i);
|
int wordId = record.wordId();
|
||||||
|
long metadata = record.metadata();
|
||||||
|
|
||||||
if (wordId >= wordWriteOffset.length)
|
if (wordId >= wordWriteOffset.length) {
|
||||||
|
logger.warn("Overflowing wordId {}", wordId);
|
||||||
continue;
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (wordId < 0) {
|
if (wordId < 0) {
|
||||||
logger.warn("Negative wordId {}", wordId);
|
logger.warn("Negative wordId {}", wordId);
|
||||||
}
|
}
|
||||||
|
|
||||||
final long urlInternal = translateUrl(entry.docId());
|
final long urlInternal = translateUrl(entry.docId());
|
||||||
if (wordId > 0) {
|
|
||||||
rwf.put(wordOffsetsTable.get(wordId - 1) + wordWriteOffset[wordId]++, urlInternal);
|
long offset;
|
||||||
} else {
|
if (wordId > 0) offset = wordOffsetsTable.get(wordId - 1) + wordWriteOffset[wordId];
|
||||||
rwf.put(wordWriteOffset[wordId]++, urlInternal);
|
else offset = wordWriteOffset[wordId];
|
||||||
}
|
|
||||||
|
rwf.put(offset + ENTRY_URL_OFFSET, urlInternal);
|
||||||
|
rwf.put(offset + ENTRY_METADATA_OFFSET, metadata);
|
||||||
|
|
||||||
|
wordWriteOffset[wordId] += ENTRY_SIZE;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -171,9 +181,9 @@ public class SearchIndexConverter {
|
|||||||
|
|
||||||
try (var urlsTmpFileMap = MultimapFileLong.forOutput(tmpUrlsFile, numberOfWordsTotal)) {
|
try (var urlsTmpFileMap = MultimapFileLong.forOutput(tmpUrlsFile, numberOfWordsTotal)) {
|
||||||
if (wordOffsetsTable.length() > 0) {
|
if (wordOffsetsTable.length() > 0) {
|
||||||
var urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit);
|
var urlTmpFileSorter = urlsTmpFileMap.createSorter(tmpFileDir, internalSortLimit, ENTRY_SIZE);
|
||||||
|
|
||||||
wordOffsetsTable.forEachRange(urlTmpFileSorter::sort);
|
wordOffsetsTable.forEachRange(urlTmpFileSorter::sortRange);
|
||||||
|
|
||||||
urlsTmpFileMap.force();
|
urlsTmpFileMap.force();
|
||||||
} else {
|
} else {
|
||||||
@ -187,7 +197,7 @@ public class SearchIndexConverter {
|
|||||||
wordOffsetsTable.foldRanges((accumulatorIdx, start, length) -> {
|
wordOffsetsTable.foldRanges((accumulatorIdx, start, length) -> {
|
||||||
// Note: The return value is accumulated into accumulatorIdx!
|
// Note: The return value is accumulated into accumulatorIdx!
|
||||||
|
|
||||||
return writer.write(accumulatorIdx, length,
|
return writer.write(accumulatorIdx, length/ENTRY_SIZE,
|
||||||
slice -> slice.transferFromFileChannel(urlsTmpFileChannel, 0, start, start + length));
|
slice -> slice.transferFromFileChannel(urlsTmpFileChannel, 0, start, start + length));
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -9,7 +9,6 @@ import gnu.trove.set.hash.TIntHashSet;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.util.ranking.BetterReversePageRank;
|
import nu.marginalia.util.ranking.BetterReversePageRank;
|
||||||
import nu.marginalia.util.ranking.BetterStandardPageRank;
|
import nu.marginalia.util.ranking.BetterStandardPageRank;
|
||||||
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
|
||||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||||
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -87,8 +86,25 @@ public class SearchIndexDao {
|
|||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public TIntList getStandardDomains() {
|
public TIntList getStandardDomains() {
|
||||||
var spr = new BuggyStandardPageRank(rankingDomains,rankingSettings.standard.toArray(String[]::new));
|
TIntArrayList results = new TIntArrayList();
|
||||||
return spr.pageRankWithPeripheralNodes(spr.size()/2);
|
|
||||||
|
try (var connection = dataSource.getConnection();
|
||||||
|
var stmt = connection.prepareStatement(
|
||||||
|
"""
|
||||||
|
SELECT ID FROM EC_DOMAIN
|
||||||
|
WHERE INDEXED>0
|
||||||
|
AND STATE='ACTIVE'
|
||||||
|
AND DOMAIN_ALIAS IS NULL
|
||||||
|
ORDER BY ID ASC
|
||||||
|
""");
|
||||||
|
) {
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
results.add(rs.getInt(1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return results;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
|
@ -110,11 +110,12 @@ public class SearchIndexPartitioner {
|
|||||||
return true;
|
return true;
|
||||||
if (academiaRanking.hasBucket(bucketId, domainId))
|
if (academiaRanking.hasBucket(bucketId, domainId))
|
||||||
return true;
|
return true;
|
||||||
if (standardRanking.hasBucket(bucketId, domainId))
|
|
||||||
return true;
|
|
||||||
if (specialDomainRanking.hasBucket(bucketId, domainId))
|
if (specialDomainRanking.hasBucket(bucketId, domainId))
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
|
if (standardRanking.hasBucket(bucketId, domainId))
|
||||||
|
return true;
|
||||||
|
|
||||||
return DYNAMIC_BUCKET_LENGTH == bucketId;
|
return DYNAMIC_BUCKET_LENGTH == bucketId;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -148,15 +149,15 @@ public class SearchIndexPartitioner {
|
|||||||
if (academiaRanking != null && academiaRanking.ownsBucket(bucketId)) {
|
if (academiaRanking != null && academiaRanking.ownsBucket(bucketId)) {
|
||||||
return academiaRanking.translateId(id);
|
return academiaRanking.translateId(id);
|
||||||
}
|
}
|
||||||
if (standardRanking != null && standardRanking.ownsBucket(bucketId)) {
|
|
||||||
return standardRanking.translateId(id);
|
|
||||||
}
|
|
||||||
if (specialDomainRanking != null && specialDomainRanking.ownsBucket(bucketId)) {
|
if (specialDomainRanking != null && specialDomainRanking.ownsBucket(bucketId)) {
|
||||||
return specialDomainRanking.translateId(id);
|
return specialDomainRanking.translateId(id);
|
||||||
}
|
}
|
||||||
if (retroRanking != null) {
|
|
||||||
return retroRanking.translateId(id);
|
// standard gets passed traight through
|
||||||
|
if (standardRanking != null && standardRanking.ownsBucket(bucketId)) {
|
||||||
|
return id;
|
||||||
}
|
}
|
||||||
|
|
||||||
return id;
|
return id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -52,7 +52,7 @@ public class SearchIndexPreconverter {
|
|||||||
var lock = partitioner.getReadLock();
|
var lock = partitioner.getReadLock();
|
||||||
try {
|
try {
|
||||||
lock.lock();
|
lock.lock();
|
||||||
ByteBuffer buffer = ByteBuffer.allocateDirect(8192);
|
ByteBuffer buffer = ByteBuffer.allocateDirect(65536);
|
||||||
for (var entry : indexJournalReader) {
|
for (var entry : indexJournalReader) {
|
||||||
if (!partitioner.isGoodUrl(entry.urlId())
|
if (!partitioner.isGoodUrl(entry.urlId())
|
||||||
|| spamDomains.contains(entry.domainId())) {
|
|| spamDomains.contains(entry.domainId())) {
|
||||||
@ -93,7 +93,7 @@ public class SearchIndexPreconverter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public boolean shouldWrite(SearchIndexPartitioner partitioner, SearchIndexJournalReader.JournalEntry entry) {
|
public boolean shouldWrite(SearchIndexPartitioner partitioner, SearchIndexJournalReader.JournalEntry entry) {
|
||||||
return shard.block == entry.header.block().id
|
return shard.block == entry.header.block().ordinal()
|
||||||
&& partitioner.filterUnsafe(entry.domainId(), shard.bucket);
|
&& partitioner.filterUnsafe(entry.domainId(), shard.bucket);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -23,10 +23,10 @@ public class WordIndexOffsetsTable {
|
|||||||
|
|
||||||
for (int i = 1; i < table.length; i++) {
|
for (int i = 1; i < table.length; i++) {
|
||||||
long start = table[i-1];
|
long start = table[i-1];
|
||||||
int length = (int) (table[i] - start);
|
long end = table[i];
|
||||||
|
|
||||||
if (length != 0) {
|
if (start != end) {
|
||||||
o.accept(start, length);
|
o.accept(start, end);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -58,7 +58,7 @@ public class WordIndexOffsetsTable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public interface OffsetTableEntryConsumer {
|
public interface OffsetTableEntryConsumer {
|
||||||
void accept(long start, int length) throws IOException;
|
void accept(long start, long end) throws IOException;
|
||||||
}
|
}
|
||||||
|
|
||||||
public interface OffsetTableEntryFoldConsumer {
|
public interface OffsetTableEntryFoldConsumer {
|
||||||
|
@ -8,8 +8,10 @@ import nu.marginalia.wmsa.edge.index.reader.IndexWordsTable;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.ENTRY_SIZE;
|
||||||
import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.urlsBTreeContext;
|
import static nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter.urlsBTreeContext;
|
||||||
|
|
||||||
public class WordsTableWriter {
|
public class WordsTableWriter {
|
||||||
@ -23,7 +25,9 @@ public class WordsTableWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void acceptWord(int wordId) {
|
public void acceptWord(int wordId) {
|
||||||
table.lengths().increment(wordId);
|
for (int i = 0; i < ENTRY_SIZE; i++) {
|
||||||
|
table.lengths().increment(wordId);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public WordIndexOffsetsTable getTable() {
|
public WordIndexOffsetsTable getTable() {
|
||||||
@ -58,7 +62,7 @@ public class WordsTableWriter {
|
|||||||
mapSlice.put(idx++, (long)length<<32);
|
mapSlice.put(idx++, (long)length<<32);
|
||||||
mapSlice.put(idx++, 0);
|
mapSlice.put(idx++, 0);
|
||||||
|
|
||||||
urlFileOffset += (urlsBTreeContext.calculateSize(length));
|
urlFileOffset += (urlsBTreeContext.calculateSize(length / ENTRY_SIZE));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 1; i < offsetTable.length; i++) {
|
for (int i = 1; i < offsetTable.length; i++) {
|
||||||
@ -68,7 +72,7 @@ public class WordsTableWriter {
|
|||||||
mapSlice.put(idx++, (long)length << 32 | i);
|
mapSlice.put(idx++, (long)length << 32 | i);
|
||||||
mapSlice.put(idx++, urlFileOffset);
|
mapSlice.put(idx++, urlFileOffset);
|
||||||
|
|
||||||
urlFileOffset += (urlsBTreeContext.calculateSize(length));
|
urlFileOffset += (urlsBTreeContext.calculateSize(length / ENTRY_SIZE));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -12,6 +12,8 @@ import org.jetbrains.annotations.NotNull;
|
|||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry.ENTRY_SIZE;
|
||||||
|
import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry.MAX_LENGTH;
|
||||||
import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS;
|
import static nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader.HEADER_SIZE_LONGS;
|
||||||
|
|
||||||
public class SearchIndexJournalReader implements Iterable<SearchIndexJournalReader.JournalEntry> {
|
public class SearchIndexJournalReader implements Iterable<SearchIndexJournalReader.JournalEntry> {
|
||||||
@ -23,6 +25,10 @@ public class SearchIndexJournalReader implements Iterable<SearchIndexJournalRead
|
|||||||
private final MultimapFileLongSlice map;
|
private final MultimapFileLongSlice map;
|
||||||
private final long committedSize;
|
private final long committedSize;
|
||||||
|
|
||||||
|
public static long[] createAdequateTempBuffer() {
|
||||||
|
return new long[MAX_LENGTH*ENTRY_SIZE];
|
||||||
|
}
|
||||||
|
|
||||||
public SearchIndexJournalReader(MultimapFileLong map) {
|
public SearchIndexJournalReader(MultimapFileLong map) {
|
||||||
fileHeader = new SearchIndexJournalFileHeader(map.get(0), map.get(1));
|
fileHeader = new SearchIndexJournalFileHeader(map.get(0), map.get(1));
|
||||||
committedSize = map.get(0) / 8 - FILE_HEADER_SIZE_LONGS;
|
committedSize = map.get(0) / 8 - FILE_HEADER_SIZE_LONGS;
|
||||||
@ -92,7 +98,7 @@ public class SearchIndexJournalReader implements Iterable<SearchIndexJournalRead
|
|||||||
public IndexBlock block() {
|
public IndexBlock block() {
|
||||||
return header.block();
|
return header.block();
|
||||||
}
|
}
|
||||||
public int wordCount() { return header.entrySize(); }
|
public int wordCount() { return header.entrySize() / ENTRY_SIZE; }
|
||||||
|
|
||||||
public SearchIndexJournalEntry readEntry() {
|
public SearchIndexJournalEntry readEntry() {
|
||||||
long[] dest = new long[header.entrySize()];
|
long[] dest = new long[header.entrySize()];
|
||||||
|
@ -26,7 +26,7 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
|||||||
private RandomAccessFile raf;
|
private RandomAccessFile raf;
|
||||||
private FileChannel channel;
|
private FileChannel channel;
|
||||||
|
|
||||||
public static final int MAX_BLOCK_SIZE = SearchIndexJournalEntry.MAX_LENGTH*32*8*4;
|
public static final int MAX_BLOCK_SIZE = SearchIndexJournalEntry.MAX_LENGTH*128*8*4;
|
||||||
private final ByteBuffer byteBuffer;
|
private final ByteBuffer byteBuffer;
|
||||||
private long pos;
|
private long pos;
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
|||||||
byteBuffer.clear();
|
byteBuffer.clear();
|
||||||
|
|
||||||
byteBuffer.putInt(entryData.size());
|
byteBuffer.putInt(entryData.size());
|
||||||
byteBuffer.putInt(header.block().id);
|
byteBuffer.putInt(header.block().ordinal());
|
||||||
byteBuffer.putLong(header.documentId());
|
byteBuffer.putLong(header.documentId());
|
||||||
|
|
||||||
entryData.write(byteBuffer);
|
entryData.write(byteBuffer);
|
||||||
|
@ -2,12 +2,14 @@ package nu.marginalia.wmsa.edge.index.journal.model;
|
|||||||
|
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
public class SearchIndexJournalEntry {
|
public class SearchIndexJournalEntry implements Iterable<SearchIndexJournalEntry.Record> {
|
||||||
private final int size;
|
private final int size;
|
||||||
private final long[] underlyingArray;
|
private final long[] underlyingArray;
|
||||||
|
|
||||||
public static final int MAX_LENGTH = 1000;
|
public static final int MAX_LENGTH = 1000;
|
||||||
|
public static final int ENTRY_SIZE = 2;
|
||||||
|
|
||||||
public SearchIndexJournalEntry(long[] underlyingArray) {
|
public SearchIndexJournalEntry(long[] underlyingArray) {
|
||||||
this.size = underlyingArray.length;
|
this.size = underlyingArray.length;
|
||||||
@ -46,4 +48,24 @@ public class SearchIndexJournalEntry {
|
|||||||
return String.format("%s[%s]", getClass().getSimpleName(), Arrays.toString(toArray()));
|
return String.format("%s[%s]", getClass().getSimpleName(), Arrays.toString(toArray()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Iterator<Record> iterator() {
|
||||||
|
return new EntryIterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
private class EntryIterator implements Iterator<Record> {
|
||||||
|
int pos = -ENTRY_SIZE;
|
||||||
|
|
||||||
|
public boolean hasNext() {
|
||||||
|
return pos + ENTRY_SIZE < size;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Record next() {
|
||||||
|
pos+=ENTRY_SIZE;
|
||||||
|
|
||||||
|
return new Record((int) underlyingArray[pos], underlyingArray[pos+1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public record Record(int wordId, long metadata) {}
|
||||||
}
|
}
|
||||||
|
@ -5,6 +5,7 @@ import com.google.common.hash.Hashing;
|
|||||||
import io.prometheus.client.Gauge;
|
import io.prometheus.client.Gauge;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||||
|
import nu.marginalia.util.dict.DictionaryMap;
|
||||||
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
|
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -16,7 +17,7 @@ import java.util.concurrent.locks.ReadWriteLock;
|
|||||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||||
|
|
||||||
public class KeywordLexicon implements AutoCloseable {
|
public class KeywordLexicon implements AutoCloseable {
|
||||||
private final DictionaryHashMap reverseIndex;
|
private final DictionaryMap reverseIndex;
|
||||||
|
|
||||||
private final ReadWriteLock memoryLock = new ReentrantReadWriteLock();
|
private final ReadWriteLock memoryLock = new ReentrantReadWriteLock();
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
@ -30,7 +31,7 @@ public class KeywordLexicon implements AutoCloseable {
|
|||||||
private final KeywordLexiconJournal journal;
|
private final KeywordLexiconJournal journal;
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public KeywordLexicon(KeywordLexiconJournal keywordLexiconJournal, DictionaryHashMap reverseIndexHashMap) {
|
public KeywordLexicon(KeywordLexiconJournal keywordLexiconJournal, DictionaryMap reverseIndexHashMap) {
|
||||||
|
|
||||||
journal = keywordLexiconJournal;
|
journal = keywordLexiconJournal;
|
||||||
reverseIndex = reverseIndexHashMap;
|
reverseIndex = reverseIndexHashMap;
|
||||||
|
@ -1,16 +0,0 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.model;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
@AllArgsConstructor
|
|
||||||
public class EdgeIndexSearchTerms {
|
|
||||||
public List<Integer> includes = new ArrayList<>();
|
|
||||||
public List<Integer> excludes = new ArrayList<>();
|
|
||||||
|
|
||||||
public boolean isEmpty() {
|
|
||||||
return includes.isEmpty();
|
|
||||||
}
|
|
||||||
}
|
|
@ -0,0 +1,32 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.model;
|
||||||
|
|
||||||
|
import java.util.EnumSet;
|
||||||
|
|
||||||
|
public enum EdgePageWordFlags {
|
||||||
|
Title,
|
||||||
|
Subjects,
|
||||||
|
NamesWords,
|
||||||
|
Site,
|
||||||
|
SiteAdjacent,
|
||||||
|
Simple;
|
||||||
|
|
||||||
|
public int asBit() {
|
||||||
|
return 1 << ordinal();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isPresent(long value) {
|
||||||
|
return (asBit() & value) > 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static EnumSet<EdgePageWordFlags> decode(long encodedValue) {
|
||||||
|
EnumSet<EdgePageWordFlags> ret = EnumSet.noneOf(EdgePageWordFlags.class);
|
||||||
|
|
||||||
|
for (EdgePageWordFlags f : values()) {
|
||||||
|
if ((encodedValue & f.asBit()) > 0) {
|
||||||
|
ret.add(f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,90 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.model;
|
||||||
|
|
||||||
|
import nu.marginalia.util.BrailleBlockPunchCards;
|
||||||
|
|
||||||
|
import java.util.EnumSet;
|
||||||
|
|
||||||
|
import static java.lang.Math.max;
|
||||||
|
import static java.lang.Math.min;
|
||||||
|
|
||||||
|
public record EdgePageWordMetadata(int tfIdf,
|
||||||
|
int positions,
|
||||||
|
int quality,
|
||||||
|
int count,
|
||||||
|
EnumSet<EdgePageWordFlags> flags) {
|
||||||
|
|
||||||
|
// If flags are moved from the least significant end of
|
||||||
|
// this struct, then EntrySourceFromBTree will break.
|
||||||
|
|
||||||
|
public static final long COUNT_MASK = 0xFL;
|
||||||
|
public static final int COUNT_SHIFT = 8;
|
||||||
|
|
||||||
|
public static final long QUALITY_MASK = 0xFL;
|
||||||
|
public static final int QUALITY_SHIFT = 12;
|
||||||
|
|
||||||
|
public static final long TF_IDF_MASK = 0xFFFFL;
|
||||||
|
public static final int TF_IDF_SHIFT = 16;
|
||||||
|
|
||||||
|
public static final int POSITIONS_SHIFT = 32;
|
||||||
|
|
||||||
|
public EdgePageWordMetadata(long value) {
|
||||||
|
this(
|
||||||
|
(int)((value >>> TF_IDF_SHIFT) & TF_IDF_MASK),
|
||||||
|
(int)(value >>> POSITIONS_SHIFT),
|
||||||
|
(int)((value >>> QUALITY_SHIFT) & QUALITY_MASK),
|
||||||
|
(int)((value >>> COUNT_SHIFT) & COUNT_MASK),
|
||||||
|
EdgePageWordFlags.decode(value)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int decodeQuality(long encoded) {
|
||||||
|
return (int)((encoded >>> QUALITY_SHIFT) & QUALITY_MASK);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static boolean hasFlags(long encoded, long metadataBitMask) {
|
||||||
|
return (encoded & metadataBitMask) == encoded;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
StringBuilder sb = new StringBuilder(getClass().getSimpleName());
|
||||||
|
sb.append('[')
|
||||||
|
.append("tfidf=").append(tfIdf).append(", ")
|
||||||
|
.append("quality=").append(quality).append(", ")
|
||||||
|
.append("count=").append(count).append(", ")
|
||||||
|
.append("positions=[").append(BrailleBlockPunchCards.printBits(positions, 32)).append(']');
|
||||||
|
sb.append(", flags=").append(flags).append(']');
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Encoded in a 64 bit long as
|
||||||
|
0-8 flags
|
||||||
|
8-12 count,
|
||||||
|
12-16 quality,
|
||||||
|
16-32 tf-idf [0, 65536]
|
||||||
|
32-64 position mask
|
||||||
|
*/
|
||||||
|
public long encode() {
|
||||||
|
long ret = 0;
|
||||||
|
|
||||||
|
for (var flag : flags) {
|
||||||
|
ret |= flag.asBit();
|
||||||
|
}
|
||||||
|
|
||||||
|
ret |= min(TF_IDF_MASK, max(0, tfIdf)) << TF_IDF_SHIFT;
|
||||||
|
ret |= min(COUNT_MASK, max(0, count)) << COUNT_SHIFT;
|
||||||
|
ret |= min(QUALITY_MASK, max(0, quality)) << QUALITY_SHIFT;
|
||||||
|
ret |= ((long)(positions)) << POSITIONS_SHIFT;
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isEmpty() {
|
||||||
|
return count == 0 && positions == 0 && flags.isEmpty() && tfIdf == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static long emptyValue() {
|
||||||
|
return 0L;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -1,20 +0,0 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.model;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Getter;
|
|
||||||
import lombok.ToString;
|
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
|
||||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
|
||||||
|
|
||||||
@AllArgsConstructor @Getter
|
|
||||||
@ToString
|
|
||||||
public class EdgePutWordsRequest {
|
|
||||||
public EdgeId<EdgeDomain> domainId;
|
|
||||||
public EdgeId<EdgeUrl> urlId;
|
|
||||||
public double quality;
|
|
||||||
|
|
||||||
public EdgePageWordSet wordSet;
|
|
||||||
private int index = 0;
|
|
||||||
}
|
|
@ -1,47 +1,35 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.model;
|
package nu.marginalia.wmsa.edge.index.model;
|
||||||
|
|
||||||
public enum IndexBlock {
|
public enum IndexBlock {
|
||||||
TitleKeywords(IndexBlockType.QUALITY_SIGNAL, 0, 0),
|
Title(IndexBlockType.PAGE_DATA),
|
||||||
Title(IndexBlockType.QUALITY_SIGNAL, 1, 1),
|
Meta(IndexBlockType.PAGE_DATA),
|
||||||
|
|
||||||
Link(IndexBlockType.QUALITY_SIGNAL, 2, 1.15),
|
Words_1(IndexBlockType.PAGE_DATA),
|
||||||
|
Words_2(IndexBlockType.PAGE_DATA),
|
||||||
|
Words_4(IndexBlockType.PAGE_DATA),
|
||||||
|
Words_8(IndexBlockType.PAGE_DATA),
|
||||||
|
Words_16Plus(IndexBlockType.PAGE_DATA),
|
||||||
|
|
||||||
Subjects(IndexBlockType.QUALITY_SIGNAL, 3, 1.0),
|
Link(IndexBlockType.QUALITY_SIGNAL),
|
||||||
NamesWords(IndexBlockType.QUALITY_SIGNAL, 4, 3.0),
|
Site(IndexBlockType.QUALITY_SIGNAL),
|
||||||
|
|
||||||
Artifacts(IndexBlockType.PAGE_DATA, 5, 10),
|
Artifacts(IndexBlockType.PAGE_DATA),
|
||||||
Meta(IndexBlockType.PAGE_DATA, 6, 7),
|
|
||||||
|
|
||||||
Tfidf_Top(IndexBlockType.TF_IDF, 7, 1.5),
|
Tfidf_High(IndexBlockType.TRANSIENT),
|
||||||
Tfidf_Middle(IndexBlockType.TF_IDF, 8, 2),
|
Subjects(IndexBlockType.TRANSIENT)
|
||||||
Tfidf_Lower(IndexBlockType.TF_IDF, 9, 3.5),
|
|
||||||
|
|
||||||
Words_1(IndexBlockType.PAGE_DATA, 10, 2.0),
|
|
||||||
Words_2(IndexBlockType.PAGE_DATA,11, 3.5),
|
|
||||||
Words_4(IndexBlockType.PAGE_DATA,12, 4.0),
|
|
||||||
Words_8(IndexBlockType.PAGE_DATA,13, 4.5),
|
|
||||||
Words_16Plus(IndexBlockType.PAGE_DATA,14, 7.0),
|
|
||||||
|
|
||||||
Site(IndexBlockType.QUALITY_SIGNAL, 15, 1.2)
|
|
||||||
;
|
;
|
||||||
|
|
||||||
public final IndexBlockType type;
|
public final IndexBlockType type;
|
||||||
public final int id;
|
|
||||||
public final double sortOrder;
|
|
||||||
|
|
||||||
IndexBlock(IndexBlockType type, int id, double sortOrder) {
|
IndexBlock(IndexBlockType type) {
|
||||||
this.type = type;
|
this.type = type;
|
||||||
this.sortOrder = sortOrder;
|
|
||||||
this.id = id;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This is kind of a hot method, and Enum.values() allocates a new
|
||||||
|
// array each call.
|
||||||
|
private static final IndexBlock[] values = IndexBlock.values();
|
||||||
public static IndexBlock byId(int id) {
|
public static IndexBlock byId(int id) {
|
||||||
for (IndexBlock block : values()) {
|
return values[id];
|
||||||
if (id == block.id) {
|
|
||||||
return block;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
throw new IllegalArgumentException("Bad block id");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,7 +1,10 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.model;
|
package nu.marginalia.wmsa.edge.index.model;
|
||||||
|
|
||||||
public enum IndexBlockType {
|
public enum IndexBlockType {
|
||||||
|
/** This block is only used for joins */
|
||||||
QUALITY_SIGNAL,
|
QUALITY_SIGNAL,
|
||||||
TF_IDF,
|
/** This block contains page keywords */
|
||||||
PAGE_DATA
|
PAGE_DATA,
|
||||||
|
/** This block is only used for generation */
|
||||||
|
TRANSIENT
|
||||||
}
|
}
|
||||||
|
@ -2,7 +2,6 @@ package nu.marginalia.wmsa.edge.index.reader;
|
|||||||
|
|
||||||
import com.upserve.uppend.blobs.NativeIO;
|
import com.upserve.uppend.blobs.NativeIO;
|
||||||
import nu.marginalia.util.btree.BTreeReader;
|
import nu.marginalia.util.btree.BTreeReader;
|
||||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
|
||||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -17,7 +16,6 @@ import static nu.marginalia.wmsa.edge.index.conversion.words.WordsTableWriter.wo
|
|||||||
public class IndexWordsTable implements AutoCloseable {
|
public class IndexWordsTable implements AutoCloseable {
|
||||||
protected final MultimapFileLong words;
|
protected final MultimapFileLong words;
|
||||||
protected final BTreeReader reader;
|
protected final BTreeReader reader;
|
||||||
protected final BTreeHeader header;
|
|
||||||
protected final int HEADER_OFFSET = 1;
|
protected final int HEADER_OFFSET = 1;
|
||||||
final Logger logger = LoggerFactory.getLogger(getClass());
|
final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
@ -26,8 +24,7 @@ public class IndexWordsTable implements AutoCloseable {
|
|||||||
public IndexWordsTable(MultimapFileLong words) {
|
public IndexWordsTable(MultimapFileLong words) {
|
||||||
this.words = words;
|
this.words = words;
|
||||||
|
|
||||||
reader = new BTreeReader(words, wordsBTreeContext);
|
reader = new BTreeReader(words, wordsBTreeContext, HEADER_OFFSET);
|
||||||
header = reader.getHeader(HEADER_OFFSET);
|
|
||||||
|
|
||||||
madvise();
|
madvise();
|
||||||
}
|
}
|
||||||
@ -49,7 +46,7 @@ public class IndexWordsTable implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public long positionForWord(int wordId) {
|
public long positionForWord(int wordId) {
|
||||||
long offset = reader.findEntry(header, wordId);
|
long offset = reader.findEntry(wordId);
|
||||||
|
|
||||||
if (offset < 0) {
|
if (offset < 0) {
|
||||||
return -1L;
|
return -1L;
|
||||||
@ -60,7 +57,7 @@ public class IndexWordsTable implements AutoCloseable {
|
|||||||
|
|
||||||
public int wordLength(int wordId) {
|
public int wordLength(int wordId) {
|
||||||
|
|
||||||
long offset = reader.findEntry(header, wordId);
|
long offset = reader.findEntry(wordId);
|
||||||
if (offset < 0) {
|
if (offset < 0) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
@ -72,7 +69,7 @@ public class IndexWordsTable implements AutoCloseable {
|
|||||||
words.advice(NativeIO.Advice.Random);
|
words.advice(NativeIO.Advice.Random);
|
||||||
words.advice0(NativeIO.Advice.WillNeed);
|
words.advice0(NativeIO.Advice.WillNeed);
|
||||||
|
|
||||||
var h = reader.getHeader(HEADER_OFFSET);
|
var h = reader.getHeader();
|
||||||
int length = (int)(h.dataOffsetLongs() - h.indexOffsetLongs());
|
int length = (int)(h.dataOffsetLongs() - h.indexOffsetLongs());
|
||||||
|
|
||||||
words.adviceRange(NativeIO.Advice.WillNeed, h.indexOffsetLongs(), length);
|
words.adviceRange(NativeIO.Advice.WillNeed, h.indexOffsetLongs(), length);
|
||||||
@ -80,8 +77,8 @@ public class IndexWordsTable implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void forEachWordsOffset(LongConsumer offsetConsumer) {
|
public void forEachWordsOffset(LongConsumer offsetConsumer) {
|
||||||
int n = header.numEntries();
|
int n = reader.numEntries();
|
||||||
long offset = header.dataOffsetLongs();
|
long offset = reader.getHeader().dataOffsetLongs();
|
||||||
|
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
try {
|
try {
|
||||||
|
@ -5,21 +5,13 @@ import com.google.inject.name.Named;
|
|||||||
import com.upserve.uppend.blobs.NativeIO;
|
import com.upserve.uppend.blobs.NativeIO;
|
||||||
import io.reactivex.rxjava3.schedulers.Schedulers;
|
import io.reactivex.rxjava3.schedulers.Schedulers;
|
||||||
import nu.marginalia.util.btree.BTreeReader;
|
import nu.marginalia.util.btree.BTreeReader;
|
||||||
import nu.marginalia.util.btree.CachingBTreeReader;
|
|
||||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
|
||||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||||
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
|
|
||||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
|
||||||
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
|
|
||||||
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.RandomAccessFile;
|
import java.io.RandomAccessFile;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.stream.LongStream;
|
|
||||||
|
|
||||||
public class SearchIndex implements AutoCloseable {
|
public class SearchIndex implements AutoCloseable {
|
||||||
|
|
||||||
@ -27,8 +19,6 @@ public class SearchIndex implements AutoCloseable {
|
|||||||
private final IndexWordsTable words;
|
private final IndexWordsTable words;
|
||||||
public final String name;
|
public final String name;
|
||||||
private final RandomAccessFile wordsFile;
|
private final RandomAccessFile wordsFile;
|
||||||
private final BTreeReader bTreeReader;
|
|
||||||
private final CachingBTreeReader cachingBTreeReader;
|
|
||||||
|
|
||||||
private final Logger logger;
|
private final Logger logger;
|
||||||
|
|
||||||
@ -49,16 +39,13 @@ public class SearchIndex implements AutoCloseable {
|
|||||||
urls = MultimapFileLong.forReading(inUrls.toPath());
|
urls = MultimapFileLong.forReading(inUrls.toPath());
|
||||||
words = IndexWordsTable.ofFile(wordsFile);
|
words = IndexWordsTable.ofFile(wordsFile);
|
||||||
|
|
||||||
bTreeReader = new BTreeReader(urls, SearchIndexConverter.urlsBTreeContext);
|
Schedulers.io().scheduleDirect(() -> madvise(urls));
|
||||||
cachingBTreeReader = new CachingBTreeReader(urls, SearchIndexConverter.urlsBTreeContext);
|
|
||||||
|
|
||||||
Schedulers.io().scheduleDirect(() -> madvise(urls, bTreeReader));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void madvise(MultimapFileLong urls, BTreeReader reader) {
|
private void madvise(MultimapFileLong urls) {
|
||||||
|
|
||||||
words.forEachWordsOffset(offset -> {
|
words.forEachWordsOffset(offset -> {
|
||||||
var h = reader.getHeader(offset);
|
var h = BTreeReader.createHeader(urls, offset);
|
||||||
long length = h.dataOffsetLongs() - h.indexOffsetLongs();
|
long length = h.dataOffsetLongs() - h.indexOffsetLongs();
|
||||||
|
|
||||||
urls.adviceRange(NativeIO.Advice.WillNeed, offset, 512);
|
urls.adviceRange(NativeIO.Advice.WillNeed, offset, 512);
|
||||||
@ -70,174 +57,16 @@ public class SearchIndex implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public long numUrls(IndexQueryCachePool pool, int wordId) {
|
public long numUrls(int wordId) {
|
||||||
int length = words.wordLength(wordId);
|
int length = words.wordLength(wordId);
|
||||||
if (length < 0) return 0;
|
if (length < 0) return 0;
|
||||||
if (length > 0) return length;
|
if (length > 0) return length;
|
||||||
|
|
||||||
return rangeForWord(pool, wordId).numEntries();
|
return rangeForWord(wordId).numEntries();
|
||||||
}
|
}
|
||||||
|
|
||||||
public IndexBTreeRange rangeForWord(IndexQueryCachePool pool, int wordId) {
|
public SearchIndexURLRange rangeForWord(int wordId) {
|
||||||
IndexBTreeRange range = pool.getRange(words, wordId);
|
return new SearchIndexURLRange(urls, words.positionForWord(wordId));
|
||||||
|
|
||||||
if (range == null) {
|
|
||||||
range = new IndexBTreeRange(words.positionForWord(wordId));
|
|
||||||
pool.cacheRange(words, wordId, range);
|
|
||||||
}
|
|
||||||
|
|
||||||
return range;
|
|
||||||
}
|
|
||||||
|
|
||||||
public IndexBTreeRange rangeForWord(int wordId) {
|
|
||||||
return new IndexBTreeRange(words.positionForWord(wordId));
|
|
||||||
}
|
|
||||||
|
|
||||||
public class IndexBTreeRange {
|
|
||||||
public final long dataOffset;
|
|
||||||
private BTreeHeader header;
|
|
||||||
public IndexBTreeRange(long dataOffset) {
|
|
||||||
this.dataOffset = dataOffset;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LongStream stream(int bufferSize) {
|
|
||||||
if (dataOffset < 0) {
|
|
||||||
return LongStream.empty();
|
|
||||||
}
|
|
||||||
if (header == null) {
|
|
||||||
header = bTreeReader.getHeader(dataOffset);
|
|
||||||
}
|
|
||||||
|
|
||||||
long urlOffset = header.dataOffsetLongs();
|
|
||||||
long endOffset = header.dataOffsetLongs() + header.numEntries();
|
|
||||||
int stepSize = Math.min(bufferSize, header.numEntries());
|
|
||||||
|
|
||||||
long[] buffer = new long[stepSize];
|
|
||||||
|
|
||||||
return LongStream
|
|
||||||
.iterate(urlOffset, i -> i< endOffset, i->i+stepSize)
|
|
||||||
.flatMap(pos -> {
|
|
||||||
int sz = (int)(Math.min(pos+stepSize, endOffset) - pos);
|
|
||||||
urls.read(buffer, sz, pos);
|
|
||||||
return Arrays.stream(buffer, 0, sz);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
public EntrySource asEntrySource() {
|
|
||||||
return new AsEntrySource();
|
|
||||||
}
|
|
||||||
|
|
||||||
public QueryFilterStepIf asExcludeFilterStep(IndexQueryCachePool pool) {
|
|
||||||
return new AsExcludeQueryFilterStep(pool);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public LongStream stream() {
|
|
||||||
return stream(1024);
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isPresent() {
|
|
||||||
return dataOffset >= 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
public long numEntries() {
|
|
||||||
if (header != null) {
|
|
||||||
return header.numEntries();
|
|
||||||
}
|
|
||||||
else if (dataOffset < 0) return 0L;
|
|
||||||
else {
|
|
||||||
header = bTreeReader.getHeader(dataOffset);
|
|
||||||
return header.numEntries();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean hasUrl(CachingBTreeReader.BTreeCachedIndex cache, long url) {
|
|
||||||
if (dataOffset < 0) return false;
|
|
||||||
|
|
||||||
return cachingBTreeReader.findEntry(cache, url) >= 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean hasUrl(IndexQueryCachePool pool, long url) {
|
|
||||||
if (dataOffset < 0)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
CachingBTreeReader.BTreeCachedIndex cache = pool.getIndexCache(SearchIndex.this, this);
|
|
||||||
|
|
||||||
return cachingBTreeReader.findEntry(cache, url) >= 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
public CachingBTreeReader.BTreeCachedIndex createIndexCache() {
|
|
||||||
if (dataOffset < 0)
|
|
||||||
return null;
|
|
||||||
|
|
||||||
if (header == null) {
|
|
||||||
header = cachingBTreeReader.getHeader(dataOffset);
|
|
||||||
}
|
|
||||||
|
|
||||||
return cachingBTreeReader.prepareCache(header);
|
|
||||||
}
|
|
||||||
|
|
||||||
class AsEntrySource implements EntrySource {
|
|
||||||
long pos;
|
|
||||||
final long endOffset;
|
|
||||||
|
|
||||||
public SearchIndex getIndex() {
|
|
||||||
return SearchIndex.this;
|
|
||||||
};
|
|
||||||
|
|
||||||
public AsEntrySource() {
|
|
||||||
if (dataOffset <= 0) {
|
|
||||||
pos = -1;
|
|
||||||
endOffset = -1;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (header == null) {
|
|
||||||
header = bTreeReader.getHeader(dataOffset);
|
|
||||||
}
|
|
||||||
|
|
||||||
pos = header.dataOffsetLongs();
|
|
||||||
endOffset = header.dataOffsetLongs() + header.numEntries();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int read(long[] buffer, int n) {
|
|
||||||
if (pos >= endOffset) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int rb = Math.min(n, (int)(endOffset - pos));
|
|
||||||
urls.read(buffer, rb, pos);
|
|
||||||
pos += rb;
|
|
||||||
return rb;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
class AsExcludeQueryFilterStep implements QueryFilterStepIf {
|
|
||||||
private final CachingBTreeReader.BTreeCachedIndex cache;
|
|
||||||
|
|
||||||
public AsExcludeQueryFilterStep(IndexQueryCachePool pool) {
|
|
||||||
cache = pool.getIndexCache(SearchIndex.this, IndexBTreeRange.this);
|
|
||||||
}
|
|
||||||
|
|
||||||
public SearchIndex getIndex() {
|
|
||||||
return SearchIndex.this;
|
|
||||||
};
|
|
||||||
public double cost() {
|
|
||||||
return cache.getIndexedDataSize();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean test(long value) {
|
|
||||||
return !hasUrl(cache, value);
|
|
||||||
}
|
|
||||||
|
|
||||||
public String describe() {
|
|
||||||
return "Exclude["+name+"]";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -5,7 +5,6 @@ import lombok.SneakyThrows;
|
|||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexDomainQueryFactory;
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexDomainQueryFactory;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
|
||||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory;
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -22,31 +21,14 @@ public class SearchIndexReader implements AutoCloseable {
|
|||||||
private final IndexDomainQueryFactory domainQueryFactory;
|
private final IndexDomainQueryFactory domainQueryFactory;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
private static final IndexBlock[] indicesBySearchOrder = new IndexBlock[] {
|
|
||||||
IndexBlock.Title,
|
|
||||||
IndexBlock.Tfidf_Top,
|
|
||||||
IndexBlock.Tfidf_Middle,
|
|
||||||
IndexBlock.Tfidf_Lower,
|
|
||||||
IndexBlock.Words_1,
|
|
||||||
IndexBlock.Words_2,
|
|
||||||
IndexBlock.Words_4,
|
|
||||||
IndexBlock.Words_8,
|
|
||||||
IndexBlock.Words_16Plus,
|
|
||||||
};
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public SearchIndexReader(
|
public SearchIndexReader(
|
||||||
EnumMap<IndexBlock, SearchIndex> indices) {
|
EnumMap<IndexBlock, SearchIndex> indices) {
|
||||||
this.indices = indices;
|
this.indices = indices;
|
||||||
|
|
||||||
var lowIndex = indices.get(IndexBlock.Tfidf_Lower);
|
|
||||||
var midIndex = indices.get(IndexBlock.Tfidf_Middle);
|
|
||||||
var topIndex = indices.get(IndexBlock.Tfidf_Top);
|
|
||||||
var linkIndex = indices.get(IndexBlock.Link);
|
var linkIndex = indices.get(IndexBlock.Link);
|
||||||
var titleIndex = indices.get(IndexBlock.Title);
|
var titleIndex = indices.get(IndexBlock.Title);
|
||||||
var siteIndex = indices.get(IndexBlock.Site);
|
|
||||||
var metaIndex = indices.get(IndexBlock.Meta);
|
var metaIndex = indices.get(IndexBlock.Meta);
|
||||||
var topicIndex = indices.get(IndexBlock.Subjects);
|
|
||||||
|
|
||||||
var words1 = indices.get(IndexBlock.Words_1);
|
var words1 = indices.get(IndexBlock.Words_1);
|
||||||
var words2 = indices.get(IndexBlock.Words_2);
|
var words2 = indices.get(IndexBlock.Words_2);
|
||||||
@ -57,7 +39,7 @@ public class SearchIndexReader implements AutoCloseable {
|
|||||||
|
|
||||||
queryBuilders = new EnumMap<>(IndexBlock.class);
|
queryBuilders = new EnumMap<>(IndexBlock.class);
|
||||||
|
|
||||||
List<SearchIndex> excludeIndices = listOfNonNulls(metaIndex, titleIndex, topIndex, midIndex, lowIndex, words1);
|
List<SearchIndex> excludeIndices = listOfNonNulls(metaIndex, titleIndex, words1, words2, words4, words8, words16);
|
||||||
|
|
||||||
queryBuilders.put(IndexBlock.Title, new IndexQueryFactory(listOfNonNulls(metaIndex, titleIndex, linkIndex), excludeIndices));
|
queryBuilders.put(IndexBlock.Title, new IndexQueryFactory(listOfNonNulls(metaIndex, titleIndex, linkIndex), excludeIndices));
|
||||||
queryBuilders.put(IndexBlock.Words_1, new IndexQueryFactory(listOfNonNulls(metaIndex, words1), excludeIndices));
|
queryBuilders.put(IndexBlock.Words_1, new IndexQueryFactory(listOfNonNulls(metaIndex, words1), excludeIndices));
|
||||||
@ -66,7 +48,7 @@ public class SearchIndexReader implements AutoCloseable {
|
|||||||
queryBuilders.put(IndexBlock.Words_8, new IndexQueryFactory(listOfNonNulls(metaIndex, words8), excludeIndices));
|
queryBuilders.put(IndexBlock.Words_8, new IndexQueryFactory(listOfNonNulls(metaIndex, words8), excludeIndices));
|
||||||
queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryFactory(listOfNonNulls(metaIndex, words16, artifacts), excludeIndices));
|
queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryFactory(listOfNonNulls(metaIndex, words16, artifacts), excludeIndices));
|
||||||
|
|
||||||
domainQueryFactory = new IndexDomainQueryFactory(siteIndex, listOfNonNulls(topicIndex));
|
domainQueryFactory = new IndexDomainQueryFactory(indices.get(IndexBlock.Words_1));
|
||||||
}
|
}
|
||||||
|
|
||||||
@SafeVarargs
|
@SafeVarargs
|
||||||
@ -75,17 +57,31 @@ public class SearchIndexReader implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public IndexQueryFactory.IndexQueryBuilder findWord(IndexQueryCachePool cachePool, IndexBlock block, int wordId) {
|
public IndexQueryFactory.IndexQueryBuilder findWord(IndexBlock block, Integer quality, int wordId) {
|
||||||
var builder = queryBuilders.get(block);
|
var builder = queryBuilders.get(block);
|
||||||
|
|
||||||
if (builder == null)
|
if (builder == null)
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
return builder.buildQuery(cachePool, wordId);
|
if (quality == null) {
|
||||||
|
return builder.buildQuery(wordId);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return builder.buildQuery(quality, wordId);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public IndexQuery findDomain(IndexQueryCachePool cachePool, int wordId) {
|
public IndexQueryFactory.IndexQueryBuilder findWordForDomainList(IndexBlock block, List<Integer> domains, int wordId) {
|
||||||
return domainQueryFactory.buildQuery(cachePool, wordId);
|
var builder = queryBuilders.get(block);
|
||||||
|
|
||||||
|
if (builder == null)
|
||||||
|
return null;
|
||||||
|
|
||||||
|
return builder.buildQuery(domains, wordId);
|
||||||
|
}
|
||||||
|
|
||||||
|
public IndexQuery findDomain(int wordId) {
|
||||||
|
return domainQueryFactory.buildQuery(wordId);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -96,7 +92,7 @@ public class SearchIndexReader implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public long numHits(IndexQueryCachePool pool, IndexBlock block, int word) {
|
public long numHits(IndexBlock block, int word) {
|
||||||
IndexQueryFactory builder = queryBuilders.get(block);
|
IndexQueryFactory builder = queryBuilders.get(block);
|
||||||
|
|
||||||
if (builder == null)
|
if (builder == null)
|
||||||
@ -104,31 +100,18 @@ public class SearchIndexReader implements AutoCloseable {
|
|||||||
|
|
||||||
long hits = 0;
|
long hits = 0;
|
||||||
for (var index : builder.getIndicies()) {
|
for (var index : builder.getIndicies()) {
|
||||||
hits += index.numUrls(pool, word);
|
hits += index.numUrls(word);
|
||||||
}
|
}
|
||||||
return hits;
|
return hits;
|
||||||
}
|
}
|
||||||
|
|
||||||
public IndexBlock getBlockForResult(IndexQueryCachePool cachePool, int searchTerm, long urlId) {
|
|
||||||
for (var block : indicesBySearchOrder) {
|
|
||||||
var index = indices.get(block);
|
|
||||||
|
|
||||||
if (null == index) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (cachePool.isUrlPresent(index, searchTerm, urlId))
|
|
||||||
return block;
|
|
||||||
|
|
||||||
|
public long[] getMetadata(IndexBlock block, int termId, long[] ids) {
|
||||||
|
final var index = indices.get(block);
|
||||||
|
if (null == index) {
|
||||||
|
return new long[ids.length];
|
||||||
}
|
}
|
||||||
|
|
||||||
return IndexBlock.Words_16Plus;
|
return indices.get(block).rangeForWord(termId).getMetadata(ids);
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isTermInBucket(IndexQueryCachePool cachePool, IndexBlock block, int searchTerm, long urlId) {
|
|
||||||
final var index = indices.get(block);
|
|
||||||
if (null == index) return false;
|
|
||||||
|
|
||||||
return cachePool.isUrlPresent(index, searchTerm, urlId);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,100 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.reader;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongLongImmutablePair;
|
||||||
|
import nu.marginalia.util.btree.BTreeQueryBuffer;
|
||||||
|
import nu.marginalia.util.btree.BTreeReader;
|
||||||
|
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||||
|
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.types.EmptyEntrySource;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySourceFromBTree;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySourceFromMapRange;
|
||||||
|
|
||||||
|
import javax.annotation.Nullable;
|
||||||
|
|
||||||
|
import static nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags.*;
|
||||||
|
|
||||||
|
public class SearchIndexURLRange {
|
||||||
|
public final long dataOffset;
|
||||||
|
private final MultimapFileLong urlsFile;
|
||||||
|
|
||||||
|
@Nullable
|
||||||
|
private final BTreeReader reader;
|
||||||
|
|
||||||
|
public SearchIndexURLRange(MultimapFileLong urlsFile, long dataOffset) {
|
||||||
|
this.dataOffset = dataOffset;
|
||||||
|
this.urlsFile = urlsFile;
|
||||||
|
|
||||||
|
if (dataOffset >= 0) {
|
||||||
|
this.reader = new BTreeReader(urlsFile, SearchIndexConverter.urlsBTreeContext, dataOffset);
|
||||||
|
} else {
|
||||||
|
this.reader = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public EntrySource asPrefixSource(long prefix, long prefixNext) {
|
||||||
|
if (reader == null)
|
||||||
|
return new EmptyEntrySource();
|
||||||
|
|
||||||
|
LongLongImmutablePair startAndEnd = reader.getRangeForPrefix(prefix, prefixNext);
|
||||||
|
|
||||||
|
if (startAndEnd.firstLong() == startAndEnd.secondLong()) {
|
||||||
|
return new EmptyEntrySource();
|
||||||
|
}
|
||||||
|
|
||||||
|
return new EntrySourceFromMapRange(urlsFile, startAndEnd.firstLong(), startAndEnd.secondLong());
|
||||||
|
}
|
||||||
|
|
||||||
|
public EntrySource asEntrySource() {
|
||||||
|
return new EntrySourceFromBTree(reader, EntrySourceFromBTree.NO_MASKING, null);
|
||||||
|
}
|
||||||
|
public EntrySource asQualityLimitingEntrySource(int limit) {
|
||||||
|
return new EntrySourceFromBTree(reader, EntrySourceFromBTree.NO_MASKING, limit);
|
||||||
|
}
|
||||||
|
public EntrySource asDomainEntrySource() {
|
||||||
|
return new EntrySourceFromBTree(reader, Subjects.asBit() | Site.asBit() | Title.asBit(), null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isPresent() {
|
||||||
|
return dataOffset >= 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long numEntries() {
|
||||||
|
if (reader == null)
|
||||||
|
return 0L;
|
||||||
|
|
||||||
|
return reader.numEntries();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void retainUrls(BTreeQueryBuffer buffer) {
|
||||||
|
if (reader != null)
|
||||||
|
reader.retainEntries(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void rejectUrls(BTreeQueryBuffer buffer) {
|
||||||
|
if (reader != null)
|
||||||
|
reader.rejectEntries(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasUrl(long url) {
|
||||||
|
if (reader == null)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return reader.findEntry(url) >= 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public long[] getMetadata(long[] urls) {
|
||||||
|
if (reader == null) {
|
||||||
|
return new long[urls.length];
|
||||||
|
}
|
||||||
|
|
||||||
|
return reader.queryData(urls, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return String.format("BTreeRange(@" + dataOffset + ", size = " + numEntries() + ")");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,111 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.svc;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import io.prometheus.client.Histogram;
|
||||||
|
import nu.marginalia.util.btree.BTreeQueryBuffer;
|
||||||
|
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||||
|
import nu.marginalia.wmsa.client.GsonFactory;
|
||||||
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexSearchBudget;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.id.EdgeIdList;
|
||||||
|
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
||||||
|
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
||||||
|
import org.apache.http.HttpStatus;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
import spark.HaltException;
|
||||||
|
import spark.Request;
|
||||||
|
import spark.Response;
|
||||||
|
import spark.Spark;
|
||||||
|
|
||||||
|
import java.util.OptionalInt;
|
||||||
|
|
||||||
|
import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH;
|
||||||
|
import static spark.Spark.halt;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class EdgeIndexDomainQueryService {
|
||||||
|
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
private static final Histogram wmsa_edge_index_domain_query_time = Histogram.build().name("wmsa_edge_index_domain_query_time").linearBuckets(25/1000., 25/1000., 15).help("-").register();
|
||||||
|
|
||||||
|
private final Gson gson = GsonFactory.get();
|
||||||
|
|
||||||
|
private final SearchIndexes indexes;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public EdgeIndexDomainQueryService(SearchIndexes indexes) {
|
||||||
|
this.indexes = indexes;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object searchDomain(Request request, Response response) {
|
||||||
|
if (indexes.getLexiconReader() == null) {
|
||||||
|
logger.warn("Dictionary reader not yet initialized");
|
||||||
|
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
|
||||||
|
}
|
||||||
|
|
||||||
|
String json = request.body();
|
||||||
|
EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class);
|
||||||
|
|
||||||
|
try {
|
||||||
|
return wmsa_edge_index_domain_query_time.time(() -> queryDomain(specsSet));
|
||||||
|
}
|
||||||
|
catch (HaltException ex) {
|
||||||
|
logger.warn("Halt", ex);
|
||||||
|
throw ex;
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.info("Error during domain search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
|
||||||
|
logger.info("Error", ex);
|
||||||
|
Spark.halt(500, "Error");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public EdgeDomainSearchResults queryDomain(EdgeDomainSearchSpecification specsSet) {
|
||||||
|
|
||||||
|
final OptionalInt wordId = lookUpWord(specsSet.keyword);
|
||||||
|
final EdgeIdList<EdgeUrl> urlIds = new EdgeIdList<>();
|
||||||
|
|
||||||
|
final IndexSearchBudget budget = new IndexSearchBudget(50);
|
||||||
|
|
||||||
|
if (wordId.isEmpty()) {
|
||||||
|
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
|
||||||
|
}
|
||||||
|
|
||||||
|
BTreeQueryBuffer buffer = new BTreeQueryBuffer(512);
|
||||||
|
|
||||||
|
for (int bucket = 0; budget.hasTimeLeft() && bucket < DYNAMIC_BUCKET_LENGTH+1; bucket++) {
|
||||||
|
|
||||||
|
final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(1);
|
||||||
|
var query = indexes.getBucket(bucket).getDomainQuery(wordId.getAsInt(), localFilter);
|
||||||
|
|
||||||
|
while (query.hasMore() && urlIds.size() < specsSet.maxResults) {
|
||||||
|
query.getMoreResults(buffer);
|
||||||
|
|
||||||
|
for (int i = 0; i < buffer.end && urlIds.size() < specsSet.maxResults; i++) {
|
||||||
|
long result = buffer.data[i];
|
||||||
|
if (localFilter.test(result)) {
|
||||||
|
urlIds.add((int) (result & 0xFFFF_FFFFL));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
|
||||||
|
}
|
||||||
|
|
||||||
|
private OptionalInt lookUpWord(String s) {
|
||||||
|
int ret = indexes.getLexiconReader().get(s);
|
||||||
|
if (ret == DictionaryHashMap.NO_VALUE) {
|
||||||
|
return OptionalInt.empty();
|
||||||
|
}
|
||||||
|
return OptionalInt.of(ret);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -5,6 +5,7 @@ import com.google.inject.Singleton;
|
|||||||
import com.google.protobuf.InvalidProtocolBufferException;
|
import com.google.protobuf.InvalidProtocolBufferException;
|
||||||
import nu.marginalia.util.ListChunker;
|
import nu.marginalia.util.ListChunker;
|
||||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||||
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
|
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
|
||||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
|
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
|
||||||
@ -21,7 +22,6 @@ import spark.Request;
|
|||||||
import spark.Response;
|
import spark.Response;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class EdgeIndexLexiconService {
|
public class EdgeIndexLexiconService {
|
||||||
@ -35,6 +35,11 @@ public class EdgeIndexLexiconService {
|
|||||||
this.keywordLexicon = servicesFactory.getKeywordLexicon();
|
this.keywordLexicon = servicesFactory.getKeywordLexicon();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public EdgeIndexLexiconService(SearchIndexes indexes, KeywordLexicon lexicon) {
|
||||||
|
this.indexes = indexes;
|
||||||
|
this.keywordLexicon = lexicon;
|
||||||
|
}
|
||||||
|
|
||||||
public Object getWordId(Request request, Response response) {
|
public Object getWordId(Request request, Response response) {
|
||||||
final String word = request.splat()[0];
|
final String word = request.splat()[0];
|
||||||
|
|
||||||
@ -73,31 +78,37 @@ public class EdgeIndexLexiconService {
|
|||||||
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
|
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
|
||||||
IndexPutKeywordsReq.WordSet words, int idx
|
IndexPutKeywordsReq.WordSet words, int idx
|
||||||
) {
|
) {
|
||||||
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
|
|
||||||
|
|
||||||
|
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
|
||||||
IndexBlock block = IndexBlock.values()[words.getIndex()];
|
IndexBlock block = IndexBlock.values()[words.getIndex()];
|
||||||
|
|
||||||
for (var chunk : ListChunker.chopList(words.getWordsList(), SearchIndexJournalEntry.MAX_LENGTH)) {
|
var wordArray = words.getWordsList().toArray(String[]::new);
|
||||||
|
var metaArray = words.getMetaList().stream().mapToLong(Long::valueOf).toArray();
|
||||||
|
|
||||||
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
|
DocumentKeywords documentKeywords = new DocumentKeywords(block, wordArray, metaArray);
|
||||||
|
for (var chunk : ListChunker.chopList(documentKeywords, SearchIndexJournalEntry.MAX_LENGTH)) {
|
||||||
|
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk.keywords(), chunk.metadata()));
|
||||||
var header = new SearchIndexJournalEntryHeader(domainId, urlId, block);
|
var header = new SearchIndexJournalEntryHeader(domainId, urlId, block);
|
||||||
|
|
||||||
indexWriter.put(header, entry);
|
indexWriter.put(header, entry);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private long[] getOrInsertWordIds(List<String> words) {
|
private long[] getOrInsertWordIds(String[] words, long[] meta) {
|
||||||
long[] ids = new long[words.size()];
|
long[] ids = new long[words.length*2];
|
||||||
int putIdx = 0;
|
int putIdx = 0;
|
||||||
|
|
||||||
for (String word : words) {
|
for (int i = 0; i < words.length; i++) {
|
||||||
|
String word = words[i];
|
||||||
|
|
||||||
long id = keywordLexicon.getOrInsert(word);
|
long id = keywordLexicon.getOrInsert(word);
|
||||||
if (id != DictionaryHashMap.NO_VALUE) {
|
if (id != DictionaryHashMap.NO_VALUE) {
|
||||||
ids[putIdx++] = id;
|
ids[putIdx++] = id;
|
||||||
|
ids[putIdx++] = meta[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (putIdx != words.size()) {
|
if (putIdx != words.length*2) {
|
||||||
ids = Arrays.copyOf(ids, putIdx);
|
ids = Arrays.copyOf(ids, putIdx);
|
||||||
}
|
}
|
||||||
return ids;
|
return ids;
|
||||||
|
@ -7,22 +7,23 @@ import gnu.trove.set.hash.TIntHashSet;
|
|||||||
import io.prometheus.client.Counter;
|
import io.prometheus.client.Counter;
|
||||||
import io.prometheus.client.Gauge;
|
import io.prometheus.client.Gauge;
|
||||||
import io.prometheus.client.Histogram;
|
import io.prometheus.client.Histogram;
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntComparator;
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntList;
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongAVLTreeSet;
|
||||||
|
import nu.marginalia.util.btree.BTreeQueryBuffer;
|
||||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||||
import nu.marginalia.wmsa.client.GsonFactory;
|
import nu.marginalia.wmsa.client.GsonFactory;
|
||||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
|
||||||
import nu.marginalia.wmsa.edge.index.EdgeIndexBucket;
|
import nu.marginalia.wmsa.edge.index.EdgeIndexBucket;
|
||||||
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryParams;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexSearchBudget;
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexSearchBudget;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator;
|
import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
|
||||||
import nu.marginalia.wmsa.edge.model.id.EdgeIdList;
|
|
||||||
import nu.marginalia.wmsa.edge.model.search.*;
|
import nu.marginalia.wmsa.edge.model.search.*;
|
||||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
|
||||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
|
||||||
import org.apache.http.HttpStatus;
|
import org.apache.http.HttpStatus;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -36,7 +37,6 @@ import java.util.function.LongPredicate;
|
|||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import static java.util.Comparator.comparing;
|
import static java.util.Comparator.comparing;
|
||||||
import static nu.marginalia.wmsa.edge.index.EdgeIndexService.DYNAMIC_BUCKET_LENGTH;
|
|
||||||
import static spark.Spark.halt;
|
import static spark.Spark.halt;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
@ -50,7 +50,6 @@ public class EdgeIndexQueryService {
|
|||||||
|
|
||||||
private static final Gauge wmsa_edge_index_query_cost = Gauge.build().name("wmsa_edge_index_query_cost").help("-").register();
|
private static final Gauge wmsa_edge_index_query_cost = Gauge.build().name("wmsa_edge_index_query_cost").help("-").register();
|
||||||
private static final Histogram wmsa_edge_index_query_time = Histogram.build().name("wmsa_edge_index_query_time").linearBuckets(25/1000., 25/1000., 15).help("-").register();
|
private static final Histogram wmsa_edge_index_query_time = Histogram.build().name("wmsa_edge_index_query_time").linearBuckets(25/1000., 25/1000., 15).help("-").register();
|
||||||
private static final Histogram wmsa_edge_index_domain_query_time = Histogram.build().name("wmsa_edge_index_domain_query_time").linearBuckets(25/1000., 25/1000., 15).help("-").register();
|
|
||||||
|
|
||||||
private final Gson gson = GsonFactory.get();
|
private final Gson gson = GsonFactory.get();
|
||||||
|
|
||||||
@ -61,30 +60,6 @@ public class EdgeIndexQueryService {
|
|||||||
this.indexes = indexes;
|
this.indexes = indexes;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Object searchDomain(Request request, Response response) {
|
|
||||||
if (indexes.getLexiconReader() == null) {
|
|
||||||
logger.warn("Dictionary reader not yet initialized");
|
|
||||||
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
|
|
||||||
}
|
|
||||||
|
|
||||||
String json = request.body();
|
|
||||||
EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class);
|
|
||||||
|
|
||||||
try {
|
|
||||||
return wmsa_edge_index_domain_query_time.time(() -> queryDomain(specsSet));
|
|
||||||
}
|
|
||||||
catch (HaltException ex) {
|
|
||||||
logger.warn("Halt", ex);
|
|
||||||
throw ex;
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
logger.info("Error during domain search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
|
|
||||||
logger.info("Error", ex);
|
|
||||||
Spark.halt(500, "Error");
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public Object search(Request request, Response response) {
|
public Object search(Request request, Response response) {
|
||||||
if (indexes.getLexiconReader() == null) {
|
if (indexes.getLexiconReader() == null) {
|
||||||
logger.warn("Dictionary reader not yet initialized");
|
logger.warn("Dictionary reader not yet initialized");
|
||||||
@ -94,6 +69,7 @@ public class EdgeIndexQueryService {
|
|||||||
String json = request.body();
|
String json = request.body();
|
||||||
EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class);
|
EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class);
|
||||||
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
return wmsa_edge_index_query_time.time(() -> query(specsSet));
|
return wmsa_edge_index_query_time.time(() -> query(specsSet));
|
||||||
}
|
}
|
||||||
@ -117,51 +93,20 @@ public class EdgeIndexQueryService {
|
|||||||
|
|
||||||
wmsa_edge_index_query_cost.set(searchQuery.getDataCost());
|
wmsa_edge_index_query_cost.set(searchQuery.getDataCost());
|
||||||
|
|
||||||
|
if (!searchQuery.hasTimeLeft()) {
|
||||||
|
wmsa_edge_index_query_timeouts.inc();
|
||||||
|
}
|
||||||
|
|
||||||
return new EdgeSearchResultSet(results);
|
return new EdgeSearchResultSet(results);
|
||||||
}
|
}
|
||||||
|
|
||||||
public EdgeDomainSearchResults queryDomain(EdgeDomainSearchSpecification specsSet) {
|
|
||||||
|
|
||||||
final OptionalInt wordId = lookUpWord(specsSet.keyword);
|
|
||||||
|
|
||||||
final EdgeIdList<EdgeUrl> urlIds = new EdgeIdList<>();
|
|
||||||
|
|
||||||
final IndexQueryCachePool pool = new IndexQueryCachePool();
|
|
||||||
final IndexSearchBudget budget = new IndexSearchBudget(50);
|
|
||||||
|
|
||||||
if (wordId.isEmpty()) {
|
|
||||||
|
|
||||||
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int bucket = 0; budget.hasTimeLeft() && bucket < DYNAMIC_BUCKET_LENGTH+1; bucket++) {
|
|
||||||
|
|
||||||
final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(1);
|
|
||||||
|
|
||||||
var query = indexes.getBucket(bucket).getDomainQuery(pool, wordId.getAsInt(), localFilter);
|
|
||||||
long[] buffer = new long[512];
|
|
||||||
|
|
||||||
while (query.hasMore() && urlIds.size() < specsSet.maxResults) {
|
|
||||||
int cnt = query.getMoreResults(buffer, budget);
|
|
||||||
for (int i = 0; i < cnt && urlIds.size() < specsSet.maxResults; i++) {
|
|
||||||
long result = buffer[i];
|
|
||||||
if (localFilter.test(result)) {
|
|
||||||
urlIds.add((int) (result & 0xFFFF_FFFFL));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
|
|
||||||
}
|
|
||||||
|
|
||||||
private class SearchQuery {
|
private class SearchQuery {
|
||||||
private final int fetchSize;
|
private final int fetchSize;
|
||||||
private final TIntHashSet seenResults;
|
private final TIntHashSet seenResults;
|
||||||
private final EdgeSearchSpecification specsSet;
|
private final EdgeSearchSpecification specsSet;
|
||||||
private final IndexSearchBudget budget;
|
private final IndexSearchBudget budget;
|
||||||
private final IndexQueryCachePool cachePool = new IndexQueryCachePool();
|
private final Integer qualityLimit;
|
||||||
|
private final Integer rankLimit;
|
||||||
private long dataCost = 0;
|
private long dataCost = 0;
|
||||||
|
|
||||||
public SearchQuery(EdgeSearchSpecification specsSet) {
|
public SearchQuery(EdgeSearchSpecification specsSet) {
|
||||||
@ -169,6 +114,8 @@ public class EdgeIndexQueryService {
|
|||||||
this.budget = new IndexSearchBudget(specsSet.timeoutMs);
|
this.budget = new IndexSearchBudget(specsSet.timeoutMs);
|
||||||
this.fetchSize = specsSet.fetchSize;
|
this.fetchSize = specsSet.fetchSize;
|
||||||
this.seenResults = new TIntHashSet(fetchSize, 0.5f);
|
this.seenResults = new TIntHashSet(fetchSize, 0.5f);
|
||||||
|
this.qualityLimit = specsSet.quality;
|
||||||
|
this.rankLimit = specsSet.rank;
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<EdgeSearchResultItem> execute() {
|
private List<EdgeSearchResultItem> execute() {
|
||||||
@ -178,32 +125,31 @@ public class EdgeIndexQueryService {
|
|||||||
results.addAll(performSearch(sq));
|
results.addAll(performSearch(sq));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final SearchTermEvaluator evaluator = new SearchTermEvaluator(specsSet, results);
|
||||||
for (var result : results) {
|
for (var result : results) {
|
||||||
addResultScores(result);
|
evaluator.addResultScores(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!budget.hasTimeLeft()) {
|
return createResultList(results);
|
||||||
wmsa_edge_index_query_timeouts.inc();
|
}
|
||||||
}
|
|
||||||
|
private List<EdgeSearchResultItem> createResultList(Set<EdgeSearchResultItem> results) {
|
||||||
|
|
||||||
var domainCountFilter = new ResultDomainDeduplicator(specsSet.limitByDomain);
|
var domainCountFilter = new ResultDomainDeduplicator(specsSet.limitByDomain);
|
||||||
|
|
||||||
if (WmsaHome.isDebug()) {
|
|
||||||
cachePool.printSummary(logger);
|
|
||||||
}
|
|
||||||
cachePool.clear();
|
|
||||||
|
|
||||||
List<EdgeSearchResultItem> resultList = results.stream()
|
List<EdgeSearchResultItem> resultList = results.stream()
|
||||||
.sorted(
|
.sorted(
|
||||||
comparing(EdgeSearchResultItem::getScore)
|
comparing(EdgeSearchResultItem::getScore)
|
||||||
.thenComparing(EdgeSearchResultItem::getRanking)
|
.thenComparing(EdgeSearchResultItem::getRanking)
|
||||||
.thenComparing(EdgeSearchResultItem::getUrlIdInt)
|
.thenComparing(EdgeSearchResultItem::getUrlIdInt)
|
||||||
)
|
)
|
||||||
.filter(domainCountFilter::test)
|
.filter(domainCountFilter::test)
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
if (resultList.size() > specsSet.getLimitTotal()) {
|
if (resultList.size() > specsSet.getLimitTotal()) {
|
||||||
|
// This can't be made a stream limit() operation because we need domainCountFilter
|
||||||
|
// to run over the entire list to provide accurate statistics
|
||||||
|
|
||||||
resultList.subList(specsSet.getLimitTotal(), resultList.size()).clear();
|
resultList.subList(specsSet.getLimitTotal(), resultList.size()).clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -219,16 +165,20 @@ public class EdgeIndexQueryService {
|
|||||||
{
|
{
|
||||||
|
|
||||||
final List<EdgeSearchResultItem> results = new ArrayList<>(fetchSize);
|
final List<EdgeSearchResultItem> results = new ArrayList<>(fetchSize);
|
||||||
final EdgeIndexSearchTerms searchTerms = getSearchTerms(sq);
|
final SearchTerms searchTerms = getSearchTerms(sq);
|
||||||
|
|
||||||
if (searchTerms.isEmpty())
|
if (searchTerms.isEmpty()) {
|
||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
|
final BTreeQueryBuffer buffer = new BTreeQueryBuffer(fetchSize);
|
||||||
|
|
||||||
for (int indexBucket : specsSet.buckets) {
|
for (int indexBucket : specsSet.buckets) {
|
||||||
final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT);
|
final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT);
|
||||||
|
|
||||||
if (!budget.hasTimeLeft()) {
|
if (!budget.hasTimeLeft()) {
|
||||||
logger.info("Query timed out, omitting {}:{} for query {}, ({}), -{}", indexBucket, sq.block, sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude);
|
logger.info("Query timed out, omitting {}:{} for query {}, ({}), -{}",
|
||||||
|
indexBucket, sq.block, sq.searchTermsInclude, sq.searchTermsAdvice, sq.searchTermsExclude);
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -237,20 +187,22 @@ public class EdgeIndexQueryService {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
IndexQuery query = getQuery(cachePool, indexBucket, sq.block, localFilter::filterRawValue, searchTerms);
|
IndexQueryParams queryParams = new IndexQueryParams(sq.block, searchTerms, qualityLimit, rankLimit, specsSet.domains);
|
||||||
long[] buf = new long[fetchSize];
|
|
||||||
|
IndexQuery query = getQuery(indexBucket, localFilter::filterRawValue, queryParams);
|
||||||
|
|
||||||
while (query.hasMore() && results.size() < fetchSize && budget.hasTimeLeft()) {
|
while (query.hasMore() && results.size() < fetchSize && budget.hasTimeLeft()) {
|
||||||
int cnt = query.getMoreResults(buf, budget);
|
buffer.reset();
|
||||||
|
query.getMoreResults(buffer);
|
||||||
|
|
||||||
for (int i = 0; i < cnt && results.size() < fetchSize; i++) {
|
for (int i = 0; i < buffer.size() && results.size() < fetchSize; i++) {
|
||||||
final long id = buf[i];
|
final long id = buffer.data[i];
|
||||||
|
|
||||||
if (!seenResults.add((int)(id & 0xFFFF_FFFFL)) || !localFilter.test(id)) {
|
if (!seenResults.add((int)(id & 0xFFFF_FFFFL)) || !localFilter.test(id)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
results.add(new EdgeSearchResultItem(indexBucket, id));
|
results.add(new EdgeSearchResultItem(indexBucket, sq.block, id));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -261,40 +213,127 @@ public class EdgeIndexQueryService {
|
|||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
private IndexQuery getQuery(IndexQueryCachePool cachePool, int bucket, IndexBlock block,
|
private IndexQuery getQuery(int bucket, LongPredicate filter, IndexQueryParams params) {
|
||||||
LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
|
|
||||||
|
|
||||||
if (!indexes.isValidBucket(bucket)) {
|
if (!indexes.isValidBucket(bucket)) {
|
||||||
logger.warn("Invalid bucket {}", bucket);
|
logger.warn("Invalid bucket {}", bucket);
|
||||||
return new IndexQuery(Collections.emptyList());
|
return new IndexQuery(Collections.emptyList());
|
||||||
}
|
}
|
||||||
|
|
||||||
return indexes.getBucket(bucket).getQuery(cachePool, block, filter, searchTerms);
|
return indexes.getBucket(bucket).getQuery(filter, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addResultScores(EdgeSearchResultItem searchResult) {
|
public boolean hasTimeLeft() {
|
||||||
|
return budget.hasTimeLeft();
|
||||||
|
}
|
||||||
|
|
||||||
|
private record IndexAndBucket(IndexBlock block, int bucket) {}
|
||||||
|
|
||||||
|
public long getDataCost() {
|
||||||
|
return dataCost;
|
||||||
|
}
|
||||||
|
|
||||||
|
record ResultTerm (int bucket, int termId, long combinedUrlId) {}
|
||||||
|
}
|
||||||
|
|
||||||
|
public class SearchTermEvaluator {
|
||||||
|
private static final EdgePageWordMetadata blankMetadata = new EdgePageWordMetadata(EdgePageWordMetadata.emptyValue());
|
||||||
|
|
||||||
|
private final Map<SearchQuery.ResultTerm, EdgePageWordMetadata> termData = new HashMap<>(16);
|
||||||
|
|
||||||
|
private final List<List<String>> searchTermVariants;
|
||||||
|
|
||||||
|
public SearchTermEvaluator(EdgeSearchSpecification specsSet, Set<EdgeSearchResultItem> results) {
|
||||||
|
this.searchTermVariants = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
|
||||||
|
|
||||||
|
final int[] termIdsAll = getIncludeTermIds(specsSet);
|
||||||
|
|
||||||
|
Map<SearchQuery.IndexAndBucket, LongAVLTreeSet> resultIdsByBucket = new HashMap<>(7);
|
||||||
|
|
||||||
|
for (int termId : termIdsAll) {
|
||||||
|
|
||||||
|
for (var result: results) {
|
||||||
|
resultIdsByBucket
|
||||||
|
.computeIfAbsent(new SearchQuery.IndexAndBucket(result.block, result.bucketId),
|
||||||
|
id -> new LongAVLTreeSet())
|
||||||
|
.add(result.combinedId);
|
||||||
|
}
|
||||||
|
|
||||||
|
resultIdsByBucket.forEach((indexAndBucket, resultIds) ->
|
||||||
|
loadMetadata(termId, indexAndBucket.bucket, indexAndBucket.block, resultIds));
|
||||||
|
|
||||||
|
resultIdsByBucket.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private int[] getIncludeTermIds(EdgeSearchSpecification specsSet) {
|
||||||
|
|
||||||
final var reader = Objects.requireNonNull(indexes.getLexiconReader());
|
final var reader = Objects.requireNonNull(indexes.getLexiconReader());
|
||||||
|
|
||||||
List<List<String>> searchTermVariants = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
|
final List<String> terms = specsSet.allIncludeSearchTerms();
|
||||||
|
final IntList ret = new IntArrayList(terms.size());
|
||||||
|
|
||||||
// Memoize calls to getTermData, as they're somewhat expensive and highly redundant
|
for (var term : terms) {
|
||||||
Map<ResultTerm, ResultTermData> termMetadata = new HashMap<>(32);
|
int id = reader.get(term);
|
||||||
|
|
||||||
|
if (id >= 0)
|
||||||
|
ret.add(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret.toIntArray();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void loadMetadata(int termId, int bucket, IndexBlock indexBlock,
|
||||||
|
LongAVLTreeSet docIdsMissingMetadata)
|
||||||
|
{
|
||||||
|
EdgeIndexBucket index = indexes.getBucket(bucket);
|
||||||
|
|
||||||
|
if (docIdsMissingMetadata.isEmpty())
|
||||||
|
return;
|
||||||
|
|
||||||
|
|
||||||
|
long[] ids = docIdsMissingMetadata.toLongArray();
|
||||||
|
long[] metadata = index.getMetadata(indexBlock, termId, ids);
|
||||||
|
|
||||||
|
for (int i = 0; i < metadata.length; i++) {
|
||||||
|
if (metadata[i] == 0L)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
termData.put(
|
||||||
|
new SearchQuery.ResultTerm(bucket, termId, ids[i]),
|
||||||
|
new EdgePageWordMetadata(metadata[i])
|
||||||
|
);
|
||||||
|
|
||||||
|
docIdsMissingMetadata.remove(ids[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addResultScores(EdgeSearchResultItem searchResult) {
|
||||||
|
final var reader = Objects.requireNonNull(indexes.getLexiconReader());
|
||||||
|
|
||||||
double bestScore = 0;
|
double bestScore = 0;
|
||||||
|
|
||||||
for (int searchTermListIdx = 0; searchTermListIdx < searchTermVariants.size(); searchTermListIdx++) {
|
for (int searchTermListIdx = 0; searchTermListIdx < searchTermVariants.size(); searchTermListIdx++) {
|
||||||
double setScore = 0;
|
double setScore = 0;
|
||||||
int setSize = 0;
|
int setSize = 0;
|
||||||
for (var searchTerm : searchTermVariants.get(searchTermListIdx)) {
|
var termList = searchTermVariants.get(searchTermListIdx);
|
||||||
|
|
||||||
|
for (int termIdx = 0; termIdx < termList.size(); termIdx++) {
|
||||||
|
String searchTerm = termList.get(termIdx);
|
||||||
|
|
||||||
final int termId = reader.get(searchTerm);
|
final int termId = reader.get(searchTerm);
|
||||||
|
|
||||||
ResultTermData data = termMetadata.computeIfAbsent(
|
var key = new SearchQuery.ResultTerm(searchResult.bucketId, termId, searchResult.getCombinedId());
|
||||||
new ResultTerm(searchResult.bucketId, termId, searchResult.getCombinedId()), this::getTermData);
|
var metadata = termData.getOrDefault(key, blankMetadata);
|
||||||
|
|
||||||
|
EdgeSearchResultKeywordScore score = new EdgeSearchResultKeywordScore(searchTermListIdx, searchTerm, metadata);
|
||||||
|
|
||||||
var score = data.asScore(searchTermListIdx, searchTerm);
|
|
||||||
searchResult.scores.add(score);
|
searchResult.scores.add(score);
|
||||||
setScore += score.value();
|
setScore += score.termValue();
|
||||||
|
if (termIdx == 0) {
|
||||||
|
setScore += score.documentValue();
|
||||||
|
}
|
||||||
|
|
||||||
setSize++;
|
setSize++;
|
||||||
}
|
}
|
||||||
bestScore = Math.min(bestScore, setScore/setSize);
|
bestScore = Math.min(bestScore, setScore/setSize);
|
||||||
@ -303,64 +342,27 @@ public class EdgeIndexQueryService {
|
|||||||
searchResult.setScore(bestScore);
|
searchResult.setScore(bestScore);
|
||||||
}
|
}
|
||||||
|
|
||||||
private ResultTermData getTermData(ResultTerm resultTerm) {
|
|
||||||
final EdgeIndexBucket bucket = indexes.getBucket(resultTerm.bucket);
|
|
||||||
final int termId = resultTerm.termId;
|
|
||||||
final long combinedUrlId = resultTerm.combinedUrlId;
|
|
||||||
|
|
||||||
return new ResultTermData(bucket.getTermScore(cachePool, termId, combinedUrlId),
|
|
||||||
bucket.isTermInBucket(cachePool, IndexBlock.Title, termId, combinedUrlId),
|
|
||||||
bucket.isTermInBucket(cachePool, IndexBlock.Link, termId, combinedUrlId),
|
|
||||||
bucket.isTermInBucket(cachePool, IndexBlock.Site, termId, combinedUrlId),
|
|
||||||
bucket.isTermInBucket(cachePool, IndexBlock.Subjects, termId, combinedUrlId),
|
|
||||||
bucket.isTermInBucket(cachePool, IndexBlock.NamesWords, termId, combinedUrlId),
|
|
||||||
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Top, termId, combinedUrlId),
|
|
||||||
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Middle, termId, combinedUrlId),
|
|
||||||
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Lower, termId, combinedUrlId)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getDataCost() {
|
|
||||||
return dataCost;
|
|
||||||
}
|
|
||||||
|
|
||||||
record ResultTerm (int bucket, int termId, long combinedUrlId) {}
|
|
||||||
record ResultTermData (IndexBlock index,
|
|
||||||
boolean title,
|
|
||||||
boolean link,
|
|
||||||
boolean site,
|
|
||||||
boolean subject,
|
|
||||||
boolean name,
|
|
||||||
boolean high,
|
|
||||||
boolean mid,
|
|
||||||
boolean low
|
|
||||||
) {
|
|
||||||
public EdgeSearchResultKeywordScore asScore(int set, String searchTerm) {
|
|
||||||
return new EdgeSearchResultKeywordScore(set, searchTerm, index, title, link, site, subject, name, high, mid, low);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private SearchTerms getSearchTerms(EdgeSearchSubquery request) {
|
||||||
private EdgeIndexSearchTerms getSearchTerms(EdgeSearchSubquery request) {
|
final IntList excludes = new IntArrayList();
|
||||||
final List<Integer> excludes = new ArrayList<>();
|
final IntList includes = new IntArrayList();
|
||||||
final List<Integer> includes = new ArrayList<>();
|
|
||||||
|
|
||||||
for (var include : request.searchTermsInclude) {
|
for (var include : request.searchTermsInclude) {
|
||||||
var word = lookUpWord(include);
|
var word = lookUpWord(include);
|
||||||
if (word.isEmpty()) {
|
if (word.isEmpty()) {
|
||||||
logger.debug("Unknown search term: " + include);
|
logger.debug("Unknown search term: " + include);
|
||||||
return new EdgeIndexSearchTerms(Collections.emptyList(), Collections.emptyList());
|
return new SearchTerms();
|
||||||
}
|
}
|
||||||
includes.add(word.getAsInt());
|
includes.add(word.getAsInt());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
for (var advice : request.searchTermsAdvice) {
|
for (var advice : request.searchTermsAdvice) {
|
||||||
var word = lookUpWord(advice);
|
var word = lookUpWord(advice);
|
||||||
if (word.isEmpty()) {
|
if (word.isEmpty()) {
|
||||||
logger.debug("Unknown search term: " + advice);
|
logger.debug("Unknown search term: " + advice);
|
||||||
return new EdgeIndexSearchTerms(Collections.emptyList(), Collections.emptyList());
|
return new SearchTerms();
|
||||||
}
|
}
|
||||||
includes.add(word.getAsInt());
|
includes.add(word.getAsInt());
|
||||||
}
|
}
|
||||||
@ -369,7 +371,26 @@ public class EdgeIndexQueryService {
|
|||||||
lookUpWord(exclude).ifPresent(excludes::add);
|
lookUpWord(exclude).ifPresent(excludes::add);
|
||||||
}
|
}
|
||||||
|
|
||||||
return new EdgeIndexSearchTerms(includes, excludes);
|
return new SearchTerms(includes, excludes);
|
||||||
|
}
|
||||||
|
|
||||||
|
public record SearchTerms(IntList includes, IntList excludes) {
|
||||||
|
public SearchTerms() {
|
||||||
|
this(IntList.of(), IntList.of());
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isEmpty() {
|
||||||
|
return includes.isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
|
public int[] sortedDistinctIncludes(IntComparator comparator) {
|
||||||
|
if (includes.isEmpty())
|
||||||
|
return includes.toIntArray();
|
||||||
|
|
||||||
|
IntList list = new IntArrayList(new IntOpenHashSet(includes));
|
||||||
|
list.sort(comparator);
|
||||||
|
return list.toIntArray();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user