mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
A tiny release between crawls (#138)
Bringing online new ranking changes Co-authored-by: Viktor Lofgren <vlofgren@marginalia.nu> Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/138
This commit is contained in:
parent
467bf566a9
commit
fa9b4e4352
@ -106,8 +106,9 @@ dependencies {
|
|||||||
|
|
||||||
implementation group: 'org.yaml', name: 'snakeyaml', version: '1.30'
|
implementation group: 'org.yaml', name: 'snakeyaml', version: '1.30'
|
||||||
|
|
||||||
implementation 'com.syncthemall:boilerpipe:1.2.2'
|
|
||||||
implementation 'com.github.luben:zstd-jni:1.5.2-2'
|
implementation 'com.github.luben:zstd-jni:1.5.2-2'
|
||||||
|
implementation 'org.lz4:lz4-java:1.8.0'
|
||||||
|
|
||||||
implementation 'com.github.vladimir-bukhtoyarov:bucket4j-core:7.5.0'
|
implementation 'com.github.vladimir-bukhtoyarov:bucket4j-core:7.5.0'
|
||||||
implementation 'de.rototor.jeuclid:jeuclid-core:3.1.14'
|
implementation 'de.rototor.jeuclid:jeuclid-core:3.1.14'
|
||||||
|
|
||||||
@ -126,7 +127,6 @@ dependencies {
|
|||||||
implementation 'org.roaringbitmap:RoaringBitmap:0.9.32'
|
implementation 'org.roaringbitmap:RoaringBitmap:0.9.32'
|
||||||
|
|
||||||
implementation group: 'mysql', name: 'mysql-connector-java', version: '8.0.29'
|
implementation group: 'mysql', name: 'mysql-connector-java', version: '8.0.29'
|
||||||
|
|
||||||
implementation 'com.github.Marcono1234:gson-record-type-adapter-factory:0.2.0'
|
implementation 'com.github.Marcono1234:gson-record-type-adapter-factory:0.2.0'
|
||||||
|
|
||||||
testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
|
testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
|
||||||
|
@ -2,11 +2,13 @@ package nu.marginalia.util;
|
|||||||
|
|
||||||
public class BrailleBlockPunchCards {
|
public class BrailleBlockPunchCards {
|
||||||
|
|
||||||
|
private static final char brailleBlockBase = '\u2800';
|
||||||
|
|
||||||
public static String printBits(int val, int bits) {
|
public static String printBits(int val, int bits) {
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
|
|
||||||
for (int b = 0; b < bits; b+=8, val>>>=8) {
|
for (int b = 0; b < bits; b+=8, val>>>=8) {
|
||||||
builder.append((char)('\u2800'+bin2brail(val)));
|
builder.append((char)(brailleBlockBase + bin2brail(val)));
|
||||||
}
|
}
|
||||||
|
|
||||||
return builder.toString();
|
return builder.toString();
|
||||||
|
@ -42,7 +42,7 @@ public abstract class ParallelPipe<INPUT,INTERMEDIATE> {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private void runProcessThread() {
|
private void runProcessThread() {
|
||||||
while (expectingInput || !inputs.isEmpty()) {
|
while (expectingInput || !inputs.isEmpty()) {
|
||||||
var in = inputs.poll(1, TimeUnit.SECONDS);
|
var in = inputs.poll(10, TimeUnit.SECONDS);
|
||||||
|
|
||||||
if (in != null) {
|
if (in != null) {
|
||||||
try {
|
try {
|
||||||
|
@ -108,7 +108,6 @@ public class RandomWriteFunnel implements AutoCloseable {
|
|||||||
|
|
||||||
private void eval(ByteBuffer dest) throws IOException {
|
private void eval(ByteBuffer dest) throws IOException {
|
||||||
flushBuffer();
|
flushBuffer();
|
||||||
channel.force(false);
|
|
||||||
|
|
||||||
channel.position(0);
|
channel.position(0);
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
|
@ -1,20 +1,33 @@
|
|||||||
package nu.marginalia.util;
|
package nu.marginalia.util;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
|
||||||
public class StringPool {
|
public class StringPool {
|
||||||
private final HashMap<String, String> words;
|
|
||||||
|
|
||||||
public StringPool() {
|
private final HashMap<String, String> words;
|
||||||
this.words = new HashMap<>(1000);
|
private final Object2LongOpenHashMap<String> ages;
|
||||||
|
private final int maxCap;
|
||||||
|
|
||||||
|
long idx;
|
||||||
|
|
||||||
|
private StringPool(int capacity, int maxCap) {
|
||||||
|
this.ages = new Object2LongOpenHashMap<>(capacity);
|
||||||
|
this.words = new HashMap<>(capacity);
|
||||||
|
this.maxCap = maxCap;
|
||||||
}
|
}
|
||||||
|
|
||||||
public StringPool(int capacity) {
|
public static StringPool create(int capacity) {
|
||||||
words = new HashMap<>(capacity);
|
return new StringPool(capacity, capacity * 10);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String internalize(String str) {
|
public String internalize(String str) {
|
||||||
|
prune();
|
||||||
|
|
||||||
final String ret = words.putIfAbsent(str, str);
|
final String ret = words.putIfAbsent(str, str);
|
||||||
|
ages.put(ret, idx++);
|
||||||
|
|
||||||
if (null == ret)
|
if (null == ret)
|
||||||
return str;
|
return str;
|
||||||
@ -22,6 +35,37 @@ public class StringPool {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String[] internalize(String[] str) {
|
||||||
|
|
||||||
|
for (int i = 0; i < str.length; i++) {
|
||||||
|
str[i] = internalize(str[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void prune() {
|
||||||
|
|
||||||
|
if (words.size() < maxCap)
|
||||||
|
return;
|
||||||
|
|
||||||
|
long[] ageValues = ages.values().toLongArray();
|
||||||
|
Arrays.sort(ageValues);
|
||||||
|
|
||||||
|
long cutoff = ageValues[ageValues.length - maxCap / 10];
|
||||||
|
|
||||||
|
words.clear();
|
||||||
|
ages.forEach((word, cnt) -> {
|
||||||
|
if (cnt >= cutoff) {
|
||||||
|
words.put(word, word);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
ages.clear();
|
||||||
|
words.forEach((w,w2) -> {
|
||||||
|
ages.put(w, idx);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
public void flush() {
|
public void flush() {
|
||||||
words.clear();
|
words.clear();
|
||||||
}
|
}
|
||||||
|
@ -18,22 +18,14 @@ public interface IntArrayBase extends BulkTransferArray<IntBuffer> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
default void increment(long pos) {
|
|
||||||
set(pos, get(pos) + 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
default void swap(long pos1, long pos2) {
|
default void swap(long pos1, long pos2) {
|
||||||
int tmp = get(pos1);
|
int tmp = get(pos1);
|
||||||
set(pos1, get(pos2));
|
set(pos1, get(pos2));
|
||||||
set(pos2, tmp);
|
set(pos2, tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
default void swapn(int n, long pos1, long pos2) {
|
default void increment(long pos) {
|
||||||
for (int i = 0; i < n; i++) {
|
set(pos, get(pos) + 1);
|
||||||
int tmp = get(pos1+i);
|
|
||||||
set(pos1+i, get(pos2+i));
|
|
||||||
set(pos2+i, tmp);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
default int getAndIncrement(long pos) {
|
default int getAndIncrement(long pos) {
|
||||||
@ -47,6 +39,7 @@ public interface IntArrayBase extends BulkTransferArray<IntBuffer> {
|
|||||||
set(start+i, buffer.get(i + bufferStart));
|
set(start+i, buffer.get(i + bufferStart));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
default void get(long start, long end, IntBuffer buffer, int bufferStart) {
|
default void get(long start, long end, IntBuffer buffer, int bufferStart) {
|
||||||
for (int i = 0; i < (end-start); i++) {
|
for (int i = 0; i < (end-start); i++) {
|
||||||
buffer.put(i + bufferStart, get(start + i));
|
buffer.put(i + bufferStart, get(start + i));
|
||||||
|
@ -28,6 +28,7 @@ public interface LongArrayBase extends BulkTransferArray<LongBuffer> {
|
|||||||
set(pos2, tmp);
|
set(pos2, tmp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Behavior not defined for overlapping ranges */
|
||||||
default void swapn(int n, long pos1, long pos2) {
|
default void swapn(int n, long pos1, long pos2) {
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
long tmp = get(pos1+i);
|
long tmp = get(pos1+i);
|
||||||
|
@ -2,6 +2,7 @@ package nu.marginalia.util.array.delegate;
|
|||||||
|
|
||||||
import com.upserve.uppend.blobs.NativeIO;
|
import com.upserve.uppend.blobs.NativeIO;
|
||||||
import nu.marginalia.util.array.IntArray;
|
import nu.marginalia.util.array.IntArray;
|
||||||
|
import nu.marginalia.util.array.algo.SortingContext;
|
||||||
import nu.marginalia.util.array.buffer.IntQueryBuffer;
|
import nu.marginalia.util.array.buffer.IntQueryBuffer;
|
||||||
import nu.marginalia.util.array.functional.IntBinaryIOOperation;
|
import nu.marginalia.util.array.functional.IntBinaryIOOperation;
|
||||||
import nu.marginalia.util.array.functional.IntIOTransformer;
|
import nu.marginalia.util.array.functional.IntIOTransformer;
|
||||||
@ -61,6 +62,16 @@ public class ShiftedIntArray implements IntArray {
|
|||||||
delegate.get(shift+start, shift+end, buffer);
|
delegate.get(shift+start, shift+end, buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getAndIncrement(long pos) {
|
||||||
|
return delegate.getAndIncrement(shift + pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void fill(long start, long end, int val) {
|
||||||
|
delegate.fill(start + shift, end + shift, val);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long size() {
|
public long size() {
|
||||||
return size;
|
return size;
|
||||||
@ -97,6 +108,12 @@ public class ShiftedIntArray implements IntArray {
|
|||||||
return delegate.isSorted(shift + start, shift + end);
|
return delegate.isSorted(shift + start, shift + end);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void sortLargeSpan(SortingContext ctx, long start, long end) throws IOException {
|
||||||
|
delegate.sortLargeSpan(ctx, start, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public long search(int key) {
|
public long search(int key) {
|
||||||
if (size < 128) {
|
if (size < 128) {
|
||||||
return linearSearch(key);
|
return linearSearch(key);
|
||||||
|
@ -3,6 +3,7 @@ package nu.marginalia.util.array.delegate;
|
|||||||
import com.upserve.uppend.blobs.NativeIO;
|
import com.upserve.uppend.blobs.NativeIO;
|
||||||
import nu.marginalia.util.array.LongArray;
|
import nu.marginalia.util.array.LongArray;
|
||||||
import nu.marginalia.util.array.algo.LongArraySearch;
|
import nu.marginalia.util.array.algo.LongArraySearch;
|
||||||
|
import nu.marginalia.util.array.algo.SortingContext;
|
||||||
import nu.marginalia.util.array.buffer.LongQueryBuffer;
|
import nu.marginalia.util.array.buffer.LongQueryBuffer;
|
||||||
import nu.marginalia.util.array.functional.LongBinaryIOOperation;
|
import nu.marginalia.util.array.functional.LongBinaryIOOperation;
|
||||||
import nu.marginalia.util.array.functional.LongIOTransformer;
|
import nu.marginalia.util.array.functional.LongIOTransformer;
|
||||||
@ -62,6 +63,16 @@ public class ShiftedLongArray implements LongArray {
|
|||||||
delegate.get(shift+start, shift+end, buffer);
|
delegate.get(shift+start, shift+end, buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getAndIncrement(long pos) {
|
||||||
|
return delegate.getAndIncrement(shift + pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void fill(long start, long end, long val) {
|
||||||
|
delegate.fill(start + shift, end + shift, val);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public long size() {
|
public long size() {
|
||||||
return size;
|
return size;
|
||||||
@ -106,6 +117,14 @@ public class ShiftedLongArray implements LongArray {
|
|||||||
return delegate.isSortedN(sz, shift + start, shift + end);
|
return delegate.isSortedN(sz, shift + start, shift + end);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void sortLargeSpanN(SortingContext ctx, int sz, long start, long end) throws IOException {
|
||||||
|
delegate.sortLargeSpanN(ctx, sz, start, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void sortLargeSpan(SortingContext ctx, long start, long end) throws IOException {
|
||||||
|
delegate.sortLargeSpan(ctx, start, end);
|
||||||
|
}
|
||||||
|
|
||||||
public long searchN(int sz, long key) {
|
public long searchN(int sz, long key) {
|
||||||
if (size < 128) {
|
if (size < 128) {
|
||||||
return linearSearchN(sz, key);
|
return linearSearchN(sz, key);
|
||||||
|
@ -2,6 +2,7 @@ package nu.marginalia.util.array.page;
|
|||||||
|
|
||||||
import com.upserve.uppend.blobs.NativeIO;
|
import com.upserve.uppend.blobs.NativeIO;
|
||||||
import nu.marginalia.util.array.IntArray;
|
import nu.marginalia.util.array.IntArray;
|
||||||
|
import nu.marginalia.util.array.algo.SortingContext;
|
||||||
import nu.marginalia.util.array.buffer.IntQueryBuffer;
|
import nu.marginalia.util.array.buffer.IntQueryBuffer;
|
||||||
import nu.marginalia.util.array.delegate.ReferenceImplIntArrayDelegate;
|
import nu.marginalia.util.array.delegate.ReferenceImplIntArrayDelegate;
|
||||||
import nu.marginalia.util.array.functional.IntBinaryIOOperation;
|
import nu.marginalia.util.array.functional.IntBinaryIOOperation;
|
||||||
@ -113,6 +114,11 @@ public class PagingIntArray extends AbstractPagingArray<IntArrayPage, IntBuffer>
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getAndIncrement(long pos) {
|
||||||
|
return pages[partitioningScheme.getPage(pos)].getAndIncrement(partitioningScheme.getOffset(pos));
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void get(long start, long end, int[] buffer) {
|
public void get(long start, long end, int[] buffer) {
|
||||||
if (partitioningScheme.isSamePage(start, end)) {
|
if (partitioningScheme.isSamePage(start, end)) {
|
||||||
@ -272,6 +278,22 @@ public class PagingIntArray extends AbstractPagingArray<IntArrayPage, IntBuffer>
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void sortLargeSpan(SortingContext ctx, long start, long end) throws IOException {
|
||||||
|
if (partitioningScheme.isSamePage(start, end)) {
|
||||||
|
int sOff = partitioningScheme.getOffset(start);
|
||||||
|
int eOff = partitioningScheme.getEndOffset(start, end);
|
||||||
|
|
||||||
|
if (eOff > sOff) {
|
||||||
|
pages[partitioningScheme.getPage(start)].sortLargeSpan(ctx, sOff, eOff);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
defaults.sortLargeSpan(ctx, start, end);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public void write(Path fileName) throws IOException {
|
public void write(Path fileName) throws IOException {
|
||||||
try (var channel = (FileChannel) Files.newByteChannel(fileName, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
|
try (var channel = (FileChannel) Files.newByteChannel(fileName, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
|
||||||
for (int i = 0; i < pages.length; i++) {
|
for (int i = 0; i < pages.length; i++) {
|
||||||
|
@ -2,6 +2,7 @@ package nu.marginalia.util.array.page;
|
|||||||
|
|
||||||
import com.upserve.uppend.blobs.NativeIO;
|
import com.upserve.uppend.blobs.NativeIO;
|
||||||
import nu.marginalia.util.array.LongArray;
|
import nu.marginalia.util.array.LongArray;
|
||||||
|
import nu.marginalia.util.array.algo.SortingContext;
|
||||||
import nu.marginalia.util.array.buffer.LongQueryBuffer;
|
import nu.marginalia.util.array.buffer.LongQueryBuffer;
|
||||||
import nu.marginalia.util.array.delegate.ReferenceImplLongArrayDelegate;
|
import nu.marginalia.util.array.delegate.ReferenceImplLongArrayDelegate;
|
||||||
import nu.marginalia.util.array.functional.LongBinaryIOOperation;
|
import nu.marginalia.util.array.functional.LongBinaryIOOperation;
|
||||||
@ -118,6 +119,11 @@ public class PagingLongArray extends AbstractPagingArray<LongArrayPage, LongBuff
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getAndIncrement(long pos) {
|
||||||
|
return pages[partitioningScheme.getPage(pos)].getAndIncrement(partitioningScheme.getOffset(pos));
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void set(long pos, long value) {
|
public void set(long pos, long value) {
|
||||||
int page = partitioningScheme.getPage(pos);
|
int page = partitioningScheme.getPage(pos);
|
||||||
@ -439,6 +445,33 @@ public class PagingLongArray extends AbstractPagingArray<LongArrayPage, LongBuff
|
|||||||
defaults.mergeSortN(sz, start, end, tempDir);
|
defaults.mergeSortN(sz, start, end, tempDir);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
public void sortLargeSpanN(SortingContext ctx, int sz, long start, long end) throws IOException {
|
||||||
|
if (partitioningScheme.isSamePage(start, end)) {
|
||||||
|
int sOff = partitioningScheme.getOffset(start);
|
||||||
|
int eOff = partitioningScheme.getEndOffset(start, end);
|
||||||
|
|
||||||
|
if (eOff > sOff) {
|
||||||
|
pages[partitioningScheme.getPage(start)].sortLargeSpanN(ctx, sz, sOff, eOff);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
defaults.sortLargeSpanN(ctx, sz, start, end);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void sortLargeSpan(SortingContext ctx, long start, long end) throws IOException {
|
||||||
|
if (partitioningScheme.isSamePage(start, end)) {
|
||||||
|
int sOff = partitioningScheme.getOffset(start);
|
||||||
|
int eOff = partitioningScheme.getEndOffset(start, end);
|
||||||
|
|
||||||
|
if (eOff > sOff) {
|
||||||
|
pages[partitioningScheme.getPage(start)].sortLargeSpan(ctx, sOff, eOff);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
defaults.sortLargeSpan(ctx, start, end);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void write(Path fileName) throws IOException {
|
public void write(Path fileName) throws IOException {
|
||||||
try (var channel = (FileChannel) Files.newByteChannel(fileName, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
|
try (var channel = (FileChannel) Files.newByteChannel(fileName, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
|
||||||
|
@ -0,0 +1,17 @@
|
|||||||
|
package nu.marginalia.util.bigstring;
|
||||||
|
|
||||||
|
public interface BigString {
|
||||||
|
static BigString encode(String stringValue) {
|
||||||
|
if (stringValue.length() > 64) {
|
||||||
|
return new CompressedBigString(stringValue);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return new PlainBigString(stringValue);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
String decode();
|
||||||
|
|
||||||
|
byte[] getBytes();
|
||||||
|
|
||||||
|
int length();
|
||||||
|
}
|
@ -0,0 +1,39 @@
|
|||||||
|
package nu.marginalia.util.bigstring;
|
||||||
|
|
||||||
|
import net.jpountz.lz4.LZ4Compressor;
|
||||||
|
import net.jpountz.lz4.LZ4Factory;
|
||||||
|
import net.jpountz.lz4.LZ4FastDecompressor;
|
||||||
|
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
|
||||||
|
public class CompressedBigString implements BigString {
|
||||||
|
private final int originalSize;
|
||||||
|
private final int length;
|
||||||
|
private final byte[] encoded;
|
||||||
|
|
||||||
|
private static final LZ4Factory lz4Factory = LZ4Factory.fastestInstance();;
|
||||||
|
private static final LZ4Compressor compressor = lz4Factory.fastCompressor();
|
||||||
|
private static final LZ4FastDecompressor decompressor = lz4Factory.fastDecompressor();
|
||||||
|
|
||||||
|
public CompressedBigString(String stringValue) {
|
||||||
|
byte[] byteValue = stringValue.getBytes(StandardCharsets.UTF_16);
|
||||||
|
originalSize = byteValue.length;
|
||||||
|
encoded = compressor.compress(byteValue);
|
||||||
|
length = stringValue.length();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String decode() {
|
||||||
|
return new String(getBytes(), StandardCharsets.UTF_16);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte[] getBytes() {
|
||||||
|
return decompressor.decompress(encoded, originalSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int length() {
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,26 @@
|
|||||||
|
package nu.marginalia.util.bigstring;
|
||||||
|
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
|
||||||
|
public class PlainBigString implements BigString {
|
||||||
|
private final String value;
|
||||||
|
|
||||||
|
public PlainBigString(String value) {
|
||||||
|
this.value = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String decode() {
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte[] getBytes() {
|
||||||
|
return value.getBytes(StandardCharsets.UTF_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int length() {
|
||||||
|
return value.length();
|
||||||
|
}
|
||||||
|
}
|
@ -1,8 +1,5 @@
|
|||||||
package nu.marginalia.util.dict;
|
package nu.marginalia.util.dict;
|
||||||
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
import java.nio.LongBuffer;
|
import java.nio.LongBuffer;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@ -10,7 +7,6 @@ import java.util.ArrayList;
|
|||||||
public class DictionaryData {
|
public class DictionaryData {
|
||||||
|
|
||||||
private final int DICTIONARY_BANK_SIZE;
|
private final int DICTIONARY_BANK_SIZE;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(DictionaryData.class);
|
|
||||||
|
|
||||||
private final ArrayList<DictionaryDataBank> banks = new ArrayList<>(100);
|
private final ArrayList<DictionaryDataBank> banks = new ArrayList<>(100);
|
||||||
|
|
||||||
|
@ -1,6 +1,17 @@
|
|||||||
package nu.marginalia.util.dict;
|
package nu.marginalia.util.dict;
|
||||||
|
|
||||||
public interface DictionaryMap {
|
public interface DictionaryMap {
|
||||||
|
int NO_VALUE = Integer.MIN_VALUE;
|
||||||
|
|
||||||
|
static DictionaryMap create() {
|
||||||
|
if (Boolean.getBoolean("small-ram")) {
|
||||||
|
return new OnHeapDictionaryMap();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return new OffHeapDictionaryHashMap(1L << 31);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int size();
|
int size();
|
||||||
|
|
||||||
int put(long key);
|
int put(long key);
|
||||||
|
@ -16,15 +16,14 @@ import static nu.marginalia.util.FileSizeUtil.readableSize;
|
|||||||
* Spiritually influenced by GNU Trove's hash maps
|
* Spiritually influenced by GNU Trove's hash maps
|
||||||
* LGPL 2.1
|
* LGPL 2.1
|
||||||
*/
|
*/
|
||||||
public class DictionaryHashMap implements DictionaryMap {
|
public class OffHeapDictionaryHashMap implements DictionaryMap {
|
||||||
private static final Logger logger = LoggerFactory.getLogger(DictionaryHashMap.class);
|
private static final Logger logger = LoggerFactory.getLogger(OffHeapDictionaryHashMap.class);
|
||||||
private static final Gauge probe_count_metrics
|
private static final Gauge probe_count_metrics
|
||||||
= Gauge.build("wmsa_dictionary_hash_map_probe_count", "Probing Count")
|
= Gauge.build("wmsa_dictionary_hash_map_probe_count", "Probing Count")
|
||||||
.register();
|
.register();
|
||||||
|
|
||||||
private final int bufferCount;
|
private final int bufferCount;
|
||||||
private final IntBuffer[] buffers;
|
private final IntBuffer[] buffers;
|
||||||
public static final int NO_VALUE = Integer.MIN_VALUE;
|
|
||||||
|
|
||||||
private final DictionaryData dictionaryData;
|
private final DictionaryData dictionaryData;
|
||||||
|
|
||||||
@ -35,7 +34,7 @@ public class DictionaryHashMap implements DictionaryMap {
|
|||||||
|
|
||||||
private final AtomicInteger sz = new AtomicInteger(0);
|
private final AtomicInteger sz = new AtomicInteger(0);
|
||||||
|
|
||||||
public DictionaryHashMap(long sizeMemory) {
|
public OffHeapDictionaryHashMap(long sizeMemory) {
|
||||||
final int intSize = 4;
|
final int intSize = 4;
|
||||||
|
|
||||||
bufferCount = 1 + (int) ((intSize*sizeMemory) / (1<<30));
|
bufferCount = 1 + (int) ((intSize*sizeMemory) / (1<<30));
|
@ -0,0 +1,23 @@
|
|||||||
|
package nu.marginalia.util.dict;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||||
|
|
||||||
|
public class OnHeapDictionaryMap implements DictionaryMap {
|
||||||
|
private final Long2IntOpenHashMap entries = new Long2IntOpenHashMap(100_000, 0.75f);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int size() {
|
||||||
|
return entries.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int put(long key) {
|
||||||
|
entries.putIfAbsent(key, entries.size());
|
||||||
|
return get(key);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int get(long key) {
|
||||||
|
return entries.getOrDefault(key, NO_VALUE);
|
||||||
|
}
|
||||||
|
}
|
@ -19,9 +19,6 @@ public class GuardedRegexFactory {
|
|||||||
public static GuardedRegex contains(String substring, @Language("RegExp") String regex) {
|
public static GuardedRegex contains(String substring, @Language("RegExp") String regex) {
|
||||||
return new GuardedRegexContains(substring, regex);
|
return new GuardedRegexContains(substring, regex);
|
||||||
}
|
}
|
||||||
public static GuardedRegex minLength(int minLength, @Language("RegExp") String regex) {
|
|
||||||
return new GuardedRegexMinLength(minLength, regex);
|
|
||||||
}
|
|
||||||
|
|
||||||
private record GuardedRegexContains(String contains, Pattern pattern) implements GuardedRegex {
|
private record GuardedRegexContains(String contains, Pattern pattern) implements GuardedRegex {
|
||||||
public GuardedRegexContains(String contains, String pattern) {
|
public GuardedRegexContains(String contains, String pattern) {
|
||||||
@ -32,15 +29,6 @@ public class GuardedRegexFactory {
|
|||||||
return s.contains(contains) && pattern.matcher(s).find();
|
return s.contains(contains) && pattern.matcher(s).find();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
private record GuardedRegexMinLength(int minLength, Pattern pattern) implements GuardedRegex {
|
|
||||||
public GuardedRegexMinLength(int minLength, String pattern) {
|
|
||||||
this(minLength, Pattern.compile(pattern));
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean test(String s) {
|
|
||||||
return s.length() >= minLength && pattern.matcher(s).find();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
private record GuardedRegexStartsWith(String start, Pattern pattern) implements GuardedRegex {
|
private record GuardedRegexStartsWith(String start, Pattern pattern) implements GuardedRegex {
|
||||||
public GuardedRegexStartsWith(String start, String pattern) {
|
public GuardedRegexStartsWith(String start, String pattern) {
|
||||||
this(start, Pattern.compile(pattern));
|
this(start, Pattern.compile(pattern));
|
||||||
|
@ -4,7 +4,7 @@ import nu.marginalia.util.language.conf.LanguageModels;
|
|||||||
import nu.marginalia.util.language.processing.KeywordCounter;
|
import nu.marginalia.util.language.processing.KeywordCounter;
|
||||||
import nu.marginalia.util.language.processing.KeywordExtractor;
|
import nu.marginalia.util.language.processing.KeywordExtractor;
|
||||||
import nu.marginalia.util.language.processing.NameCounter;
|
import nu.marginalia.util.language.processing.NameCounter;
|
||||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
||||||
import nu.marginalia.util.language.processing.model.WordRep;
|
import nu.marginalia.util.language.processing.model.WordRep;
|
||||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||||
@ -68,9 +68,6 @@ public class DocumentDebugger {
|
|||||||
|
|
||||||
Set<String> reps = new HashSet<>();
|
Set<String> reps = new HashSet<>();
|
||||||
|
|
||||||
// kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed));
|
|
||||||
// kc.count(languageData).forEach(rep -> reps.add(rep.stemmed));
|
|
||||||
|
|
||||||
try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) {
|
try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) {
|
||||||
|
|
||||||
for (var sent : languageData.titleSentences) {
|
for (var sent : languageData.titleSentences) {
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
package nu.marginalia.util.language;
|
package nu.marginalia.util.language;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
@ -123,14 +125,24 @@ public class WordPatterns {
|
|||||||
if (!filter(s)) {
|
if (!filter(s)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (isTopWord(s)) {
|
|
||||||
|
String sLc;
|
||||||
|
if (StringUtils.isAllLowerCase(s)) {
|
||||||
|
sLc = s;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
sLc = s.toLowerCase();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isTopWord(sLc)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static boolean isTopWord(String s) {
|
public static boolean isTopWord(String strLowerCase) {
|
||||||
return topWords.contains(s.toLowerCase());
|
return topWords.contains(strLowerCase);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -35,9 +35,7 @@ public class DocumentKeywordExtractor {
|
|||||||
|
|
||||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
||||||
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
List<WordRep> subjects = subjectCounter.count(keywordMetadata, documentLanguageData);
|
||||||
|
|
||||||
tfIdfCounter.countHisto(keywordMetadata, documentLanguageData);
|
|
||||||
|
|
||||||
for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
|
for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
|
||||||
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
|
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
|
||||||
@ -59,11 +57,12 @@ public class DocumentKeywordExtractor {
|
|||||||
|
|
||||||
getWordPositions(keywordMetadata, documentLanguageData);
|
getWordPositions(keywordMetadata, documentLanguageData);
|
||||||
|
|
||||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
|
||||||
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
|
||||||
|
|
||||||
List<WordRep> wordsTfIdf = tfIdfCounter.countHisto(keywordMetadata, documentLanguageData);
|
List<WordRep> wordsTfIdf = tfIdfCounter.countHisto(keywordMetadata, documentLanguageData);
|
||||||
|
|
||||||
|
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
||||||
|
List<WordRep> subjects = subjectCounter.count(keywordMetadata, documentLanguageData);
|
||||||
|
|
||||||
|
|
||||||
for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
|
for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
|
||||||
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
|
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
|
||||||
for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed);
|
for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed);
|
||||||
@ -94,7 +93,7 @@ public class DocumentKeywordExtractor {
|
|||||||
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
|
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (var span : keywordExtractor.getNames(sent)) {
|
for (var span : keywordExtractor.getProperNames(sent)) {
|
||||||
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -108,7 +107,7 @@ public class DocumentKeywordExtractor {
|
|||||||
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
|
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (var span : keywordExtractor.getNames(sent)) {
|
for (var span : keywordExtractor.getProperNames(sent)) {
|
||||||
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -155,16 +154,16 @@ public class DocumentKeywordExtractor {
|
|||||||
if (!word.isStopWord()) {
|
if (!word.isStopWord()) {
|
||||||
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
||||||
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
|
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
|
||||||
wordsBuilder.add(w, metadata.forWord(flagsTemplate, word.stemmed()));
|
wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, word.stemmed()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (var names : keywordExtractor.getNames(sent)) {
|
for (var names : keywordExtractor.getProperNames(sent)) {
|
||||||
var rep = new WordRep(sent, names);
|
var rep = new WordRep(sent, names);
|
||||||
String w = AsciiFlattener.flattenUnicode(rep.word);
|
String w = AsciiFlattener.flattenUnicode(rep.word);
|
||||||
|
|
||||||
wordsBuilder.add(w, metadata.forWord(flagsTemplate, rep.stemmed));
|
wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, rep.stemmed));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -218,7 +217,7 @@ public class DocumentKeywordExtractor {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
wordsBuilder.add(flatWord, metadata.forWord(metadata.wordFlagsTemplate(), word.stemmed) | additionalMeta);
|
wordsBuilder.add(flatWord, metadata.getMetadataForWord(metadata.wordFlagsTemplate(), word.stemmed) | additionalMeta);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -43,8 +43,8 @@ public class KeywordCounter {
|
|||||||
|
|
||||||
counts.mergeInt(rep.stemmed, 1, Integer::sum);
|
counts.mergeInt(rep.stemmed, 1, Integer::sum);
|
||||||
|
|
||||||
var instanceSet = instances.computeIfAbsent(rep.stemmed, k -> new HashSet<>(500));
|
var instanceSet = instances.computeIfAbsent(rep.stemmed, k -> new HashSet<>(16));
|
||||||
if (instanceSet.size() < 250) {
|
if (instanceSet.size() < 4) {
|
||||||
instanceSet.add(rep);
|
instanceSet.add(rep);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7,14 +7,12 @@ import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
|||||||
|
|
||||||
import java.lang.ref.SoftReference;
|
import java.lang.ref.SoftReference;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
public class KeywordExtractor {
|
public class KeywordExtractor {
|
||||||
|
|
||||||
public WordSpan[] getNames(DocumentSentence sentence) {
|
public WordSpan[] getProperNames(DocumentSentence sentence) {
|
||||||
List<WordSpan> spans = new ArrayList<>(sentence.length());
|
List<WordSpan> spans = new ArrayList<>(2 * sentence.length());
|
||||||
|
|
||||||
for (int i = 0; i < sentence.length(); i++) {
|
for (int i = 0; i < sentence.length(); i++) {
|
||||||
if (isProperNoun(i, sentence))
|
if (isProperNoun(i, sentence))
|
||||||
@ -57,27 +55,73 @@ public class KeywordExtractor {
|
|||||||
return spans.toArray(WordSpan[]::new);
|
return spans.toArray(WordSpan[]::new);
|
||||||
}
|
}
|
||||||
|
|
||||||
public WordSpan[] getKeywordsFromSentence(DocumentSentence sentence) {
|
|
||||||
if (sentence.keywords != null) {
|
|
||||||
return sentence.keywords.get();
|
|
||||||
}
|
|
||||||
List<WordSpan> spans = new ArrayList<>(sentence.length());
|
|
||||||
|
|
||||||
Set<String> topWords = Collections.emptySet();
|
public WordSpan[] getNouns(DocumentSentence sentence) {
|
||||||
|
List<WordSpan> spans = new ArrayList<>(2 * sentence.length());
|
||||||
|
|
||||||
for (int i = 0; i < sentence.length(); i++) {
|
for (int i = 0; i < sentence.length(); i++) {
|
||||||
if (isName(i, sentence, topWords) || isTopAdj(i, sentence, topWords))
|
if (isNoun(i, sentence))
|
||||||
spans.add(new WordSpan(i, i+1));
|
spans.add(new WordSpan(i, i+1));
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 1; i < sentence.length(); i++) {
|
for (int i = 1; i < sentence.length(); i++) {
|
||||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }
|
if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }
|
||||||
|
|
||||||
if (isName(i, sentence, topWords)) {
|
if (isNoun(i, sentence)
|
||||||
if (isName(i - 1, sentence, topWords) || isTopAdj(i-1, sentence, topWords))
|
&& (isNoun(i-1, sentence)) || "JJ".equals(sentence.posTags[i-1])) {
|
||||||
|
spans.add(new WordSpan(i - 1, i + 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 2; i < sentence.length(); i++) {
|
||||||
|
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
|
||||||
|
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
|
||||||
|
|
||||||
|
if ((isNoun(i, sentence))
|
||||||
|
&& (isJoiner(sentence, i-1) || isNoun(i-1, sentence))
|
||||||
|
&& (isNoun(i-2, sentence)) || "JJ".equals(sentence.posTags[i-2]))
|
||||||
|
spans.add(new WordSpan(i-2, i+1));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 3; i < sentence.length(); i++) {
|
||||||
|
if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
|
||||||
|
if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
|
||||||
|
if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; }
|
||||||
|
|
||||||
|
if (isNoun(i, sentence) && (isNoun(i-3, sentence) || "JJ".equals(sentence.posTags[i-3]))) {
|
||||||
|
if (isNoun(i - 1, sentence) && isNoun(i - 2, sentence))
|
||||||
|
spans.add(new WordSpan(i-3, i+1));
|
||||||
|
else if (isJoiner(sentence, i-2) && sentence.posTags[i-1].equals("DT"))
|
||||||
|
spans.add(new WordSpan(i-3, i+1));
|
||||||
|
else if ((isJoiner(sentence, i-1) ||isNoun(i-1, sentence))
|
||||||
|
&& (isJoiner(sentence, i-2)||isNoun(i-2, sentence)))
|
||||||
|
spans.add(new WordSpan(i-3, i+1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return spans.toArray(WordSpan[]::new);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public WordSpan[] getKeywordsFromSentence(DocumentSentence sentence) {
|
||||||
|
if (sentence.keywords != null) {
|
||||||
|
return sentence.keywords.get();
|
||||||
|
}
|
||||||
|
List<WordSpan> spans = new ArrayList<>(2 * sentence.length());
|
||||||
|
|
||||||
|
for (int i = 0; i < sentence.length(); i++) {
|
||||||
|
if (isName(i, sentence) || isTopAdj(i, sentence))
|
||||||
|
spans.add(new WordSpan(i, i+1));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 1; i < sentence.length(); i++) {
|
||||||
|
if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }
|
||||||
|
|
||||||
|
if (isName(i, sentence)) {
|
||||||
|
if (isName(i - 1, sentence) || isTopAdj(i-1, sentence))
|
||||||
spans.add(new WordSpan(i - 1, i + 1));
|
spans.add(new WordSpan(i - 1, i + 1));
|
||||||
}
|
}
|
||||||
if (sentence.posTags[i].equals("CD") && isName(i-1, sentence, topWords)) {
|
if (sentence.posTags[i].equals("CD") && isName(i-1, sentence)) {
|
||||||
spans.add(new WordSpan(i - 1, i + 1));
|
spans.add(new WordSpan(i - 1, i + 1));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -86,16 +130,16 @@ public class KeywordExtractor {
|
|||||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
|
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
|
||||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
|
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
|
||||||
|
|
||||||
if (isName(i, sentence, topWords)) {
|
if (isName(i, sentence)) {
|
||||||
if ((isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords))
|
if ((isName(i-1, sentence) || isTopAdj(i-1, sentence))
|
||||||
&& (isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords))) {
|
&& (isName(i-2, sentence) || isTopAdj(i-2, sentence))) {
|
||||||
spans.add(new WordSpan(i - 2, i + 1));
|
spans.add(new WordSpan(i - 2, i + 1));
|
||||||
}
|
}
|
||||||
else if ((isProperNoun(i-1, sentence) || isJoiner(sentence, i-1)) && isProperNoun(i-2, sentence)) {
|
else if ((isProperNoun(i-1, sentence) || isJoiner(sentence, i-1)) && isProperNoun(i-2, sentence)) {
|
||||||
spans.add(new WordSpan(i - 2, i + 1));
|
spans.add(new WordSpan(i - 2, i + 1));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (sentence.posTags[i].equals("CD") && isName(i-1, sentence, topWords) && isName(i-2, sentence, topWords)) {
|
else if (sentence.posTags[i].equals("CD") && isName(i-1, sentence) && isName(i-2, sentence)) {
|
||||||
spans.add(new WordSpan(i - 2, i + 1));
|
spans.add(new WordSpan(i - 2, i + 1));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -105,10 +149,10 @@ public class KeywordExtractor {
|
|||||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
|
if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
|
||||||
if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
|
if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
|
||||||
|
|
||||||
if (isName(i, sentence, topWords) &&
|
if (isName(i, sentence) &&
|
||||||
(isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords)) &&
|
(isName(i-1, sentence) || isTopAdj(i-1, sentence)) &&
|
||||||
(isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords)) &&
|
(isName(i-2, sentence) || isTopAdj(i-2, sentence)) &&
|
||||||
(isName(i-3, sentence, topWords) || isTopAdj(i-3, sentence, topWords))) {
|
(isName(i-3, sentence) || isTopAdj(i-3, sentence))) {
|
||||||
spans.add(new WordSpan(i - 3, i + 1));
|
spans.add(new WordSpan(i - 3, i + 1));
|
||||||
}
|
}
|
||||||
else if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) {
|
else if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) {
|
||||||
@ -134,7 +178,9 @@ public class KeywordExtractor {
|
|||||||
public boolean isProperNoun(int i, DocumentSentence sent) {
|
public boolean isProperNoun(int i, DocumentSentence sent) {
|
||||||
return "NNP".equals(sent.posTags[i]) || "NNPS".equals(sent.posTags[i]);
|
return "NNP".equals(sent.posTags[i]) || "NNPS".equals(sent.posTags[i]);
|
||||||
}
|
}
|
||||||
|
public boolean isNoun(int i, DocumentSentence sent) {
|
||||||
|
return sent.posTags[i].startsWith("NN");
|
||||||
|
}
|
||||||
public boolean isJoiner(DocumentSentence sent, int i) {
|
public boolean isJoiner(DocumentSentence sent, int i) {
|
||||||
if(sent.posTags[i].equals("IN")) {
|
if(sent.posTags[i].equals("IN")) {
|
||||||
return true;
|
return true;
|
||||||
@ -183,21 +229,13 @@ public class KeywordExtractor {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isName(int i, DocumentSentence sentence, Set<String> topWords) {
|
private boolean isName(int i, DocumentSentence sentence) {
|
||||||
if (!topWords.isEmpty()) {
|
|
||||||
String posTag = sentence.posTags[i];
|
|
||||||
String word = sentence.stemmedWords[i];
|
|
||||||
|
|
||||||
return ((topWords.contains(word)) && (posTag.startsWith("N") || "VBN".equals(posTag)) && !sentence.isStopWord(i));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
String posTag = sentence.posTags[i];
|
String posTag = sentence.posTags[i];
|
||||||
|
|
||||||
return (posTag.startsWith("N") || "VBN".equals(posTag)) && !sentence.isStopWord(i);
|
return (posTag.startsWith("N") || "VBG".equals(posTag)|| "VBN".equals(posTag)) && !sentence.isStopWord(i);
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isTopAdj(int i, DocumentSentence sentence, Set<String> topWords) {
|
private boolean isTopAdj(int i, DocumentSentence sentence) {
|
||||||
String posTag = sentence.posTags[i];
|
String posTag = sentence.posTags[i];
|
||||||
|
|
||||||
return (posTag.startsWith("JJ") || posTag.startsWith("R") || posTag.startsWith("VBG"));
|
return (posTag.startsWith("JJ") || posTag.startsWith("R") || posTag.startsWith("VBG"));
|
||||||
|
@ -20,7 +20,7 @@ public class NameCounter {
|
|||||||
|
|
||||||
for (int i = 0; i < dld.sentences.length; i++) {
|
for (int i = 0; i < dld.sentences.length; i++) {
|
||||||
DocumentSentence sent = dld.sentences[i];
|
DocumentSentence sent = dld.sentences[i];
|
||||||
var keywords = keywordExtractor.getNames(sent);
|
var keywords = keywordExtractor.getProperNames(sent);
|
||||||
for (var span : keywords) {
|
for (var span : keywords) {
|
||||||
if (span.size() <= 1)
|
if (span.size() <= 1)
|
||||||
continue;
|
continue;
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
package nu.marginalia.util.language.processing;
|
package nu.marginalia.util.language.processing;
|
||||||
|
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
|
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
||||||
import nu.marginalia.util.language.processing.model.WordRep;
|
import nu.marginalia.util.language.processing.model.WordRep;
|
||||||
import nu.marginalia.util.language.processing.model.WordSpan;
|
import nu.marginalia.util.language.processing.model.WordSpan;
|
||||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
@ -23,13 +25,13 @@ public class SubjectCounter {
|
|||||||
// Greeks bearing gifts -> Greeks
|
// Greeks bearing gifts -> Greeks
|
||||||
// Steve McQueen drove fast | cars -> Steve McQueen
|
// Steve McQueen drove fast | cars -> Steve McQueen
|
||||||
|
|
||||||
public List<WordRep> count(DocumentLanguageData dld) {
|
public List<WordRep> count(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
|
||||||
|
|
||||||
Map<String, Integer> counts = new HashMap<>();
|
Map<String, Integer> counts = new HashMap<>();
|
||||||
Map<String, Set<WordRep>> instances = new HashMap<>();
|
Map<String, Set<WordRep>> instances = new HashMap<>();
|
||||||
|
|
||||||
for (var sentence : dld.sentences) {
|
for (var sentence : dld.sentences) {
|
||||||
for (WordSpan kw : keywordExtractor.getNames(sentence)) {
|
for (WordSpan kw : keywordExtractor.getNouns(sentence)) {
|
||||||
if (kw.end + 2 >= sentence.length()) {
|
if (kw.end + 2 >= sentence.length()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -46,20 +48,46 @@ public class SubjectCounter {
|
|||||||
|
|
||||||
String stemmed = rep.stemmed;
|
String stemmed = rep.stemmed;
|
||||||
|
|
||||||
counts.merge(stemmed, -1, Integer::sum);
|
|
||||||
instances.computeIfAbsent(stemmed, s -> new HashSet<>()).add(rep);
|
instances.computeIfAbsent(stemmed, s -> new HashSet<>()).add(rep);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int best = counts.values().stream().mapToInt(Integer::valueOf).min().orElse(0);
|
Map<String, Integer> scores = new HashMap<>(instances.size());
|
||||||
|
for (String stemmed : instances.keySet()) {
|
||||||
|
scores.put(stemmed, getTermTfIdf(keywordMetadata, stemmed));
|
||||||
|
}
|
||||||
|
|
||||||
return counts.entrySet().stream().sorted(Map.Entry.comparingByValue())
|
return scores.entrySet().stream()
|
||||||
.filter(e -> e.getValue()<-2 && e.getValue()<=best*0.75)
|
.filter(e -> e.getValue() >= 150)
|
||||||
.flatMap(e -> instances.getOrDefault(e.getKey(), Collections.emptySet()).stream())
|
.flatMap(e -> instances.getOrDefault(e.getKey(), Collections.emptySet()).stream())
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private int getTermTfIdf(KeywordMetadata keywordMetadata, String stemmed) {
|
||||||
|
if (stemmed.contains("_")) {
|
||||||
|
int sum = 0;
|
||||||
|
String[] parts = StringUtils.split(stemmed, '_');
|
||||||
|
|
||||||
|
if (parts.length == 0) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String part : parts) {
|
||||||
|
sum += getTermTfIdf(keywordMetadata, part);
|
||||||
|
}
|
||||||
|
|
||||||
|
return sum / parts.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
var meta = keywordMetadata.wordsTfIdf().get(stemmed);
|
||||||
|
if (meta != null) {
|
||||||
|
return meta.tfIdfNormalized();
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
private boolean isDetOrAdverbOrVerb(String posTag) {
|
private boolean isDetOrAdverbOrVerb(String posTag) {
|
||||||
return "DT".equals(posTag) // determinant
|
return "DT".equals(posTag) // determinant
|
||||||
|| "RB".equals(posTag) // adverb
|
|| "RB".equals(posTag) // adverb
|
||||||
|
@ -2,12 +2,13 @@ package nu.marginalia.util.language.processing.model;
|
|||||||
|
|
||||||
import gnu.trove.map.hash.TObjectIntHashMap;
|
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
|
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @see nu.marginalia.util.language.processing.SentenceExtractor
|
* @see SentenceExtractor
|
||||||
*/
|
*/
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
public class DocumentLanguageData {
|
public class DocumentLanguageData {
|
||||||
|
@ -17,9 +17,6 @@ public record KeywordMetadata(HashSet<String> titleKeywords,
|
|||||||
)
|
)
|
||||||
{
|
{
|
||||||
|
|
||||||
private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0);
|
|
||||||
private static final int TF_IDF_HIGH_LIMIT = 64;
|
|
||||||
|
|
||||||
public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) {
|
public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) {
|
||||||
this(new HashSet<>(50), new HashSet<>(10), new HashSet<>(50),
|
this(new HashSet<>(50), new HashSet<>(10), new HashSet<>(50),
|
||||||
new HashMap<>(15_000),
|
new HashMap<>(15_000),
|
||||||
@ -31,7 +28,8 @@ public record KeywordMetadata(HashSet<String> titleKeywords,
|
|||||||
this(EnumSet.noneOf(EdgePageWordFlags.class));
|
this(EnumSet.noneOf(EdgePageWordFlags.class));
|
||||||
}
|
}
|
||||||
|
|
||||||
public long forWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
|
private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0);
|
||||||
|
public long getMetadataForWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
|
||||||
|
|
||||||
KeywordCounter.WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty);
|
KeywordCounter.WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty);
|
||||||
EnumSet<EdgePageWordFlags> flags = flagsTemplate.clone();
|
EnumSet<EdgePageWordFlags> flags = flagsTemplate.clone();
|
||||||
|
@ -24,7 +24,7 @@ public class WordRep implements Comparable<WordRep> {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int compareTo(@NotNull WordRep o) {
|
public int compareTo(@NotNull WordRep o) {
|
||||||
return stemmed.compareTo(o.stemmed);
|
return word.compareTo(o.word);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -1,16 +1,14 @@
|
|||||||
package nu.marginalia.util.language.processing;
|
package nu.marginalia.util.language.processing.sentence;
|
||||||
|
|
||||||
import com.github.datquocnguyen.RDRPOSTagger;
|
import com.github.datquocnguyen.RDRPOSTagger;
|
||||||
import com.github.jknack.handlebars.internal.lang3.StringUtils;
|
import com.github.jknack.handlebars.internal.lang3.StringUtils;
|
||||||
import gnu.trove.list.array.TIntArrayList;
|
|
||||||
import gnu.trove.map.hash.TObjectIntHashMap;
|
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Getter;
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.util.StringPool;
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
|
import nu.marginalia.util.language.processing.HtmlTagCleaner;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
||||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
|
||||||
import opennlp.tools.sentdetect.SentenceDetectorME;
|
import opennlp.tools.sentdetect.SentenceDetectorME;
|
||||||
import opennlp.tools.sentdetect.SentenceModel;
|
import opennlp.tools.sentdetect.SentenceModel;
|
||||||
import opennlp.tools.stemmer.PorterStemmer;
|
import opennlp.tools.stemmer.PorterStemmer;
|
||||||
@ -24,25 +22,22 @@ import javax.inject.Inject;
|
|||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import static nu.marginalia.util.language.WordPatterns.*;
|
|
||||||
|
|
||||||
public class SentenceExtractor {
|
public class SentenceExtractor {
|
||||||
|
|
||||||
private SentenceDetectorME sentenceDetector;
|
private SentenceDetectorME sentenceDetector;
|
||||||
private final RDRPOSTagger rdrposTagger;
|
private final RDRPOSTagger rdrposTagger;
|
||||||
|
|
||||||
private final PorterStemmer porterStemmer = new PorterStemmer();
|
private final PorterStemmer porterStemmer = new PorterStemmer();
|
||||||
private boolean legacyMode = false;
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class);
|
private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class);
|
||||||
|
|
||||||
private static final HtmlTagCleaner tagCleaner = new HtmlTagCleaner();
|
private static final HtmlTagCleaner tagCleaner = new HtmlTagCleaner();
|
||||||
|
|
||||||
|
private final ThreadLocal<StringPool> stringPool = ThreadLocal.withInitial(() -> StringPool.create(10_000));
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows @Inject
|
@SneakyThrows @Inject
|
||||||
public SentenceExtractor(LanguageModels models) {
|
public SentenceExtractor(LanguageModels models) {
|
||||||
try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) {
|
try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) {
|
||||||
@ -66,6 +61,22 @@ public class SentenceExtractor {
|
|||||||
final String text = asText(doc);
|
final String text = asText(doc);
|
||||||
final DocumentSentence[] textSentences = extractSentencesFromString(text);
|
final DocumentSentence[] textSentences = extractSentencesFromString(text);
|
||||||
|
|
||||||
|
String title = getTitle(doc, textSentences);
|
||||||
|
|
||||||
|
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
||||||
|
var titleSentences = extractSentencesFromString(title.toLowerCase());
|
||||||
|
return new DocumentLanguageData(textSentences, titleSentences, counts);
|
||||||
|
}
|
||||||
|
|
||||||
|
public DocumentLanguageData extractSentences(String text, String title) {
|
||||||
|
final DocumentSentence[] textSentences = extractSentencesFromString(text);
|
||||||
|
|
||||||
|
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
||||||
|
|
||||||
|
return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String getTitle(Document doc, DocumentSentence[] textSentences) {
|
||||||
String title = doc.getElementsByTag("title").text() + " . " +
|
String title = doc.getElementsByTag("title").text() + " . " +
|
||||||
Optional.ofNullable(doc.getElementsByTag("h1").first()).map(Element::text).orElse("");
|
Optional.ofNullable(doc.getElementsByTag("h1").first()).map(Element::text).orElse("");
|
||||||
|
|
||||||
@ -82,34 +93,7 @@ public class SentenceExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
return title;
|
||||||
var titleSentences = extractSentencesFromString(title.toLowerCase());
|
|
||||||
return new DocumentLanguageData(textSentences, titleSentences, counts);
|
|
||||||
}
|
|
||||||
|
|
||||||
public DocumentLanguageData extractSentences(String text) {
|
|
||||||
final DocumentSentence[] textSentences = extractSentencesFromString(text);
|
|
||||||
|
|
||||||
String title = "";
|
|
||||||
for (DocumentSentence textSentence : textSentences) {
|
|
||||||
if (textSentence.length() > 0) {
|
|
||||||
title = textSentence.originalSentence.toLowerCase();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
|
||||||
|
|
||||||
return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public DocumentLanguageData extractSentences(String text, String title) {
|
|
||||||
final DocumentSentence[] textSentences = extractSentencesFromString(text);
|
|
||||||
|
|
||||||
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
|
||||||
|
|
||||||
return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -125,79 +109,95 @@ public class SentenceExtractor {
|
|||||||
return counts;
|
return counts;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)");
|
|
||||||
|
|
||||||
// private static final Pattern badCharPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|(\\.(\\s+|$))");
|
|
||||||
|
|
||||||
private boolean isBadChar(char c) {
|
|
||||||
if (c >= 'a' && c <= 'z') return false;
|
|
||||||
if (c >= 'A' && c <= 'Z') return false;
|
|
||||||
if (c >= '0' && c <= '9') return false;
|
|
||||||
if ("_#@.".indexOf(c) >= 0) return false;
|
|
||||||
if (c >= '\u00C0' && c <= '\u00D6') return false;
|
|
||||||
if (c >= '\u00D8' && c <= '\u00F6') return false;
|
|
||||||
if (c >= '\u00F8' && c <= '\u00FF') return false;
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
private String sanitizeString(String s) {
|
|
||||||
char[] newChars = new char[s.length()];
|
|
||||||
int pi = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < newChars.length; i++) {
|
|
||||||
char c = s.charAt(i);
|
|
||||||
if (!isBadChar(c)) {
|
|
||||||
newChars[pi++] = c;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
newChars[pi++] = ' ';
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
s = new String(newChars, 0, pi);
|
|
||||||
|
|
||||||
if (s.startsWith(".")) {
|
|
||||||
s = s.substring(1);
|
|
||||||
if (s.isBlank())
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public DocumentSentence extractSentence(String text) {
|
public DocumentSentence extractSentence(String text) {
|
||||||
var wordsAndSeps = splitSegment(text);
|
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text);
|
||||||
|
|
||||||
var words = wordsAndSeps.words;
|
var words = wordsAndSeps.words;
|
||||||
var seps = wordsAndSeps.separators;
|
var seps = wordsAndSeps.separators;
|
||||||
var lc = toLc(wordsAndSeps.words);
|
var lc = SentenceExtractorStringUtils.toLowerCaseStripPossessive(wordsAndSeps.words);
|
||||||
|
|
||||||
return new DocumentSentence(
|
return new DocumentSentence(
|
||||||
sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc)
|
SentenceExtractorStringUtils.sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String normalizeSpaces(String s) {
|
|
||||||
if (s.indexOf('\t') >= 0) {
|
|
||||||
s = s.replace('\t', ' ');
|
|
||||||
}
|
|
||||||
if (s.indexOf('\n') >= 0) {
|
|
||||||
s = s.replace('\n', ' ');
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
public DocumentSentence[] extractSentencesFromString(String text) {
|
public DocumentSentence[] extractSentencesFromString(String text) {
|
||||||
String[] sentences;
|
String[] sentences;
|
||||||
|
|
||||||
String textNormalizedSpaces = normalizeSpaces(text);
|
String textNormalizedSpaces = SentenceExtractorStringUtils.normalizeSpaces(text);
|
||||||
try {
|
try {
|
||||||
sentences = sentenceDetector.sentDetect(textNormalizedSpaces);
|
sentences = sentenceDetector.sentDetect(textNormalizedSpaces);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
|
// shitty fallback logic
|
||||||
sentences = StringUtils.split(textNormalizedSpaces, '.');
|
sentences = StringUtils.split(textNormalizedSpaces, '.');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sentences = preCleanSentences(sentences);
|
||||||
|
|
||||||
|
final String[][] tokens = new String[sentences.length][];
|
||||||
|
final int[][] separators = new int[sentences.length][];
|
||||||
|
final String[][] posTags = new String[sentences.length][];
|
||||||
|
final String[][] tokensLc = new String[sentences.length][];
|
||||||
|
final String[][] stemmedWords = new String[sentences.length][];
|
||||||
|
|
||||||
|
for (int i = 0; i < tokens.length; i++) {
|
||||||
|
|
||||||
|
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sentences[i]);
|
||||||
|
tokens[i] = wordsAndSeps.words;
|
||||||
|
separators[i] = wordsAndSeps.separators;
|
||||||
|
if (tokens[i].length > 250) {
|
||||||
|
tokens[i] = Arrays.copyOf(tokens[i], 250);
|
||||||
|
separators[i] = Arrays.copyOf(separators[i], 250);
|
||||||
|
}
|
||||||
|
for (int j = 0; j < tokens[i].length; j++) {
|
||||||
|
while (tokens[i][j].endsWith(".")) {
|
||||||
|
tokens[i][j] = StringUtils.removeEnd(tokens[i][j], ".");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var sPool = stringPool.get();
|
||||||
|
|
||||||
|
for (int i = 0; i < tokens.length; i++) {
|
||||||
|
tokens[i] = sPool.internalize(tokens[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < tokens.length; i++) {
|
||||||
|
posTags[i] = rdrposTagger.tagsForEnSentence(tokens[i]);
|
||||||
|
// don't need to internalize this
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < tokens.length; i++) {
|
||||||
|
tokensLc[i] = SentenceExtractorStringUtils.toLowerCaseStripPossessive(tokens[i]);
|
||||||
|
tokensLc[i] = sPool.internalize(tokensLc[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < tokens.length; i++) {
|
||||||
|
stemmedWords[i] = stemSentence(tokensLc[i]);
|
||||||
|
stemmedWords[i] = sPool.internalize(stemmedWords[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
DocumentSentence[] ret = new DocumentSentence[sentences.length];
|
||||||
|
for (int i = 0; i < ret.length; i++) {
|
||||||
|
String fullString;
|
||||||
|
|
||||||
|
if (i == 0) {
|
||||||
|
fullString = SentenceExtractorStringUtils.sanitizeString(sentences[i]);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
fullString = "";
|
||||||
|
}
|
||||||
|
|
||||||
|
ret[i] = new DocumentSentence(fullString, tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)");
|
||||||
|
|
||||||
|
private String[] preCleanSentences(String[] sentences) {
|
||||||
|
|
||||||
if (sentences.length > 250) {
|
if (sentences.length > 250) {
|
||||||
sentences = Arrays.copyOf(sentences, 250);
|
sentences = Arrays.copyOf(sentences, 250);
|
||||||
}
|
}
|
||||||
@ -212,53 +212,13 @@ public class SentenceExtractor {
|
|||||||
sentenceList.add(s);
|
sentenceList.add(s);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
sentences = sentenceList.toArray(String[]::new);
|
return sentenceList.toArray(String[]::new);
|
||||||
|
|
||||||
final String[][] tokens = new String[sentences.length][];
|
|
||||||
final int[][] separators = new int[sentences.length][];
|
|
||||||
final String[][] posTags = new String[sentences.length][];
|
|
||||||
final String[][] tokensLc = new String[sentences.length][];
|
|
||||||
final String[][] stemmedWords = new String[sentences.length][];
|
|
||||||
|
|
||||||
for (int i = 0; i < tokens.length; i++) {
|
|
||||||
|
|
||||||
var wordsAndSeps = splitSegment(sentences[i]); //tokenizer.tokenize(sentences[i]);
|
|
||||||
tokens[i] = wordsAndSeps.words;
|
|
||||||
separators[i] = wordsAndSeps.separators;
|
|
||||||
if (tokens[i].length > 250) {
|
|
||||||
tokens[i] = Arrays.copyOf(tokens[i], 250);
|
|
||||||
separators[i] = Arrays.copyOf(separators[i], 250);
|
|
||||||
}
|
|
||||||
for (int j = 0; j < tokens[i].length; j++) {
|
|
||||||
while (tokens[i][j].endsWith(".")) {
|
|
||||||
tokens[i][j] = StringUtils.removeEnd(tokens[i][j], ".");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < tokens.length; i++) {
|
|
||||||
posTags[i] = rdrposTagger.tagsForEnSentence(tokens[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < tokens.length; i++) {
|
|
||||||
tokensLc[i] = toLc(tokens[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < tokens.length; i++) {
|
|
||||||
stemmedWords[i] = stemSentence(tokensLc[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
DocumentSentence[] ret = new DocumentSentence[sentences.length];
|
|
||||||
for (int i = 0; i < ret.length; i++) {
|
|
||||||
ret[i] = new DocumentSentence(sanitizeString(sentences[i]), tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]);
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private String[] stemSentence(String[] strings) {
|
private String[] stemSentence(String[] strings) {
|
||||||
String[] stemmed = new String[strings.length];
|
String[] stemmed = new String[strings.length];
|
||||||
for (int i = 0; i < stemmed.length; i++) {
|
for (int i = 0; i < stemmed.length; i++) {
|
||||||
var sent = cleanPossessive(strings[i]);
|
var sent = SentenceExtractorStringUtils.stripPossessive(strings[i]);
|
||||||
try {
|
try {
|
||||||
stemmed[i] = porterStemmer.stem(sent);
|
stemmed[i] = porterStemmer.stem(sent);
|
||||||
}
|
}
|
||||||
@ -269,27 +229,6 @@ public class SentenceExtractor {
|
|||||||
return stemmed;
|
return stemmed;
|
||||||
}
|
}
|
||||||
|
|
||||||
private String cleanPossessive(String s) {
|
|
||||||
int end = s.length();
|
|
||||||
|
|
||||||
if (s.endsWith("\'")) {
|
|
||||||
return s.substring(0, end-1);
|
|
||||||
} else if (end > 2 && s.charAt(end-2) == '\'' && "sS".indexOf(s.charAt(end-1))>=0) {
|
|
||||||
return s.substring(0, end-2).toLowerCase();
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private String[] toLc(String[] words) {
|
|
||||||
String[] lower = new String[words.length];
|
|
||||||
for (int i = 0; i < lower.length; i++) {
|
|
||||||
lower[i] = cleanPossessive(words[i]).toLowerCase();
|
|
||||||
}
|
|
||||||
return lower;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String asText(Document dc) {
|
public String asText(Document dc) {
|
||||||
|
|
||||||
tagCleaner.clean(dc);
|
tagCleaner.clean(dc);
|
||||||
@ -299,67 +238,6 @@ public class SentenceExtractor {
|
|||||||
return text.substring(0, (int) (text.length()*0.95));
|
return text.substring(0, (int) (text.length()*0.95));
|
||||||
}
|
}
|
||||||
|
|
||||||
@AllArgsConstructor @Getter
|
|
||||||
private static class WordsAndSeparators {
|
|
||||||
String[] words;
|
|
||||||
int[] separators;
|
|
||||||
}
|
|
||||||
|
|
||||||
private WordsAndSeparators splitSegment(String segment) {
|
|
||||||
var matcher = wordBreakPattern.matcher(segment);
|
|
||||||
|
|
||||||
List<String> words = new ArrayList<>(segment.length()/6);
|
|
||||||
TIntArrayList separators = new TIntArrayList(segment.length()/6);
|
|
||||||
|
|
||||||
int start = 0;
|
|
||||||
int wordStart = 0;
|
|
||||||
while (wordStart <= segment.length()) {
|
|
||||||
if (!matcher.find(wordStart)) {
|
|
||||||
words.add(segment.substring(wordStart));
|
|
||||||
separators.add(WordSeparator.SPACE);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (wordStart != matcher.start()) {
|
|
||||||
words.add(segment.substring(wordStart, matcher.start()));
|
|
||||||
separators.add(segment.substring(matcher.start(), matcher.end()).isBlank() ? WordSeparator.SPACE : WordSeparator.COMMA);
|
|
||||||
}
|
|
||||||
wordStart = matcher.end();
|
|
||||||
}
|
|
||||||
|
|
||||||
String[] parts = words.toArray(String[]::new);
|
|
||||||
int length = 0;
|
|
||||||
for (int i = 0; i < parts.length; i++) {
|
|
||||||
if (parts[i].isBlank() || parts[i].length() >= MAX_WORD_LENGTH || characterNoisePredicate.test(parts[i])) {
|
|
||||||
parts[i] = null;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
length++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
String[] ret = new String[length];
|
|
||||||
int[] seps = new int[length];
|
|
||||||
for (int i = 0, j=0; i < parts.length; i++) {
|
|
||||||
if (parts[i] != null) {
|
|
||||||
seps[j] = separators.getQuick(i);
|
|
||||||
ret[j++] = parts[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < ret.length; i++) {
|
|
||||||
if (ret[i].startsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(1); }
|
|
||||||
if (ret[i].endsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(0, ret[i].length()-1); }
|
|
||||||
}
|
|
||||||
return new WordsAndSeparators(ret, seps);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean isLegacyMode() {
|
|
||||||
return legacyMode;
|
|
||||||
}
|
|
||||||
public void setLegacyMode(boolean legacyMode) {
|
|
||||||
this.legacyMode = legacyMode;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
@ -0,0 +1,93 @@
|
|||||||
|
package nu.marginalia.util.language.processing.sentence;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
|
public class SentenceExtractorStringUtils {
|
||||||
|
|
||||||
|
public static String sanitizeString(String s) {
|
||||||
|
char[] newChars = new char[s.length()];
|
||||||
|
int pi = 0;
|
||||||
|
boolean changed = false;
|
||||||
|
for (int i = 0; i < newChars.length; i++) {
|
||||||
|
char c = s.charAt(i);
|
||||||
|
if (!isBadChar(c)) {
|
||||||
|
newChars[pi++] = c;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
changed = true;
|
||||||
|
newChars[pi++] = ' ';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (changed) {
|
||||||
|
s = new String(newChars, 0, pi);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (s.startsWith(".")) {
|
||||||
|
s = s.substring(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (s.isBlank()) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
return s;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isBadChar(char c) {
|
||||||
|
if (c >= 'a' && c <= 'z') return false;
|
||||||
|
if (c >= 'A' && c <= 'Z') return false;
|
||||||
|
if (c >= '0' && c <= '9') return false;
|
||||||
|
if ("_#@.".indexOf(c) >= 0) return false;
|
||||||
|
if (c >= '\u00C0' && c <= '\u00D6') return false;
|
||||||
|
if (c >= '\u00D8' && c <= '\u00F6') return false;
|
||||||
|
if (c >= '\u00F8' && c <= '\u00FF') return false;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String normalizeSpaces(String s) {
|
||||||
|
if (s.indexOf('\t') >= 0) {
|
||||||
|
s = s.replace('\t', ' ');
|
||||||
|
}
|
||||||
|
if (s.indexOf('\n') >= 0) {
|
||||||
|
s = s.replace('\n', ' ');
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static String toLowerCaseStripPossessive(String word) {
|
||||||
|
String val = stripPossessive(word).toLowerCase();
|
||||||
|
|
||||||
|
if (Objects.equals(val, word)) {
|
||||||
|
return word;
|
||||||
|
}
|
||||||
|
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String[] toLowerCaseStripPossessive(String[] words) {
|
||||||
|
String[] lc = new String[words.length];
|
||||||
|
Arrays.setAll(lc, i ->SentenceExtractorStringUtils.toLowerCaseStripPossessive(words[i]));
|
||||||
|
return lc;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String stripPossessive(String s) {
|
||||||
|
int end = s.length();
|
||||||
|
|
||||||
|
if (s.endsWith("'")) {
|
||||||
|
return s.substring(0, end-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (s.endsWith("'s") || s.endsWith("'S")) {
|
||||||
|
return s.substring(0, end-2);
|
||||||
|
}
|
||||||
|
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,72 @@
|
|||||||
|
package nu.marginalia.util.language.processing.sentence;
|
||||||
|
|
||||||
|
import gnu.trove.list.array.TIntArrayList;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.Getter;
|
||||||
|
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static nu.marginalia.util.language.WordPatterns.*;
|
||||||
|
|
||||||
|
public class SentenceSegmentSplitter {
|
||||||
|
|
||||||
|
|
||||||
|
@AllArgsConstructor
|
||||||
|
@Getter
|
||||||
|
public static class SeparatedSentence {
|
||||||
|
String[] words;
|
||||||
|
int[] separators;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static SeparatedSentence splitSegment(String segment) {
|
||||||
|
var matcher = wordBreakPattern.matcher(segment);
|
||||||
|
|
||||||
|
List<String> words = new ArrayList<>(segment.length()/6);
|
||||||
|
TIntArrayList separators = new TIntArrayList(segment.length()/6);
|
||||||
|
|
||||||
|
int wordStart = 0;
|
||||||
|
while (wordStart <= segment.length()) {
|
||||||
|
if (!matcher.find(wordStart)) {
|
||||||
|
words.add(segment.substring(wordStart));
|
||||||
|
separators.add(WordSeparator.SPACE);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (wordStart != matcher.start()) {
|
||||||
|
words.add(segment.substring(wordStart, matcher.start()));
|
||||||
|
separators.add(segment.substring(matcher.start(), matcher.end()).isBlank() ? WordSeparator.SPACE : WordSeparator.COMMA);
|
||||||
|
}
|
||||||
|
wordStart = matcher.end();
|
||||||
|
}
|
||||||
|
|
||||||
|
String[] parts = words.toArray(String[]::new);
|
||||||
|
int length = 0;
|
||||||
|
for (int i = 0; i < parts.length; i++) {
|
||||||
|
if (parts[i].isBlank() || parts[i].length() >= MAX_WORD_LENGTH || characterNoisePredicate.test(parts[i])) {
|
||||||
|
parts[i] = null;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
length++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
String[] ret = new String[length];
|
||||||
|
int[] seps = new int[length];
|
||||||
|
for (int i = 0, j=0; i < parts.length; i++) {
|
||||||
|
if (parts[i] != null) {
|
||||||
|
seps[j] = separators.getQuick(i);
|
||||||
|
ret[j++] = parts[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < ret.length; i++) {
|
||||||
|
if (ret[i].startsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(1); }
|
||||||
|
if (ret[i].endsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(0, ret[i].length()-1); }
|
||||||
|
}
|
||||||
|
return new SeparatedSentence(ret, seps);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -1,39 +0,0 @@
|
|||||||
package nu.marginalia.util.ranking;
|
|
||||||
|
|
||||||
|
|
||||||
public class BuggyReversePageRank extends RankingAlgorithm {
|
|
||||||
|
|
||||||
|
|
||||||
public BuggyReversePageRank(RankingDomainFetcher domains, String... origins) {
|
|
||||||
super(domains, origins);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
RankVector createNewRankVector(RankVector rank) {
|
|
||||||
|
|
||||||
double rankNorm = rank.norm();
|
|
||||||
RankVector newRank = new RankVector(0);
|
|
||||||
|
|
||||||
for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) {
|
|
||||||
|
|
||||||
var links = linkDataSrc2Dest[domainId];
|
|
||||||
|
|
||||||
if (links != null && links.size() > 0) {
|
|
||||||
double newRankValue = 0;
|
|
||||||
|
|
||||||
for (int j = 0; j < links.size(); j++) {
|
|
||||||
newRankValue += rank.get(links.getQuick(j)) / links.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
newRank.set(domainId, 0.85*newRankValue/rankNorm);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return newRank;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
void adjustRankVector(RankVector vector, double dNorm, double oldNorm) {
|
|
||||||
originDomainIds.forEach(id -> vector.increment(domainIdToIndex.get(id), dNorm/oldNorm));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,45 +0,0 @@
|
|||||||
package nu.marginalia.util.ranking;
|
|
||||||
|
|
||||||
|
|
||||||
public class BuggyStandardPageRank extends RankingAlgorithm {
|
|
||||||
|
|
||||||
public BuggyStandardPageRank(RankingDomainFetcher domains, String... origins) {
|
|
||||||
super(domains, origins);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
RankingAlgorithm.RankVector createNewRankVector(RankingAlgorithm.RankVector rank) {
|
|
||||||
RankVector newRank = new RankVector(0);
|
|
||||||
|
|
||||||
for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) {
|
|
||||||
|
|
||||||
var links = linkDataSrc2Dest[domainId];
|
|
||||||
double newRankValue = 0;
|
|
||||||
|
|
||||||
if (links != null && links.size() > 0) {
|
|
||||||
for (int j = 0; j < links.size(); j++) {
|
|
||||||
int linkedDomain = links.getQuick(j);
|
|
||||||
|
|
||||||
int linkSize = 1;
|
|
||||||
var bl = linkDataSrc2Dest[linkedDomain];
|
|
||||||
if (bl != null) {
|
|
||||||
linkSize = bl.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
newRankValue += rank.get(linkedDomain) / linkSize;
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
newRank.set(domainId, 0.85 * newRankValue);
|
|
||||||
}
|
|
||||||
return newRank;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
void adjustRankVector(RankingAlgorithm.RankVector vector, double dNorm, double oldNorm) {
|
|
||||||
originDomainIds.forEach(id -> vector.increment(id, dNorm/originDomainIds.size()));
|
|
||||||
vector.incrementAll(0.14*dNorm/vector.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,89 +0,0 @@
|
|||||||
package nu.marginalia.util.ranking.tool;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.Getter;
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import lombok.ToString;
|
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
|
||||||
import org.mariadb.jdbc.Driver;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.*;
|
|
||||||
import java.util.concurrent.LinkedBlockingQueue;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
public class DedupTool {
|
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(DedupTool.class);
|
|
||||||
|
|
||||||
public Set<String> originDomains = new HashSet<>();
|
|
||||||
public Set<Integer> originDomainIds = new HashSet<>();
|
|
||||||
public final long domainIdMax = -1;
|
|
||||||
public int domainCount;
|
|
||||||
private volatile static int rankMax;
|
|
||||||
|
|
||||||
public int maxId() {
|
|
||||||
return (int) domainIdMax;
|
|
||||||
}
|
|
||||||
public int domainCount() {
|
|
||||||
return domainCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
static LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
|
|
||||||
volatile static boolean running = true;
|
|
||||||
|
|
||||||
@AllArgsConstructor @ToString @Getter
|
|
||||||
static class Data {
|
|
||||||
String url;
|
|
||||||
int id;
|
|
||||||
String domain;
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public static void main(String... args) {
|
|
||||||
Driver driver = new Driver();
|
|
||||||
var ds = new DatabaseModule().provideConnection();
|
|
||||||
|
|
||||||
Map<Integer, Map<Integer, List<Data>>> domainToHashToUrl = new HashMap<>();
|
|
||||||
|
|
||||||
try (var conn = ds.getConnection();
|
|
||||||
var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.DOMAIN_NAME FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL");
|
|
||||||
var updateStmt = conn.prepareStatement("UPDATE EC_URL SET STATE='redirect' WHERE ID=?")
|
|
||||||
|
|
||||||
) {
|
|
||||||
fetchStmt.setFetchSize(10_000);
|
|
||||||
var rsp = fetchStmt.executeQuery();
|
|
||||||
while (rsp.next()) {
|
|
||||||
domainToHashToUrl.computeIfAbsent(rsp.getInt(1), i -> new HashMap<>())
|
|
||||||
.computeIfAbsent(rsp.getInt(2), i -> new ArrayList<>()).add(new Data(rsp.getString(3), rsp.getInt(4), rsp.getString(5)));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
List<Integer> updateIds = new ArrayList<>();
|
|
||||||
|
|
||||||
domainToHashToUrl.forEach((domain, hashes) -> {
|
|
||||||
hashes.forEach((hash, urls) -> {
|
|
||||||
if (urls.size() > 1) {
|
|
||||||
Comparator<Data> c = Comparator.comparing(d -> d.domain.length());
|
|
||||||
var urls2 = urls.stream().sorted(c.thenComparing(d -> d.url.length()))
|
|
||||||
.collect(Collectors.partitioningBy(d -> d.url.endsWith("/")));
|
|
||||||
|
|
||||||
Stream
|
|
||||||
.concat(urls2.get(true).stream(),urls2.get(false).stream()).skip(1)
|
|
||||||
.map(Data::getId)
|
|
||||||
.forEach(updateIds::add);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
for (int id : updateIds) {
|
|
||||||
updateStmt.setInt(1, id);
|
|
||||||
updateStmt.executeUpdate();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -2,6 +2,7 @@ package nu.marginalia.wmsa.client;
|
|||||||
|
|
||||||
import com.google.gson.*;
|
import com.google.gson.*;
|
||||||
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
|
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
|
||||||
|
import nu.marginalia.util.bigstring.BigString;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||||
@ -24,6 +25,8 @@ public class GsonFactory {
|
|||||||
.registerTypeAdapter(EdgeDomain.class, (JsonDeserializer<EdgeDomain>) (json, typeOfT, context) -> new EdgeDomain(json.getAsString()))
|
.registerTypeAdapter(EdgeDomain.class, (JsonDeserializer<EdgeDomain>) (json, typeOfT, context) -> new EdgeDomain(json.getAsString()))
|
||||||
.registerTypeAdapter(EdgeId.class, (JsonDeserializer<EdgeId<?>>) (json, typeOfT, context) -> new EdgeId<>(json.getAsInt()))
|
.registerTypeAdapter(EdgeId.class, (JsonDeserializer<EdgeId<?>>) (json, typeOfT, context) -> new EdgeId<>(json.getAsInt()))
|
||||||
.registerTypeAdapter(EdgeId.class, (JsonSerializer<EdgeId<?>>) (src, typeOfSrc, context) -> new JsonPrimitive(src.id()))
|
.registerTypeAdapter(EdgeId.class, (JsonSerializer<EdgeId<?>>) (src, typeOfSrc, context) -> new JsonPrimitive(src.id()))
|
||||||
|
.registerTypeAdapter(BigString.class, (JsonDeserializer<BigString>) (json, typeOfT, context) -> BigString.encode(json.getAsString()))
|
||||||
|
.registerTypeAdapter(BigString.class, (JsonSerializer<BigString>) (src, typeOfT, context) -> new JsonPrimitive(src.decode()))
|
||||||
.serializeSpecialFloatingPointValues()
|
.serializeSpecialFloatingPointValues()
|
||||||
.create();
|
.create();
|
||||||
}
|
}
|
||||||
|
@ -13,7 +13,6 @@ import nu.marginalia.wmsa.memex.MemexMain;
|
|||||||
import nu.marginalia.wmsa.podcasts.PodcastScraperMain;
|
import nu.marginalia.wmsa.podcasts.PodcastScraperMain;
|
||||||
import nu.marginalia.wmsa.renderer.RendererMain;
|
import nu.marginalia.wmsa.renderer.RendererMain;
|
||||||
import nu.marginalia.wmsa.resource_store.ResourceStoreMain;
|
import nu.marginalia.wmsa.resource_store.ResourceStoreMain;
|
||||||
import nu.marginalia.wmsa.smhi.scraper.SmhiScraperMain;
|
|
||||||
import org.apache.logging.log4j.core.lookup.MainMapLookup;
|
import org.apache.logging.log4j.core.lookup.MainMapLookup;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
@ -26,7 +25,6 @@ public enum ServiceDescriptor {
|
|||||||
AUTH("auth", 5003, AuthMain.class),
|
AUTH("auth", 5003, AuthMain.class),
|
||||||
API("api", 5004, ApiMain.class),
|
API("api", 5004, ApiMain.class),
|
||||||
|
|
||||||
SMHI_SCRAPER("smhi-scraper",5012, SmhiScraperMain.class),
|
|
||||||
PODCST_SCRAPER("podcast-scraper", 5013, PodcastScraperMain.class),
|
PODCST_SCRAPER("podcast-scraper", 5013, PodcastScraperMain.class),
|
||||||
|
|
||||||
EDGE_INDEX("edge-index", 5021, EdgeIndexMain.class),
|
EDGE_INDEX("edge-index", 5021, EdgeIndexMain.class),
|
||||||
|
@ -2,9 +2,10 @@ package nu.marginalia.wmsa.edge.assistant.dict;
|
|||||||
|
|
||||||
import ca.rmen.porterstemmer.PorterStemmer;
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
import gnu.trove.map.hash.TLongIntHashMap;
|
import gnu.trove.map.hash.TLongIntHashMap;
|
||||||
|
import gnu.trove.set.hash.TLongHashSet;
|
||||||
import nu.marginalia.util.language.LanguageFilter;
|
import nu.marginalia.util.language.LanguageFilter;
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter;
|
||||||
@ -18,11 +19,10 @@ import javax.annotation.Nullable;
|
|||||||
import javax.inject.Inject;
|
import javax.inject.Inject;
|
||||||
import javax.inject.Singleton;
|
import javax.inject.Singleton;
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.concurrent.ForkJoinPool;
|
import java.util.concurrent.ForkJoinPool;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
@ -101,12 +101,15 @@ public class TermFrequencyDict {
|
|||||||
|
|
||||||
fjp.execute(() -> {
|
fjp.execute(() -> {
|
||||||
|
|
||||||
|
TLongHashSet words = new TLongHashSet(10_000);
|
||||||
|
|
||||||
for (var doc : domain.doc) {
|
for (var doc : domain.doc) {
|
||||||
|
|
||||||
if (doc.documentBody == null)
|
if (doc.documentBody == null)
|
||||||
continue;
|
continue;
|
||||||
docCount.incrementAndGet();
|
docCount.incrementAndGet();
|
||||||
|
|
||||||
Document parsed = Jsoup.parse(doc.documentBody);
|
Document parsed = Jsoup.parse(doc.documentBody.decode());
|
||||||
parsed.body().filter(new DomPruningFilter(0.5));
|
parsed.body().filter(new DomPruningFilter(0.5));
|
||||||
|
|
||||||
DocumentLanguageData dld = se.get().extractSentences(parsed);
|
DocumentLanguageData dld = se.get().extractSentences(parsed);
|
||||||
@ -115,28 +118,30 @@ public class TermFrequencyDict {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
Set<String> words = new HashSet<>(10_000);
|
|
||||||
|
|
||||||
for (var sent : dld.sentences) {
|
for (var sent : dld.sentences) {
|
||||||
for (var word : sent) {
|
for (var word : sent) {
|
||||||
words.add(word.stemmed());
|
words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fjp.execute(() -> {
|
synchronized (counts) {
|
||||||
synchronized (counts) {
|
words.forEach(w -> {
|
||||||
for (var word : words) {
|
counts.adjustOrPutValue(w, 1, 1);
|
||||||
counts.adjustOrPutValue(longHash(word.getBytes()), 1, 1);
|
return true;
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
});
|
|
||||||
|
|
||||||
|
words.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
System.out.println(domain.domain + "\t" + counts.size());
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fjp.shutdown();
|
fjp.shutdown();
|
||||||
fjp.awaitTermination(10, TimeUnit.SECONDS);
|
fjp.awaitTermination(10, TimeUnit.DAYS);
|
||||||
|
|
||||||
try (var dos = new DataOutputStream(Files.newOutputStream(Path.of(outFile)))) {
|
try (var dos = new DataOutputStream(Files.newOutputStream(Path.of(outFile)))) {
|
||||||
synchronized (counts) {
|
synchronized (counts) {
|
||||||
@ -155,14 +160,6 @@ public class TermFrequencyDict {
|
|||||||
}
|
}
|
||||||
|
|
||||||
System.out.println(docCount.get());
|
System.out.println(docCount.get());
|
||||||
//
|
|
||||||
// counts.forEachEntry((w,c) -> {
|
|
||||||
// if (c > 3L) {
|
|
||||||
// System.out.println(w + ":" + c);
|
|
||||||
// }
|
|
||||||
// return true;
|
|
||||||
// });
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static long getStringHash(String s) {
|
public static long getStringHash(String s) {
|
||||||
|
@ -46,17 +46,12 @@ public class ConverterMain {
|
|||||||
InstructionsCompiler compiler,
|
InstructionsCompiler compiler,
|
||||||
Gson gson
|
Gson gson
|
||||||
) throws Exception {
|
) throws Exception {
|
||||||
|
|
||||||
;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
logger.info("Starting pipe");
|
logger.info("Starting pipe");
|
||||||
|
|
||||||
try (WorkLog processLog = plan.createProcessWorkLog();
|
try (WorkLog processLog = plan.createProcessWorkLog();
|
||||||
ConversionLog log = new ConversionLog(plan.process.getDir())) {
|
ConversionLog log = new ConversionLog(plan.process.getDir())) {
|
||||||
instructionWriter = new LoadInstructionWriter(log, plan.process.getDir(), gson);
|
instructionWriter = new LoadInstructionWriter(log, plan.process.getDir(), gson);
|
||||||
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 20, 4, 2) {
|
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 16, 4, 2) {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected ProcessingInstructions onProcess(CrawledDomain domainData) {
|
protected ProcessingInstructions onProcess(CrawledDomain domainData) {
|
||||||
|
@ -150,7 +150,7 @@ public class LinkKeywordExtractorMain {
|
|||||||
|
|
||||||
for (var doc : crawledDomain.doc) {
|
for (var doc : crawledDomain.doc) {
|
||||||
if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) {
|
if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) {
|
||||||
anchorTextExtractor.processDocument(doc.url, doc.documentBody);
|
anchorTextExtractor.processDocument(doc.url, doc.documentBody.decode());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7,7 +7,7 @@ import nu.marginalia.util.gregex.GuardedRegex;
|
|||||||
import nu.marginalia.util.gregex.GuardedRegexFactory;
|
import nu.marginalia.util.gregex.GuardedRegexFactory;
|
||||||
import nu.marginalia.util.language.LanguageFilter;
|
import nu.marginalia.util.language.LanguageFilter;
|
||||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
||||||
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
|
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
|
||||||
@ -178,11 +178,13 @@ public class DocumentProcessor {
|
|||||||
private DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument)
|
private DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument)
|
||||||
throws DisqualifiedException, URISyntaxException {
|
throws DisqualifiedException, URISyntaxException {
|
||||||
|
|
||||||
if (languageFilter.isBlockedUnicodeRange(crawledDocument.documentBody)) {
|
String documentBody = crawledDocument.documentBody.decode();
|
||||||
|
|
||||||
|
if (languageFilter.isBlockedUnicodeRange(documentBody)) {
|
||||||
throw new DisqualifiedException(DisqualificationReason.LANGUAGE);
|
throw new DisqualifiedException(DisqualificationReason.LANGUAGE);
|
||||||
}
|
}
|
||||||
|
|
||||||
Document doc = Jsoup.parse(crawledDocument.documentBody);
|
Document doc = Jsoup.parse(documentBody);
|
||||||
|
|
||||||
if (AcceptableAds.hasAcceptableAdsTag(doc)) {
|
if (AcceptableAds.hasAcceptableAdsTag(doc)) {
|
||||||
// I've never encountered a website where this hasn't been a severe indicator
|
// I've never encountered a website where this hasn't been a severe indicator
|
||||||
|
@ -42,7 +42,7 @@ public class DomainProcessor {
|
|||||||
|
|
||||||
fixBadCanonicalTags(crawledDomain.doc);
|
fixBadCanonicalTags(crawledDomain.doc);
|
||||||
|
|
||||||
StringPool stringPool = new StringPool(1000 + 100 * crawledDomain.doc.size());
|
StringPool stringPool = StringPool.create(1000 + 100 * crawledDomain.doc.size());
|
||||||
|
|
||||||
for (var doc : crawledDomain.doc) {
|
for (var doc : crawledDomain.doc) {
|
||||||
var processedDoc = documentProcessor.process(doc, crawledDomain);
|
var processedDoc = documentProcessor.process(doc, crawledDomain);
|
||||||
|
@ -33,8 +33,7 @@ public class SiteWords {
|
|||||||
Set<String> commonSiteWords = new HashSet<>(10);
|
Set<String> commonSiteWords = new HashSet<>(10);
|
||||||
|
|
||||||
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain,
|
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain,
|
||||||
EdgePageWordFlags.Subjects,
|
EdgePageWordFlags.Subjects));
|
||||||
EdgePageWordFlags.TfIdfHigh));
|
|
||||||
|
|
||||||
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain,
|
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain,
|
||||||
EdgePageWordFlags.Title));
|
EdgePageWordFlags.Title));
|
||||||
|
@ -11,7 +11,7 @@ public class CommonKeywordExtractor {
|
|||||||
|
|
||||||
private static final int MIN_REQUIRED_DOCUMENTS = 25;
|
private static final int MIN_REQUIRED_DOCUMENTS = 25;
|
||||||
|
|
||||||
private static final int REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION = 100;
|
private static final int REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION = 15;
|
||||||
private static final double QUALIFYING_PROPORTION_FOR_KEYWORD = .25;
|
private static final double QUALIFYING_PROPORTION_FOR_KEYWORD = .25;
|
||||||
|
|
||||||
private static final int MAX_SITE_KEYWORDS_TO_EXTRACT = 5;
|
private static final int MAX_SITE_KEYWORDS_TO_EXTRACT = 5;
|
||||||
|
@ -126,6 +126,9 @@ public class LinkParser {
|
|||||||
if (doesUrlStringHaveProtocol(s)) {
|
if (doesUrlStringHaveProtocol(s)) {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
else if (s.startsWith("//")) { // scheme-relative URL
|
||||||
|
return baseUrl.proto + ":" + s;
|
||||||
|
}
|
||||||
|
|
||||||
String[] parts = paramSeparatorPattern.split(s, 2);
|
String[] parts = paramSeparatorPattern.split(s, 2);
|
||||||
String path = parts[0];
|
String path = parts[0];
|
||||||
|
@ -2,11 +2,11 @@ package nu.marginalia.wmsa.edge.crawling;
|
|||||||
|
|
||||||
import com.github.luben.zstd.ZstdInputStream;
|
import com.github.luben.zstd.ZstdInputStream;
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
|
import jdkoverride.LargeLineBufferedReader;
|
||||||
import nu.marginalia.wmsa.client.GsonFactory;
|
import nu.marginalia.wmsa.client.GsonFactory;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
@ -19,61 +19,41 @@ import java.util.concurrent.TimeUnit;
|
|||||||
public class CrawledDomainReader {
|
public class CrawledDomainReader {
|
||||||
private final Gson gson = GsonFactory.get();
|
private final Gson gson = GsonFactory.get();
|
||||||
|
|
||||||
private final ForkJoinPool pool = new ForkJoinPool(4);
|
private final ForkJoinPool pool = new ForkJoinPool(6);
|
||||||
|
|
||||||
public CrawledDomainReader() {
|
public CrawledDomainReader() {
|
||||||
}
|
}
|
||||||
|
|
||||||
public CrawledDomain read(Path path) throws IOException {
|
public CrawledDomain read(Path path) throws IOException {
|
||||||
List<CrawledDocument> docs = new ArrayList<>();
|
DomainDataAssembler domainData = new DomainDataAssembler();
|
||||||
CrawledDomain domain = null;
|
|
||||||
|
|
||||||
|
try (var br = new LargeLineBufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))))) {
|
||||||
|
String line;
|
||||||
|
while ((line = br.readLine()) != null) {
|
||||||
|
if (line.startsWith("//")) {
|
||||||
|
String identifier = line;
|
||||||
|
String data = br.readLine();
|
||||||
|
|
||||||
try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))))) {
|
pool.execute(() -> deserializeLine(identifier, data, domainData));
|
||||||
br.mark(2);
|
|
||||||
boolean legacy = '{' == br.read();
|
|
||||||
br.reset();
|
|
||||||
|
|
||||||
if (legacy) {
|
|
||||||
domain = gson.fromJson(br, CrawledDomain.class);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
String line;
|
|
||||||
while ((line = br.readLine()) != null) {
|
|
||||||
if (line.startsWith("//")) {
|
|
||||||
String nextLine = br.readLine();
|
|
||||||
if (nextLine == null) break;
|
|
||||||
|
|
||||||
if (line.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
|
|
||||||
domain = gson.fromJson(nextLine, CrawledDomain.class);
|
|
||||||
} else if (line.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
|
|
||||||
pool.execute(() -> {
|
|
||||||
var doc = gson.fromJson(nextLine, CrawledDocument.class);
|
|
||||||
synchronized (docs) {
|
|
||||||
docs.add(doc);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
} else if (line.charAt(0) == '{') {
|
|
||||||
domain = gson.fromJson(line, CrawledDomain.class);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pool.awaitQuiescence(10, TimeUnit.SECONDS);
|
while (!pool.awaitQuiescence(1, TimeUnit.SECONDS));
|
||||||
|
|
||||||
if (domain == null) {
|
return domainData.assemble();
|
||||||
return null;
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void deserializeLine(String identifier, String data, DomainDataAssembler assembler) {
|
||||||
|
if (null == data) {
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
|
||||||
if (!docs.isEmpty()) {
|
assembler.acceptDomain(gson.fromJson(data, CrawledDomain.class));
|
||||||
if (domain.doc == null)
|
} else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
|
||||||
domain.doc = new ArrayList<>();
|
assembler.acceptDoc(gson.fromJson(data, CrawledDocument.class));
|
||||||
|
|
||||||
domain.doc.addAll(docs);
|
|
||||||
}
|
}
|
||||||
return domain;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public CrawledDomain readRuntimeExcept(Path path) {
|
public CrawledDomain readRuntimeExcept(Path path) {
|
||||||
@ -84,4 +64,27 @@ public class CrawledDomainReader {
|
|||||||
throw new RuntimeException(ex);
|
throw new RuntimeException(ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static class DomainDataAssembler {
|
||||||
|
private CrawledDomain domainPrototype;
|
||||||
|
private final List<CrawledDocument> docs = new ArrayList<>();
|
||||||
|
|
||||||
|
public synchronized void acceptDomain(CrawledDomain domain) {
|
||||||
|
this.domainPrototype = domain;
|
||||||
|
}
|
||||||
|
|
||||||
|
public synchronized void acceptDoc(CrawledDocument doc) {
|
||||||
|
docs.add(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
public synchronized CrawledDomain assemble() {
|
||||||
|
if (!docs.isEmpty()) {
|
||||||
|
if (domainPrototype.doc == null)
|
||||||
|
domainPrototype.doc = new ArrayList<>();
|
||||||
|
|
||||||
|
domainPrototype.doc.addAll(docs);
|
||||||
|
}
|
||||||
|
return domainPrototype;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -24,7 +24,7 @@ public class UrlBlocklist {
|
|||||||
patterns.add(s -> s.contains("-download-free"));
|
patterns.add(s -> s.contains("-download-free"));
|
||||||
|
|
||||||
// long base64-strings in URLs are typically git hashes or the like, rarely worth crawling
|
// long base64-strings in URLs are typically git hashes or the like, rarely worth crawling
|
||||||
patterns.add(GuardedRegexFactory.minLength(48, ".*/[^/]*[a-f0-9]{32,}(/|$)"));
|
patterns.add(this::hashTest);
|
||||||
|
|
||||||
// link farms &c
|
// link farms &c
|
||||||
patterns.add(GuardedRegexFactory.contains("/download", "/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$"));
|
patterns.add(GuardedRegexFactory.contains("/download", "/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$"));
|
||||||
@ -38,6 +38,33 @@ public class UrlBlocklist {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean hashTest(String path) {
|
||||||
|
// look for strings might be a git hash (i.e. long hexadecimal strings)
|
||||||
|
// there is no good guard for a regular expression for this so hand-rolling this
|
||||||
|
// is necessary
|
||||||
|
|
||||||
|
int runLength = 0;
|
||||||
|
int minLength = 32;
|
||||||
|
|
||||||
|
if (path.length() <= minLength + 2)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
for (int i = 0; i < path.length(); i++) {
|
||||||
|
int c = path.charAt(i);
|
||||||
|
|
||||||
|
if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f')) {
|
||||||
|
runLength++;
|
||||||
|
}
|
||||||
|
else if (runLength >= minLength) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
runLength = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return runLength >= minLength;
|
||||||
|
}
|
||||||
|
|
||||||
public boolean isUrlBlocked(EdgeUrl url) {
|
public boolean isUrlBlocked(EdgeUrl url) {
|
||||||
try {
|
try {
|
||||||
if (badDomains.contains(url.domain.domain)) {
|
if (badDomains.contains(url.domain.domain)) {
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
package nu.marginalia.wmsa.edge.crawling.model;
|
package nu.marginalia.wmsa.edge.crawling.model;
|
||||||
|
|
||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
|
import nu.marginalia.util.bigstring.BigString;
|
||||||
|
import nu.marginalia.util.bigstring.CompressedBigString;
|
||||||
|
|
||||||
@Builder
|
@Builder
|
||||||
public class CrawledDocument implements SerializableCrawlData {
|
public class CrawledDocument implements SerializableCrawlData {
|
||||||
@ -16,8 +18,7 @@ public class CrawledDocument implements SerializableCrawlData {
|
|||||||
public String crawlerStatusDesc;
|
public String crawlerStatusDesc;
|
||||||
|
|
||||||
public String headers;
|
public String headers;
|
||||||
public String documentBody;
|
public BigString documentBody;
|
||||||
|
|
||||||
public String documentBodyHash;
|
public String documentBodyHash;
|
||||||
|
|
||||||
public String canonicalUrl;
|
public String canonicalUrl;
|
||||||
|
@ -206,7 +206,7 @@ public class CrawlerRetreiver {
|
|||||||
|
|
||||||
if (doc.documentBody != null) {
|
if (doc.documentBody != null) {
|
||||||
|
|
||||||
doc.documentBodyHash = createHash(doc.documentBody);
|
doc.documentBodyHash = createHash(doc.documentBody.decode());
|
||||||
|
|
||||||
Optional<Document> parsedDoc = parseDoc(doc);
|
Optional<Document> parsedDoc = parseDoc(doc);
|
||||||
EdgeUrl url = new EdgeUrl(doc.url);
|
EdgeUrl url = new EdgeUrl(doc.url);
|
||||||
@ -251,7 +251,7 @@ public class CrawlerRetreiver {
|
|||||||
private Optional<Document> parseDoc(CrawledDocument doc) {
|
private Optional<Document> parseDoc(CrawledDocument doc) {
|
||||||
if (doc.documentBody == null)
|
if (doc.documentBody == null)
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
return Optional.of(Jsoup.parse(doc.documentBody));
|
return Optional.of(Jsoup.parse(doc.documentBody.decode()));
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isSameDomain(EdgeUrl url) {
|
public boolean isSameDomain(EdgeUrl url) {
|
||||||
|
@ -7,6 +7,7 @@ import crawlercommons.robots.SimpleRobotRulesParser;
|
|||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import lombok.ToString;
|
import lombok.ToString;
|
||||||
|
import nu.marginalia.util.bigstring.BigString;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
|
||||||
import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic;
|
import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic;
|
||||||
@ -271,7 +272,7 @@ public class HttpFetcher {
|
|||||||
.canonicalUrl(canonical)
|
.canonicalUrl(canonical)
|
||||||
.httpStatus(rsp.code())
|
.httpStatus(rsp.code())
|
||||||
.url(responseUrl.toString())
|
.url(responseUrl.toString())
|
||||||
.documentBody(strData)
|
.documentBody(BigString.encode(strData))
|
||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -325,7 +326,7 @@ public class HttpFetcher {
|
|||||||
|
|
||||||
private SimpleRobotRules parseRobotsTxt(CrawledDocument doc) {
|
private SimpleRobotRules parseRobotsTxt(CrawledDocument doc) {
|
||||||
return robotsParser.parseContent(doc.url,
|
return robotsParser.parseContent(doc.url,
|
||||||
doc.documentBody.getBytes(StandardCharsets.UTF_8),
|
doc.documentBody.getBytes(),
|
||||||
doc.contentType,
|
doc.contentType,
|
||||||
userAgent);
|
userAgent);
|
||||||
}
|
}
|
||||||
|
@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.index;
|
|||||||
|
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
@ -9,14 +10,16 @@ import java.io.IOException;
|
|||||||
public class EdgeIndexControl {
|
public class EdgeIndexControl {
|
||||||
|
|
||||||
private final IndexServicesFactory servicesFactory;
|
private final IndexServicesFactory servicesFactory;
|
||||||
|
private final EdgeIndexSearchSetsService searchSetsService;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public EdgeIndexControl(IndexServicesFactory servicesFactory) {
|
public EdgeIndexControl(IndexServicesFactory servicesFactory, EdgeIndexSearchSetsService searchSetsService) {
|
||||||
this.servicesFactory = servicesFactory;
|
this.servicesFactory = servicesFactory;
|
||||||
|
this.searchSetsService = searchSetsService;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void regenerateIndex() throws IOException {
|
public void regenerateIndex() throws IOException {
|
||||||
servicesFactory.convertIndex();
|
servicesFactory.convertIndex(searchSetsService.getDomainRankings());
|
||||||
|
|
||||||
System.gc();
|
System.gc();
|
||||||
}
|
}
|
||||||
|
@ -13,13 +13,6 @@ public class EdgeIndexModule extends AbstractModule {
|
|||||||
|
|
||||||
|
|
||||||
public void configure() {
|
public void configure() {
|
||||||
if (Boolean.getBoolean("small-ram")) {
|
|
||||||
bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 27);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 31);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Provides
|
@Provides
|
||||||
|
@ -5,11 +5,11 @@ import com.google.inject.Singleton;
|
|||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.util.array.LongArray;
|
import nu.marginalia.util.array.LongArray;
|
||||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
import nu.marginalia.util.dict.DictionaryMap;
|
||||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist;
|
|
||||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
||||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
|
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
|
||||||
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
|
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
|
||||||
|
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.SearchIndex;
|
import nu.marginalia.wmsa.edge.index.postings.SearchIndex;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.SearchIndexReader;
|
import nu.marginalia.wmsa.edge.index.postings.SearchIndexReader;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexConverter;
|
import nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexConverter;
|
||||||
@ -20,6 +20,7 @@ import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexConverter;
|
|||||||
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPrioReader;
|
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPrioReader;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPriorityParameters;
|
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPriorityParameters;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexReader;
|
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexReader;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -33,7 +34,6 @@ import java.util.concurrent.Callable;
|
|||||||
@Singleton
|
@Singleton
|
||||||
public class IndexServicesFactory {
|
public class IndexServicesFactory {
|
||||||
private final Path tmpFileDir;
|
private final Path tmpFileDir;
|
||||||
private final EdgeDomainBlacklist domainBlacklist;
|
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
@ -48,7 +48,6 @@ public class IndexServicesFactory {
|
|||||||
private final PartitionedDataFile revPrioIndexWords;
|
private final PartitionedDataFile revPrioIndexWords;
|
||||||
|
|
||||||
private volatile static KeywordLexicon keywordLexicon;
|
private volatile static KeywordLexicon keywordLexicon;
|
||||||
private final Long dictionaryHashMapSize;
|
|
||||||
|
|
||||||
private final Path searchSetsBase;
|
private final Path searchSetsBase;
|
||||||
|
|
||||||
@ -59,14 +58,10 @@ public class IndexServicesFactory {
|
|||||||
public IndexServicesFactory(
|
public IndexServicesFactory(
|
||||||
@Named("tmp-file-dir") Path tmpFileDir,
|
@Named("tmp-file-dir") Path tmpFileDir,
|
||||||
@Named("partition-root-slow") Path partitionRootSlow,
|
@Named("partition-root-slow") Path partitionRootSlow,
|
||||||
@Named("partition-root-fast") Path partitionRootFast,
|
@Named("partition-root-fast") Path partitionRootFast
|
||||||
@Named("edge-dictionary-hash-map-size") Long dictionaryHashMapSize,
|
|
||||||
EdgeDomainBlacklist domainBlacklist
|
|
||||||
) throws IOException {
|
) throws IOException {
|
||||||
|
|
||||||
this.tmpFileDir = tmpFileDir;
|
this.tmpFileDir = tmpFileDir;
|
||||||
this.dictionaryHashMapSize = dictionaryHashMapSize;
|
|
||||||
this.domainBlacklist = domainBlacklist;
|
|
||||||
|
|
||||||
this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, "page-index.dat");
|
this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, "page-index.dat");
|
||||||
this.keywordLexiconFile = new RootDataFile(partitionRootSlow, "dictionary.dat");
|
this.keywordLexiconFile = new RootDataFile(partitionRootSlow, "dictionary.dat");
|
||||||
@ -98,7 +93,7 @@ public class IndexServicesFactory {
|
|||||||
public KeywordLexicon getKeywordLexicon() {
|
public KeywordLexicon getKeywordLexicon() {
|
||||||
if (keywordLexicon == null) {
|
if (keywordLexicon == null) {
|
||||||
final var journal = new KeywordLexiconJournal(keywordLexiconFile.get());
|
final var journal = new KeywordLexiconJournal(keywordLexiconFile.get());
|
||||||
keywordLexicon = new KeywordLexicon(journal, new DictionaryHashMap(dictionaryHashMapSize));
|
keywordLexicon = new KeywordLexicon(journal, DictionaryMap.create());
|
||||||
}
|
}
|
||||||
return keywordLexicon;
|
return keywordLexicon;
|
||||||
}
|
}
|
||||||
@ -109,15 +104,15 @@ public class IndexServicesFactory {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void convertIndex() throws IOException {
|
public void convertIndex(DomainRankings domainRankings) throws IOException {
|
||||||
convertForwardIndex();
|
convertForwardIndex(domainRankings);
|
||||||
convertFullReverseIndex();
|
convertFullReverseIndex(domainRankings);
|
||||||
convertPriorityReverseIndex();
|
convertPriorityReverseIndex(domainRankings);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void convertFullReverseIndex() throws IOException {
|
private void convertFullReverseIndex(DomainRankings domainRankings) throws IOException {
|
||||||
|
|
||||||
logger.info("Converting full reverse index");
|
logger.info("Converting full reverse index");
|
||||||
|
|
||||||
@ -125,6 +120,7 @@ public class IndexServicesFactory {
|
|||||||
var journalReader = new SearchIndexJournalReaderSingleFile(longArray);
|
var journalReader = new SearchIndexJournalReaderSingleFile(longArray);
|
||||||
var converter = new ReverseIndexConverter(tmpFileDir,
|
var converter = new ReverseIndexConverter(tmpFileDir,
|
||||||
journalReader,
|
journalReader,
|
||||||
|
domainRankings,
|
||||||
revIndexWords.get(NEXT_PART).toPath(),
|
revIndexWords.get(NEXT_PART).toPath(),
|
||||||
revIndexDoc.get(NEXT_PART).toPath());
|
revIndexDoc.get(NEXT_PART).toPath());
|
||||||
|
|
||||||
@ -133,7 +129,7 @@ public class IndexServicesFactory {
|
|||||||
tryGc();
|
tryGc();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void convertPriorityReverseIndex() throws IOException {
|
private void convertPriorityReverseIndex(DomainRankings domainRankings) throws IOException {
|
||||||
|
|
||||||
logger.info("Converting priority reverse index");
|
logger.info("Converting priority reverse index");
|
||||||
|
|
||||||
@ -143,6 +139,7 @@ public class IndexServicesFactory {
|
|||||||
|
|
||||||
var converter = new ReverseIndexConverter(tmpFileDir,
|
var converter = new ReverseIndexConverter(tmpFileDir,
|
||||||
journalReader,
|
journalReader,
|
||||||
|
domainRankings,
|
||||||
revPrioIndexWords.get(NEXT_PART).toPath(),
|
revPrioIndexWords.get(NEXT_PART).toPath(),
|
||||||
revPrioIndexDoc.get(NEXT_PART).toPath());
|
revPrioIndexDoc.get(NEXT_PART).toPath());
|
||||||
|
|
||||||
@ -151,13 +148,14 @@ public class IndexServicesFactory {
|
|||||||
tryGc();
|
tryGc();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void convertForwardIndex() throws IOException {
|
private void convertForwardIndex(DomainRankings domainRankings) throws IOException {
|
||||||
logger.info("Converting forward index data");
|
logger.info("Converting forward index data");
|
||||||
|
|
||||||
new ForwardIndexConverter(tmpFileDir,
|
new ForwardIndexConverter(
|
||||||
writerIndexFile.get(0),
|
writerIndexFile.get(0),
|
||||||
fwdIndexDocId.get(NEXT_PART).toPath(),
|
fwdIndexDocId.get(NEXT_PART).toPath(),
|
||||||
fwdIndexDocData.get(NEXT_PART).toPath())
|
fwdIndexDocData.get(NEXT_PART).toPath(),
|
||||||
|
domainRankings)
|
||||||
.convert();
|
.convert();
|
||||||
|
|
||||||
tryGc();
|
tryGc();
|
||||||
@ -215,8 +213,8 @@ public class IndexServicesFactory {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public SearchIndex createIndexBucket() {
|
public SearchIndex createIndexBucket(EdgeIndexSearchSetsService searchSetsService) {
|
||||||
return new SearchIndex(this, new EdgeIndexControl(this));
|
return new SearchIndex(this, new EdgeIndexControl(this, searchSetsService));
|
||||||
}
|
}
|
||||||
|
|
||||||
public SearchIndexReader getSearchIndexReader() throws IOException {
|
public SearchIndexReader getSearchIndexReader() throws IOException {
|
||||||
|
@ -3,7 +3,8 @@ package nu.marginalia.wmsa.edge.index.client;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
|
||||||
|
import nu.marginalia.util.dict.DictionaryMap;
|
||||||
import nu.marginalia.wmsa.configuration.server.Context;
|
import nu.marginalia.wmsa.configuration.server.Context;
|
||||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker;
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker;
|
||||||
@ -32,14 +33,9 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient {
|
|||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public EdgeIndexLocalService(@Named("local-index-path") Path path) throws IOException {
|
public EdgeIndexLocalService(@Named("local-index-path") Path path) throws IOException {
|
||||||
long hashMapSize = 1L << 31;
|
|
||||||
|
|
||||||
if (Boolean.getBoolean("small-ram")) {
|
|
||||||
hashMapSize = 1L << 27;
|
|
||||||
}
|
|
||||||
|
|
||||||
var lexiconJournal = new KeywordLexiconJournal(path.resolve("dictionary.dat").toFile());
|
var lexiconJournal = new KeywordLexiconJournal(path.resolve("dictionary.dat").toFile());
|
||||||
lexicon = new KeywordLexicon(lexiconJournal, new DictionaryHashMap(hashMapSize));
|
lexicon = new KeywordLexicon(lexiconJournal, DictionaryMap.create());
|
||||||
indexWriter = new SearchIndexJournalWriterImpl(lexicon, path.resolve("index.dat").toFile());
|
indexWriter = new SearchIndexJournalWriterImpl(lexicon, path.resolve("index.dat").toFile());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -72,7 +68,7 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient {
|
|||||||
String word = words[i];
|
String word = words[i];
|
||||||
|
|
||||||
long id = lexicon.getOrInsert(word);
|
long id = lexicon.getOrInsert(word);
|
||||||
if (id != DictionaryHashMap.NO_VALUE) {
|
if (id != OffHeapDictionaryHashMap.NO_VALUE) {
|
||||||
ids[putIdx++] = id;
|
ids[putIdx++] = id;
|
||||||
ids[putIdx++] = meta[i];
|
ids[putIdx++] = meta[i];
|
||||||
}
|
}
|
||||||
|
@ -4,7 +4,6 @@ import com.google.common.hash.HashFunction;
|
|||||||
import com.google.common.hash.Hashing;
|
import com.google.common.hash.Hashing;
|
||||||
import io.prometheus.client.Gauge;
|
import io.prometheus.client.Gauge;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
|
||||||
import nu.marginalia.util.dict.DictionaryMap;
|
import nu.marginalia.util.dict.DictionaryMap;
|
||||||
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
|
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -55,7 +54,7 @@ public class KeywordLexicon implements AutoCloseable {
|
|||||||
private int getOrInsert(byte[] bytes) {
|
private int getOrInsert(byte[] bytes) {
|
||||||
if (bytes.length >= Byte.MAX_VALUE) {
|
if (bytes.length >= Byte.MAX_VALUE) {
|
||||||
logger.warn("getOrInsert({}), illegal length {}", new String(bytes), bytes.length);
|
logger.warn("getOrInsert({}), illegal length {}", new String(bytes), bytes.length);
|
||||||
return DictionaryHashMap.NO_VALUE;
|
return DictionaryMap.NO_VALUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
final long key = hashFunction.hashBytes(bytes).padToLong();
|
final long key = hashFunction.hashBytes(bytes).padToLong();
|
||||||
|
@ -8,7 +8,8 @@ import java.util.Set;
|
|||||||
import static java.lang.Math.max;
|
import static java.lang.Math.max;
|
||||||
import static java.lang.Math.min;
|
import static java.lang.Math.min;
|
||||||
|
|
||||||
public record EdgePageDocumentsMetadata(int encSize,
|
public record EdgePageDocumentsMetadata(int rank,
|
||||||
|
int encSize,
|
||||||
int topology,
|
int topology,
|
||||||
int year,
|
int year,
|
||||||
int sets,
|
int sets,
|
||||||
@ -16,9 +17,13 @@ public record EdgePageDocumentsMetadata(int encSize,
|
|||||||
byte flags) {
|
byte flags) {
|
||||||
|
|
||||||
|
|
||||||
|
public static final long RANK_MASK = 0xFFL;
|
||||||
|
public static final int RANK_SHIFT = 48;
|
||||||
|
|
||||||
public static final long ENCSIZE_MASK = 0xFFL;
|
public static final long ENCSIZE_MASK = 0xFFL;
|
||||||
public static final int ENCSIZE_SHIFT = 48;
|
public static final int ENCSIZE_SHIFT = 40;
|
||||||
public static final int ENCSIZE_MULTIPLIER = 50;
|
public static final int ENCSIZE_MULTIPLIER = 50;
|
||||||
|
|
||||||
public static final long TOPOLOGY_MASK = 0xFFL;
|
public static final long TOPOLOGY_MASK = 0xFFL;
|
||||||
|
|
||||||
public static final int TOPOLOGY_SHIFT = 32;
|
public static final int TOPOLOGY_SHIFT = 32;
|
||||||
@ -39,7 +44,7 @@ public record EdgePageDocumentsMetadata(int encSize,
|
|||||||
this(defaultValue());
|
this(defaultValue());
|
||||||
}
|
}
|
||||||
public EdgePageDocumentsMetadata(int topology, int year, int sets, int quality, EnumSet<EdgePageDocumentFlags> flags) {
|
public EdgePageDocumentsMetadata(int topology, int year, int sets, int quality, EnumSet<EdgePageDocumentFlags> flags) {
|
||||||
this(0, topology, year, sets, quality, encodeFlags(flags));
|
this(0, 0, topology, year, sets, quality, encodeFlags(flags));
|
||||||
}
|
}
|
||||||
|
|
||||||
public EdgePageDocumentsMetadata withSize(int size) {
|
public EdgePageDocumentsMetadata withSize(int size) {
|
||||||
@ -49,7 +54,7 @@ public record EdgePageDocumentsMetadata(int encSize,
|
|||||||
|
|
||||||
final int encSize = (int) Math.min(ENCSIZE_MASK, Math.max(1, size / ENCSIZE_MULTIPLIER));
|
final int encSize = (int) Math.min(ENCSIZE_MASK, Math.max(1, size / ENCSIZE_MULTIPLIER));
|
||||||
|
|
||||||
return new EdgePageDocumentsMetadata(encSize, topology, year, sets, quality, flags);
|
return new EdgePageDocumentsMetadata(rank, encSize, topology, year, sets, quality, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static byte encodeFlags(Set<EdgePageDocumentFlags> flags) {
|
private static byte encodeFlags(Set<EdgePageDocumentFlags> flags) {
|
||||||
@ -63,7 +68,8 @@ public record EdgePageDocumentsMetadata(int encSize,
|
|||||||
}
|
}
|
||||||
|
|
||||||
public EdgePageDocumentsMetadata(long value) {
|
public EdgePageDocumentsMetadata(long value) {
|
||||||
this( (int) ((value >>> ENCSIZE_SHIFT) & ENCSIZE_MASK),
|
this( (int) ((value >>> RANK_SHIFT) & RANK_MASK),
|
||||||
|
(int) ((value >>> ENCSIZE_SHIFT) & ENCSIZE_MASK),
|
||||||
(int) ((value >>> TOPOLOGY_SHIFT) & TOPOLOGY_MASK),
|
(int) ((value >>> TOPOLOGY_SHIFT) & TOPOLOGY_MASK),
|
||||||
(int) ((value >>> YEAR_SHIFT) & YEAR_MASK),
|
(int) ((value >>> YEAR_SHIFT) & YEAR_MASK),
|
||||||
(int) ((value >>> SETS_SHIFT) & SETS_MASK),
|
(int) ((value >>> SETS_SHIFT) & SETS_MASK),
|
||||||
@ -84,12 +90,13 @@ public record EdgePageDocumentsMetadata(int encSize,
|
|||||||
ret |= min(YEAR_MASK, max(0, year)) << YEAR_SHIFT;
|
ret |= min(YEAR_MASK, max(0, year)) << YEAR_SHIFT;
|
||||||
ret |= min(TOPOLOGY_MASK, max(0, topology)) << TOPOLOGY_SHIFT;
|
ret |= min(TOPOLOGY_MASK, max(0, topology)) << TOPOLOGY_SHIFT;
|
||||||
ret |= min(ENCSIZE_MASK, max(0, encSize)) << ENCSIZE_SHIFT;
|
ret |= min(ENCSIZE_MASK, max(0, encSize)) << ENCSIZE_SHIFT;
|
||||||
|
ret |= min(RANK_MASK, max(0, rank)) << RANK_SHIFT;
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isEmpty() {
|
public boolean isEmpty() {
|
||||||
return encSize == 0 && topology == 0 && sets == 0 && quality == 0 && year == 0 && flags == 0;
|
return encSize == 0 && topology == 0 && sets == 0 && quality == 0 && year == 0 && flags == 0 && rank == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static int decodeQuality(long encoded) {
|
public static int decodeQuality(long encoded) {
|
||||||
@ -112,6 +119,12 @@ public record EdgePageDocumentsMetadata(int encSize,
|
|||||||
return ENCSIZE_MULTIPLIER * (int) ((encoded >>> ENCSIZE_SHIFT) & ENCSIZE_MASK);
|
return ENCSIZE_MULTIPLIER * (int) ((encoded >>> ENCSIZE_SHIFT) & ENCSIZE_MASK);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static int decodeRank(long encoded) {
|
||||||
|
return (int) ((encoded >>> RANK_SHIFT) & RANK_MASK);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static long encodeRank(long encoded, int rank) {
|
||||||
|
return encoded | min(RANK_MASK, max(0, rank)) << RANK_SHIFT;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,4 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.model;
|
||||||
|
|
||||||
|
public record QueryLimits(int resultsByDomain, int resultsTotal, int timeoutMs, int fetchSize) {
|
||||||
|
}
|
@ -3,5 +3,10 @@ package nu.marginalia.wmsa.edge.index.model;
|
|||||||
public enum QueryStrategy {
|
public enum QueryStrategy {
|
||||||
SENTENCE,
|
SENTENCE,
|
||||||
TOPIC,
|
TOPIC,
|
||||||
|
|
||||||
|
REQUIRE_FIELD_SITE,
|
||||||
|
REQUIRE_FIELD_TITLE,
|
||||||
|
REQUIRE_FIELD_SUBJECT,
|
||||||
|
|
||||||
AUTO
|
AUTO
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,43 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.postings;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
|
||||||
|
import it.unimi.dsi.fastutil.ints.Int2ShortOpenHashMap;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import static java.lang.Math.max;
|
||||||
|
import static java.lang.Math.min;
|
||||||
|
|
||||||
|
public class DomainRankings {
|
||||||
|
private final Int2ShortOpenHashMap rankings;
|
||||||
|
|
||||||
|
private final int MAX_MEANINGFUL_RANK = 50_000;
|
||||||
|
private final int MAX_RANK_VALUE = 255;
|
||||||
|
private final int MIN_RANK_VALUE = 1;
|
||||||
|
private final double RANK_SCALING_FACTOR = (double) MAX_RANK_VALUE / MAX_MEANINGFUL_RANK;
|
||||||
|
|
||||||
|
public DomainRankings() {
|
||||||
|
rankings = new Int2ShortOpenHashMap();
|
||||||
|
}
|
||||||
|
public DomainRankings(Int2IntOpenHashMap values) {
|
||||||
|
rankings = new Int2ShortOpenHashMap(values.size());
|
||||||
|
values.forEach(this::putRanking);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void putRanking(int domainId, int value) {
|
||||||
|
rankings.put(domainId, scaleRank(value));
|
||||||
|
}
|
||||||
|
|
||||||
|
private short scaleRank(int value) {
|
||||||
|
double rankScaled = RANK_SCALING_FACTOR * value;
|
||||||
|
return (short) min(MAX_RANK_VALUE, max(MIN_RANK_VALUE, rankScaled));
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getRanking(int domainId) {
|
||||||
|
return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int size() {
|
||||||
|
return rankings.size();
|
||||||
|
}
|
||||||
|
}
|
@ -7,6 +7,8 @@ import it.unimi.dsi.fastutil.ints.IntArrayList;
|
|||||||
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
|
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
|
||||||
import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
|
import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
|
||||||
import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
|
import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.QueryStrategy;
|
||||||
|
import nu.marginalia.wmsa.edge.index.query.IndexQueryParams;
|
||||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem;
|
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem;
|
||||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore;
|
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore;
|
||||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery;
|
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery;
|
||||||
@ -17,6 +19,7 @@ import java.util.Objects;
|
|||||||
public class IndexResultValuator {
|
public class IndexResultValuator {
|
||||||
private final IndexMetadataService metadataService;
|
private final IndexMetadataService metadataService;
|
||||||
private final List<List<String>> searchTermVariants;
|
private final List<List<String>> searchTermVariants;
|
||||||
|
private final IndexQueryParams queryParams;
|
||||||
private final int[] termIdsAll;
|
private final int[] termIdsAll;
|
||||||
|
|
||||||
private final TLongHashSet resultsWithPriorityTerms;
|
private final TLongHashSet resultsWithPriorityTerms;
|
||||||
@ -24,9 +27,10 @@ public class IndexResultValuator {
|
|||||||
private final TObjectIntHashMap<String> termToId = new TObjectIntHashMap<>(10, 0.75f, -1);
|
private final TObjectIntHashMap<String> termToId = new TObjectIntHashMap<>(10, 0.75f, -1);
|
||||||
private final TermMetadata termMetadata;
|
private final TermMetadata termMetadata;
|
||||||
|
|
||||||
public IndexResultValuator(SearchIndexControl indexes, TLongList results, List<EdgeSearchSubquery> subqueries) {
|
public IndexResultValuator(SearchIndexControl indexes, TLongList results, List<EdgeSearchSubquery> subqueries, IndexQueryParams queryParams) {
|
||||||
this.metadataService = new IndexMetadataService(indexes);
|
this.metadataService = new IndexMetadataService(indexes);
|
||||||
this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
|
this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
|
||||||
|
this.queryParams = queryParams;
|
||||||
|
|
||||||
var lexiconReader = Objects.requireNonNull(indexes.getLexiconReader());
|
var lexiconReader = Objects.requireNonNull(indexes.getLexiconReader());
|
||||||
IntArrayList termIdsList = new IntArrayList();
|
IntArrayList termIdsList = new IntArrayList();
|
||||||
@ -114,10 +118,15 @@ public class IndexResultValuator {
|
|||||||
docMetadata,
|
docMetadata,
|
||||||
resultsWithPriorityTerms.contains(searchResult.combinedId)
|
resultsWithPriorityTerms.contains(searchResult.combinedId)
|
||||||
);
|
);
|
||||||
|
|
||||||
searchResult.scores.add(score);
|
searchResult.scores.add(score);
|
||||||
|
|
||||||
setScore += score.termValue();
|
setScore += score.termValue();
|
||||||
|
|
||||||
|
if (!filterRequired(metadata, queryParams.queryStrategy())) {
|
||||||
|
setScore += 1000;
|
||||||
|
}
|
||||||
|
|
||||||
if (termIdx == 0) {
|
if (termIdx == 0) {
|
||||||
setScore += score.documentValue();
|
setScore += score.documentValue();
|
||||||
}
|
}
|
||||||
@ -130,6 +139,19 @@ public class IndexResultValuator {
|
|||||||
return setScore/setSize;
|
return setScore/setSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean filterRequired(long metadata, QueryStrategy queryStrategy) {
|
||||||
|
if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) {
|
||||||
|
return EdgePageWordFlags.Site.isPresent(metadata);
|
||||||
|
}
|
||||||
|
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) {
|
||||||
|
return EdgePageWordFlags.Subjects.isPresent(metadata);
|
||||||
|
}
|
||||||
|
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) {
|
||||||
|
return EdgePageWordFlags.Title.isPresent(metadata);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
private double calculateTermCoherencePenalty(int urlId, TObjectIntHashMap<String> termToId, List<String> termList) {
|
private double calculateTermCoherencePenalty(int urlId, TObjectIntHashMap<String> termToId, List<String> termList) {
|
||||||
long maskDirectGenerous = ~0;
|
long maskDirectGenerous = ~0;
|
||||||
long maskDirectRaw = ~0;
|
long maskDirectRaw = ~0;
|
||||||
@ -139,6 +161,9 @@ public class IndexResultValuator {
|
|||||||
| EdgePageWordFlags.Subjects.asBit()
|
| EdgePageWordFlags.Subjects.asBit()
|
||||||
| EdgePageWordFlags.Synthetic.asBit();
|
| EdgePageWordFlags.Synthetic.asBit();
|
||||||
|
|
||||||
|
int termCount = 0;
|
||||||
|
double tfIdfSum = 1.;
|
||||||
|
|
||||||
for (String term : termList) {
|
for (String term : termList) {
|
||||||
var meta = termMetadata.getTermMetadata(termToId.get(term), urlId);
|
var meta = termMetadata.getTermMetadata(termToId.get(term), urlId);
|
||||||
long positions;
|
long positions;
|
||||||
@ -156,18 +181,22 @@ public class IndexResultValuator {
|
|||||||
maskDirectGenerous &= positions;
|
maskDirectGenerous &= positions;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
termCount++;
|
||||||
|
tfIdfSum += EdgePageWordMetadata.decodeTfidf(meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
double avgTfIdf = termCount / tfIdfSum;
|
||||||
|
|
||||||
if (maskAdjacent == 0) {
|
if (maskAdjacent == 0) {
|
||||||
return 40;
|
return Math.max(-2, 40 - 0.5 * avgTfIdf);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (maskDirectGenerous == 0) {
|
if (maskDirectGenerous == 0) {
|
||||||
return 20;
|
return Math.max(-1, 20 - 0.3 * avgTfIdf);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (maskDirectRaw == 0) {
|
if (maskDirectRaw == 0) {
|
||||||
return 2;
|
return Math.max(-1, 15 - 0.2 * avgTfIdf);
|
||||||
}
|
}
|
||||||
|
|
||||||
return Long.numberOfTrailingZeros(maskDirectGenerous)/5. - Long.bitCount(maskDirectGenerous);
|
return Long.numberOfTrailingZeros(maskDirectGenerous)/5. - Long.bitCount(maskDirectGenerous);
|
||||||
|
@ -92,7 +92,8 @@ public class SearchIndex {
|
|||||||
SearchIndexReader.IndexQueryBuilder query =
|
SearchIndexReader.IndexQueryBuilder query =
|
||||||
switch(params.queryStrategy()) {
|
switch(params.queryStrategy()) {
|
||||||
case SENTENCE -> indexReader.findWordAsSentence(orderedIncludes);
|
case SENTENCE -> indexReader.findWordAsSentence(orderedIncludes);
|
||||||
case TOPIC -> indexReader.findWordAsTopic(orderedIncludes);
|
case TOPIC, REQUIRE_FIELD_SITE, REQUIRE_FIELD_TITLE, REQUIRE_FIELD_SUBJECT
|
||||||
|
-> indexReader.findWordAsTopic(orderedIncludes);
|
||||||
case AUTO -> indexReader.findWordTopicDynamicMode(orderedIncludes);
|
case AUTO -> indexReader.findWordTopicDynamicMode(orderedIncludes);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -6,6 +6,7 @@ import nu.marginalia.wmsa.configuration.server.Initialization;
|
|||||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
|
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl;
|
import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.EdgeOpsLockService;
|
import nu.marginalia.wmsa.edge.index.svc.EdgeOpsLockService;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -26,13 +27,14 @@ public class SearchIndexControl {
|
|||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public SearchIndexControl(IndexServicesFactory servicesFactory,
|
public SearchIndexControl(IndexServicesFactory servicesFactory,
|
||||||
EdgeOpsLockService opsLockService) {
|
EdgeOpsLockService opsLockService,
|
||||||
|
EdgeIndexSearchSetsService searchSetsService) {
|
||||||
this.servicesFactory = servicesFactory;
|
this.servicesFactory = servicesFactory;
|
||||||
|
|
||||||
this.primaryIndexWriter = servicesFactory.getIndexWriter(0);
|
this.primaryIndexWriter = servicesFactory.getIndexWriter(0);
|
||||||
this.secondaryIndexWriter = servicesFactory.getIndexWriter(1);
|
this.secondaryIndexWriter = servicesFactory.getIndexWriter(1);
|
||||||
|
|
||||||
index = servicesFactory.createIndexBucket();
|
index = servicesFactory.createIndexBucket(searchSetsService);
|
||||||
this.opsLockService = opsLockService;
|
this.opsLockService = opsLockService;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3,6 +3,8 @@ package nu.marginalia.wmsa.edge.index.postings.forward;
|
|||||||
import com.upserve.uppend.blobs.NativeIO;
|
import com.upserve.uppend.blobs.NativeIO;
|
||||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||||
import nu.marginalia.util.array.LongArray;
|
import nu.marginalia.util.array.LongArray;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata;
|
||||||
|
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReader;
|
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReader;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile;
|
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile;
|
||||||
import org.roaringbitmap.IntConsumer;
|
import org.roaringbitmap.IntConsumer;
|
||||||
@ -18,26 +20,26 @@ import java.nio.file.Path;
|
|||||||
import static nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexParameters.*;
|
import static nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexParameters.*;
|
||||||
|
|
||||||
public class ForwardIndexConverter {
|
public class ForwardIndexConverter {
|
||||||
private static final int RWF_BIN_SIZE = 10_000_000;
|
|
||||||
|
|
||||||
private final Path tmpFileDir;
|
|
||||||
private final File inputFile;
|
private final File inputFile;
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
private final Path outputFileDocsId;
|
private final Path outputFileDocsId;
|
||||||
private final Path outputFileDocsData;
|
private final Path outputFileDocsData;
|
||||||
|
private final DomainRankings domainRankings;
|
||||||
|
|
||||||
|
|
||||||
public ForwardIndexConverter(Path tmpFileDir,
|
public ForwardIndexConverter(
|
||||||
File inputFile,
|
File inputFile,
|
||||||
Path outputFileDocsId,
|
Path outputFileDocsId,
|
||||||
Path outputFileDocsData
|
Path outputFileDocsData,
|
||||||
|
DomainRankings domainRankings
|
||||||
) {
|
) {
|
||||||
this.tmpFileDir = tmpFileDir;
|
|
||||||
this.inputFile = inputFile;
|
this.inputFile = inputFile;
|
||||||
this.outputFileDocsId = outputFileDocsId;
|
this.outputFileDocsId = outputFileDocsId;
|
||||||
this.outputFileDocsData = outputFileDocsData;
|
this.outputFileDocsData = outputFileDocsData;
|
||||||
|
this.domainRankings = domainRankings;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void convert() throws IOException {
|
public void convert() throws IOException {
|
||||||
@ -50,6 +52,8 @@ public class ForwardIndexConverter {
|
|||||||
|
|
||||||
logger.info("Converting {} {}",inputFile, journalReader.fileHeader);
|
logger.info("Converting {} {}",inputFile, journalReader.fileHeader);
|
||||||
|
|
||||||
|
logger.info("Domain Rankings size = {}", domainRankings.size());
|
||||||
|
|
||||||
try {
|
try {
|
||||||
LongArray docsFileId = getDocIds(outputFileDocsId, journalReader);
|
LongArray docsFileId = getDocIds(outputFileDocsId, journalReader);
|
||||||
|
|
||||||
@ -68,7 +72,10 @@ public class ForwardIndexConverter {
|
|||||||
journalReader.forEach(entry -> {
|
journalReader.forEach(entry -> {
|
||||||
long entryOffset = (long) ENTRY_SIZE * docIdToIdx.get(entry.urlId());
|
long entryOffset = (long) ENTRY_SIZE * docIdToIdx.get(entry.urlId());
|
||||||
|
|
||||||
docFileData.set(entryOffset + METADATA_OFFSET, entry.docMeta());
|
int ranking = domainRankings.getRanking(entry.domainId());
|
||||||
|
long meta = EdgePageDocumentsMetadata.encodeRank(entry.docMeta(), ranking);
|
||||||
|
|
||||||
|
docFileData.set(entryOffset + METADATA_OFFSET, meta);
|
||||||
docFileData.set(entryOffset + DOMAIN_OFFSET, entry.domainId());
|
docFileData.set(entryOffset + DOMAIN_OFFSET, entry.domainId());
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -29,20 +29,30 @@ public class ForwardIndexReader {
|
|||||||
|
|
||||||
logger.info("Switching forward index");
|
logger.info("Switching forward index");
|
||||||
|
|
||||||
|
ids = loadIds(idsFile);
|
||||||
|
data = loadData(dataFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static TLongIntHashMap loadIds(Path idsFile) throws IOException {
|
||||||
var idsArray = LongArray.mmapRead(idsFile);
|
var idsArray = LongArray.mmapRead(idsFile);
|
||||||
idsArray.advice(NativeIO.Advice.Sequential);
|
idsArray.advice(NativeIO.Advice.Sequential);
|
||||||
|
|
||||||
ids = new TLongIntHashMap((int) idsArray.size(), 0.5f, -1, -1);
|
var ids = new TLongIntHashMap((int) idsArray.size(), 0.5f, -1, -1);
|
||||||
|
|
||||||
// This hash table should be of the same size as the number of documents, so typically less than 1 Gb
|
// This hash table should be of the same size as the number of documents, so typically less than 1 Gb
|
||||||
idsArray.forEach(0, idsArray.size(), (pos, val) -> {
|
idsArray.forEach(0, idsArray.size(), (pos, val) -> {
|
||||||
ids.put(val, (int) pos);
|
ids.put(val, (int) pos);
|
||||||
});
|
});
|
||||||
|
|
||||||
data = LongArray.mmapRead(dataFile);
|
return ids;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static LongArray loadData(Path dataFile) throws IOException {
|
||||||
|
var data = LongArray.mmapRead(dataFile);
|
||||||
|
|
||||||
data.advice(NativeIO.Advice.Random);
|
data.advice(NativeIO.Advice.Random);
|
||||||
|
|
||||||
|
return data;
|
||||||
}
|
}
|
||||||
|
|
||||||
private int idxForDoc(long docId) {
|
private int idxForDoc(long docId) {
|
||||||
@ -55,6 +65,7 @@ public class ForwardIndexReader {
|
|||||||
|
|
||||||
return data.get(ENTRY_SIZE * offset + METADATA_OFFSET);
|
return data.get(ENTRY_SIZE * offset + METADATA_OFFSET);
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getDomainId(long docId) {
|
public int getDomainId(long docId) {
|
||||||
long offset = idxForDoc(docId);
|
long offset = idxForDoc(docId);
|
||||||
if (offset < 0) return 0;
|
if (offset < 0) return 0;
|
||||||
|
@ -16,7 +16,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean test(long docId) {
|
public boolean test(long docId) {
|
||||||
var post = forwardIndexReader.docPost(docId);
|
var post = forwardIndexReader.docPost(docId & 0xFFFF_FFFFL);
|
||||||
|
|
||||||
if (!validateDomain(post)) {
|
if (!validateDomain(post)) {
|
||||||
return false;
|
return false;
|
||||||
@ -33,6 +33,11 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
|||||||
if (!validateSize(post)) {
|
if (!validateSize(post)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!validateRank(post)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -51,6 +56,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
|||||||
|
|
||||||
return limit.test(quality);
|
return limit.test(quality);
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean validateYear(ForwardIndexReader.DocPost post) {
|
private boolean validateYear(ForwardIndexReader.DocPost post) {
|
||||||
if (params.year().type() == SpecificationLimitType.NONE)
|
if (params.year().type() == SpecificationLimitType.NONE)
|
||||||
return true;
|
return true;
|
||||||
@ -69,6 +75,15 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
|||||||
return params.size().test(postVal);
|
return params.size().test(postVal);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean validateRank(ForwardIndexReader.DocPost post) {
|
||||||
|
if (params.rank().type() == SpecificationLimitType.NONE)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
int postVal = EdgePageDocumentsMetadata.decodeRank(post.meta());
|
||||||
|
|
||||||
|
return params.rank().test(postVal);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double cost() {
|
public double cost() {
|
||||||
return 32;
|
return 32;
|
||||||
|
@ -8,6 +8,7 @@ import nu.marginalia.util.array.functional.LongBinaryIOOperation;
|
|||||||
import nu.marginalia.util.array.functional.LongIOTransformer;
|
import nu.marginalia.util.array.functional.LongIOTransformer;
|
||||||
import nu.marginalia.util.array.functional.LongTransformer;
|
import nu.marginalia.util.array.functional.LongTransformer;
|
||||||
import nu.marginalia.util.btree.BTreeWriter;
|
import nu.marginalia.util.btree.BTreeWriter;
|
||||||
|
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry;
|
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalStatistics;
|
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalStatistics;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReader;
|
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReader;
|
||||||
@ -32,18 +33,22 @@ public class ReverseIndexConverter {
|
|||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
private final SearchIndexJournalReaderSingleFile journalReader;
|
private final SearchIndexJournalReaderSingleFile journalReader;
|
||||||
|
private final DomainRankings domainRankings;
|
||||||
private final Path outputFileWords;
|
private final Path outputFileWords;
|
||||||
private final Path outputFileDocs;
|
private final Path outputFileDocs;
|
||||||
|
private final SortingContext sortingContext;
|
||||||
|
|
||||||
public ReverseIndexConverter(Path tmpFileDir,
|
public ReverseIndexConverter(Path tmpFileDir,
|
||||||
SearchIndexJournalReaderSingleFile journalReader,
|
SearchIndexJournalReaderSingleFile journalReader,
|
||||||
|
DomainRankings domainRankings,
|
||||||
Path outputFileWords,
|
Path outputFileWords,
|
||||||
Path outputFileDocs) {
|
Path outputFileDocs) {
|
||||||
this.tmpFileDir = tmpFileDir;
|
this.tmpFileDir = tmpFileDir;
|
||||||
this.journalReader = journalReader;
|
this.journalReader = journalReader;
|
||||||
|
this.domainRankings = domainRankings;
|
||||||
this.outputFileWords = outputFileWords;
|
this.outputFileWords = outputFileWords;
|
||||||
this.outputFileDocs = outputFileDocs;
|
this.outputFileDocs = outputFileDocs;
|
||||||
|
this.sortingContext = new SortingContext(tmpFileDir, 64_000);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void convert() throws IOException {
|
public void convert() throws IOException {
|
||||||
@ -56,7 +61,7 @@ public class ReverseIndexConverter {
|
|||||||
final SearchIndexJournalStatistics statistics = journalReader.getStatistics();
|
final SearchIndexJournalStatistics statistics = journalReader.getStatistics();
|
||||||
|
|
||||||
final Path intermediateUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
|
final Path intermediateUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
|
||||||
SortingContext sortingContext = new SortingContext(tmpFileDir, 64_000);
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
final long wordsFileSize = statistics.highestWord() + 1;
|
final long wordsFileSize = statistics.highestWord() + 1;
|
||||||
@ -187,7 +192,7 @@ public class ReverseIndexConverter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class IntermediateIndexConstructor implements SearchIndexJournalReaderSingleFile.LongObjectConsumer<SearchIndexJournalEntry.Record>, AutoCloseable {
|
private class IntermediateIndexConstructor implements SearchIndexJournalReaderSingleFile.LongObjectConsumer<SearchIndexJournalEntry.Record>, AutoCloseable {
|
||||||
|
|
||||||
private final LongArray wordRangeEnds;
|
private final LongArray wordRangeEnds;
|
||||||
private final IntArray wordRangeOffset;
|
private final IntArray wordRangeOffset;
|
||||||
@ -205,12 +210,26 @@ public class ReverseIndexConverter {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void accept(long docId, SearchIndexJournalEntry.Record record) {
|
public void accept(long docId, SearchIndexJournalEntry.Record record) {
|
||||||
final long urlId = docId & 0xFFFF_FFFFL;
|
|
||||||
final int wordId = record.wordId();
|
|
||||||
|
|
||||||
|
/* Encode the ID as
|
||||||
|
*
|
||||||
|
* 32 bits 32 bits
|
||||||
|
* [ ranking | url-id ]
|
||||||
|
*
|
||||||
|
* in order to get low-ranking documents to be considered first
|
||||||
|
* when sorting the items.
|
||||||
|
*/
|
||||||
|
|
||||||
|
int domainId = (int) (docId >>> 32);
|
||||||
|
long rankingId = (long) domainRankings.getRanking(domainId) << 32;
|
||||||
|
|
||||||
|
int urlId = (int) (docId & 0xFFFF_FFFFL);
|
||||||
|
long rankEncodedId = rankingId | urlId;
|
||||||
|
|
||||||
|
final int wordId = record.wordId();
|
||||||
long offset = startOfRange(wordId);
|
long offset = startOfRange(wordId);
|
||||||
|
|
||||||
documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), urlId);
|
documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), rankEncodedId);
|
||||||
documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), record.metadata());
|
documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), record.metadata());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -47,18 +47,6 @@ public class ReverseIndexPrioReader {
|
|||||||
return new ReverseIndexEntrySource(createReaderNew(offset), ReverseIndexEntrySourceBehavior.DO_PREFER);
|
return new ReverseIndexEntrySource(createReaderNew(offset), ReverseIndexEntrySourceBehavior.DO_PREFER);
|
||||||
}
|
}
|
||||||
|
|
||||||
public int numDocuments(int wordId) {
|
|
||||||
if (wordId < 0)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
long offset = words.get(wordId);
|
|
||||||
|
|
||||||
if (offset < 0)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
return createReaderNew(offset).numEntries();
|
|
||||||
}
|
|
||||||
|
|
||||||
private BTreeReader createReaderNew(long offset) {
|
private BTreeReader createReaderNew(long offset) {
|
||||||
return new BTreeReader(documents, ReverseIndexParameters.bTreeContext, offset);
|
return new BTreeReader(documents, ReverseIndexParameters.bTreeContext, offset);
|
||||||
}
|
}
|
||||||
|
@ -53,6 +53,11 @@ public class ReverseIndexReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public EntrySource documents(int wordId, ReverseIndexEntrySourceBehavior behavior) {
|
public EntrySource documents(int wordId, ReverseIndexEntrySourceBehavior behavior) {
|
||||||
|
if (null == words) {
|
||||||
|
logger.warn("Reverse index is not ready, dropping query");
|
||||||
|
return new EmptyEntrySource();
|
||||||
|
}
|
||||||
|
|
||||||
if (wordId < 0 || wordId >= words.size()) return new EmptyEntrySource();
|
if (wordId < 0 || wordId >= words.size()) return new EmptyEntrySource();
|
||||||
|
|
||||||
long offset = words.get(wordId);
|
long offset = words.get(wordId);
|
||||||
|
@ -7,6 +7,7 @@ import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit;
|
|||||||
public record IndexQueryParams(SpecificationLimit qualityLimit,
|
public record IndexQueryParams(SpecificationLimit qualityLimit,
|
||||||
SpecificationLimit year,
|
SpecificationLimit year,
|
||||||
SpecificationLimit size,
|
SpecificationLimit size,
|
||||||
|
SpecificationLimit rank,
|
||||||
SearchSet searchSet,
|
SearchSet searchSet,
|
||||||
QueryStrategy queryStrategy
|
QueryStrategy queryStrategy
|
||||||
)
|
)
|
||||||
|
@ -1,21 +1,21 @@
|
|||||||
package nu.marginalia.util.ranking;
|
package nu.marginalia.wmsa.edge.index.ranking;
|
||||||
|
|
||||||
import gnu.trove.list.TIntList;
|
|
||||||
import gnu.trove.list.array.TIntArrayList;
|
import gnu.trove.list.array.TIntArrayList;
|
||||||
import gnu.trove.map.hash.TIntIntHashMap;
|
import gnu.trove.map.hash.TIntIntHashMap;
|
||||||
import gnu.trove.map.hash.TIntObjectHashMap;
|
import gnu.trove.map.hash.TIntObjectHashMap;
|
||||||
import it.unimi.dsi.fastutil.ints.IntArrays;
|
import it.unimi.dsi.fastutil.ints.IntArrays;
|
||||||
import it.unimi.dsi.fastutil.ints.IntComparator;
|
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultAccumulator;
|
||||||
import org.roaringbitmap.RoaringBitmap;
|
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainData;
|
||||||
|
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.function.IntToDoubleFunction;
|
import java.util.function.Supplier;
|
||||||
import java.util.stream.IntStream;
|
|
||||||
|
import static java.lang.Math.min;
|
||||||
|
|
||||||
public abstract class RankingAlgorithm {
|
public abstract class RankingAlgorithm {
|
||||||
protected final TIntObjectHashMap<RankingDomainData> domainsById = new TIntObjectHashMap<>();
|
protected final TIntObjectHashMap<RankingDomainData> domainsById = new TIntObjectHashMap<>();
|
||||||
@ -85,6 +85,10 @@ public abstract class RankingAlgorithm {
|
|||||||
logger.info("Origin Domains: {}", originDomainIds.size());
|
logger.info("Origin Domains: {}", originDomainIds.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public RankingDomainData getDomainData(int id) {
|
||||||
|
return domainsById.get(id);
|
||||||
|
}
|
||||||
|
|
||||||
public void addPeripheralNodes() {
|
public void addPeripheralNodes() {
|
||||||
|
|
||||||
int newNodesIdxCutoff = domainIdToIndex.size();
|
int newNodesIdxCutoff = domainIdToIndex.size();
|
||||||
@ -133,29 +137,7 @@ public abstract class RankingAlgorithm {
|
|||||||
return domainsById.size();
|
return domainsById.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public <T> T pageRank(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
||||||
public RankVector pageRankVector() {
|
|
||||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
|
||||||
|
|
||||||
int iter_max = 100;
|
|
||||||
for (int i = 0; i < iter_max; i++) {
|
|
||||||
RankVector newRank = createNewRankVector(rank);
|
|
||||||
|
|
||||||
double oldNorm = rank.norm();
|
|
||||||
double newNorm = newRank.norm();
|
|
||||||
double dNorm = oldNorm - newNorm ;
|
|
||||||
if (i < iter_max-1) {
|
|
||||||
adjustRankVector(newRank, dNorm, oldNorm);
|
|
||||||
}
|
|
||||||
|
|
||||||
rank = newRank;
|
|
||||||
}
|
|
||||||
|
|
||||||
return rank;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public RoaringBitmap pageRank(int resultCount) {
|
|
||||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
RankVector rank = new RankVector(1.d / domainsById.size());
|
||||||
|
|
||||||
int iter_max = 100;
|
int iter_max = 100;
|
||||||
@ -174,10 +156,10 @@ public abstract class RankingAlgorithm {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
return rank.getRanking(resultCount);
|
return rank.getRanking(resultCount, accumulatorP).get();
|
||||||
}
|
}
|
||||||
|
|
||||||
public RoaringBitmap pageRankWithPeripheralNodes(int resultCount) {
|
public <T> T pageRankWithPeripheralNodes(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
||||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
RankVector rank = new RankVector(1.d / domainsById.size());
|
||||||
|
|
||||||
int iter_max = 100;
|
int iter_max = 100;
|
||||||
@ -201,32 +183,11 @@ public abstract class RankingAlgorithm {
|
|||||||
|
|
||||||
logger.info("PRWPN iteration done");
|
logger.info("PRWPN iteration done");
|
||||||
|
|
||||||
return rank.getRanking(resultCount);
|
return rank.getRanking(resultCount, accumulatorP).get();
|
||||||
}
|
}
|
||||||
|
|
||||||
abstract void adjustRankVector(RankVector vector, double dNorm, double oldNorm);
|
abstract void adjustRankVector(RankVector vector, double dNorm, double oldNorm);
|
||||||
|
|
||||||
public TIntList pageRank(IntToDoubleFunction weight, int resultCount) {
|
|
||||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
|
||||||
|
|
||||||
int iter_max = 100;
|
|
||||||
for (int i = 0; i < iter_max; i++) {
|
|
||||||
RankVector newRank = createNewRankVector(rank);
|
|
||||||
|
|
||||||
double oldNorm = rank.norm();
|
|
||||||
double newNorm = newRank.norm();
|
|
||||||
double dNorm = oldNorm - newNorm ;
|
|
||||||
|
|
||||||
if (i < iter_max-1) {
|
|
||||||
adjustRankVector(newRank, dNorm, oldNorm);
|
|
||||||
}
|
|
||||||
|
|
||||||
rank = newRank;
|
|
||||||
}
|
|
||||||
|
|
||||||
return rank.getRanking(weight, resultCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
abstract RankVector createNewRankVector(RankVector rank);
|
abstract RankVector createNewRankVector(RankVector rank);
|
||||||
|
|
||||||
public boolean includeInRanking(RankingDomainData data) {
|
public boolean includeInRanking(RankingDomainData data) {
|
||||||
@ -245,9 +206,9 @@ public abstract class RankingAlgorithm {
|
|||||||
public void setMaxKnownUrls(int maxKnownUrls) {
|
public void setMaxKnownUrls(int maxKnownUrls) {
|
||||||
this.maxKnownUrls = maxKnownUrls;
|
this.maxKnownUrls = maxKnownUrls;
|
||||||
}
|
}
|
||||||
|
|
||||||
public class RankVector {
|
public class RankVector {
|
||||||
private final double[] rank;
|
private final double[] rank;
|
||||||
|
|
||||||
public RankVector(double defaultValue) {
|
public RankVector(double defaultValue) {
|
||||||
rank = new double[domainIndexToId.size()];
|
rank = new double[domainIndexToId.size()];
|
||||||
if (defaultValue != 0.) {
|
if (defaultValue != 0.) {
|
||||||
@ -271,9 +232,8 @@ public abstract class RankingAlgorithm {
|
|||||||
|
|
||||||
public double norm() {
|
public double norm() {
|
||||||
double v = 0.;
|
double v = 0.;
|
||||||
for (int i = 0; i < rank.length; i++) {
|
for (double value : rank) {
|
||||||
if (rank[i] > 0) { v+=rank[i]; }
|
v += Math.abs(value);
|
||||||
else { v -= rank[i]; }
|
|
||||||
}
|
}
|
||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
@ -281,74 +241,39 @@ public abstract class RankingAlgorithm {
|
|||||||
public double norm(RankVector other) {
|
public double norm(RankVector other) {
|
||||||
double v = 0.;
|
double v = 0.;
|
||||||
for (int i = 0; i < rank.length; i++) {
|
for (int i = 0; i < rank.length; i++) {
|
||||||
double dv = rank[i] - other.get(i);
|
v += Math.abs(rank[i] - other.get(i));
|
||||||
|
|
||||||
if (dv > 0) { v+=dv; }
|
|
||||||
else { v -= dv; }
|
|
||||||
}
|
}
|
||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
public TIntList getRanking(IntToDoubleFunction other, int numResults) {
|
public <T> RankingResultAccumulator<T> getRanking(int numResults, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
||||||
TIntArrayList list = new TIntArrayList(numResults);
|
|
||||||
|
|
||||||
Comparator<Integer> comparator = Comparator.comparing(i -> Math.sqrt(other.applyAsDouble(domainIdToIndex.get(i)) * rank[i]));
|
|
||||||
|
|
||||||
IntStream.range(0, rank.length)
|
|
||||||
.boxed()
|
|
||||||
.sorted(comparator.reversed())
|
|
||||||
.map(domainIndexToId::get)
|
|
||||||
.limit(numResults)
|
|
||||||
.forEach(list::add);
|
|
||||||
|
|
||||||
return list;
|
|
||||||
}
|
|
||||||
|
|
||||||
public RoaringBitmap getRanking(int numResults) {
|
|
||||||
if (numResults < 0) {
|
if (numResults < 0) {
|
||||||
numResults = domainIdToIndex.size();
|
numResults = domainIdToIndex.size();
|
||||||
}
|
}
|
||||||
if (numResults >= rank.length) {
|
numResults = min(numResults, min(domainIdToIndex.size(), rank.length));
|
||||||
numResults = rank.length;
|
|
||||||
}
|
|
||||||
|
|
||||||
RoaringBitmap list = new RoaringBitmap();
|
int[] nodes = sortOrder(rank);
|
||||||
|
var accumulator = accumulatorP.get();
|
||||||
|
|
||||||
int[] nodes = new int[rank.length];
|
for (int i = 0; i < numResults; i++) {
|
||||||
Arrays.setAll(nodes, i->i);
|
|
||||||
IntComparator comp = (i,j) -> (int) Math.signum(rank[j] - rank[i]);
|
|
||||||
IntArrays.quickSort(nodes, comp);
|
|
||||||
|
|
||||||
int i;
|
|
||||||
|
|
||||||
for (i = 0; i < numResults; i++) {
|
|
||||||
int id = domainIndexToId.get(nodes[i]);
|
int id = domainIndexToId.get(nodes[i]);
|
||||||
|
|
||||||
if (includeInRanking(domainsById.get(id)))
|
if (includeInRanking(domainsById.get(id)))
|
||||||
list.add(id);
|
accumulator.add(id, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (; i < nodes.length && domainsById.size() < numResults; i++) {
|
return accumulator;
|
||||||
int id = domainIndexToId.get(nodes[i]);
|
}
|
||||||
|
private static int[] sortOrder(double[] values) {
|
||||||
|
|
||||||
if (includeInRanking(domainsById.get(id)))
|
int[] ret = new int[values.length];
|
||||||
list.add(id);
|
Arrays.setAll(ret, i->i);
|
||||||
}
|
IntArrays.quickSort(ret, (i,j) -> (int) Math.signum(values[j] - values[i]));
|
||||||
|
|
||||||
|
return ret;
|
||||||
return list;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void incrementAll(double v) {
|
|
||||||
for (int i = 0; i < rank.length; i++) {
|
|
||||||
rank[i]+=v;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int size() {
|
|
||||||
return domainsById.size();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -1,10 +1,12 @@
|
|||||||
package nu.marginalia.util.ranking;
|
package nu.marginalia.wmsa.edge.index.ranking;
|
||||||
|
|
||||||
|
|
||||||
public class BetterReversePageRank extends RankingAlgorithm {
|
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
|
||||||
|
|
||||||
|
public class ReversePageRank extends RankingAlgorithm {
|
||||||
|
|
||||||
|
|
||||||
public BetterReversePageRank(RankingDomainFetcher domains, String... origins) {
|
public ReversePageRank(RankingDomainFetcher domains, String... origins) {
|
||||||
super(domains, origins);
|
super(domains, origins);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -20,8 +22,6 @@ public class BetterReversePageRank extends RankingAlgorithm {
|
|||||||
double newRankValue = 0;
|
double newRankValue = 0;
|
||||||
|
|
||||||
if (links != null && links.size() > 0) {
|
if (links != null && links.size() > 0) {
|
||||||
|
|
||||||
|
|
||||||
for (int j = 0; j < links.size(); j++) {
|
for (int j = 0; j < links.size(); j++) {
|
||||||
var revLinks = linkDataDest2Src[links.getQuick(j)];
|
var revLinks = linkDataDest2Src[links.getQuick(j)];
|
||||||
newRankValue += rank.get(links.getQuick(j)) / revLinks.size();
|
newRankValue += rank.get(links.getQuick(j)) / revLinks.size();
|
@ -1,9 +1,11 @@
|
|||||||
package nu.marginalia.util.ranking;
|
package nu.marginalia.wmsa.edge.index.ranking;
|
||||||
|
|
||||||
|
|
||||||
public class BetterStandardPageRank extends RankingAlgorithm {
|
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
|
||||||
|
|
||||||
public BetterStandardPageRank(RankingDomainFetcher domains, String... origins) {
|
public class StandardPageRank extends RankingAlgorithm {
|
||||||
|
|
||||||
|
public StandardPageRank(RankingDomainFetcher domains, String... origins) {
|
||||||
super(domains, origins);
|
super(domains, origins);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -38,8 +40,7 @@ public class BetterStandardPageRank extends RankingAlgorithm {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
void adjustRankVector(RankVector vector, double dNorm, double oldNorm) {
|
void adjustRankVector(RankVector vector, double dNorm, double oldNorm) {
|
||||||
originDomainIds.forEach(id -> vector.increment(id, 0.15 / originDomainIds.size() /* dNorm/originDomainIds.size() */ ));
|
originDomainIds.forEach(id -> vector.increment(id, 0.15 / originDomainIds.size() ));
|
||||||
// vector.incrementAll(0.14*dNorm/vector.size());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -0,0 +1,6 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
|
||||||
|
|
||||||
|
public interface RankingResultAccumulator<T> {
|
||||||
|
void add(int domainId, int rank);
|
||||||
|
T get();
|
||||||
|
}
|
@ -0,0 +1,17 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
|
||||||
|
|
||||||
|
import org.roaringbitmap.RoaringBitmap;
|
||||||
|
|
||||||
|
public class RankingResultBitSetAccumulator implements RankingResultAccumulator<RoaringBitmap> {
|
||||||
|
private final RoaringBitmap result = new RoaringBitmap();
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void add(int domainId, int rank) {
|
||||||
|
result.add(domainId);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public RoaringBitmap get() {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,21 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
|
||||||
|
|
||||||
|
public class RankingResultHashMapAccumulator implements RankingResultAccumulator<Int2IntOpenHashMap> {
|
||||||
|
private final Int2IntOpenHashMap result;
|
||||||
|
|
||||||
|
public RankingResultHashMapAccumulator(int size) {
|
||||||
|
result = new Int2IntOpenHashMap(size);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void add(int domainId, int rank) {
|
||||||
|
result.put(domainId, rank);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Int2IntOpenHashMap get() {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,24 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
|
||||||
|
|
||||||
|
import gnu.trove.list.array.TIntArrayList;
|
||||||
|
|
||||||
|
public class RankingResultListAccumulator implements RankingResultAccumulator<TIntArrayList> {
|
||||||
|
private final TIntArrayList result;
|
||||||
|
|
||||||
|
public RankingResultListAccumulator(int size) {
|
||||||
|
result = new TIntArrayList(size);
|
||||||
|
}
|
||||||
|
public RankingResultListAccumulator() {
|
||||||
|
result = new TIntArrayList(10_000);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void add(int domainId, int rank) {
|
||||||
|
result.add(domainId);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TIntArrayList get() {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.util.ranking;
|
package nu.marginalia.wmsa.edge.index.ranking.data;
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
@ -10,7 +10,7 @@ public class RankingDomainData {
|
|||||||
public final int id;
|
public final int id;
|
||||||
public final String name;
|
public final String name;
|
||||||
private int alias;
|
private int alias;
|
||||||
private EdgeDomainIndexingState state;
|
public EdgeDomainIndexingState state;
|
||||||
public final int knownUrls;
|
public final int knownUrls;
|
||||||
|
|
||||||
public int resolveAlias() {
|
public int resolveAlias() {
|
@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.util.ranking;
|
package nu.marginalia.wmsa.edge.index.ranking.data;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||||
@ -11,12 +12,13 @@ import java.sql.SQLException;
|
|||||||
import java.util.function.Consumer;
|
import java.util.function.Consumer;
|
||||||
import java.util.function.IntConsumer;
|
import java.util.function.IntConsumer;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
public class RankingDomainFetcher {
|
public class RankingDomainFetcher {
|
||||||
private final HikariDataSource dataSource;
|
protected final HikariDataSource dataSource;
|
||||||
private final EdgeDomainBlacklistImpl blacklist;
|
protected final EdgeDomainBlacklistImpl blacklist;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
protected final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
private final boolean getNames = false;
|
protected boolean getNames = false;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public RankingDomainFetcher(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
|
public RankingDomainFetcher(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
|
||||||
@ -24,6 +26,10 @@ public class RankingDomainFetcher {
|
|||||||
this.blacklist = blacklist;
|
this.blacklist = blacklist;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void retainNames() {
|
||||||
|
this.getNames = true;
|
||||||
|
}
|
||||||
|
|
||||||
public void getDomains(Consumer<RankingDomainData> consumer) {
|
public void getDomains(Consumer<RankingDomainData> consumer) {
|
||||||
String query;
|
String query;
|
||||||
if (getNames) {
|
if (getNames) {
|
||||||
@ -49,14 +55,19 @@ public class RankingDomainFetcher {
|
|||||||
getDomains(query, consumer);
|
getDomains(query, consumer);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void getDomains(String query, Consumer<RankingDomainData> consumer) {
|
protected void getDomains(String query, Consumer<RankingDomainData> consumer) {
|
||||||
try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) {
|
try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) {
|
||||||
stmt.setFetchSize(10000);
|
stmt.setFetchSize(10000);
|
||||||
var rsp = stmt.executeQuery();
|
var rsp = stmt.executeQuery();
|
||||||
while (rsp.next()) {
|
while (rsp.next()) {
|
||||||
int id = rsp.getInt(1);
|
int id = rsp.getInt(1);
|
||||||
if (!blacklist.isBlacklisted(id)) {
|
if (!blacklist.isBlacklisted(id)) {
|
||||||
consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5)));
|
consumer.accept(
|
||||||
|
new RankingDomainData(id,
|
||||||
|
rsp.getString(2),
|
||||||
|
rsp.getInt(3),
|
||||||
|
EdgeDomainIndexingState.valueOf(rsp.getString(4)),
|
||||||
|
rsp.getInt(5)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -0,0 +1,103 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.ranking.data;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.function.Consumer;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher {
|
||||||
|
final boolean hasData;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
|
||||||
|
super(dataSource, blacklist);
|
||||||
|
|
||||||
|
hasData = isDomainNeighborTablePopulated(dataSource);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isDomainNeighborTablePopulated(HikariDataSource dataSource) {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.createStatement();
|
||||||
|
var rs = stmt.executeQuery("SELECT DOMAIN_ID FROM EC_DOMAIN_NEIGHBORS_2 LIMIT 1")) {
|
||||||
|
|
||||||
|
return rs.next();
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
LoggerFactory
|
||||||
|
.getLogger(RankingDomainFetcherForSimilarityData.class)
|
||||||
|
.error("Failed to get count from EC_DOMAIN_NEIGHBORS_2", ex);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
public boolean hasData() {
|
||||||
|
return hasData;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void eachDomainLink(DomainLinkConsumer consumer) {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("SELECT DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS FROM EC_DOMAIN_NEIGHBORS_2"))
|
||||||
|
{
|
||||||
|
stmt.setFetchSize(10000);
|
||||||
|
|
||||||
|
var rsp = stmt.executeQuery();
|
||||||
|
|
||||||
|
while (rsp.next()) {
|
||||||
|
int src = rsp.getInt(1);
|
||||||
|
int dst = rsp.getInt(2);
|
||||||
|
|
||||||
|
// these "links" are bidi
|
||||||
|
consumer.accept(src, dst);
|
||||||
|
consumer.accept(dst, src);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
logger.error("Failed to fetch domain links", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void getDomains(Consumer<RankingDomainData> consumer) {
|
||||||
|
// String query =
|
||||||
|
// """
|
||||||
|
// SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
|
||||||
|
// FROM EC_DOMAIN
|
||||||
|
// LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||||
|
// INNER JOIN EC_DOMAIN_LINK ON DEST_DOMAIN_ID=EC_DOMAIN.ID
|
||||||
|
// WHERE SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID
|
||||||
|
// GROUP BY EC_DOMAIN.ID
|
||||||
|
// HAVING COUNT(SOURCE_DOMAIN_ID)>5
|
||||||
|
// """;
|
||||||
|
|
||||||
|
String query;
|
||||||
|
if (getNames) {
|
||||||
|
query =
|
||||||
|
"""
|
||||||
|
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
|
||||||
|
FROM EC_DOMAIN
|
||||||
|
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||||
|
GROUP BY EC_DOMAIN.ID
|
||||||
|
""";
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
query =
|
||||||
|
"""
|
||||||
|
SELECT EC_DOMAIN.ID,"",DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
|
||||||
|
FROM EC_DOMAIN
|
||||||
|
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||||
|
GROUP BY EC_DOMAIN.ID
|
||||||
|
""";
|
||||||
|
}
|
||||||
|
|
||||||
|
getDomains(query, consumer);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
|
||||||
|
// This is not relevant for this variant of pagerank since it is bidirectional
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.util.ranking.old;
|
package nu.marginalia.wmsa.edge.index.ranking.old;
|
||||||
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.util.ranking.old;
|
package nu.marginalia.wmsa.edge.index.ranking.old;
|
||||||
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
@ -125,7 +125,6 @@ public class StandardPageRank {
|
|||||||
|
|
||||||
final TIntArrayList empty = new TIntArrayList();
|
final TIntArrayList empty = new TIntArrayList();
|
||||||
|
|
||||||
double rankNorm = rank.norm();
|
|
||||||
RankVector newRank = new RankVector(0);
|
RankVector newRank = new RankVector(0);
|
||||||
|
|
||||||
for (DomainData domain : domains.valueCollection()) {
|
for (DomainData domain : domains.valueCollection()) {
|
||||||
@ -176,8 +175,6 @@ public class StandardPageRank {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
TIntHashSet deadEnds = new TIntHashSet(domains.size());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private class RankVector {
|
private class RankVector {
|
@ -1,43 +1,30 @@
|
|||||||
package nu.marginalia.util.ranking.tool;
|
package nu.marginalia.wmsa.edge.index.ranking.tool;
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
|
||||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||||
|
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
|
||||||
|
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator;
|
||||||
|
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData;
|
||||||
import org.mariadb.jdbc.Driver;
|
import org.mariadb.jdbc.Driver;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.concurrent.LinkedBlockingQueue;
|
import java.util.concurrent.LinkedBlockingQueue;
|
||||||
|
|
||||||
public class UpdateDomainRanksTool {
|
public class CreateBrowseDomainRanksTool {
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool.class);
|
private static final Logger logger = LoggerFactory.getLogger(CreateBrowseDomainRanksTool.class);
|
||||||
|
|
||||||
public Set<String> originDomains = new HashSet<>();
|
|
||||||
public Set<Integer> originDomainIds = new HashSet<>();
|
|
||||||
public final long domainIdMax = -1;
|
|
||||||
public int domainCount;
|
|
||||||
private volatile static int rankMax;
|
|
||||||
|
|
||||||
public int maxId() {
|
|
||||||
return (int) domainIdMax;
|
|
||||||
}
|
|
||||||
public int domainCount() {
|
|
||||||
return domainCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
|
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
|
||||||
volatile static boolean running = true;
|
volatile static boolean running = true;
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public static void main(String... args) {
|
public static void main(String... args) {
|
||||||
org.mariadb.jdbc.Driver driver = new Driver();
|
Driver driver = new Driver();
|
||||||
var conn = new DatabaseModule().provideConnection();
|
var conn = new DatabaseModule().provideConnection();
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
@ -45,20 +32,21 @@ public class UpdateDomainRanksTool {
|
|||||||
|
|
||||||
logger.info("Ranking");
|
logger.info("Ranking");
|
||||||
var ds = new DatabaseModule().provideConnection();
|
var ds = new DatabaseModule().provideConnection();
|
||||||
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
var domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds));
|
||||||
var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu");
|
var rpr = new StandardPageRank(domains, args);
|
||||||
|
|
||||||
rankMax = spr.size()*2;
|
|
||||||
uploader.start();
|
uploader.start();
|
||||||
|
|
||||||
var rankData = spr.pageRankWithPeripheralNodes(rankMax);
|
var rankData = rpr.pageRankWithPeripheralNodes(1000, RankingResultListAccumulator::new);
|
||||||
for (int i : rankData) {
|
|
||||||
|
rankData.forEach(i -> {
|
||||||
try {
|
try {
|
||||||
uploadQueue.put(i);
|
uploadQueue.put(i);
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
}
|
return true;
|
||||||
|
});
|
||||||
|
|
||||||
long end = System.currentTimeMillis();
|
long end = System.currentTimeMillis();
|
||||||
running = false;
|
running = false;
|
||||||
@ -68,24 +56,14 @@ public class UpdateDomainRanksTool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static void uploadThread(HikariDataSource dataSource) {
|
public static void uploadThread(HikariDataSource dataSource) {
|
||||||
int i = 0;
|
|
||||||
|
|
||||||
try (var conn = dataSource.getConnection()) {
|
try (var conn = dataSource.getConnection()) {
|
||||||
logger.info("Resetting rank");
|
try (var stmt = conn.prepareStatement("INSERT IGNORE INTO EC_RANDOM_DOMAINS(DOMAIN_SET, DOMAIN_ID) VALUES (3, ?)")) {
|
||||||
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET RANK=1")) {
|
|
||||||
stmt.executeUpdate();
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info("Updating ranks");
|
|
||||||
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET RANK=? WHERE ID=?")) {
|
|
||||||
while (running || (!running && !uploadQueue.isEmpty())) {
|
while (running || (!running && !uploadQueue.isEmpty())) {
|
||||||
var job = uploadQueue.take();
|
var job = uploadQueue.take();
|
||||||
stmt.setDouble(1, i++ / (double) rankMax);
|
stmt.setInt(1, job);
|
||||||
stmt.setInt(2, job);
|
|
||||||
stmt.executeUpdate();
|
stmt.executeUpdate();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (SQLException | InterruptedException throwables) {
|
} catch (SQLException | InterruptedException throwables) {
|
||||||
throwables.printStackTrace();
|
throwables.printStackTrace();
|
||||||
}
|
}
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.util.ranking.tool;
|
package nu.marginalia.wmsa.edge.index.ranking.tool;
|
||||||
|
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
@ -10,9 +10,9 @@ import it.unimi.dsi.fastutil.ints.IntArrays;
|
|||||||
import it.unimi.dsi.fastutil.ints.IntComparator;
|
import it.unimi.dsi.fastutil.ints.IntComparator;
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.util.ranking.RankingAlgorithm;
|
import nu.marginalia.wmsa.edge.index.ranking.RankingAlgorithm;
|
||||||
import nu.marginalia.util.ranking.RankingDomainData;
|
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainData;
|
||||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
@ -33,8 +33,6 @@ public class PerusePageRankV2 {
|
|||||||
TIntArrayList[] linkDataSrc2Dest;
|
TIntArrayList[] linkDataSrc2Dest;
|
||||||
TIntArrayList[] linkDataDest2Src;
|
TIntArrayList[] linkDataDest2Src;
|
||||||
|
|
||||||
private static final boolean getNames = true;
|
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
static final LinkedBlockingQueue<LinkAdjacencies> uploadQueue = new LinkedBlockingQueue<>(10);
|
static final LinkedBlockingQueue<LinkAdjacencies> uploadQueue = new LinkedBlockingQueue<>(10);
|
@ -0,0 +1,67 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.ranking.tool;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
|
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||||
|
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
|
||||||
|
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator;
|
||||||
|
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
|
||||||
|
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData;
|
||||||
|
import org.mariadb.jdbc.Driver;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.concurrent.LinkedBlockingQueue;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
|
public class PrintDomainRanksTool {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(PrintDomainRanksTool.class);
|
||||||
|
|
||||||
|
private volatile static int rankMax;
|
||||||
|
|
||||||
|
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
|
||||||
|
volatile static boolean running = true;
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public static void main(String... args) {
|
||||||
|
Driver driver = new Driver();
|
||||||
|
var conn = new DatabaseModule().provideConnection();
|
||||||
|
|
||||||
|
long start = System.currentTimeMillis();
|
||||||
|
|
||||||
|
logger.info("Ranking");
|
||||||
|
var ds = new DatabaseModule().provideConnection();
|
||||||
|
|
||||||
|
RankingDomainFetcher domains;
|
||||||
|
if (Boolean.getBoolean("use-link-data")) {
|
||||||
|
domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||||
|
domains.retainNames();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds));
|
||||||
|
domains.retainNames();
|
||||||
|
}
|
||||||
|
|
||||||
|
var rpr = new StandardPageRank(domains, args);
|
||||||
|
|
||||||
|
rankMax = rpr.size();
|
||||||
|
|
||||||
|
var rankData = rpr.pageRankWithPeripheralNodes(rankMax, RankingResultListAccumulator::new);
|
||||||
|
|
||||||
|
AtomicInteger cnt = new AtomicInteger();
|
||||||
|
rankData.forEach(i -> {
|
||||||
|
|
||||||
|
var data = rpr.getDomainData(i);
|
||||||
|
|
||||||
|
System.out.printf("%d %s %s\n", cnt.getAndIncrement(), data.name, data.state);
|
||||||
|
return true;
|
||||||
|
});
|
||||||
|
|
||||||
|
long end = System.currentTimeMillis();
|
||||||
|
running = false;
|
||||||
|
|
||||||
|
logger.info("Done in {}", (end - start)/1000.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,11 +1,12 @@
|
|||||||
package nu.marginalia.util.ranking.tool;
|
package nu.marginalia.wmsa.edge.index.ranking.tool;
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.util.ranking.BetterReversePageRank;
|
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
|
||||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator;
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||||
|
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData;
|
||||||
import org.mariadb.jdbc.Driver;
|
import org.mariadb.jdbc.Driver;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -13,12 +14,10 @@ import org.slf4j.LoggerFactory;
|
|||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.concurrent.LinkedBlockingQueue;
|
import java.util.concurrent.LinkedBlockingQueue;
|
||||||
|
|
||||||
public class UpdateDomainRanksTool2 {
|
public class UpdateDomainRanksTool {
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool2.class);
|
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool.class);
|
||||||
|
|
||||||
public final long domainIdMax = -1;
|
|
||||||
public int domainCount;
|
|
||||||
private volatile static int rankMax;
|
private volatile static int rankMax;
|
||||||
|
|
||||||
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
|
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
|
||||||
@ -34,21 +33,22 @@ public class UpdateDomainRanksTool2 {
|
|||||||
|
|
||||||
logger.info("Ranking");
|
logger.info("Ranking");
|
||||||
var ds = new DatabaseModule().provideConnection();
|
var ds = new DatabaseModule().provideConnection();
|
||||||
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
var domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds));
|
||||||
var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
var rpr = new StandardPageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com");
|
||||||
|
|
||||||
var rankVector = rpr.pageRankVector();
|
|
||||||
rankMax = rpr.size();
|
rankMax = rpr.size();
|
||||||
uploader.start();
|
uploader.start();
|
||||||
|
|
||||||
var rankData = rpr.pageRankWithPeripheralNodes(rankMax);
|
var rankData = rpr.pageRankWithPeripheralNodes(rankMax, RankingResultListAccumulator::new);
|
||||||
for (int i : rankData) {
|
|
||||||
|
rankData.forEach(i -> {
|
||||||
try {
|
try {
|
||||||
uploadQueue.put(i);
|
uploadQueue.put(i);
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
}
|
}
|
||||||
}
|
return true;
|
||||||
|
});
|
||||||
|
|
||||||
long end = System.currentTimeMillis();
|
long end = System.currentTimeMillis();
|
||||||
running = false;
|
running = false;
|
@ -5,7 +5,7 @@ import com.google.inject.Inject;
|
|||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import io.prometheus.client.Histogram;
|
import io.prometheus.client.Histogram;
|
||||||
import nu.marginalia.util.array.buffer.LongQueryBuffer;
|
import nu.marginalia.util.array.buffer.LongQueryBuffer;
|
||||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
|
||||||
import nu.marginalia.wmsa.client.GsonFactory;
|
import nu.marginalia.wmsa.client.GsonFactory;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl;
|
import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl;
|
||||||
import nu.marginalia.wmsa.edge.index.query.IndexResultDomainDeduplicator;
|
import nu.marginalia.wmsa.edge.index.query.IndexResultDomainDeduplicator;
|
||||||
@ -101,7 +101,7 @@ public class EdgeIndexDomainQueryService {
|
|||||||
|
|
||||||
private OptionalInt lookUpWord(String s) {
|
private OptionalInt lookUpWord(String s) {
|
||||||
int ret = indexes.getLexiconReader().get(s);
|
int ret = indexes.getLexiconReader().get(s);
|
||||||
if (ret == DictionaryHashMap.NO_VALUE) {
|
if (ret == OffHeapDictionaryHashMap.NO_VALUE) {
|
||||||
return OptionalInt.empty();
|
return OptionalInt.empty();
|
||||||
}
|
}
|
||||||
return OptionalInt.of(ret);
|
return OptionalInt.of(ret);
|
||||||
|
@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.index.svc;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import com.google.protobuf.InvalidProtocolBufferException;
|
import com.google.protobuf.InvalidProtocolBufferException;
|
||||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
|
||||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker;
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker;
|
||||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||||
@ -51,7 +51,7 @@ public class EdgeIndexLexiconService {
|
|||||||
|
|
||||||
final int wordId = lr.get(word);
|
final int wordId = lr.get(word);
|
||||||
|
|
||||||
if (DictionaryHashMap.NO_VALUE == wordId) {
|
if (OffHeapDictionaryHashMap.NO_VALUE == wordId) {
|
||||||
response.status(404);
|
response.status(404);
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
@ -110,7 +110,7 @@ public class EdgeIndexLexiconService {
|
|||||||
String word = words[i];
|
String word = words[i];
|
||||||
|
|
||||||
long id = keywordLexicon.getOrInsert(word);
|
long id = keywordLexicon.getOrInsert(word);
|
||||||
if (id != DictionaryHashMap.NO_VALUE) {
|
if (id != OffHeapDictionaryHashMap.NO_VALUE) {
|
||||||
ids[putIdx++] = id;
|
ids[putIdx++] = id;
|
||||||
ids[putIdx++] = meta[i];
|
ids[putIdx++] = meta[i];
|
||||||
}
|
}
|
||||||
|
@ -12,7 +12,7 @@ import io.prometheus.client.Histogram;
|
|||||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||||
import it.unimi.dsi.fastutil.ints.IntList;
|
import it.unimi.dsi.fastutil.ints.IntList;
|
||||||
import nu.marginalia.util.array.buffer.LongQueryBuffer;
|
import nu.marginalia.util.array.buffer.LongQueryBuffer;
|
||||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
|
||||||
import nu.marginalia.wmsa.client.GsonFactory;
|
import nu.marginalia.wmsa.client.GsonFactory;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.EdgeIndexQuerySearchTerms;
|
import nu.marginalia.wmsa.edge.index.postings.EdgeIndexQuerySearchTerms;
|
||||||
import nu.marginalia.wmsa.edge.index.postings.IndexResultValuator;
|
import nu.marginalia.wmsa.edge.index.postings.IndexResultValuator;
|
||||||
@ -115,11 +115,13 @@ public class EdgeIndexQueryService {
|
|||||||
TLongHashSet consideredUrlIds;
|
TLongHashSet consideredUrlIds;
|
||||||
|
|
||||||
public SearchQuery(EdgeSearchSpecification specsSet) {
|
public SearchQuery(EdgeSearchSpecification specsSet) {
|
||||||
this.fetchSize = specsSet.fetchSize;
|
var limits = specsSet.queryLimits;
|
||||||
this.budget = new IndexSearchBudget(specsSet.timeoutMs);
|
|
||||||
|
this.fetchSize = limits.fetchSize();
|
||||||
|
this.budget = new IndexSearchBudget(limits.timeoutMs());
|
||||||
this.subqueries = specsSet.subqueries;
|
this.subqueries = specsSet.subqueries;
|
||||||
this.limitByDomain = specsSet.limitByDomain;
|
this.limitByDomain = limits.resultsByDomain();
|
||||||
this.limitTotal = specsSet.limitTotal;
|
this.limitTotal = limits.resultsTotal();
|
||||||
|
|
||||||
this.consideredUrlIds = new TLongHashSet(fetchSize * 4);
|
this.consideredUrlIds = new TLongHashSet(fetchSize * 4);
|
||||||
|
|
||||||
@ -127,6 +129,7 @@ public class EdgeIndexQueryService {
|
|||||||
specsSet.quality,
|
specsSet.quality,
|
||||||
specsSet.year,
|
specsSet.year,
|
||||||
specsSet.size,
|
specsSet.size,
|
||||||
|
specsSet.rank,
|
||||||
getSearchSet(specsSet),
|
getSearchSet(specsSet),
|
||||||
specsSet.queryStrategy);
|
specsSet.queryStrategy);
|
||||||
}
|
}
|
||||||
@ -151,7 +154,7 @@ public class EdgeIndexQueryService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final var evaluator = new IndexResultValuator(indexes, results, subqueries);
|
final var evaluator = new IndexResultValuator(indexes, results, subqueries, queryParams);
|
||||||
|
|
||||||
ArrayList<EdgeSearchResultItem> items = new ArrayList<>(results.size());
|
ArrayList<EdgeSearchResultItem> items = new ArrayList<>(results.size());
|
||||||
ArrayList<EdgeSearchResultItem> refusedItems = new ArrayList<>(results.size());
|
ArrayList<EdgeSearchResultItem> refusedItems = new ArrayList<>(results.size());
|
||||||
@ -293,7 +296,7 @@ public class EdgeIndexQueryService {
|
|||||||
|
|
||||||
private OptionalInt lookUpWord(String s) {
|
private OptionalInt lookUpWord(String s) {
|
||||||
int ret = indexes.getLexiconReader().get(s);
|
int ret = indexes.getLexiconReader().get(s);
|
||||||
if (ret == DictionaryHashMap.NO_VALUE) {
|
if (ret == OffHeapDictionaryHashMap.NO_VALUE) {
|
||||||
return OptionalInt.empty();
|
return OptionalInt.empty();
|
||||||
}
|
}
|
||||||
return OptionalInt.of(ret);
|
return OptionalInt.of(ret);
|
||||||
|
@ -2,20 +2,20 @@ package nu.marginalia.wmsa.edge.index.svc;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
|
||||||
import gnu.trove.list.TIntList;
|
|
||||||
import gnu.trove.list.array.TIntArrayList;
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.util.ranking.BetterReversePageRank;
|
import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank;
|
||||||
import nu.marginalia.util.ranking.BetterStandardPageRank;
|
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
|
||||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultHashMapAccumulator;
|
||||||
|
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
|
||||||
|
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultBitSetAccumulator;
|
||||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||||
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
||||||
|
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
|
||||||
|
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.searchset.RankingSearchSet;
|
import nu.marginalia.wmsa.edge.index.svc.searchset.RankingSearchSet;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSet;
|
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSet;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny;
|
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier;
|
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier;
|
||||||
import org.roaringbitmap.RoaringBitmap;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -23,137 +23,47 @@ import java.io.IOException;
|
|||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class EdgeIndexSearchSetsService {
|
public class EdgeIndexSearchSetsService {
|
||||||
private final HikariDataSource dataSource;
|
|
||||||
private RankingDomainFetcher rankingDomains;
|
|
||||||
private final RankingSettings rankingSettings;
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
private final RankingDomainFetcher rankingDomains;
|
||||||
|
private final RankingDomainFetcher similarityDomains;
|
||||||
|
private final RankingSettings rankingSettings;
|
||||||
|
|
||||||
private final SearchSet anySet = new SearchSetAny();
|
|
||||||
|
// Below are binary indices that are used to constrain a search
|
||||||
private volatile RankingSearchSet retroSet;
|
private volatile RankingSearchSet retroSet;
|
||||||
private volatile RankingSearchSet smallWebSet;
|
private volatile RankingSearchSet smallWebSet;
|
||||||
private volatile RankingSearchSet academiaSet;
|
private volatile RankingSearchSet academiaSet;
|
||||||
|
private final SearchSet anySet = new SearchSetAny();
|
||||||
|
|
||||||
|
// The ranking value of the domains used in sorting the domains
|
||||||
|
private volatile DomainRankings domainRankings = new DomainRankings();
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public EdgeIndexSearchSetsService(HikariDataSource dataSource,
|
public EdgeIndexSearchSetsService(RankingDomainFetcher rankingDomains,
|
||||||
RankingDomainFetcher rankingDomains,
|
RankingDomainFetcherForSimilarityData similarityDomains,
|
||||||
RankingSettings rankingSettings,
|
RankingSettings rankingSettings,
|
||||||
IndexServicesFactory servicesFactory) throws IOException {
|
IndexServicesFactory servicesFactory) throws IOException {
|
||||||
this.dataSource = dataSource;
|
|
||||||
this.rankingDomains = rankingDomains;
|
this.rankingDomains = rankingDomains;
|
||||||
|
|
||||||
|
if (similarityDomains.hasData()) {
|
||||||
|
this.similarityDomains = similarityDomains;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// on test environments the cosine similarity graph may not be present
|
||||||
|
logger.info("Domain similarity is not present, falling back on link graph");
|
||||||
|
this.similarityDomains = rankingDomains;
|
||||||
|
}
|
||||||
|
|
||||||
this.rankingSettings = rankingSettings;
|
this.rankingSettings = rankingSettings;
|
||||||
|
|
||||||
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat"));
|
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat"));
|
||||||
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, servicesFactory.getSearchSetsBase().resolve("academia.dat"));
|
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, servicesFactory.getSearchSetsBase().resolve("academia.dat"));
|
||||||
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, servicesFactory.getSearchSetsBase().resolve("retro.dat"));
|
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, servicesFactory.getSearchSetsBase().resolve("retro.dat"));
|
||||||
|
|
||||||
logger.info("SearchIndexDao ranking settings = {}", rankingSettings);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void recalculateAll() {
|
public DomainRankings getDomainRankings() {
|
||||||
updateAcademiaDomains();
|
return domainRankings;
|
||||||
updateRetroDomains();
|
|
||||||
updateSmallWebDomains();
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public RoaringBitmap goodUrls() {
|
|
||||||
RoaringBitmap domains = new RoaringBitmap();
|
|
||||||
RoaringBitmap urls = new RoaringBitmap();
|
|
||||||
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND IS_ALIVE")) {
|
|
||||||
stmt.setFetchSize(10_000);
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
while (rsp.next()) {
|
|
||||||
domains.add(rsp.getInt(1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// For some reason, doing this "INNER JOIN" in Java is significantly faster than doing it in SQL
|
|
||||||
try (var stmt = connection.prepareStatement("SELECT ID,DOMAIN_ID FROM EC_URL WHERE VISITED AND EC_URL.STATE='OK'")) {
|
|
||||||
stmt.setFetchSize(10_000);
|
|
||||||
var rsp = stmt.executeQuery();
|
|
||||||
while (rsp.next()) {
|
|
||||||
if (domains.contains(rsp.getInt(2))) {
|
|
||||||
urls.add(rsp.getInt(1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
return urls;
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public void updateRetroDomains() {
|
|
||||||
var spr = new BetterStandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new));
|
|
||||||
var data = spr.pageRankWithPeripheralNodes(spr.size() / 2);
|
|
||||||
|
|
||||||
synchronized (this) {
|
|
||||||
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data);
|
|
||||||
retroSet.write();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public void updateSmallWebDomains() {
|
|
||||||
var rpr = new BetterReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new));
|
|
||||||
rpr.setMaxKnownUrls(750);
|
|
||||||
var data = rpr.pageRankWithPeripheralNodes(rpr.size());
|
|
||||||
|
|
||||||
synchronized (this) {
|
|
||||||
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data);
|
|
||||||
smallWebSet.write();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public void updateAcademiaDomains() {
|
|
||||||
var spr = new BetterStandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new));
|
|
||||||
var data = spr.pageRankWithPeripheralNodes(spr.size()/2);
|
|
||||||
|
|
||||||
synchronized (this) {
|
|
||||||
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data);
|
|
||||||
academiaSet.write();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public TIntList getStandardDomains() {
|
|
||||||
TIntArrayList results = new TIntArrayList();
|
|
||||||
|
|
||||||
try (var connection = dataSource.getConnection();
|
|
||||||
var stmt = connection.prepareStatement(
|
|
||||||
"""
|
|
||||||
SELECT ID FROM EC_DOMAIN
|
|
||||||
WHERE INDEXED>0
|
|
||||||
AND STATE='ACTIVE'
|
|
||||||
AND DOMAIN_ALIAS IS NULL
|
|
||||||
ORDER BY ID ASC
|
|
||||||
""");
|
|
||||||
) {
|
|
||||||
var rs = stmt.executeQuery();
|
|
||||||
while (rs.next()) {
|
|
||||||
results.add(rs.getInt(1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return results;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public TIntList getSpecialDomains() {
|
|
||||||
TIntArrayList results = new TIntArrayList();
|
|
||||||
try (var connection = dataSource.getConnection();
|
|
||||||
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE='SPECIAL'")
|
|
||||||
) {
|
|
||||||
var rs = stmt.executeQuery();
|
|
||||||
while (rs.next()) {
|
|
||||||
results.add(rs.getInt(1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return results;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public SearchSet getSearchSetByName(SearchSetIdentifier searchSetIdentifier) {
|
public SearchSet getSearchSetByName(SearchSetIdentifier searchSetIdentifier) {
|
||||||
@ -167,4 +77,54 @@ public class EdgeIndexSearchSetsService {
|
|||||||
case SMALLWEB -> smallWebSet;
|
case SMALLWEB -> smallWebSet;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void recalculateAll() {
|
||||||
|
updateAcademiaDomainsSet();
|
||||||
|
updateRetroDomainsSet();
|
||||||
|
updateSmallWebDomainsSet();
|
||||||
|
updateDomainRankings();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void updateDomainRankings() {
|
||||||
|
var spr = new StandardPageRank(similarityDomains, rankingSettings.retro.toArray(String[]::new));
|
||||||
|
|
||||||
|
var ranks = spr.pageRankWithPeripheralNodes(spr.size() / 2, () -> new RankingResultHashMapAccumulator(100_000));
|
||||||
|
synchronized (this) {
|
||||||
|
domainRankings = new DomainRankings(ranks);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public void updateRetroDomainsSet() {
|
||||||
|
var spr = new StandardPageRank(similarityDomains, rankingSettings.retro.toArray(String[]::new));
|
||||||
|
var data = spr.pageRankWithPeripheralNodes(spr.size() / 2, RankingResultBitSetAccumulator::new);
|
||||||
|
|
||||||
|
synchronized (this) {
|
||||||
|
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data);
|
||||||
|
retroSet.write();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public void updateSmallWebDomainsSet() {
|
||||||
|
var rpr = new ReversePageRank(similarityDomains, rankingSettings.small.toArray(String[]::new));
|
||||||
|
rpr.setMaxKnownUrls(750);
|
||||||
|
var data = rpr.pageRankWithPeripheralNodes(rpr.size(), RankingResultBitSetAccumulator::new);
|
||||||
|
|
||||||
|
synchronized (this) {
|
||||||
|
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data);
|
||||||
|
smallWebSet.write();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public void updateAcademiaDomainsSet() {
|
||||||
|
var spr = new StandardPageRank(similarityDomains, rankingSettings.academia.toArray(String[]::new));
|
||||||
|
var data = spr.pageRankWithPeripheralNodes(spr.size()/2, RankingResultBitSetAccumulator::new);
|
||||||
|
|
||||||
|
synchronized (this) {
|
||||||
|
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data);
|
||||||
|
academiaSet.write();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -9,21 +9,37 @@ import java.nio.file.Files;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.StandardOpenOption;
|
import java.nio.file.StandardOpenOption;
|
||||||
|
|
||||||
|
/** A serializable bit map of domains
|
||||||
|
*
|
||||||
|
* @see SearchSetIdentifier
|
||||||
|
*
|
||||||
|
* */
|
||||||
public class RankingSearchSet implements SearchSet {
|
public class RankingSearchSet implements SearchSet {
|
||||||
|
|
||||||
private final RoaringBitmap set;
|
private final RoaringBitmap set;
|
||||||
public final SearchSetIdentifier identifier;
|
public final SearchSetIdentifier identifier;
|
||||||
public final Path source;
|
public final Path source;
|
||||||
|
|
||||||
|
public RankingSearchSet(SearchSetIdentifier identifier, Path source, RoaringBitmap set) {
|
||||||
|
this.identifier = identifier;
|
||||||
|
this.source = source;
|
||||||
|
this.set = set;
|
||||||
|
}
|
||||||
|
|
||||||
public RankingSearchSet(SearchSetIdentifier identifier, Path source) throws IOException {
|
public RankingSearchSet(SearchSetIdentifier identifier, Path source) throws IOException {
|
||||||
this.identifier = identifier;
|
this.identifier = identifier;
|
||||||
this.source = source;
|
this.source = source;
|
||||||
set = new RoaringBitmap();
|
|
||||||
|
|
||||||
if (!Files.exists(source)) {
|
if (!Files.exists(source)) {
|
||||||
return;
|
set = new RoaringBitmap();
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
set = load(source);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static RoaringBitmap load(Path source) throws IOException {
|
||||||
|
var set = new RoaringBitmap();
|
||||||
try (var ds = new DataInputStream(Files.newInputStream(source, StandardOpenOption.READ))) {
|
try (var ds = new DataInputStream(Files.newInputStream(source, StandardOpenOption.READ))) {
|
||||||
for (;;) {
|
for (;;) {
|
||||||
try {
|
try {
|
||||||
@ -32,12 +48,7 @@ public class RankingSearchSet implements SearchSet {
|
|||||||
catch (IOException ex) { break; }
|
catch (IOException ex) { break; }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
return set;
|
||||||
|
|
||||||
public RankingSearchSet(SearchSetIdentifier identifier, Path source, RoaringBitmap set) {
|
|
||||||
this.identifier = identifier;
|
|
||||||
this.source = source;
|
|
||||||
this.set = set;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -46,7 +57,11 @@ public class RankingSearchSet implements SearchSet {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void write() throws IOException {
|
public void write() throws IOException {
|
||||||
try (var ds = new DataOutputStream(Files.newOutputStream(source, StandardOpenOption.WRITE, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))) {
|
try (var ds = new DataOutputStream(Files.newOutputStream(source,
|
||||||
|
StandardOpenOption.WRITE,
|
||||||
|
StandardOpenOption.CREATE,
|
||||||
|
StandardOpenOption.TRUNCATE_EXISTING)))
|
||||||
|
{
|
||||||
for (var iter = set.getIntIterator(); iter.hasNext();) {
|
for (var iter = set.getIntIterator(); iter.hasNext();) {
|
||||||
ds.writeInt(iter.next());
|
ds.writeInt(iter.next());
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,12 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.svc.searchset;
|
package nu.marginalia.wmsa.edge.index.svc.searchset;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.search.model.EdgeSearchProfile;
|
||||||
|
|
||||||
|
/** Identifies a RankingSearchSet, associated with an EdgeSearchProfile
|
||||||
|
*
|
||||||
|
* @see RankingSearchSet
|
||||||
|
* @see EdgeSearchProfile
|
||||||
|
* */
|
||||||
public enum SearchSetIdentifier {
|
public enum SearchSetIdentifier {
|
||||||
NONE,
|
NONE,
|
||||||
RETRO,
|
RETRO,
|
||||||
|
@ -13,8 +13,8 @@ public class SmallSearchSet implements SearchSet {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean contains(int urlId) {
|
public boolean contains(int domainId) {
|
||||||
return entries.contains(urlId);
|
return entries.contains(domainId);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.integration.stackoverflow;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||||
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||||
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
|
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.wmsa.edge.integration.wikipedia;
|
package nu.marginalia.wmsa.edge.integration.wikipedia;
|
||||||
|
|
||||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||||
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||||
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
|
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
|
||||||
|
@ -11,8 +11,6 @@ import java.util.regex.Pattern;
|
|||||||
@Getter @Setter @Builder
|
@Getter @Setter @Builder
|
||||||
public class EdgeDomain {
|
public class EdgeDomain {
|
||||||
|
|
||||||
private static final Predicate<String> ipPatternTest = Pattern.compile("[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}").asMatchPredicate();
|
|
||||||
private static final Predicate<String> govListTest = Pattern.compile(".*\\.(ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate();
|
|
||||||
|
|
||||||
@Nonnull
|
@Nonnull
|
||||||
public final String subDomain;
|
public final String subDomain;
|
||||||
@ -27,7 +25,7 @@ public class EdgeDomain {
|
|||||||
|
|
||||||
var dot = host.lastIndexOf('.');
|
var dot = host.lastIndexOf('.');
|
||||||
|
|
||||||
if (dot < 0 || ipPatternTest.test(host)) { // IPV6 >.>
|
if (dot < 0 || looksLikeAnIp(host)) { // IPV6 >.>
|
||||||
subDomain = "";
|
subDomain = "";
|
||||||
domain = host;
|
domain = host;
|
||||||
}
|
}
|
||||||
@ -38,7 +36,7 @@ public class EdgeDomain {
|
|||||||
domain = host;
|
domain = host;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if (govListTest.test(host))
|
if (looksLikeGovTld(host))
|
||||||
{ // Capture .ac.jp, .co.uk
|
{ // Capture .ac.jp, .co.uk
|
||||||
int dot3 = host.substring(0, dot2).lastIndexOf('.');
|
int dot3 = host.substring(0, dot2).lastIndexOf('.');
|
||||||
if (dot3 >= 0) {
|
if (dot3 >= 0) {
|
||||||
@ -59,6 +57,35 @@ public class EdgeDomain {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static final Predicate<String> govListTest = Pattern.compile(".*\\.(ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate();
|
||||||
|
private boolean looksLikeGovTld(String host) {
|
||||||
|
if (host.length() < 8)
|
||||||
|
return false;
|
||||||
|
int cnt = 0;
|
||||||
|
for (int i = host.length() - 7; i < host.length(); i++) {
|
||||||
|
if (host.charAt(i) == '.')
|
||||||
|
cnt++;
|
||||||
|
}
|
||||||
|
return cnt >= 2 && govListTest.test(host);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static final Predicate<String> ipPatternTest = Pattern.compile("[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}").asMatchPredicate();
|
||||||
|
|
||||||
|
private boolean looksLikeAnIp(String host) {
|
||||||
|
if (host.length() < 7)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
char firstChar = host.charAt(0);
|
||||||
|
int lastChar = host.charAt(host.length() - 1);
|
||||||
|
|
||||||
|
return Character.isDigit(firstChar)
|
||||||
|
&& Character.isDigit(lastChar)
|
||||||
|
&& ipPatternTest.test(host);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public EdgeUrl toRootUrl() {
|
public EdgeUrl toRootUrl() {
|
||||||
// Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http
|
// Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http
|
||||||
return new EdgeUrl("http", this, null, "/", null);
|
return new EdgeUrl("http", this, null, "/", null);
|
||||||
|
@ -24,6 +24,11 @@ public record EdgeSearchResultKeywordScore(int set,
|
|||||||
sum += 20;
|
sum += 20;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int rank = EdgePageDocumentsMetadata.decodeRank(encodedDocMetadata) - 13;
|
||||||
|
if (rank < 0)
|
||||||
|
sum += rank / 2;
|
||||||
|
else
|
||||||
|
sum += rank / 4;
|
||||||
|
|
||||||
return sum;
|
return sum;
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.wmsa.edge.model.search;
|
package nu.marginalia.wmsa.edge.model.search;
|
||||||
|
|
||||||
import lombok.*;
|
import lombok.*;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.QueryLimits;
|
||||||
import nu.marginalia.wmsa.edge.index.model.QueryStrategy;
|
import nu.marginalia.wmsa.edge.index.model.QueryStrategy;
|
||||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier;
|
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier;
|
||||||
import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit;
|
import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit;
|
||||||
@ -9,23 +10,18 @@ import java.util.List;
|
|||||||
|
|
||||||
@ToString @Getter @Builder @With @AllArgsConstructor
|
@ToString @Getter @Builder @With @AllArgsConstructor
|
||||||
public class EdgeSearchSpecification {
|
public class EdgeSearchSpecification {
|
||||||
|
|
||||||
public List<EdgeSearchSubquery> subqueries;
|
public List<EdgeSearchSubquery> subqueries;
|
||||||
public List<Integer> domains;
|
public List<Integer> domains;
|
||||||
public SearchSetIdentifier searchSetIdentifier;
|
public SearchSetIdentifier searchSetIdentifier;
|
||||||
|
|
||||||
public final int limitByDomain;
|
|
||||||
public final int limitTotal;
|
|
||||||
|
|
||||||
public final String humanQuery;
|
public final String humanQuery;
|
||||||
|
|
||||||
public final int timeoutMs;
|
|
||||||
public final int fetchSize;
|
|
||||||
|
|
||||||
public final SpecificationLimit quality;
|
public final SpecificationLimit quality;
|
||||||
public final SpecificationLimit year;
|
public final SpecificationLimit year;
|
||||||
public final SpecificationLimit size;
|
public final SpecificationLimit size;
|
||||||
|
public final SpecificationLimit rank;
|
||||||
|
|
||||||
|
public final QueryLimits queryLimits;
|
||||||
public final QueryStrategy queryStrategy;
|
public final QueryStrategy queryStrategy;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -6,6 +6,7 @@ import nu.marginalia.util.language.WordPatterns;
|
|||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
|
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.QueryLimits;
|
||||||
import nu.marginalia.wmsa.edge.index.model.QueryStrategy;
|
import nu.marginalia.wmsa.edge.index.model.QueryStrategy;
|
||||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
||||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery;
|
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery;
|
||||||
@ -84,6 +85,8 @@ public class QueryFactory {
|
|||||||
List<String> problems = new ArrayList<>();
|
List<String> problems = new ArrayList<>();
|
||||||
String domain = null;
|
String domain = null;
|
||||||
|
|
||||||
|
QueryStrategy queryStrategy = QueryStrategy.AUTO;
|
||||||
|
|
||||||
var basicQuery = queryParser.parse(query);
|
var basicQuery = queryParser.parse(query);
|
||||||
|
|
||||||
if (basicQuery.size() >= 8) {
|
if (basicQuery.size() >= 8) {
|
||||||
@ -94,6 +97,7 @@ public class QueryFactory {
|
|||||||
SpecificationLimit qualityLimit = profile.getQualityLimit();
|
SpecificationLimit qualityLimit = profile.getQualityLimit();
|
||||||
SpecificationLimit year = profile.getYearLimit();
|
SpecificationLimit year = profile.getYearLimit();
|
||||||
SpecificationLimit size = profile.getSizeLimit();
|
SpecificationLimit size = profile.getSizeLimit();
|
||||||
|
SpecificationLimit rank = SpecificationLimit.none();
|
||||||
|
|
||||||
for (Token t : basicQuery) {
|
for (Token t : basicQuery) {
|
||||||
if (t.type == TokenType.QUOT_TERM || t.type == TokenType.LITERAL_TERM) {
|
if (t.type == TokenType.QUOT_TERM || t.type == TokenType.LITERAL_TERM) {
|
||||||
@ -113,6 +117,12 @@ public class QueryFactory {
|
|||||||
if (t.type == TokenType.SIZE_TERM) {
|
if (t.type == TokenType.SIZE_TERM) {
|
||||||
size = parseSpecificationLimit(t.str);
|
size = parseSpecificationLimit(t.str);
|
||||||
}
|
}
|
||||||
|
if (t.type == TokenType.RANK_TERM) {
|
||||||
|
rank = parseSpecificationLimit(t.str);
|
||||||
|
}
|
||||||
|
if (t.type == TokenType.QS_TERM) {
|
||||||
|
queryStrategy = parseQueryStrategy(t.str);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var queryPermutations = queryParser.permuteQueriesNew(basicQuery);
|
var queryPermutations = queryParser.permuteQueriesNew(basicQuery);
|
||||||
@ -148,6 +158,8 @@ public class QueryFactory {
|
|||||||
case QUALITY_TERM:
|
case QUALITY_TERM:
|
||||||
case YEAR_TERM:
|
case YEAR_TERM:
|
||||||
case SIZE_TERM:
|
case SIZE_TERM:
|
||||||
|
case RANK_TERM:
|
||||||
|
case QS_TERM:
|
||||||
break; //
|
break; //
|
||||||
case NEAR_TERM:
|
case NEAR_TERM:
|
||||||
near = t.str;
|
near = t.str;
|
||||||
@ -179,25 +191,25 @@ public class QueryFactory {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int domainLimit;
|
||||||
|
if (domain != null) {
|
||||||
|
domainLimit = 100;
|
||||||
|
} else {
|
||||||
|
domainLimit = 2;
|
||||||
|
}
|
||||||
|
|
||||||
EdgeSearchSpecification.EdgeSearchSpecificationBuilder specsBuilder = EdgeSearchSpecification.builder()
|
EdgeSearchSpecification.EdgeSearchSpecificationBuilder specsBuilder = EdgeSearchSpecification.builder()
|
||||||
.subqueries(subqueries)
|
.subqueries(subqueries)
|
||||||
.limitTotal(100)
|
.queryLimits(new QueryLimits(domainLimit, 100, 250, 4096))
|
||||||
.humanQuery(query)
|
.humanQuery(query)
|
||||||
.timeoutMs(250)
|
|
||||||
.fetchSize(4096)
|
|
||||||
.quality(qualityLimit)
|
.quality(qualityLimit)
|
||||||
.year(year)
|
.year(year)
|
||||||
.size(size)
|
.size(size)
|
||||||
|
.rank(rank)
|
||||||
.domains(domains)
|
.domains(domains)
|
||||||
.queryStrategy(QueryStrategy.AUTO)
|
.queryStrategy(queryStrategy)
|
||||||
.searchSetIdentifier(profile.searchSetIdentifier);
|
.searchSetIdentifier(profile.searchSetIdentifier);
|
||||||
|
|
||||||
if (domain != null) {
|
|
||||||
specsBuilder = specsBuilder.limitByDomain(100);
|
|
||||||
} else {
|
|
||||||
specsBuilder = specsBuilder.limitByDomain(2);
|
|
||||||
}
|
|
||||||
|
|
||||||
EdgeSearchSpecification specs = specsBuilder.build();
|
EdgeSearchSpecification specs = specsBuilder.build();
|
||||||
|
|
||||||
return new EdgeSearchQuery(specs, searchTermsHuman, domain);
|
return new EdgeSearchQuery(specs, searchTermsHuman, domain);
|
||||||
@ -210,10 +222,10 @@ public class QueryFactory {
|
|||||||
if (startChar == '=') {
|
if (startChar == '=') {
|
||||||
return SpecificationLimit.equals(val);
|
return SpecificationLimit.equals(val);
|
||||||
}
|
}
|
||||||
else if (startChar == '<'){
|
else if (startChar == '<') {
|
||||||
return SpecificationLimit.lessThan(val);
|
return SpecificationLimit.lessThan(val);
|
||||||
}
|
}
|
||||||
else if (startChar == '>'){
|
else if (startChar == '>') {
|
||||||
return SpecificationLimit.greaterThan(val);
|
return SpecificationLimit.greaterThan(val);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
@ -221,6 +233,17 @@ public class QueryFactory {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private QueryStrategy parseQueryStrategy(String str) {
|
||||||
|
return switch (str.toUpperCase()) {
|
||||||
|
case "RF_TITLE" -> QueryStrategy.REQUIRE_FIELD_TITLE;
|
||||||
|
case "RF_SUBJECT" -> QueryStrategy.REQUIRE_FIELD_SUBJECT;
|
||||||
|
case "RF_SITE" -> QueryStrategy.REQUIRE_FIELD_SITE;
|
||||||
|
case "SENTENCE" -> QueryStrategy.SENTENCE;
|
||||||
|
case "TOPIC" -> QueryStrategy.TOPIC;
|
||||||
|
default -> QueryStrategy.AUTO;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
private String normalizeDomainName(String str) {
|
private String normalizeDomainName(String str) {
|
||||||
return str.toLowerCase();
|
return str.toLowerCase();
|
||||||
}
|
}
|
||||||
|
@ -93,6 +93,10 @@ public class QueryParser {
|
|||||||
entity.replace(new Token(TokenType.YEAR_TERM, t.str.substring(4), t.displayStr));
|
entity.replace(new Token(TokenType.YEAR_TERM, t.str.substring(4), t.displayStr));
|
||||||
} else if (t.str.startsWith("size") && t.str.matches("size[=><]\\d+")) {
|
} else if (t.str.startsWith("size") && t.str.matches("size[=><]\\d+")) {
|
||||||
entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr));
|
entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr));
|
||||||
|
} else if (t.str.startsWith("rank") && t.str.matches("rank[=><]\\d+")) {
|
||||||
|
entity.replace(new Token(TokenType.RANK_TERM, t.str.substring(4), t.displayStr));
|
||||||
|
} else if (t.str.startsWith("qs=")) {
|
||||||
|
entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr));
|
||||||
} else if (t.str.contains(":")) {
|
} else if (t.str.contains(":")) {
|
||||||
entity.replace(new Token(TokenType.ADVICE_TERM, t.str, t.displayStr));
|
entity.replace(new Token(TokenType.ADVICE_TERM, t.str, t.displayStr));
|
||||||
}
|
}
|
||||||
@ -506,8 +510,11 @@ enum TokenType implements Predicate<Token> {
|
|||||||
QUALITY_TERM,
|
QUALITY_TERM,
|
||||||
YEAR_TERM,
|
YEAR_TERM,
|
||||||
SIZE_TERM,
|
SIZE_TERM,
|
||||||
|
RANK_TERM,
|
||||||
NEAR_TERM,
|
NEAR_TERM,
|
||||||
|
|
||||||
|
QS_TERM,
|
||||||
|
|
||||||
QUOT,
|
QUOT,
|
||||||
MINUS,
|
MINUS,
|
||||||
QMARK,
|
QMARK,
|
||||||
|
@ -8,7 +8,7 @@ import lombok.Getter;
|
|||||||
import lombok.ToString;
|
import lombok.ToString;
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
import nu.marginalia.util.language.processing.KeywordExtractor;
|
import nu.marginalia.util.language.processing.KeywordExtractor;
|
||||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
||||||
import nu.marginalia.util.language.processing.model.WordSpan;
|
import nu.marginalia.util.language.processing.model.WordSpan;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
|
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
|
||||||
@ -25,12 +25,12 @@ public class QueryVariants {
|
|||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final KeywordExtractor keywordExtractor;
|
private final KeywordExtractor keywordExtractor;
|
||||||
private final SentenceExtractor sentenceExtractor;
|
|
||||||
private final TermFrequencyDict dict;
|
private final TermFrequencyDict dict;
|
||||||
private final PorterStemmer ps = new PorterStemmer();
|
private final PorterStemmer ps = new PorterStemmer();
|
||||||
|
|
||||||
private final NGramBloomFilter nGramBloomFilter;
|
private final NGramBloomFilter nGramBloomFilter;
|
||||||
private final EnglishDictionary englishDictionary;
|
private final EnglishDictionary englishDictionary;
|
||||||
|
private final ThreadLocal<SentenceExtractor> sentenceExtractor;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public QueryVariants(LanguageModels lm,
|
public QueryVariants(LanguageModels lm,
|
||||||
@ -40,7 +40,7 @@ public class QueryVariants {
|
|||||||
this.nGramBloomFilter = nGramBloomFilter;
|
this.nGramBloomFilter = nGramBloomFilter;
|
||||||
this.englishDictionary = englishDictionary;
|
this.englishDictionary = englishDictionary;
|
||||||
this.keywordExtractor = new KeywordExtractor();
|
this.keywordExtractor = new KeywordExtractor();
|
||||||
this.sentenceExtractor = new SentenceExtractor(lm);
|
this.sentenceExtractor = ThreadLocal.withInitial(() -> new SentenceExtractor(lm));
|
||||||
this.dict = dict;
|
this.dict = dict;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -78,10 +78,8 @@ public class QueryVariants {
|
|||||||
|
|
||||||
final TreeMap<Integer, List<WordSpan>> byStart = new TreeMap<>();
|
final TreeMap<Integer, List<WordSpan>> byStart = new TreeMap<>();
|
||||||
|
|
||||||
logger.debug("Q: {}", query);
|
var se = sentenceExtractor.get();
|
||||||
logger.debug("QAS: {}", joinedQuery);
|
var sentence = se.extractSentence(joinedQuery.joinedQuery);
|
||||||
|
|
||||||
var sentence = sentenceExtractor.extractSentence(joinedQuery.joinedQuery);
|
|
||||||
|
|
||||||
for (int i = 0; i < sentence.posTags.length; i++) {
|
for (int i = 0; i < sentence.posTags.length; i++) {
|
||||||
if (sentence.posTags[i].startsWith("N") || sentence.posTags[i].startsWith("V")) {
|
if (sentence.posTags[i].startsWith("N") || sentence.posTags[i].startsWith("V")) {
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user