mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
A tiny release between crawls (#138)
Bringing online new ranking changes Co-authored-by: Viktor Lofgren <vlofgren@marginalia.nu> Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/138
This commit is contained in:
parent
467bf566a9
commit
fa9b4e4352
@ -106,8 +106,9 @@ dependencies {
|
||||
|
||||
implementation group: 'org.yaml', name: 'snakeyaml', version: '1.30'
|
||||
|
||||
implementation 'com.syncthemall:boilerpipe:1.2.2'
|
||||
implementation 'com.github.luben:zstd-jni:1.5.2-2'
|
||||
implementation 'org.lz4:lz4-java:1.8.0'
|
||||
|
||||
implementation 'com.github.vladimir-bukhtoyarov:bucket4j-core:7.5.0'
|
||||
implementation 'de.rototor.jeuclid:jeuclid-core:3.1.14'
|
||||
|
||||
@ -126,7 +127,6 @@ dependencies {
|
||||
implementation 'org.roaringbitmap:RoaringBitmap:0.9.32'
|
||||
|
||||
implementation group: 'mysql', name: 'mysql-connector-java', version: '8.0.29'
|
||||
|
||||
implementation 'com.github.Marcono1234:gson-record-type-adapter-factory:0.2.0'
|
||||
|
||||
testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'
|
||||
|
@ -2,11 +2,13 @@ package nu.marginalia.util;
|
||||
|
||||
public class BrailleBlockPunchCards {
|
||||
|
||||
private static final char brailleBlockBase = '\u2800';
|
||||
|
||||
public static String printBits(int val, int bits) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
|
||||
for (int b = 0; b < bits; b+=8, val>>>=8) {
|
||||
builder.append((char)('\u2800'+bin2brail(val)));
|
||||
builder.append((char)(brailleBlockBase + bin2brail(val)));
|
||||
}
|
||||
|
||||
return builder.toString();
|
||||
|
@ -42,7 +42,7 @@ public abstract class ParallelPipe<INPUT,INTERMEDIATE> {
|
||||
@SneakyThrows
|
||||
private void runProcessThread() {
|
||||
while (expectingInput || !inputs.isEmpty()) {
|
||||
var in = inputs.poll(1, TimeUnit.SECONDS);
|
||||
var in = inputs.poll(10, TimeUnit.SECONDS);
|
||||
|
||||
if (in != null) {
|
||||
try {
|
||||
|
@ -108,7 +108,6 @@ public class RandomWriteFunnel implements AutoCloseable {
|
||||
|
||||
private void eval(ByteBuffer dest) throws IOException {
|
||||
flushBuffer();
|
||||
channel.force(false);
|
||||
|
||||
channel.position(0);
|
||||
buffer.clear();
|
||||
|
@ -1,20 +1,33 @@
|
||||
package nu.marginalia.util;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
|
||||
public class StringPool {
|
||||
private final HashMap<String, String> words;
|
||||
|
||||
public StringPool() {
|
||||
this.words = new HashMap<>(1000);
|
||||
private final HashMap<String, String> words;
|
||||
private final Object2LongOpenHashMap<String> ages;
|
||||
private final int maxCap;
|
||||
|
||||
long idx;
|
||||
|
||||
private StringPool(int capacity, int maxCap) {
|
||||
this.ages = new Object2LongOpenHashMap<>(capacity);
|
||||
this.words = new HashMap<>(capacity);
|
||||
this.maxCap = maxCap;
|
||||
}
|
||||
|
||||
public StringPool(int capacity) {
|
||||
words = new HashMap<>(capacity);
|
||||
public static StringPool create(int capacity) {
|
||||
return new StringPool(capacity, capacity * 10);
|
||||
}
|
||||
|
||||
public String internalize(String str) {
|
||||
prune();
|
||||
|
||||
final String ret = words.putIfAbsent(str, str);
|
||||
ages.put(ret, idx++);
|
||||
|
||||
if (null == ret)
|
||||
return str;
|
||||
@ -22,6 +35,37 @@ public class StringPool {
|
||||
return ret;
|
||||
}
|
||||
|
||||
public String[] internalize(String[] str) {
|
||||
|
||||
for (int i = 0; i < str.length; i++) {
|
||||
str[i] = internalize(str[i]);
|
||||
}
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
public void prune() {
|
||||
|
||||
if (words.size() < maxCap)
|
||||
return;
|
||||
|
||||
long[] ageValues = ages.values().toLongArray();
|
||||
Arrays.sort(ageValues);
|
||||
|
||||
long cutoff = ageValues[ageValues.length - maxCap / 10];
|
||||
|
||||
words.clear();
|
||||
ages.forEach((word, cnt) -> {
|
||||
if (cnt >= cutoff) {
|
||||
words.put(word, word);
|
||||
}
|
||||
});
|
||||
ages.clear();
|
||||
words.forEach((w,w2) -> {
|
||||
ages.put(w, idx);
|
||||
});
|
||||
}
|
||||
|
||||
public void flush() {
|
||||
words.clear();
|
||||
}
|
||||
|
@ -18,22 +18,14 @@ public interface IntArrayBase extends BulkTransferArray<IntBuffer> {
|
||||
}
|
||||
}
|
||||
|
||||
default void increment(long pos) {
|
||||
set(pos, get(pos) + 1);
|
||||
}
|
||||
|
||||
default void swap(long pos1, long pos2) {
|
||||
int tmp = get(pos1);
|
||||
set(pos1, get(pos2));
|
||||
set(pos2, tmp);
|
||||
}
|
||||
|
||||
default void swapn(int n, long pos1, long pos2) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
int tmp = get(pos1+i);
|
||||
set(pos1+i, get(pos2+i));
|
||||
set(pos2+i, tmp);
|
||||
}
|
||||
default void increment(long pos) {
|
||||
set(pos, get(pos) + 1);
|
||||
}
|
||||
|
||||
default int getAndIncrement(long pos) {
|
||||
@ -47,6 +39,7 @@ public interface IntArrayBase extends BulkTransferArray<IntBuffer> {
|
||||
set(start+i, buffer.get(i + bufferStart));
|
||||
}
|
||||
}
|
||||
|
||||
default void get(long start, long end, IntBuffer buffer, int bufferStart) {
|
||||
for (int i = 0; i < (end-start); i++) {
|
||||
buffer.put(i + bufferStart, get(start + i));
|
||||
|
@ -28,6 +28,7 @@ public interface LongArrayBase extends BulkTransferArray<LongBuffer> {
|
||||
set(pos2, tmp);
|
||||
}
|
||||
|
||||
/** Behavior not defined for overlapping ranges */
|
||||
default void swapn(int n, long pos1, long pos2) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
long tmp = get(pos1+i);
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.util.array.delegate;
|
||||
|
||||
import com.upserve.uppend.blobs.NativeIO;
|
||||
import nu.marginalia.util.array.IntArray;
|
||||
import nu.marginalia.util.array.algo.SortingContext;
|
||||
import nu.marginalia.util.array.buffer.IntQueryBuffer;
|
||||
import nu.marginalia.util.array.functional.IntBinaryIOOperation;
|
||||
import nu.marginalia.util.array.functional.IntIOTransformer;
|
||||
@ -61,6 +62,16 @@ public class ShiftedIntArray implements IntArray {
|
||||
delegate.get(shift+start, shift+end, buffer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getAndIncrement(long pos) {
|
||||
return delegate.getAndIncrement(shift + pos);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void fill(long start, long end, int val) {
|
||||
delegate.fill(start + shift, end + shift, val);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long size() {
|
||||
return size;
|
||||
@ -97,6 +108,12 @@ public class ShiftedIntArray implements IntArray {
|
||||
return delegate.isSorted(shift + start, shift + end);
|
||||
}
|
||||
|
||||
|
||||
public void sortLargeSpan(SortingContext ctx, long start, long end) throws IOException {
|
||||
delegate.sortLargeSpan(ctx, start, end);
|
||||
}
|
||||
|
||||
|
||||
public long search(int key) {
|
||||
if (size < 128) {
|
||||
return linearSearch(key);
|
||||
|
@ -3,6 +3,7 @@ package nu.marginalia.util.array.delegate;
|
||||
import com.upserve.uppend.blobs.NativeIO;
|
||||
import nu.marginalia.util.array.LongArray;
|
||||
import nu.marginalia.util.array.algo.LongArraySearch;
|
||||
import nu.marginalia.util.array.algo.SortingContext;
|
||||
import nu.marginalia.util.array.buffer.LongQueryBuffer;
|
||||
import nu.marginalia.util.array.functional.LongBinaryIOOperation;
|
||||
import nu.marginalia.util.array.functional.LongIOTransformer;
|
||||
@ -62,6 +63,16 @@ public class ShiftedLongArray implements LongArray {
|
||||
delegate.get(shift+start, shift+end, buffer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getAndIncrement(long pos) {
|
||||
return delegate.getAndIncrement(shift + pos);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void fill(long start, long end, long val) {
|
||||
delegate.fill(start + shift, end + shift, val);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long size() {
|
||||
return size;
|
||||
@ -106,6 +117,14 @@ public class ShiftedLongArray implements LongArray {
|
||||
return delegate.isSortedN(sz, shift + start, shift + end);
|
||||
}
|
||||
|
||||
public void sortLargeSpanN(SortingContext ctx, int sz, long start, long end) throws IOException {
|
||||
delegate.sortLargeSpanN(ctx, sz, start, end);
|
||||
}
|
||||
|
||||
public void sortLargeSpan(SortingContext ctx, long start, long end) throws IOException {
|
||||
delegate.sortLargeSpan(ctx, start, end);
|
||||
}
|
||||
|
||||
public long searchN(int sz, long key) {
|
||||
if (size < 128) {
|
||||
return linearSearchN(sz, key);
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.util.array.page;
|
||||
|
||||
import com.upserve.uppend.blobs.NativeIO;
|
||||
import nu.marginalia.util.array.IntArray;
|
||||
import nu.marginalia.util.array.algo.SortingContext;
|
||||
import nu.marginalia.util.array.buffer.IntQueryBuffer;
|
||||
import nu.marginalia.util.array.delegate.ReferenceImplIntArrayDelegate;
|
||||
import nu.marginalia.util.array.functional.IntBinaryIOOperation;
|
||||
@ -113,6 +114,11 @@ public class PagingIntArray extends AbstractPagingArray<IntArrayPage, IntBuffer>
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getAndIncrement(long pos) {
|
||||
return pages[partitioningScheme.getPage(pos)].getAndIncrement(partitioningScheme.getOffset(pos));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void get(long start, long end, int[] buffer) {
|
||||
if (partitioningScheme.isSamePage(start, end)) {
|
||||
@ -272,6 +278,22 @@ public class PagingIntArray extends AbstractPagingArray<IntArrayPage, IntBuffer>
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void sortLargeSpan(SortingContext ctx, long start, long end) throws IOException {
|
||||
if (partitioningScheme.isSamePage(start, end)) {
|
||||
int sOff = partitioningScheme.getOffset(start);
|
||||
int eOff = partitioningScheme.getEndOffset(start, end);
|
||||
|
||||
if (eOff > sOff) {
|
||||
pages[partitioningScheme.getPage(start)].sortLargeSpan(ctx, sOff, eOff);
|
||||
}
|
||||
}
|
||||
else {
|
||||
defaults.sortLargeSpan(ctx, start, end);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void write(Path fileName) throws IOException {
|
||||
try (var channel = (FileChannel) Files.newByteChannel(fileName, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
|
||||
for (int i = 0; i < pages.length; i++) {
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.util.array.page;
|
||||
|
||||
import com.upserve.uppend.blobs.NativeIO;
|
||||
import nu.marginalia.util.array.LongArray;
|
||||
import nu.marginalia.util.array.algo.SortingContext;
|
||||
import nu.marginalia.util.array.buffer.LongQueryBuffer;
|
||||
import nu.marginalia.util.array.delegate.ReferenceImplLongArrayDelegate;
|
||||
import nu.marginalia.util.array.functional.LongBinaryIOOperation;
|
||||
@ -118,6 +119,11 @@ public class PagingLongArray extends AbstractPagingArray<LongArrayPage, LongBuff
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getAndIncrement(long pos) {
|
||||
return pages[partitioningScheme.getPage(pos)].getAndIncrement(partitioningScheme.getOffset(pos));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void set(long pos, long value) {
|
||||
int page = partitioningScheme.getPage(pos);
|
||||
@ -439,6 +445,33 @@ public class PagingLongArray extends AbstractPagingArray<LongArrayPage, LongBuff
|
||||
defaults.mergeSortN(sz, start, end, tempDir);
|
||||
}
|
||||
}
|
||||
public void sortLargeSpanN(SortingContext ctx, int sz, long start, long end) throws IOException {
|
||||
if (partitioningScheme.isSamePage(start, end)) {
|
||||
int sOff = partitioningScheme.getOffset(start);
|
||||
int eOff = partitioningScheme.getEndOffset(start, end);
|
||||
|
||||
if (eOff > sOff) {
|
||||
pages[partitioningScheme.getPage(start)].sortLargeSpanN(ctx, sz, sOff, eOff);
|
||||
}
|
||||
}
|
||||
else {
|
||||
defaults.sortLargeSpanN(ctx, sz, start, end);
|
||||
}
|
||||
}
|
||||
|
||||
public void sortLargeSpan(SortingContext ctx, long start, long end) throws IOException {
|
||||
if (partitioningScheme.isSamePage(start, end)) {
|
||||
int sOff = partitioningScheme.getOffset(start);
|
||||
int eOff = partitioningScheme.getEndOffset(start, end);
|
||||
|
||||
if (eOff > sOff) {
|
||||
pages[partitioningScheme.getPage(start)].sortLargeSpan(ctx, sOff, eOff);
|
||||
}
|
||||
}
|
||||
else {
|
||||
defaults.sortLargeSpan(ctx, start, end);
|
||||
}
|
||||
}
|
||||
|
||||
public void write(Path fileName) throws IOException {
|
||||
try (var channel = (FileChannel) Files.newByteChannel(fileName, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
|
||||
|
@ -0,0 +1,17 @@
|
||||
package nu.marginalia.util.bigstring;
|
||||
|
||||
public interface BigString {
|
||||
static BigString encode(String stringValue) {
|
||||
if (stringValue.length() > 64) {
|
||||
return new CompressedBigString(stringValue);
|
||||
}
|
||||
else {
|
||||
return new PlainBigString(stringValue);
|
||||
}
|
||||
}
|
||||
String decode();
|
||||
|
||||
byte[] getBytes();
|
||||
|
||||
int length();
|
||||
}
|
@ -0,0 +1,39 @@
|
||||
package nu.marginalia.util.bigstring;
|
||||
|
||||
import net.jpountz.lz4.LZ4Compressor;
|
||||
import net.jpountz.lz4.LZ4Factory;
|
||||
import net.jpountz.lz4.LZ4FastDecompressor;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
public class CompressedBigString implements BigString {
|
||||
private final int originalSize;
|
||||
private final int length;
|
||||
private final byte[] encoded;
|
||||
|
||||
private static final LZ4Factory lz4Factory = LZ4Factory.fastestInstance();;
|
||||
private static final LZ4Compressor compressor = lz4Factory.fastCompressor();
|
||||
private static final LZ4FastDecompressor decompressor = lz4Factory.fastDecompressor();
|
||||
|
||||
public CompressedBigString(String stringValue) {
|
||||
byte[] byteValue = stringValue.getBytes(StandardCharsets.UTF_16);
|
||||
originalSize = byteValue.length;
|
||||
encoded = compressor.compress(byteValue);
|
||||
length = stringValue.length();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String decode() {
|
||||
return new String(getBytes(), StandardCharsets.UTF_16);
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] getBytes() {
|
||||
return decompressor.decompress(encoded, originalSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
return length;
|
||||
}
|
||||
}
|
@ -0,0 +1,26 @@
|
||||
package nu.marginalia.util.bigstring;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
public class PlainBigString implements BigString {
|
||||
private final String value;
|
||||
|
||||
public PlainBigString(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String decode() {
|
||||
return value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] getBytes() {
|
||||
return value.getBytes(StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
return value.length();
|
||||
}
|
||||
}
|
@ -1,8 +1,5 @@
|
||||
package nu.marginalia.util.dict;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.LongBuffer;
|
||||
import java.util.ArrayList;
|
||||
@ -10,7 +7,6 @@ import java.util.ArrayList;
|
||||
public class DictionaryData {
|
||||
|
||||
private final int DICTIONARY_BANK_SIZE;
|
||||
private static final Logger logger = LoggerFactory.getLogger(DictionaryData.class);
|
||||
|
||||
private final ArrayList<DictionaryDataBank> banks = new ArrayList<>(100);
|
||||
|
||||
|
@ -1,6 +1,17 @@
|
||||
package nu.marginalia.util.dict;
|
||||
|
||||
public interface DictionaryMap {
|
||||
int NO_VALUE = Integer.MIN_VALUE;
|
||||
|
||||
static DictionaryMap create() {
|
||||
if (Boolean.getBoolean("small-ram")) {
|
||||
return new OnHeapDictionaryMap();
|
||||
}
|
||||
else {
|
||||
return new OffHeapDictionaryHashMap(1L << 31);
|
||||
}
|
||||
}
|
||||
|
||||
int size();
|
||||
|
||||
int put(long key);
|
||||
|
@ -16,15 +16,14 @@ import static nu.marginalia.util.FileSizeUtil.readableSize;
|
||||
* Spiritually influenced by GNU Trove's hash maps
|
||||
* LGPL 2.1
|
||||
*/
|
||||
public class DictionaryHashMap implements DictionaryMap {
|
||||
private static final Logger logger = LoggerFactory.getLogger(DictionaryHashMap.class);
|
||||
public class OffHeapDictionaryHashMap implements DictionaryMap {
|
||||
private static final Logger logger = LoggerFactory.getLogger(OffHeapDictionaryHashMap.class);
|
||||
private static final Gauge probe_count_metrics
|
||||
= Gauge.build("wmsa_dictionary_hash_map_probe_count", "Probing Count")
|
||||
.register();
|
||||
|
||||
private final int bufferCount;
|
||||
private final IntBuffer[] buffers;
|
||||
public static final int NO_VALUE = Integer.MIN_VALUE;
|
||||
|
||||
private final DictionaryData dictionaryData;
|
||||
|
||||
@ -35,7 +34,7 @@ public class DictionaryHashMap implements DictionaryMap {
|
||||
|
||||
private final AtomicInteger sz = new AtomicInteger(0);
|
||||
|
||||
public DictionaryHashMap(long sizeMemory) {
|
||||
public OffHeapDictionaryHashMap(long sizeMemory) {
|
||||
final int intSize = 4;
|
||||
|
||||
bufferCount = 1 + (int) ((intSize*sizeMemory) / (1<<30));
|
@ -0,0 +1,23 @@
|
||||
package nu.marginalia.util.dict;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||
|
||||
public class OnHeapDictionaryMap implements DictionaryMap {
|
||||
private final Long2IntOpenHashMap entries = new Long2IntOpenHashMap(100_000, 0.75f);
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
return entries.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int put(long key) {
|
||||
entries.putIfAbsent(key, entries.size());
|
||||
return get(key);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int get(long key) {
|
||||
return entries.getOrDefault(key, NO_VALUE);
|
||||
}
|
||||
}
|
@ -19,9 +19,6 @@ public class GuardedRegexFactory {
|
||||
public static GuardedRegex contains(String substring, @Language("RegExp") String regex) {
|
||||
return new GuardedRegexContains(substring, regex);
|
||||
}
|
||||
public static GuardedRegex minLength(int minLength, @Language("RegExp") String regex) {
|
||||
return new GuardedRegexMinLength(minLength, regex);
|
||||
}
|
||||
|
||||
private record GuardedRegexContains(String contains, Pattern pattern) implements GuardedRegex {
|
||||
public GuardedRegexContains(String contains, String pattern) {
|
||||
@ -32,15 +29,6 @@ public class GuardedRegexFactory {
|
||||
return s.contains(contains) && pattern.matcher(s).find();
|
||||
}
|
||||
}
|
||||
private record GuardedRegexMinLength(int minLength, Pattern pattern) implements GuardedRegex {
|
||||
public GuardedRegexMinLength(int minLength, String pattern) {
|
||||
this(minLength, Pattern.compile(pattern));
|
||||
}
|
||||
|
||||
public boolean test(String s) {
|
||||
return s.length() >= minLength && pattern.matcher(s).find();
|
||||
}
|
||||
}
|
||||
private record GuardedRegexStartsWith(String start, Pattern pattern) implements GuardedRegex {
|
||||
public GuardedRegexStartsWith(String start, String pattern) {
|
||||
this(start, Pattern.compile(pattern));
|
||||
|
@ -4,7 +4,7 @@ import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.util.language.processing.KeywordCounter;
|
||||
import nu.marginalia.util.language.processing.KeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.NameCounter;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
||||
import nu.marginalia.util.language.processing.model.WordRep;
|
||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||
@ -68,9 +68,6 @@ public class DocumentDebugger {
|
||||
|
||||
Set<String> reps = new HashSet<>();
|
||||
|
||||
// kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed));
|
||||
// kc.count(languageData).forEach(rep -> reps.add(rep.stemmed));
|
||||
|
||||
try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) {
|
||||
|
||||
for (var sent : languageData.titleSentences) {
|
||||
|
@ -1,5 +1,7 @@
|
||||
package nu.marginalia.util.language;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
@ -123,14 +125,24 @@ public class WordPatterns {
|
||||
if (!filter(s)) {
|
||||
return true;
|
||||
}
|
||||
if (isTopWord(s)) {
|
||||
|
||||
String sLc;
|
||||
if (StringUtils.isAllLowerCase(s)) {
|
||||
sLc = s;
|
||||
}
|
||||
else {
|
||||
sLc = s.toLowerCase();
|
||||
}
|
||||
|
||||
if (isTopWord(sLc)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public static boolean isTopWord(String s) {
|
||||
return topWords.contains(s.toLowerCase());
|
||||
public static boolean isTopWord(String strLowerCase) {
|
||||
return topWords.contains(strLowerCase);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -35,9 +35,7 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
||||
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
||||
|
||||
tfIdfCounter.countHisto(keywordMetadata, documentLanguageData);
|
||||
List<WordRep> subjects = subjectCounter.count(keywordMetadata, documentLanguageData);
|
||||
|
||||
for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
|
||||
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
|
||||
@ -59,11 +57,12 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
getWordPositions(keywordMetadata, documentLanguageData);
|
||||
|
||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
||||
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
||||
|
||||
List<WordRep> wordsTfIdf = tfIdfCounter.countHisto(keywordMetadata, documentLanguageData);
|
||||
|
||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
||||
List<WordRep> subjects = subjectCounter.count(keywordMetadata, documentLanguageData);
|
||||
|
||||
|
||||
for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
|
||||
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
|
||||
for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed);
|
||||
@ -94,7 +93,7 @@ public class DocumentKeywordExtractor {
|
||||
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getNames(sent)) {
|
||||
for (var span : keywordExtractor.getProperNames(sent)) {
|
||||
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
}
|
||||
@ -108,7 +107,7 @@ public class DocumentKeywordExtractor {
|
||||
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
for (var span : keywordExtractor.getNames(sent)) {
|
||||
for (var span : keywordExtractor.getProperNames(sent)) {
|
||||
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
|
||||
}
|
||||
|
||||
@ -155,16 +154,16 @@ public class DocumentKeywordExtractor {
|
||||
if (!word.isStopWord()) {
|
||||
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
||||
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
|
||||
wordsBuilder.add(w, metadata.forWord(flagsTemplate, word.stemmed()));
|
||||
wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, word.stemmed()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (var names : keywordExtractor.getNames(sent)) {
|
||||
for (var names : keywordExtractor.getProperNames(sent)) {
|
||||
var rep = new WordRep(sent, names);
|
||||
String w = AsciiFlattener.flattenUnicode(rep.word);
|
||||
|
||||
wordsBuilder.add(w, metadata.forWord(flagsTemplate, rep.stemmed));
|
||||
wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, rep.stemmed));
|
||||
}
|
||||
}
|
||||
|
||||
@ -218,7 +217,7 @@ public class DocumentKeywordExtractor {
|
||||
continue;
|
||||
}
|
||||
|
||||
wordsBuilder.add(flatWord, metadata.forWord(metadata.wordFlagsTemplate(), word.stemmed) | additionalMeta);
|
||||
wordsBuilder.add(flatWord, metadata.getMetadataForWord(metadata.wordFlagsTemplate(), word.stemmed) | additionalMeta);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -43,8 +43,8 @@ public class KeywordCounter {
|
||||
|
||||
counts.mergeInt(rep.stemmed, 1, Integer::sum);
|
||||
|
||||
var instanceSet = instances.computeIfAbsent(rep.stemmed, k -> new HashSet<>(500));
|
||||
if (instanceSet.size() < 250) {
|
||||
var instanceSet = instances.computeIfAbsent(rep.stemmed, k -> new HashSet<>(16));
|
||||
if (instanceSet.size() < 4) {
|
||||
instanceSet.add(rep);
|
||||
}
|
||||
}
|
||||
|
@ -7,14 +7,12 @@ import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||
|
||||
import java.lang.ref.SoftReference;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
public class KeywordExtractor {
|
||||
|
||||
public WordSpan[] getNames(DocumentSentence sentence) {
|
||||
List<WordSpan> spans = new ArrayList<>(sentence.length());
|
||||
public WordSpan[] getProperNames(DocumentSentence sentence) {
|
||||
List<WordSpan> spans = new ArrayList<>(2 * sentence.length());
|
||||
|
||||
for (int i = 0; i < sentence.length(); i++) {
|
||||
if (isProperNoun(i, sentence))
|
||||
@ -57,27 +55,73 @@ public class KeywordExtractor {
|
||||
return spans.toArray(WordSpan[]::new);
|
||||
}
|
||||
|
||||
public WordSpan[] getKeywordsFromSentence(DocumentSentence sentence) {
|
||||
if (sentence.keywords != null) {
|
||||
return sentence.keywords.get();
|
||||
}
|
||||
List<WordSpan> spans = new ArrayList<>(sentence.length());
|
||||
|
||||
Set<String> topWords = Collections.emptySet();
|
||||
public WordSpan[] getNouns(DocumentSentence sentence) {
|
||||
List<WordSpan> spans = new ArrayList<>(2 * sentence.length());
|
||||
|
||||
for (int i = 0; i < sentence.length(); i++) {
|
||||
if (isName(i, sentence, topWords) || isTopAdj(i, sentence, topWords))
|
||||
if (isNoun(i, sentence))
|
||||
spans.add(new WordSpan(i, i+1));
|
||||
}
|
||||
|
||||
for (int i = 1; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }
|
||||
|
||||
if (isName(i, sentence, topWords)) {
|
||||
if (isName(i - 1, sentence, topWords) || isTopAdj(i-1, sentence, topWords))
|
||||
if (isNoun(i, sentence)
|
||||
&& (isNoun(i-1, sentence)) || "JJ".equals(sentence.posTags[i-1])) {
|
||||
spans.add(new WordSpan(i - 1, i + 1));
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 2; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
|
||||
|
||||
if ((isNoun(i, sentence))
|
||||
&& (isJoiner(sentence, i-1) || isNoun(i-1, sentence))
|
||||
&& (isNoun(i-2, sentence)) || "JJ".equals(sentence.posTags[i-2]))
|
||||
spans.add(new WordSpan(i-2, i+1));
|
||||
}
|
||||
|
||||
for (int i = 3; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
|
||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; }
|
||||
|
||||
if (isNoun(i, sentence) && (isNoun(i-3, sentence) || "JJ".equals(sentence.posTags[i-3]))) {
|
||||
if (isNoun(i - 1, sentence) && isNoun(i - 2, sentence))
|
||||
spans.add(new WordSpan(i-3, i+1));
|
||||
else if (isJoiner(sentence, i-2) && sentence.posTags[i-1].equals("DT"))
|
||||
spans.add(new WordSpan(i-3, i+1));
|
||||
else if ((isJoiner(sentence, i-1) ||isNoun(i-1, sentence))
|
||||
&& (isJoiner(sentence, i-2)||isNoun(i-2, sentence)))
|
||||
spans.add(new WordSpan(i-3, i+1));
|
||||
}
|
||||
}
|
||||
|
||||
return spans.toArray(WordSpan[]::new);
|
||||
}
|
||||
|
||||
|
||||
public WordSpan[] getKeywordsFromSentence(DocumentSentence sentence) {
|
||||
if (sentence.keywords != null) {
|
||||
return sentence.keywords.get();
|
||||
}
|
||||
List<WordSpan> spans = new ArrayList<>(2 * sentence.length());
|
||||
|
||||
for (int i = 0; i < sentence.length(); i++) {
|
||||
if (isName(i, sentence) || isTopAdj(i, sentence))
|
||||
spans.add(new WordSpan(i, i+1));
|
||||
}
|
||||
|
||||
for (int i = 1; i < sentence.length(); i++) {
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }
|
||||
|
||||
if (isName(i, sentence)) {
|
||||
if (isName(i - 1, sentence) || isTopAdj(i-1, sentence))
|
||||
spans.add(new WordSpan(i - 1, i + 1));
|
||||
}
|
||||
if (sentence.posTags[i].equals("CD") && isName(i-1, sentence, topWords)) {
|
||||
if (sentence.posTags[i].equals("CD") && isName(i-1, sentence)) {
|
||||
spans.add(new WordSpan(i - 1, i + 1));
|
||||
}
|
||||
}
|
||||
@ -86,16 +130,16 @@ public class KeywordExtractor {
|
||||
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
|
||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
|
||||
|
||||
if (isName(i, sentence, topWords)) {
|
||||
if ((isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords))
|
||||
&& (isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords))) {
|
||||
if (isName(i, sentence)) {
|
||||
if ((isName(i-1, sentence) || isTopAdj(i-1, sentence))
|
||||
&& (isName(i-2, sentence) || isTopAdj(i-2, sentence))) {
|
||||
spans.add(new WordSpan(i - 2, i + 1));
|
||||
}
|
||||
else if ((isProperNoun(i-1, sentence) || isJoiner(sentence, i-1)) && isProperNoun(i-2, sentence)) {
|
||||
spans.add(new WordSpan(i - 2, i + 1));
|
||||
}
|
||||
}
|
||||
else if (sentence.posTags[i].equals("CD") && isName(i-1, sentence, topWords) && isName(i-2, sentence, topWords)) {
|
||||
else if (sentence.posTags[i].equals("CD") && isName(i-1, sentence) && isName(i-2, sentence)) {
|
||||
spans.add(new WordSpan(i - 2, i + 1));
|
||||
}
|
||||
}
|
||||
@ -105,10 +149,10 @@ public class KeywordExtractor {
|
||||
if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
|
||||
if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
|
||||
|
||||
if (isName(i, sentence, topWords) &&
|
||||
(isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords)) &&
|
||||
(isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords)) &&
|
||||
(isName(i-3, sentence, topWords) || isTopAdj(i-3, sentence, topWords))) {
|
||||
if (isName(i, sentence) &&
|
||||
(isName(i-1, sentence) || isTopAdj(i-1, sentence)) &&
|
||||
(isName(i-2, sentence) || isTopAdj(i-2, sentence)) &&
|
||||
(isName(i-3, sentence) || isTopAdj(i-3, sentence))) {
|
||||
spans.add(new WordSpan(i - 3, i + 1));
|
||||
}
|
||||
else if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) {
|
||||
@ -134,7 +178,9 @@ public class KeywordExtractor {
|
||||
public boolean isProperNoun(int i, DocumentSentence sent) {
|
||||
return "NNP".equals(sent.posTags[i]) || "NNPS".equals(sent.posTags[i]);
|
||||
}
|
||||
|
||||
public boolean isNoun(int i, DocumentSentence sent) {
|
||||
return sent.posTags[i].startsWith("NN");
|
||||
}
|
||||
public boolean isJoiner(DocumentSentence sent, int i) {
|
||||
if(sent.posTags[i].equals("IN")) {
|
||||
return true;
|
||||
@ -183,21 +229,13 @@ public class KeywordExtractor {
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean isName(int i, DocumentSentence sentence, Set<String> topWords) {
|
||||
if (!topWords.isEmpty()) {
|
||||
String posTag = sentence.posTags[i];
|
||||
String word = sentence.stemmedWords[i];
|
||||
|
||||
return ((topWords.contains(word)) && (posTag.startsWith("N") || "VBN".equals(posTag)) && !sentence.isStopWord(i));
|
||||
}
|
||||
|
||||
|
||||
private boolean isName(int i, DocumentSentence sentence) {
|
||||
String posTag = sentence.posTags[i];
|
||||
|
||||
return (posTag.startsWith("N") || "VBN".equals(posTag)) && !sentence.isStopWord(i);
|
||||
return (posTag.startsWith("N") || "VBG".equals(posTag)|| "VBN".equals(posTag)) && !sentence.isStopWord(i);
|
||||
}
|
||||
|
||||
private boolean isTopAdj(int i, DocumentSentence sentence, Set<String> topWords) {
|
||||
private boolean isTopAdj(int i, DocumentSentence sentence) {
|
||||
String posTag = sentence.posTags[i];
|
||||
|
||||
return (posTag.startsWith("JJ") || posTag.startsWith("R") || posTag.startsWith("VBG"));
|
||||
|
@ -20,7 +20,7 @@ public class NameCounter {
|
||||
|
||||
for (int i = 0; i < dld.sentences.length; i++) {
|
||||
DocumentSentence sent = dld.sentences[i];
|
||||
var keywords = keywordExtractor.getNames(sent);
|
||||
var keywords = keywordExtractor.getProperNames(sent);
|
||||
for (var span : keywords) {
|
||||
if (span.size() <= 1)
|
||||
continue;
|
||||
|
@ -1,9 +1,11 @@
|
||||
package nu.marginalia.util.language.processing;
|
||||
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
||||
import nu.marginalia.util.language.processing.model.WordRep;
|
||||
import nu.marginalia.util.language.processing.model.WordSpan;
|
||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
@ -23,13 +25,13 @@ public class SubjectCounter {
|
||||
// Greeks bearing gifts -> Greeks
|
||||
// Steve McQueen drove fast | cars -> Steve McQueen
|
||||
|
||||
public List<WordRep> count(DocumentLanguageData dld) {
|
||||
public List<WordRep> count(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
|
||||
|
||||
Map<String, Integer> counts = new HashMap<>();
|
||||
Map<String, Set<WordRep>> instances = new HashMap<>();
|
||||
|
||||
for (var sentence : dld.sentences) {
|
||||
for (WordSpan kw : keywordExtractor.getNames(sentence)) {
|
||||
for (WordSpan kw : keywordExtractor.getNouns(sentence)) {
|
||||
if (kw.end + 2 >= sentence.length()) {
|
||||
continue;
|
||||
}
|
||||
@ -46,20 +48,46 @@ public class SubjectCounter {
|
||||
|
||||
String stemmed = rep.stemmed;
|
||||
|
||||
counts.merge(stemmed, -1, Integer::sum);
|
||||
instances.computeIfAbsent(stemmed, s -> new HashSet<>()).add(rep);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int best = counts.values().stream().mapToInt(Integer::valueOf).min().orElse(0);
|
||||
Map<String, Integer> scores = new HashMap<>(instances.size());
|
||||
for (String stemmed : instances.keySet()) {
|
||||
scores.put(stemmed, getTermTfIdf(keywordMetadata, stemmed));
|
||||
}
|
||||
|
||||
return counts.entrySet().stream().sorted(Map.Entry.comparingByValue())
|
||||
.filter(e -> e.getValue()<-2 && e.getValue()<=best*0.75)
|
||||
return scores.entrySet().stream()
|
||||
.filter(e -> e.getValue() >= 150)
|
||||
.flatMap(e -> instances.getOrDefault(e.getKey(), Collections.emptySet()).stream())
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private int getTermTfIdf(KeywordMetadata keywordMetadata, String stemmed) {
|
||||
if (stemmed.contains("_")) {
|
||||
int sum = 0;
|
||||
String[] parts = StringUtils.split(stemmed, '_');
|
||||
|
||||
if (parts.length == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (String part : parts) {
|
||||
sum += getTermTfIdf(keywordMetadata, part);
|
||||
}
|
||||
|
||||
return sum / parts.length;
|
||||
}
|
||||
|
||||
var meta = keywordMetadata.wordsTfIdf().get(stemmed);
|
||||
if (meta != null) {
|
||||
return meta.tfIdfNormalized();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private boolean isDetOrAdverbOrVerb(String posTag) {
|
||||
return "DT".equals(posTag) // determinant
|
||||
|| "RB".equals(posTag) // adverb
|
||||
|
@ -2,12 +2,13 @@ package nu.marginalia.util.language.processing.model;
|
||||
|
||||
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||
import lombok.AllArgsConstructor;
|
||||
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
/**
|
||||
* @see nu.marginalia.util.language.processing.SentenceExtractor
|
||||
* @see SentenceExtractor
|
||||
*/
|
||||
@AllArgsConstructor
|
||||
public class DocumentLanguageData {
|
||||
|
@ -17,9 +17,6 @@ public record KeywordMetadata(HashSet<String> titleKeywords,
|
||||
)
|
||||
{
|
||||
|
||||
private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0);
|
||||
private static final int TF_IDF_HIGH_LIMIT = 64;
|
||||
|
||||
public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) {
|
||||
this(new HashSet<>(50), new HashSet<>(10), new HashSet<>(50),
|
||||
new HashMap<>(15_000),
|
||||
@ -31,7 +28,8 @@ public record KeywordMetadata(HashSet<String> titleKeywords,
|
||||
this(EnumSet.noneOf(EdgePageWordFlags.class));
|
||||
}
|
||||
|
||||
public long forWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
|
||||
private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0);
|
||||
public long getMetadataForWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
|
||||
|
||||
KeywordCounter.WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty);
|
||||
EnumSet<EdgePageWordFlags> flags = flagsTemplate.clone();
|
||||
|
@ -24,7 +24,7 @@ public class WordRep implements Comparable<WordRep> {
|
||||
|
||||
@Override
|
||||
public int compareTo(@NotNull WordRep o) {
|
||||
return stemmed.compareTo(o.stemmed);
|
||||
return word.compareTo(o.word);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -1,16 +1,14 @@
|
||||
package nu.marginalia.util.language.processing;
|
||||
package nu.marginalia.util.language.processing.sentence;
|
||||
|
||||
import com.github.datquocnguyen.RDRPOSTagger;
|
||||
import com.github.jknack.handlebars.internal.lang3.StringUtils;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.StringPool;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.util.language.processing.HtmlTagCleaner;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||
import opennlp.tools.sentdetect.SentenceDetectorME;
|
||||
import opennlp.tools.sentdetect.SentenceModel;
|
||||
import opennlp.tools.stemmer.PorterStemmer;
|
||||
@ -24,25 +22,22 @@ import javax.inject.Inject;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import static nu.marginalia.util.language.WordPatterns.*;
|
||||
|
||||
public class SentenceExtractor {
|
||||
|
||||
private SentenceDetectorME sentenceDetector;
|
||||
private final RDRPOSTagger rdrposTagger;
|
||||
|
||||
private final PorterStemmer porterStemmer = new PorterStemmer();
|
||||
private boolean legacyMode = false;
|
||||
private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class);
|
||||
|
||||
private static final HtmlTagCleaner tagCleaner = new HtmlTagCleaner();
|
||||
|
||||
private final ThreadLocal<StringPool> stringPool = ThreadLocal.withInitial(() -> StringPool.create(10_000));
|
||||
|
||||
|
||||
@SneakyThrows @Inject
|
||||
public SentenceExtractor(LanguageModels models) {
|
||||
try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) {
|
||||
@ -66,6 +61,22 @@ public class SentenceExtractor {
|
||||
final String text = asText(doc);
|
||||
final DocumentSentence[] textSentences = extractSentencesFromString(text);
|
||||
|
||||
String title = getTitle(doc, textSentences);
|
||||
|
||||
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
||||
var titleSentences = extractSentencesFromString(title.toLowerCase());
|
||||
return new DocumentLanguageData(textSentences, titleSentences, counts);
|
||||
}
|
||||
|
||||
public DocumentLanguageData extractSentences(String text, String title) {
|
||||
final DocumentSentence[] textSentences = extractSentencesFromString(text);
|
||||
|
||||
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
||||
|
||||
return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts);
|
||||
}
|
||||
|
||||
private String getTitle(Document doc, DocumentSentence[] textSentences) {
|
||||
String title = doc.getElementsByTag("title").text() + " . " +
|
||||
Optional.ofNullable(doc.getElementsByTag("h1").first()).map(Element::text).orElse("");
|
||||
|
||||
@ -82,34 +93,7 @@ public class SentenceExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
||||
var titleSentences = extractSentencesFromString(title.toLowerCase());
|
||||
return new DocumentLanguageData(textSentences, titleSentences, counts);
|
||||
}
|
||||
|
||||
public DocumentLanguageData extractSentences(String text) {
|
||||
final DocumentSentence[] textSentences = extractSentencesFromString(text);
|
||||
|
||||
String title = "";
|
||||
for (DocumentSentence textSentence : textSentences) {
|
||||
if (textSentence.length() > 0) {
|
||||
title = textSentence.originalSentence.toLowerCase();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
||||
|
||||
return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts);
|
||||
}
|
||||
|
||||
|
||||
public DocumentLanguageData extractSentences(String text, String title) {
|
||||
final DocumentSentence[] textSentences = extractSentencesFromString(text);
|
||||
|
||||
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
|
||||
|
||||
return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts);
|
||||
return title;
|
||||
}
|
||||
|
||||
|
||||
@ -125,79 +109,95 @@ public class SentenceExtractor {
|
||||
return counts;
|
||||
}
|
||||
|
||||
private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)");
|
||||
|
||||
// private static final Pattern badCharPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|(\\.(\\s+|$))");
|
||||
|
||||
private boolean isBadChar(char c) {
|
||||
if (c >= 'a' && c <= 'z') return false;
|
||||
if (c >= 'A' && c <= 'Z') return false;
|
||||
if (c >= '0' && c <= '9') return false;
|
||||
if ("_#@.".indexOf(c) >= 0) return false;
|
||||
if (c >= '\u00C0' && c <= '\u00D6') return false;
|
||||
if (c >= '\u00D8' && c <= '\u00F6') return false;
|
||||
if (c >= '\u00F8' && c <= '\u00FF') return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
private String sanitizeString(String s) {
|
||||
char[] newChars = new char[s.length()];
|
||||
int pi = 0;
|
||||
|
||||
for (int i = 0; i < newChars.length; i++) {
|
||||
char c = s.charAt(i);
|
||||
if (!isBadChar(c)) {
|
||||
newChars[pi++] = c;
|
||||
}
|
||||
else {
|
||||
newChars[pi++] = ' ';
|
||||
}
|
||||
}
|
||||
|
||||
s = new String(newChars, 0, pi);
|
||||
|
||||
if (s.startsWith(".")) {
|
||||
s = s.substring(1);
|
||||
if (s.isBlank())
|
||||
return "";
|
||||
}
|
||||
return s;
|
||||
|
||||
}
|
||||
|
||||
public DocumentSentence extractSentence(String text) {
|
||||
var wordsAndSeps = splitSegment(text);
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text);
|
||||
|
||||
var words = wordsAndSeps.words;
|
||||
var seps = wordsAndSeps.separators;
|
||||
var lc = toLc(wordsAndSeps.words);
|
||||
var lc = SentenceExtractorStringUtils.toLowerCaseStripPossessive(wordsAndSeps.words);
|
||||
|
||||
return new DocumentSentence(
|
||||
sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc)
|
||||
SentenceExtractorStringUtils.sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc)
|
||||
);
|
||||
}
|
||||
|
||||
public String normalizeSpaces(String s) {
|
||||
if (s.indexOf('\t') >= 0) {
|
||||
s = s.replace('\t', ' ');
|
||||
}
|
||||
if (s.indexOf('\n') >= 0) {
|
||||
s = s.replace('\n', ' ');
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
public DocumentSentence[] extractSentencesFromString(String text) {
|
||||
String[] sentences;
|
||||
|
||||
String textNormalizedSpaces = normalizeSpaces(text);
|
||||
String textNormalizedSpaces = SentenceExtractorStringUtils.normalizeSpaces(text);
|
||||
try {
|
||||
sentences = sentenceDetector.sentDetect(textNormalizedSpaces);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
// shitty fallback logic
|
||||
sentences = StringUtils.split(textNormalizedSpaces, '.');
|
||||
}
|
||||
|
||||
sentences = preCleanSentences(sentences);
|
||||
|
||||
final String[][] tokens = new String[sentences.length][];
|
||||
final int[][] separators = new int[sentences.length][];
|
||||
final String[][] posTags = new String[sentences.length][];
|
||||
final String[][] tokensLc = new String[sentences.length][];
|
||||
final String[][] stemmedWords = new String[sentences.length][];
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
|
||||
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sentences[i]);
|
||||
tokens[i] = wordsAndSeps.words;
|
||||
separators[i] = wordsAndSeps.separators;
|
||||
if (tokens[i].length > 250) {
|
||||
tokens[i] = Arrays.copyOf(tokens[i], 250);
|
||||
separators[i] = Arrays.copyOf(separators[i], 250);
|
||||
}
|
||||
for (int j = 0; j < tokens[i].length; j++) {
|
||||
while (tokens[i][j].endsWith(".")) {
|
||||
tokens[i][j] = StringUtils.removeEnd(tokens[i][j], ".");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var sPool = stringPool.get();
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
tokens[i] = sPool.internalize(tokens[i]);
|
||||
}
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
posTags[i] = rdrposTagger.tagsForEnSentence(tokens[i]);
|
||||
// don't need to internalize this
|
||||
}
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
tokensLc[i] = SentenceExtractorStringUtils.toLowerCaseStripPossessive(tokens[i]);
|
||||
tokensLc[i] = sPool.internalize(tokensLc[i]);
|
||||
}
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
stemmedWords[i] = stemSentence(tokensLc[i]);
|
||||
stemmedWords[i] = sPool.internalize(stemmedWords[i]);
|
||||
}
|
||||
|
||||
DocumentSentence[] ret = new DocumentSentence[sentences.length];
|
||||
for (int i = 0; i < ret.length; i++) {
|
||||
String fullString;
|
||||
|
||||
if (i == 0) {
|
||||
fullString = SentenceExtractorStringUtils.sanitizeString(sentences[i]);
|
||||
}
|
||||
else {
|
||||
fullString = "";
|
||||
}
|
||||
|
||||
ret[i] = new DocumentSentence(fullString, tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)");
|
||||
|
||||
private String[] preCleanSentences(String[] sentences) {
|
||||
|
||||
if (sentences.length > 250) {
|
||||
sentences = Arrays.copyOf(sentences, 250);
|
||||
}
|
||||
@ -212,53 +212,13 @@ public class SentenceExtractor {
|
||||
sentenceList.add(s);
|
||||
}
|
||||
}
|
||||
sentences = sentenceList.toArray(String[]::new);
|
||||
|
||||
final String[][] tokens = new String[sentences.length][];
|
||||
final int[][] separators = new int[sentences.length][];
|
||||
final String[][] posTags = new String[sentences.length][];
|
||||
final String[][] tokensLc = new String[sentences.length][];
|
||||
final String[][] stemmedWords = new String[sentences.length][];
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
|
||||
var wordsAndSeps = splitSegment(sentences[i]); //tokenizer.tokenize(sentences[i]);
|
||||
tokens[i] = wordsAndSeps.words;
|
||||
separators[i] = wordsAndSeps.separators;
|
||||
if (tokens[i].length > 250) {
|
||||
tokens[i] = Arrays.copyOf(tokens[i], 250);
|
||||
separators[i] = Arrays.copyOf(separators[i], 250);
|
||||
}
|
||||
for (int j = 0; j < tokens[i].length; j++) {
|
||||
while (tokens[i][j].endsWith(".")) {
|
||||
tokens[i][j] = StringUtils.removeEnd(tokens[i][j], ".");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
posTags[i] = rdrposTagger.tagsForEnSentence(tokens[i]);
|
||||
}
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
tokensLc[i] = toLc(tokens[i]);
|
||||
}
|
||||
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
stemmedWords[i] = stemSentence(tokensLc[i]);
|
||||
}
|
||||
|
||||
DocumentSentence[] ret = new DocumentSentence[sentences.length];
|
||||
for (int i = 0; i < ret.length; i++) {
|
||||
ret[i] = new DocumentSentence(sanitizeString(sentences[i]), tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]);
|
||||
}
|
||||
return ret;
|
||||
return sentenceList.toArray(String[]::new);
|
||||
}
|
||||
|
||||
private String[] stemSentence(String[] strings) {
|
||||
String[] stemmed = new String[strings.length];
|
||||
for (int i = 0; i < stemmed.length; i++) {
|
||||
var sent = cleanPossessive(strings[i]);
|
||||
var sent = SentenceExtractorStringUtils.stripPossessive(strings[i]);
|
||||
try {
|
||||
stemmed[i] = porterStemmer.stem(sent);
|
||||
}
|
||||
@ -269,27 +229,6 @@ public class SentenceExtractor {
|
||||
return stemmed;
|
||||
}
|
||||
|
||||
private String cleanPossessive(String s) {
|
||||
int end = s.length();
|
||||
|
||||
if (s.endsWith("\'")) {
|
||||
return s.substring(0, end-1);
|
||||
} else if (end > 2 && s.charAt(end-2) == '\'' && "sS".indexOf(s.charAt(end-1))>=0) {
|
||||
return s.substring(0, end-2).toLowerCase();
|
||||
}
|
||||
else {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
private String[] toLc(String[] words) {
|
||||
String[] lower = new String[words.length];
|
||||
for (int i = 0; i < lower.length; i++) {
|
||||
lower[i] = cleanPossessive(words[i]).toLowerCase();
|
||||
}
|
||||
return lower;
|
||||
}
|
||||
|
||||
public String asText(Document dc) {
|
||||
|
||||
tagCleaner.clean(dc);
|
||||
@ -299,67 +238,6 @@ public class SentenceExtractor {
|
||||
return text.substring(0, (int) (text.length()*0.95));
|
||||
}
|
||||
|
||||
@AllArgsConstructor @Getter
|
||||
private static class WordsAndSeparators {
|
||||
String[] words;
|
||||
int[] separators;
|
||||
}
|
||||
|
||||
private WordsAndSeparators splitSegment(String segment) {
|
||||
var matcher = wordBreakPattern.matcher(segment);
|
||||
|
||||
List<String> words = new ArrayList<>(segment.length()/6);
|
||||
TIntArrayList separators = new TIntArrayList(segment.length()/6);
|
||||
|
||||
int start = 0;
|
||||
int wordStart = 0;
|
||||
while (wordStart <= segment.length()) {
|
||||
if (!matcher.find(wordStart)) {
|
||||
words.add(segment.substring(wordStart));
|
||||
separators.add(WordSeparator.SPACE);
|
||||
break;
|
||||
}
|
||||
|
||||
if (wordStart != matcher.start()) {
|
||||
words.add(segment.substring(wordStart, matcher.start()));
|
||||
separators.add(segment.substring(matcher.start(), matcher.end()).isBlank() ? WordSeparator.SPACE : WordSeparator.COMMA);
|
||||
}
|
||||
wordStart = matcher.end();
|
||||
}
|
||||
|
||||
String[] parts = words.toArray(String[]::new);
|
||||
int length = 0;
|
||||
for (int i = 0; i < parts.length; i++) {
|
||||
if (parts[i].isBlank() || parts[i].length() >= MAX_WORD_LENGTH || characterNoisePredicate.test(parts[i])) {
|
||||
parts[i] = null;
|
||||
}
|
||||
else {
|
||||
length++;
|
||||
}
|
||||
}
|
||||
|
||||
String[] ret = new String[length];
|
||||
int[] seps = new int[length];
|
||||
for (int i = 0, j=0; i < parts.length; i++) {
|
||||
if (parts[i] != null) {
|
||||
seps[j] = separators.getQuick(i);
|
||||
ret[j++] = parts[i];
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < ret.length; i++) {
|
||||
if (ret[i].startsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(1); }
|
||||
if (ret[i].endsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(0, ret[i].length()-1); }
|
||||
}
|
||||
return new WordsAndSeparators(ret, seps);
|
||||
}
|
||||
|
||||
|
||||
public boolean isLegacyMode() {
|
||||
return legacyMode;
|
||||
}
|
||||
public void setLegacyMode(boolean legacyMode) {
|
||||
this.legacyMode = legacyMode;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,93 @@
|
||||
package nu.marginalia.util.language.processing.sentence;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Objects;
|
||||
|
||||
public class SentenceExtractorStringUtils {
|
||||
|
||||
public static String sanitizeString(String s) {
|
||||
char[] newChars = new char[s.length()];
|
||||
int pi = 0;
|
||||
boolean changed = false;
|
||||
for (int i = 0; i < newChars.length; i++) {
|
||||
char c = s.charAt(i);
|
||||
if (!isBadChar(c)) {
|
||||
newChars[pi++] = c;
|
||||
}
|
||||
else {
|
||||
changed = true;
|
||||
newChars[pi++] = ' ';
|
||||
}
|
||||
}
|
||||
|
||||
if (changed) {
|
||||
s = new String(newChars, 0, pi);
|
||||
}
|
||||
|
||||
if (s.startsWith(".")) {
|
||||
s = s.substring(1);
|
||||
}
|
||||
|
||||
if (s.isBlank()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
return s;
|
||||
|
||||
}
|
||||
|
||||
private static boolean isBadChar(char c) {
|
||||
if (c >= 'a' && c <= 'z') return false;
|
||||
if (c >= 'A' && c <= 'Z') return false;
|
||||
if (c >= '0' && c <= '9') return false;
|
||||
if ("_#@.".indexOf(c) >= 0) return false;
|
||||
if (c >= '\u00C0' && c <= '\u00D6') return false;
|
||||
if (c >= '\u00D8' && c <= '\u00F6') return false;
|
||||
if (c >= '\u00F8' && c <= '\u00FF') return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public static String normalizeSpaces(String s) {
|
||||
if (s.indexOf('\t') >= 0) {
|
||||
s = s.replace('\t', ' ');
|
||||
}
|
||||
if (s.indexOf('\n') >= 0) {
|
||||
s = s.replace('\n', ' ');
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
public static String toLowerCaseStripPossessive(String word) {
|
||||
String val = stripPossessive(word).toLowerCase();
|
||||
|
||||
if (Objects.equals(val, word)) {
|
||||
return word;
|
||||
}
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
public static String[] toLowerCaseStripPossessive(String[] words) {
|
||||
String[] lc = new String[words.length];
|
||||
Arrays.setAll(lc, i ->SentenceExtractorStringUtils.toLowerCaseStripPossessive(words[i]));
|
||||
return lc;
|
||||
}
|
||||
|
||||
public static String stripPossessive(String s) {
|
||||
int end = s.length();
|
||||
|
||||
if (s.endsWith("'")) {
|
||||
return s.substring(0, end-1);
|
||||
}
|
||||
|
||||
if (s.endsWith("'s") || s.endsWith("'S")) {
|
||||
return s.substring(0, end-2);
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,72 @@
|
||||
package nu.marginalia.util.language.processing.sentence;
|
||||
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.util.language.WordPatterns.*;
|
||||
|
||||
public class SentenceSegmentSplitter {
|
||||
|
||||
|
||||
@AllArgsConstructor
|
||||
@Getter
|
||||
public static class SeparatedSentence {
|
||||
String[] words;
|
||||
int[] separators;
|
||||
}
|
||||
|
||||
public static SeparatedSentence splitSegment(String segment) {
|
||||
var matcher = wordBreakPattern.matcher(segment);
|
||||
|
||||
List<String> words = new ArrayList<>(segment.length()/6);
|
||||
TIntArrayList separators = new TIntArrayList(segment.length()/6);
|
||||
|
||||
int wordStart = 0;
|
||||
while (wordStart <= segment.length()) {
|
||||
if (!matcher.find(wordStart)) {
|
||||
words.add(segment.substring(wordStart));
|
||||
separators.add(WordSeparator.SPACE);
|
||||
break;
|
||||
}
|
||||
|
||||
if (wordStart != matcher.start()) {
|
||||
words.add(segment.substring(wordStart, matcher.start()));
|
||||
separators.add(segment.substring(matcher.start(), matcher.end()).isBlank() ? WordSeparator.SPACE : WordSeparator.COMMA);
|
||||
}
|
||||
wordStart = matcher.end();
|
||||
}
|
||||
|
||||
String[] parts = words.toArray(String[]::new);
|
||||
int length = 0;
|
||||
for (int i = 0; i < parts.length; i++) {
|
||||
if (parts[i].isBlank() || parts[i].length() >= MAX_WORD_LENGTH || characterNoisePredicate.test(parts[i])) {
|
||||
parts[i] = null;
|
||||
}
|
||||
else {
|
||||
length++;
|
||||
}
|
||||
}
|
||||
|
||||
String[] ret = new String[length];
|
||||
int[] seps = new int[length];
|
||||
for (int i = 0, j=0; i < parts.length; i++) {
|
||||
if (parts[i] != null) {
|
||||
seps[j] = separators.getQuick(i);
|
||||
ret[j++] = parts[i];
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < ret.length; i++) {
|
||||
if (ret[i].startsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(1); }
|
||||
if (ret[i].endsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(0, ret[i].length()-1); }
|
||||
}
|
||||
return new SeparatedSentence(ret, seps);
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -1,39 +0,0 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
|
||||
public class BuggyReversePageRank extends RankingAlgorithm {
|
||||
|
||||
|
||||
public BuggyReversePageRank(RankingDomainFetcher domains, String... origins) {
|
||||
super(domains, origins);
|
||||
}
|
||||
|
||||
@Override
|
||||
RankVector createNewRankVector(RankVector rank) {
|
||||
|
||||
double rankNorm = rank.norm();
|
||||
RankVector newRank = new RankVector(0);
|
||||
|
||||
for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) {
|
||||
|
||||
var links = linkDataSrc2Dest[domainId];
|
||||
|
||||
if (links != null && links.size() > 0) {
|
||||
double newRankValue = 0;
|
||||
|
||||
for (int j = 0; j < links.size(); j++) {
|
||||
newRankValue += rank.get(links.getQuick(j)) / links.size();
|
||||
}
|
||||
|
||||
newRank.set(domainId, 0.85*newRankValue/rankNorm);
|
||||
}
|
||||
}
|
||||
return newRank;
|
||||
}
|
||||
|
||||
@Override
|
||||
void adjustRankVector(RankVector vector, double dNorm, double oldNorm) {
|
||||
originDomainIds.forEach(id -> vector.increment(domainIdToIndex.get(id), dNorm/oldNorm));
|
||||
}
|
||||
|
||||
}
|
@ -1,45 +0,0 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
|
||||
|
||||
public class BuggyStandardPageRank extends RankingAlgorithm {
|
||||
|
||||
public BuggyStandardPageRank(RankingDomainFetcher domains, String... origins) {
|
||||
super(domains, origins);
|
||||
}
|
||||
|
||||
@Override
|
||||
RankingAlgorithm.RankVector createNewRankVector(RankingAlgorithm.RankVector rank) {
|
||||
RankVector newRank = new RankVector(0);
|
||||
|
||||
for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) {
|
||||
|
||||
var links = linkDataSrc2Dest[domainId];
|
||||
double newRankValue = 0;
|
||||
|
||||
if (links != null && links.size() > 0) {
|
||||
for (int j = 0; j < links.size(); j++) {
|
||||
int linkedDomain = links.getQuick(j);
|
||||
|
||||
int linkSize = 1;
|
||||
var bl = linkDataSrc2Dest[linkedDomain];
|
||||
if (bl != null) {
|
||||
linkSize = bl.size();
|
||||
}
|
||||
|
||||
newRankValue += rank.get(linkedDomain) / linkSize;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
newRank.set(domainId, 0.85 * newRankValue);
|
||||
}
|
||||
return newRank;
|
||||
}
|
||||
|
||||
@Override
|
||||
void adjustRankVector(RankingAlgorithm.RankVector vector, double dNorm, double oldNorm) {
|
||||
originDomainIds.forEach(id -> vector.increment(id, dNorm/originDomainIds.size()));
|
||||
vector.incrementAll(0.14*dNorm/vector.size());
|
||||
}
|
||||
|
||||
}
|
@ -1,89 +0,0 @@
|
||||
package nu.marginalia.util.ranking.tool;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class DedupTool {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DedupTool.class);
|
||||
|
||||
public Set<String> originDomains = new HashSet<>();
|
||||
public Set<Integer> originDomainIds = new HashSet<>();
|
||||
public final long domainIdMax = -1;
|
||||
public int domainCount;
|
||||
private volatile static int rankMax;
|
||||
|
||||
public int maxId() {
|
||||
return (int) domainIdMax;
|
||||
}
|
||||
public int domainCount() {
|
||||
return domainCount;
|
||||
}
|
||||
|
||||
static LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
|
||||
volatile static boolean running = true;
|
||||
|
||||
@AllArgsConstructor @ToString @Getter
|
||||
static class Data {
|
||||
String url;
|
||||
int id;
|
||||
String domain;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public static void main(String... args) {
|
||||
Driver driver = new Driver();
|
||||
var ds = new DatabaseModule().provideConnection();
|
||||
|
||||
Map<Integer, Map<Integer, List<Data>>> domainToHashToUrl = new HashMap<>();
|
||||
|
||||
try (var conn = ds.getConnection();
|
||||
var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.DOMAIN_NAME FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL");
|
||||
var updateStmt = conn.prepareStatement("UPDATE EC_URL SET STATE='redirect' WHERE ID=?")
|
||||
|
||||
) {
|
||||
fetchStmt.setFetchSize(10_000);
|
||||
var rsp = fetchStmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
domainToHashToUrl.computeIfAbsent(rsp.getInt(1), i -> new HashMap<>())
|
||||
.computeIfAbsent(rsp.getInt(2), i -> new ArrayList<>()).add(new Data(rsp.getString(3), rsp.getInt(4), rsp.getString(5)));
|
||||
}
|
||||
|
||||
|
||||
List<Integer> updateIds = new ArrayList<>();
|
||||
|
||||
domainToHashToUrl.forEach((domain, hashes) -> {
|
||||
hashes.forEach((hash, urls) -> {
|
||||
if (urls.size() > 1) {
|
||||
Comparator<Data> c = Comparator.comparing(d -> d.domain.length());
|
||||
var urls2 = urls.stream().sorted(c.thenComparing(d -> d.url.length()))
|
||||
.collect(Collectors.partitioningBy(d -> d.url.endsWith("/")));
|
||||
|
||||
Stream
|
||||
.concat(urls2.get(true).stream(),urls2.get(false).stream()).skip(1)
|
||||
.map(Data::getId)
|
||||
.forEach(updateIds::add);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
for (int id : updateIds) {
|
||||
updateStmt.setInt(1, id);
|
||||
updateStmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -2,6 +2,7 @@ package nu.marginalia.wmsa.client;
|
||||
|
||||
import com.google.gson.*;
|
||||
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
|
||||
import nu.marginalia.util.bigstring.BigString;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
@ -24,6 +25,8 @@ public class GsonFactory {
|
||||
.registerTypeAdapter(EdgeDomain.class, (JsonDeserializer<EdgeDomain>) (json, typeOfT, context) -> new EdgeDomain(json.getAsString()))
|
||||
.registerTypeAdapter(EdgeId.class, (JsonDeserializer<EdgeId<?>>) (json, typeOfT, context) -> new EdgeId<>(json.getAsInt()))
|
||||
.registerTypeAdapter(EdgeId.class, (JsonSerializer<EdgeId<?>>) (src, typeOfSrc, context) -> new JsonPrimitive(src.id()))
|
||||
.registerTypeAdapter(BigString.class, (JsonDeserializer<BigString>) (json, typeOfT, context) -> BigString.encode(json.getAsString()))
|
||||
.registerTypeAdapter(BigString.class, (JsonSerializer<BigString>) (src, typeOfT, context) -> new JsonPrimitive(src.decode()))
|
||||
.serializeSpecialFloatingPointValues()
|
||||
.create();
|
||||
}
|
||||
|
@ -13,7 +13,6 @@ import nu.marginalia.wmsa.memex.MemexMain;
|
||||
import nu.marginalia.wmsa.podcasts.PodcastScraperMain;
|
||||
import nu.marginalia.wmsa.renderer.RendererMain;
|
||||
import nu.marginalia.wmsa.resource_store.ResourceStoreMain;
|
||||
import nu.marginalia.wmsa.smhi.scraper.SmhiScraperMain;
|
||||
import org.apache.logging.log4j.core.lookup.MainMapLookup;
|
||||
|
||||
import java.util.Map;
|
||||
@ -26,7 +25,6 @@ public enum ServiceDescriptor {
|
||||
AUTH("auth", 5003, AuthMain.class),
|
||||
API("api", 5004, ApiMain.class),
|
||||
|
||||
SMHI_SCRAPER("smhi-scraper",5012, SmhiScraperMain.class),
|
||||
PODCST_SCRAPER("podcast-scraper", 5013, PodcastScraperMain.class),
|
||||
|
||||
EDGE_INDEX("edge-index", 5021, EdgeIndexMain.class),
|
||||
|
@ -2,9 +2,10 @@ package nu.marginalia.wmsa.edge.assistant.dict;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import gnu.trove.map.hash.TLongIntHashMap;
|
||||
import gnu.trove.set.hash.TLongHashSet;
|
||||
import nu.marginalia.util.language.LanguageFilter;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter;
|
||||
@ -18,11 +19,10 @@ import javax.annotation.Nullable;
|
||||
import javax.inject.Inject;
|
||||
import javax.inject.Singleton;
|
||||
import java.io.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
@ -101,12 +101,15 @@ public class TermFrequencyDict {
|
||||
|
||||
fjp.execute(() -> {
|
||||
|
||||
TLongHashSet words = new TLongHashSet(10_000);
|
||||
|
||||
for (var doc : domain.doc) {
|
||||
|
||||
if (doc.documentBody == null)
|
||||
continue;
|
||||
docCount.incrementAndGet();
|
||||
|
||||
Document parsed = Jsoup.parse(doc.documentBody);
|
||||
Document parsed = Jsoup.parse(doc.documentBody.decode());
|
||||
parsed.body().filter(new DomPruningFilter(0.5));
|
||||
|
||||
DocumentLanguageData dld = se.get().extractSentences(parsed);
|
||||
@ -115,28 +118,30 @@ public class TermFrequencyDict {
|
||||
return;
|
||||
}
|
||||
|
||||
Set<String> words = new HashSet<>(10_000);
|
||||
|
||||
for (var sent : dld.sentences) {
|
||||
for (var word : sent) {
|
||||
words.add(word.stemmed());
|
||||
words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8)));
|
||||
}
|
||||
}
|
||||
|
||||
fjp.execute(() -> {
|
||||
synchronized (counts) {
|
||||
for (var word : words) {
|
||||
counts.adjustOrPutValue(longHash(word.getBytes()), 1, 1);
|
||||
}
|
||||
}
|
||||
});
|
||||
synchronized (counts) {
|
||||
words.forEach(w -> {
|
||||
counts.adjustOrPutValue(w, 1, 1);
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
words.clear();
|
||||
}
|
||||
|
||||
System.out.println(domain.domain + "\t" + counts.size());
|
||||
});
|
||||
|
||||
|
||||
}
|
||||
|
||||
fjp.shutdown();
|
||||
fjp.awaitTermination(10, TimeUnit.SECONDS);
|
||||
fjp.awaitTermination(10, TimeUnit.DAYS);
|
||||
|
||||
try (var dos = new DataOutputStream(Files.newOutputStream(Path.of(outFile)))) {
|
||||
synchronized (counts) {
|
||||
@ -155,14 +160,6 @@ public class TermFrequencyDict {
|
||||
}
|
||||
|
||||
System.out.println(docCount.get());
|
||||
//
|
||||
// counts.forEachEntry((w,c) -> {
|
||||
// if (c > 3L) {
|
||||
// System.out.println(w + ":" + c);
|
||||
// }
|
||||
// return true;
|
||||
// });
|
||||
|
||||
}
|
||||
|
||||
public static long getStringHash(String s) {
|
||||
|
@ -46,17 +46,12 @@ public class ConverterMain {
|
||||
InstructionsCompiler compiler,
|
||||
Gson gson
|
||||
) throws Exception {
|
||||
|
||||
;
|
||||
|
||||
|
||||
|
||||
logger.info("Starting pipe");
|
||||
|
||||
try (WorkLog processLog = plan.createProcessWorkLog();
|
||||
ConversionLog log = new ConversionLog(plan.process.getDir())) {
|
||||
instructionWriter = new LoadInstructionWriter(log, plan.process.getDir(), gson);
|
||||
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 20, 4, 2) {
|
||||
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 16, 4, 2) {
|
||||
|
||||
@Override
|
||||
protected ProcessingInstructions onProcess(CrawledDomain domainData) {
|
||||
|
@ -150,7 +150,7 @@ public class LinkKeywordExtractorMain {
|
||||
|
||||
for (var doc : crawledDomain.doc) {
|
||||
if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) {
|
||||
anchorTextExtractor.processDocument(doc.url, doc.documentBody);
|
||||
anchorTextExtractor.processDocument(doc.url, doc.documentBody.decode());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -7,7 +7,7 @@ import nu.marginalia.util.gregex.GuardedRegex;
|
||||
import nu.marginalia.util.gregex.GuardedRegexFactory;
|
||||
import nu.marginalia.util.language.LanguageFilter;
|
||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
||||
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
|
||||
@ -178,11 +178,13 @@ public class DocumentProcessor {
|
||||
private DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument)
|
||||
throws DisqualifiedException, URISyntaxException {
|
||||
|
||||
if (languageFilter.isBlockedUnicodeRange(crawledDocument.documentBody)) {
|
||||
String documentBody = crawledDocument.documentBody.decode();
|
||||
|
||||
if (languageFilter.isBlockedUnicodeRange(documentBody)) {
|
||||
throw new DisqualifiedException(DisqualificationReason.LANGUAGE);
|
||||
}
|
||||
|
||||
Document doc = Jsoup.parse(crawledDocument.documentBody);
|
||||
Document doc = Jsoup.parse(documentBody);
|
||||
|
||||
if (AcceptableAds.hasAcceptableAdsTag(doc)) {
|
||||
// I've never encountered a website where this hasn't been a severe indicator
|
||||
|
@ -42,7 +42,7 @@ public class DomainProcessor {
|
||||
|
||||
fixBadCanonicalTags(crawledDomain.doc);
|
||||
|
||||
StringPool stringPool = new StringPool(1000 + 100 * crawledDomain.doc.size());
|
||||
StringPool stringPool = StringPool.create(1000 + 100 * crawledDomain.doc.size());
|
||||
|
||||
for (var doc : crawledDomain.doc) {
|
||||
var processedDoc = documentProcessor.process(doc, crawledDomain);
|
||||
|
@ -33,8 +33,7 @@ public class SiteWords {
|
||||
Set<String> commonSiteWords = new HashSet<>(10);
|
||||
|
||||
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain,
|
||||
EdgePageWordFlags.Subjects,
|
||||
EdgePageWordFlags.TfIdfHigh));
|
||||
EdgePageWordFlags.Subjects));
|
||||
|
||||
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain,
|
||||
EdgePageWordFlags.Title));
|
||||
|
@ -11,7 +11,7 @@ public class CommonKeywordExtractor {
|
||||
|
||||
private static final int MIN_REQUIRED_DOCUMENTS = 25;
|
||||
|
||||
private static final int REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION = 100;
|
||||
private static final int REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION = 15;
|
||||
private static final double QUALIFYING_PROPORTION_FOR_KEYWORD = .25;
|
||||
|
||||
private static final int MAX_SITE_KEYWORDS_TO_EXTRACT = 5;
|
||||
|
@ -126,6 +126,9 @@ public class LinkParser {
|
||||
if (doesUrlStringHaveProtocol(s)) {
|
||||
return s;
|
||||
}
|
||||
else if (s.startsWith("//")) { // scheme-relative URL
|
||||
return baseUrl.proto + ":" + s;
|
||||
}
|
||||
|
||||
String[] parts = paramSeparatorPattern.split(s, 2);
|
||||
String path = parts[0];
|
||||
|
@ -2,11 +2,11 @@ package nu.marginalia.wmsa.edge.crawling;
|
||||
|
||||
import com.github.luben.zstd.ZstdInputStream;
|
||||
import com.google.gson.Gson;
|
||||
import jdkoverride.LargeLineBufferedReader;
|
||||
import nu.marginalia.wmsa.client.GsonFactory;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
@ -19,61 +19,41 @@ import java.util.concurrent.TimeUnit;
|
||||
public class CrawledDomainReader {
|
||||
private final Gson gson = GsonFactory.get();
|
||||
|
||||
private final ForkJoinPool pool = new ForkJoinPool(4);
|
||||
private final ForkJoinPool pool = new ForkJoinPool(6);
|
||||
|
||||
public CrawledDomainReader() {
|
||||
}
|
||||
|
||||
public CrawledDomain read(Path path) throws IOException {
|
||||
List<CrawledDocument> docs = new ArrayList<>();
|
||||
CrawledDomain domain = null;
|
||||
DomainDataAssembler domainData = new DomainDataAssembler();
|
||||
|
||||
try (var br = new LargeLineBufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))))) {
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
if (line.startsWith("//")) {
|
||||
String identifier = line;
|
||||
String data = br.readLine();
|
||||
|
||||
try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))))) {
|
||||
br.mark(2);
|
||||
boolean legacy = '{' == br.read();
|
||||
br.reset();
|
||||
|
||||
if (legacy) {
|
||||
domain = gson.fromJson(br, CrawledDomain.class);
|
||||
}
|
||||
else {
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
if (line.startsWith("//")) {
|
||||
String nextLine = br.readLine();
|
||||
if (nextLine == null) break;
|
||||
|
||||
if (line.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
|
||||
domain = gson.fromJson(nextLine, CrawledDomain.class);
|
||||
} else if (line.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
|
||||
pool.execute(() -> {
|
||||
var doc = gson.fromJson(nextLine, CrawledDocument.class);
|
||||
synchronized (docs) {
|
||||
docs.add(doc);
|
||||
}
|
||||
});
|
||||
}
|
||||
} else if (line.charAt(0) == '{') {
|
||||
domain = gson.fromJson(line, CrawledDomain.class);
|
||||
}
|
||||
pool.execute(() -> deserializeLine(identifier, data, domainData));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pool.awaitQuiescence(10, TimeUnit.SECONDS);
|
||||
while (!pool.awaitQuiescence(1, TimeUnit.SECONDS));
|
||||
|
||||
if (domain == null) {
|
||||
return null;
|
||||
return domainData.assemble();
|
||||
}
|
||||
|
||||
|
||||
private void deserializeLine(String identifier, String data, DomainDataAssembler assembler) {
|
||||
if (null == data) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!docs.isEmpty()) {
|
||||
if (domain.doc == null)
|
||||
domain.doc = new ArrayList<>();
|
||||
|
||||
domain.doc.addAll(docs);
|
||||
if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
|
||||
assembler.acceptDomain(gson.fromJson(data, CrawledDomain.class));
|
||||
} else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
|
||||
assembler.acceptDoc(gson.fromJson(data, CrawledDocument.class));
|
||||
}
|
||||
return domain;
|
||||
}
|
||||
|
||||
public CrawledDomain readRuntimeExcept(Path path) {
|
||||
@ -84,4 +64,27 @@ public class CrawledDomainReader {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
private static class DomainDataAssembler {
|
||||
private CrawledDomain domainPrototype;
|
||||
private final List<CrawledDocument> docs = new ArrayList<>();
|
||||
|
||||
public synchronized void acceptDomain(CrawledDomain domain) {
|
||||
this.domainPrototype = domain;
|
||||
}
|
||||
|
||||
public synchronized void acceptDoc(CrawledDocument doc) {
|
||||
docs.add(doc);
|
||||
}
|
||||
|
||||
public synchronized CrawledDomain assemble() {
|
||||
if (!docs.isEmpty()) {
|
||||
if (domainPrototype.doc == null)
|
||||
domainPrototype.doc = new ArrayList<>();
|
||||
|
||||
domainPrototype.doc.addAll(docs);
|
||||
}
|
||||
return domainPrototype;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -24,7 +24,7 @@ public class UrlBlocklist {
|
||||
patterns.add(s -> s.contains("-download-free"));
|
||||
|
||||
// long base64-strings in URLs are typically git hashes or the like, rarely worth crawling
|
||||
patterns.add(GuardedRegexFactory.minLength(48, ".*/[^/]*[a-f0-9]{32,}(/|$)"));
|
||||
patterns.add(this::hashTest);
|
||||
|
||||
// link farms &c
|
||||
patterns.add(GuardedRegexFactory.contains("/download", "/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$"));
|
||||
@ -38,6 +38,33 @@ public class UrlBlocklist {
|
||||
|
||||
}
|
||||
|
||||
public boolean hashTest(String path) {
|
||||
// look for strings might be a git hash (i.e. long hexadecimal strings)
|
||||
// there is no good guard for a regular expression for this so hand-rolling this
|
||||
// is necessary
|
||||
|
||||
int runLength = 0;
|
||||
int minLength = 32;
|
||||
|
||||
if (path.length() <= minLength + 2)
|
||||
return false;
|
||||
|
||||
for (int i = 0; i < path.length(); i++) {
|
||||
int c = path.charAt(i);
|
||||
|
||||
if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f')) {
|
||||
runLength++;
|
||||
}
|
||||
else if (runLength >= minLength) {
|
||||
return true;
|
||||
}
|
||||
else {
|
||||
runLength = 0;
|
||||
}
|
||||
}
|
||||
return runLength >= minLength;
|
||||
}
|
||||
|
||||
public boolean isUrlBlocked(EdgeUrl url) {
|
||||
try {
|
||||
if (badDomains.contains(url.domain.domain)) {
|
||||
|
@ -1,6 +1,8 @@
|
||||
package nu.marginalia.wmsa.edge.crawling.model;
|
||||
|
||||
import lombok.Builder;
|
||||
import nu.marginalia.util.bigstring.BigString;
|
||||
import nu.marginalia.util.bigstring.CompressedBigString;
|
||||
|
||||
@Builder
|
||||
public class CrawledDocument implements SerializableCrawlData {
|
||||
@ -16,8 +18,7 @@ public class CrawledDocument implements SerializableCrawlData {
|
||||
public String crawlerStatusDesc;
|
||||
|
||||
public String headers;
|
||||
public String documentBody;
|
||||
|
||||
public BigString documentBody;
|
||||
public String documentBodyHash;
|
||||
|
||||
public String canonicalUrl;
|
||||
|
@ -206,7 +206,7 @@ public class CrawlerRetreiver {
|
||||
|
||||
if (doc.documentBody != null) {
|
||||
|
||||
doc.documentBodyHash = createHash(doc.documentBody);
|
||||
doc.documentBodyHash = createHash(doc.documentBody.decode());
|
||||
|
||||
Optional<Document> parsedDoc = parseDoc(doc);
|
||||
EdgeUrl url = new EdgeUrl(doc.url);
|
||||
@ -251,7 +251,7 @@ public class CrawlerRetreiver {
|
||||
private Optional<Document> parseDoc(CrawledDocument doc) {
|
||||
if (doc.documentBody == null)
|
||||
return Optional.empty();
|
||||
return Optional.of(Jsoup.parse(doc.documentBody));
|
||||
return Optional.of(Jsoup.parse(doc.documentBody.decode()));
|
||||
}
|
||||
|
||||
public boolean isSameDomain(EdgeUrl url) {
|
||||
|
@ -7,6 +7,7 @@ import crawlercommons.robots.SimpleRobotRulesParser;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.util.bigstring.BigString;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
|
||||
import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic;
|
||||
@ -271,7 +272,7 @@ public class HttpFetcher {
|
||||
.canonicalUrl(canonical)
|
||||
.httpStatus(rsp.code())
|
||||
.url(responseUrl.toString())
|
||||
.documentBody(strData)
|
||||
.documentBody(BigString.encode(strData))
|
||||
.build();
|
||||
}
|
||||
|
||||
@ -325,7 +326,7 @@ public class HttpFetcher {
|
||||
|
||||
private SimpleRobotRules parseRobotsTxt(CrawledDocument doc) {
|
||||
return robotsParser.parseContent(doc.url,
|
||||
doc.documentBody.getBytes(StandardCharsets.UTF_8),
|
||||
doc.documentBody.getBytes(),
|
||||
doc.contentType,
|
||||
userAgent);
|
||||
}
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.index;
|
||||
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
@ -9,14 +10,16 @@ import java.io.IOException;
|
||||
public class EdgeIndexControl {
|
||||
|
||||
private final IndexServicesFactory servicesFactory;
|
||||
private final EdgeIndexSearchSetsService searchSetsService;
|
||||
|
||||
@Inject
|
||||
public EdgeIndexControl(IndexServicesFactory servicesFactory) {
|
||||
public EdgeIndexControl(IndexServicesFactory servicesFactory, EdgeIndexSearchSetsService searchSetsService) {
|
||||
this.servicesFactory = servicesFactory;
|
||||
this.searchSetsService = searchSetsService;
|
||||
}
|
||||
|
||||
public void regenerateIndex() throws IOException {
|
||||
servicesFactory.convertIndex();
|
||||
servicesFactory.convertIndex(searchSetsService.getDomainRankings());
|
||||
|
||||
System.gc();
|
||||
}
|
||||
|
@ -13,13 +13,6 @@ public class EdgeIndexModule extends AbstractModule {
|
||||
|
||||
|
||||
public void configure() {
|
||||
if (Boolean.getBoolean("small-ram")) {
|
||||
bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 27);
|
||||
}
|
||||
else {
|
||||
bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 31);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Provides
|
||||
|
@ -5,11 +5,11 @@ import com.google.inject.Singleton;
|
||||
import com.google.inject.name.Named;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.array.LongArray;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist;
|
||||
import nu.marginalia.util.dict.DictionaryMap;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
|
||||
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
|
||||
import nu.marginalia.wmsa.edge.index.postings.SearchIndex;
|
||||
import nu.marginalia.wmsa.edge.index.postings.SearchIndexReader;
|
||||
import nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexConverter;
|
||||
@ -20,6 +20,7 @@ import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexConverter;
|
||||
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPrioReader;
|
||||
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPriorityParameters;
|
||||
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexReader;
|
||||
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -33,7 +34,6 @@ import java.util.concurrent.Callable;
|
||||
@Singleton
|
||||
public class IndexServicesFactory {
|
||||
private final Path tmpFileDir;
|
||||
private final EdgeDomainBlacklist domainBlacklist;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@ -48,7 +48,6 @@ public class IndexServicesFactory {
|
||||
private final PartitionedDataFile revPrioIndexWords;
|
||||
|
||||
private volatile static KeywordLexicon keywordLexicon;
|
||||
private final Long dictionaryHashMapSize;
|
||||
|
||||
private final Path searchSetsBase;
|
||||
|
||||
@ -59,14 +58,10 @@ public class IndexServicesFactory {
|
||||
public IndexServicesFactory(
|
||||
@Named("tmp-file-dir") Path tmpFileDir,
|
||||
@Named("partition-root-slow") Path partitionRootSlow,
|
||||
@Named("partition-root-fast") Path partitionRootFast,
|
||||
@Named("edge-dictionary-hash-map-size") Long dictionaryHashMapSize,
|
||||
EdgeDomainBlacklist domainBlacklist
|
||||
@Named("partition-root-fast") Path partitionRootFast
|
||||
) throws IOException {
|
||||
|
||||
this.tmpFileDir = tmpFileDir;
|
||||
this.dictionaryHashMapSize = dictionaryHashMapSize;
|
||||
this.domainBlacklist = domainBlacklist;
|
||||
|
||||
this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, "page-index.dat");
|
||||
this.keywordLexiconFile = new RootDataFile(partitionRootSlow, "dictionary.dat");
|
||||
@ -98,7 +93,7 @@ public class IndexServicesFactory {
|
||||
public KeywordLexicon getKeywordLexicon() {
|
||||
if (keywordLexicon == null) {
|
||||
final var journal = new KeywordLexiconJournal(keywordLexiconFile.get());
|
||||
keywordLexicon = new KeywordLexicon(journal, new DictionaryHashMap(dictionaryHashMapSize));
|
||||
keywordLexicon = new KeywordLexicon(journal, DictionaryMap.create());
|
||||
}
|
||||
return keywordLexicon;
|
||||
}
|
||||
@ -109,15 +104,15 @@ public class IndexServicesFactory {
|
||||
|
||||
}
|
||||
|
||||
public void convertIndex() throws IOException {
|
||||
convertForwardIndex();
|
||||
convertFullReverseIndex();
|
||||
convertPriorityReverseIndex();
|
||||
public void convertIndex(DomainRankings domainRankings) throws IOException {
|
||||
convertForwardIndex(domainRankings);
|
||||
convertFullReverseIndex(domainRankings);
|
||||
convertPriorityReverseIndex(domainRankings);
|
||||
|
||||
|
||||
}
|
||||
|
||||
private void convertFullReverseIndex() throws IOException {
|
||||
private void convertFullReverseIndex(DomainRankings domainRankings) throws IOException {
|
||||
|
||||
logger.info("Converting full reverse index");
|
||||
|
||||
@ -125,6 +120,7 @@ public class IndexServicesFactory {
|
||||
var journalReader = new SearchIndexJournalReaderSingleFile(longArray);
|
||||
var converter = new ReverseIndexConverter(tmpFileDir,
|
||||
journalReader,
|
||||
domainRankings,
|
||||
revIndexWords.get(NEXT_PART).toPath(),
|
||||
revIndexDoc.get(NEXT_PART).toPath());
|
||||
|
||||
@ -133,7 +129,7 @@ public class IndexServicesFactory {
|
||||
tryGc();
|
||||
}
|
||||
|
||||
private void convertPriorityReverseIndex() throws IOException {
|
||||
private void convertPriorityReverseIndex(DomainRankings domainRankings) throws IOException {
|
||||
|
||||
logger.info("Converting priority reverse index");
|
||||
|
||||
@ -143,6 +139,7 @@ public class IndexServicesFactory {
|
||||
|
||||
var converter = new ReverseIndexConverter(tmpFileDir,
|
||||
journalReader,
|
||||
domainRankings,
|
||||
revPrioIndexWords.get(NEXT_PART).toPath(),
|
||||
revPrioIndexDoc.get(NEXT_PART).toPath());
|
||||
|
||||
@ -151,13 +148,14 @@ public class IndexServicesFactory {
|
||||
tryGc();
|
||||
}
|
||||
|
||||
private void convertForwardIndex() throws IOException {
|
||||
private void convertForwardIndex(DomainRankings domainRankings) throws IOException {
|
||||
logger.info("Converting forward index data");
|
||||
|
||||
new ForwardIndexConverter(tmpFileDir,
|
||||
new ForwardIndexConverter(
|
||||
writerIndexFile.get(0),
|
||||
fwdIndexDocId.get(NEXT_PART).toPath(),
|
||||
fwdIndexDocData.get(NEXT_PART).toPath())
|
||||
fwdIndexDocData.get(NEXT_PART).toPath(),
|
||||
domainRankings)
|
||||
.convert();
|
||||
|
||||
tryGc();
|
||||
@ -215,8 +213,8 @@ public class IndexServicesFactory {
|
||||
}
|
||||
}
|
||||
|
||||
public SearchIndex createIndexBucket() {
|
||||
return new SearchIndex(this, new EdgeIndexControl(this));
|
||||
public SearchIndex createIndexBucket(EdgeIndexSearchSetsService searchSetsService) {
|
||||
return new SearchIndex(this, new EdgeIndexControl(this, searchSetsService));
|
||||
}
|
||||
|
||||
public SearchIndexReader getSearchIndexReader() throws IOException {
|
||||
|
@ -3,7 +3,8 @@ package nu.marginalia.wmsa.edge.index.client;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
|
||||
import nu.marginalia.util.dict.DictionaryMap;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker;
|
||||
@ -32,14 +33,9 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient {
|
||||
|
||||
@Inject
|
||||
public EdgeIndexLocalService(@Named("local-index-path") Path path) throws IOException {
|
||||
long hashMapSize = 1L << 31;
|
||||
|
||||
if (Boolean.getBoolean("small-ram")) {
|
||||
hashMapSize = 1L << 27;
|
||||
}
|
||||
|
||||
var lexiconJournal = new KeywordLexiconJournal(path.resolve("dictionary.dat").toFile());
|
||||
lexicon = new KeywordLexicon(lexiconJournal, new DictionaryHashMap(hashMapSize));
|
||||
lexicon = new KeywordLexicon(lexiconJournal, DictionaryMap.create());
|
||||
indexWriter = new SearchIndexJournalWriterImpl(lexicon, path.resolve("index.dat").toFile());
|
||||
}
|
||||
|
||||
@ -72,7 +68,7 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient {
|
||||
String word = words[i];
|
||||
|
||||
long id = lexicon.getOrInsert(word);
|
||||
if (id != DictionaryHashMap.NO_VALUE) {
|
||||
if (id != OffHeapDictionaryHashMap.NO_VALUE) {
|
||||
ids[putIdx++] = id;
|
||||
ids[putIdx++] = meta[i];
|
||||
}
|
||||
|
@ -4,7 +4,6 @@ import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import io.prometheus.client.Gauge;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.util.dict.DictionaryMap;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
|
||||
import org.slf4j.Logger;
|
||||
@ -55,7 +54,7 @@ public class KeywordLexicon implements AutoCloseable {
|
||||
private int getOrInsert(byte[] bytes) {
|
||||
if (bytes.length >= Byte.MAX_VALUE) {
|
||||
logger.warn("getOrInsert({}), illegal length {}", new String(bytes), bytes.length);
|
||||
return DictionaryHashMap.NO_VALUE;
|
||||
return DictionaryMap.NO_VALUE;
|
||||
}
|
||||
|
||||
final long key = hashFunction.hashBytes(bytes).padToLong();
|
||||
|
@ -8,7 +8,8 @@ import java.util.Set;
|
||||
import static java.lang.Math.max;
|
||||
import static java.lang.Math.min;
|
||||
|
||||
public record EdgePageDocumentsMetadata(int encSize,
|
||||
public record EdgePageDocumentsMetadata(int rank,
|
||||
int encSize,
|
||||
int topology,
|
||||
int year,
|
||||
int sets,
|
||||
@ -16,9 +17,13 @@ public record EdgePageDocumentsMetadata(int encSize,
|
||||
byte flags) {
|
||||
|
||||
|
||||
public static final long RANK_MASK = 0xFFL;
|
||||
public static final int RANK_SHIFT = 48;
|
||||
|
||||
public static final long ENCSIZE_MASK = 0xFFL;
|
||||
public static final int ENCSIZE_SHIFT = 48;
|
||||
public static final int ENCSIZE_SHIFT = 40;
|
||||
public static final int ENCSIZE_MULTIPLIER = 50;
|
||||
|
||||
public static final long TOPOLOGY_MASK = 0xFFL;
|
||||
|
||||
public static final int TOPOLOGY_SHIFT = 32;
|
||||
@ -39,7 +44,7 @@ public record EdgePageDocumentsMetadata(int encSize,
|
||||
this(defaultValue());
|
||||
}
|
||||
public EdgePageDocumentsMetadata(int topology, int year, int sets, int quality, EnumSet<EdgePageDocumentFlags> flags) {
|
||||
this(0, topology, year, sets, quality, encodeFlags(flags));
|
||||
this(0, 0, topology, year, sets, quality, encodeFlags(flags));
|
||||
}
|
||||
|
||||
public EdgePageDocumentsMetadata withSize(int size) {
|
||||
@ -49,7 +54,7 @@ public record EdgePageDocumentsMetadata(int encSize,
|
||||
|
||||
final int encSize = (int) Math.min(ENCSIZE_MASK, Math.max(1, size / ENCSIZE_MULTIPLIER));
|
||||
|
||||
return new EdgePageDocumentsMetadata(encSize, topology, year, sets, quality, flags);
|
||||
return new EdgePageDocumentsMetadata(rank, encSize, topology, year, sets, quality, flags);
|
||||
}
|
||||
|
||||
private static byte encodeFlags(Set<EdgePageDocumentFlags> flags) {
|
||||
@ -63,7 +68,8 @@ public record EdgePageDocumentsMetadata(int encSize,
|
||||
}
|
||||
|
||||
public EdgePageDocumentsMetadata(long value) {
|
||||
this( (int) ((value >>> ENCSIZE_SHIFT) & ENCSIZE_MASK),
|
||||
this( (int) ((value >>> RANK_SHIFT) & RANK_MASK),
|
||||
(int) ((value >>> ENCSIZE_SHIFT) & ENCSIZE_MASK),
|
||||
(int) ((value >>> TOPOLOGY_SHIFT) & TOPOLOGY_MASK),
|
||||
(int) ((value >>> YEAR_SHIFT) & YEAR_MASK),
|
||||
(int) ((value >>> SETS_SHIFT) & SETS_MASK),
|
||||
@ -84,12 +90,13 @@ public record EdgePageDocumentsMetadata(int encSize,
|
||||
ret |= min(YEAR_MASK, max(0, year)) << YEAR_SHIFT;
|
||||
ret |= min(TOPOLOGY_MASK, max(0, topology)) << TOPOLOGY_SHIFT;
|
||||
ret |= min(ENCSIZE_MASK, max(0, encSize)) << ENCSIZE_SHIFT;
|
||||
ret |= min(RANK_MASK, max(0, rank)) << RANK_SHIFT;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return encSize == 0 && topology == 0 && sets == 0 && quality == 0 && year == 0 && flags == 0;
|
||||
return encSize == 0 && topology == 0 && sets == 0 && quality == 0 && year == 0 && flags == 0 && rank == 0;
|
||||
}
|
||||
|
||||
public static int decodeQuality(long encoded) {
|
||||
@ -112,6 +119,12 @@ public record EdgePageDocumentsMetadata(int encSize,
|
||||
return ENCSIZE_MULTIPLIER * (int) ((encoded >>> ENCSIZE_SHIFT) & ENCSIZE_MASK);
|
||||
}
|
||||
|
||||
public static int decodeRank(long encoded) {
|
||||
return (int) ((encoded >>> RANK_SHIFT) & RANK_MASK);
|
||||
}
|
||||
|
||||
public static long encodeRank(long encoded, int rank) {
|
||||
return encoded | min(RANK_MASK, max(0, rank)) << RANK_SHIFT;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,4 @@
|
||||
package nu.marginalia.wmsa.edge.index.model;
|
||||
|
||||
public record QueryLimits(int resultsByDomain, int resultsTotal, int timeoutMs, int fetchSize) {
|
||||
}
|
@ -3,5 +3,10 @@ package nu.marginalia.wmsa.edge.index.model;
|
||||
public enum QueryStrategy {
|
||||
SENTENCE,
|
||||
TOPIC,
|
||||
|
||||
REQUIRE_FIELD_SITE,
|
||||
REQUIRE_FIELD_TITLE,
|
||||
REQUIRE_FIELD_SUBJECT,
|
||||
|
||||
AUTO
|
||||
}
|
||||
|
@ -0,0 +1,43 @@
|
||||
package nu.marginalia.wmsa.edge.index.postings;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
|
||||
import it.unimi.dsi.fastutil.ints.Int2ShortOpenHashMap;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import static java.lang.Math.max;
|
||||
import static java.lang.Math.min;
|
||||
|
||||
public class DomainRankings {
|
||||
private final Int2ShortOpenHashMap rankings;
|
||||
|
||||
private final int MAX_MEANINGFUL_RANK = 50_000;
|
||||
private final int MAX_RANK_VALUE = 255;
|
||||
private final int MIN_RANK_VALUE = 1;
|
||||
private final double RANK_SCALING_FACTOR = (double) MAX_RANK_VALUE / MAX_MEANINGFUL_RANK;
|
||||
|
||||
public DomainRankings() {
|
||||
rankings = new Int2ShortOpenHashMap();
|
||||
}
|
||||
public DomainRankings(Int2IntOpenHashMap values) {
|
||||
rankings = new Int2ShortOpenHashMap(values.size());
|
||||
values.forEach(this::putRanking);
|
||||
}
|
||||
|
||||
private void putRanking(int domainId, int value) {
|
||||
rankings.put(domainId, scaleRank(value));
|
||||
}
|
||||
|
||||
private short scaleRank(int value) {
|
||||
double rankScaled = RANK_SCALING_FACTOR * value;
|
||||
return (short) min(MAX_RANK_VALUE, max(MIN_RANK_VALUE, rankScaled));
|
||||
}
|
||||
|
||||
public int getRanking(int domainId) {
|
||||
return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE);
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return rankings.size();
|
||||
}
|
||||
}
|
@ -7,6 +7,8 @@ import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
|
||||
import nu.marginalia.wmsa.edge.index.model.QueryStrategy;
|
||||
import nu.marginalia.wmsa.edge.index.query.IndexQueryParams;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery;
|
||||
@ -17,6 +19,7 @@ import java.util.Objects;
|
||||
public class IndexResultValuator {
|
||||
private final IndexMetadataService metadataService;
|
||||
private final List<List<String>> searchTermVariants;
|
||||
private final IndexQueryParams queryParams;
|
||||
private final int[] termIdsAll;
|
||||
|
||||
private final TLongHashSet resultsWithPriorityTerms;
|
||||
@ -24,9 +27,10 @@ public class IndexResultValuator {
|
||||
private final TObjectIntHashMap<String> termToId = new TObjectIntHashMap<>(10, 0.75f, -1);
|
||||
private final TermMetadata termMetadata;
|
||||
|
||||
public IndexResultValuator(SearchIndexControl indexes, TLongList results, List<EdgeSearchSubquery> subqueries) {
|
||||
public IndexResultValuator(SearchIndexControl indexes, TLongList results, List<EdgeSearchSubquery> subqueries, IndexQueryParams queryParams) {
|
||||
this.metadataService = new IndexMetadataService(indexes);
|
||||
this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
|
||||
this.queryParams = queryParams;
|
||||
|
||||
var lexiconReader = Objects.requireNonNull(indexes.getLexiconReader());
|
||||
IntArrayList termIdsList = new IntArrayList();
|
||||
@ -114,10 +118,15 @@ public class IndexResultValuator {
|
||||
docMetadata,
|
||||
resultsWithPriorityTerms.contains(searchResult.combinedId)
|
||||
);
|
||||
|
||||
searchResult.scores.add(score);
|
||||
|
||||
setScore += score.termValue();
|
||||
|
||||
if (!filterRequired(metadata, queryParams.queryStrategy())) {
|
||||
setScore += 1000;
|
||||
}
|
||||
|
||||
if (termIdx == 0) {
|
||||
setScore += score.documentValue();
|
||||
}
|
||||
@ -130,6 +139,19 @@ public class IndexResultValuator {
|
||||
return setScore/setSize;
|
||||
}
|
||||
|
||||
private boolean filterRequired(long metadata, QueryStrategy queryStrategy) {
|
||||
if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) {
|
||||
return EdgePageWordFlags.Site.isPresent(metadata);
|
||||
}
|
||||
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) {
|
||||
return EdgePageWordFlags.Subjects.isPresent(metadata);
|
||||
}
|
||||
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) {
|
||||
return EdgePageWordFlags.Title.isPresent(metadata);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private double calculateTermCoherencePenalty(int urlId, TObjectIntHashMap<String> termToId, List<String> termList) {
|
||||
long maskDirectGenerous = ~0;
|
||||
long maskDirectRaw = ~0;
|
||||
@ -139,6 +161,9 @@ public class IndexResultValuator {
|
||||
| EdgePageWordFlags.Subjects.asBit()
|
||||
| EdgePageWordFlags.Synthetic.asBit();
|
||||
|
||||
int termCount = 0;
|
||||
double tfIdfSum = 1.;
|
||||
|
||||
for (String term : termList) {
|
||||
var meta = termMetadata.getTermMetadata(termToId.get(term), urlId);
|
||||
long positions;
|
||||
@ -156,18 +181,22 @@ public class IndexResultValuator {
|
||||
maskDirectGenerous &= positions;
|
||||
}
|
||||
|
||||
termCount++;
|
||||
tfIdfSum += EdgePageWordMetadata.decodeTfidf(meta);
|
||||
}
|
||||
|
||||
double avgTfIdf = termCount / tfIdfSum;
|
||||
|
||||
if (maskAdjacent == 0) {
|
||||
return 40;
|
||||
return Math.max(-2, 40 - 0.5 * avgTfIdf);
|
||||
}
|
||||
|
||||
if (maskDirectGenerous == 0) {
|
||||
return 20;
|
||||
return Math.max(-1, 20 - 0.3 * avgTfIdf);
|
||||
}
|
||||
|
||||
if (maskDirectRaw == 0) {
|
||||
return 2;
|
||||
return Math.max(-1, 15 - 0.2 * avgTfIdf);
|
||||
}
|
||||
|
||||
return Long.numberOfTrailingZeros(maskDirectGenerous)/5. - Long.bitCount(maskDirectGenerous);
|
||||
|
@ -92,7 +92,8 @@ public class SearchIndex {
|
||||
SearchIndexReader.IndexQueryBuilder query =
|
||||
switch(params.queryStrategy()) {
|
||||
case SENTENCE -> indexReader.findWordAsSentence(orderedIncludes);
|
||||
case TOPIC -> indexReader.findWordAsTopic(orderedIncludes);
|
||||
case TOPIC, REQUIRE_FIELD_SITE, REQUIRE_FIELD_TITLE, REQUIRE_FIELD_SUBJECT
|
||||
-> indexReader.findWordAsTopic(orderedIncludes);
|
||||
case AUTO -> indexReader.findWordTopicDynamicMode(orderedIncludes);
|
||||
};
|
||||
|
||||
|
@ -6,6 +6,7 @@ import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
|
||||
import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl;
|
||||
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService;
|
||||
import nu.marginalia.wmsa.edge.index.svc.EdgeOpsLockService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -26,13 +27,14 @@ public class SearchIndexControl {
|
||||
|
||||
@Inject
|
||||
public SearchIndexControl(IndexServicesFactory servicesFactory,
|
||||
EdgeOpsLockService opsLockService) {
|
||||
EdgeOpsLockService opsLockService,
|
||||
EdgeIndexSearchSetsService searchSetsService) {
|
||||
this.servicesFactory = servicesFactory;
|
||||
|
||||
this.primaryIndexWriter = servicesFactory.getIndexWriter(0);
|
||||
this.secondaryIndexWriter = servicesFactory.getIndexWriter(1);
|
||||
|
||||
index = servicesFactory.createIndexBucket();
|
||||
index = servicesFactory.createIndexBucket(searchSetsService);
|
||||
this.opsLockService = opsLockService;
|
||||
}
|
||||
|
||||
|
@ -3,6 +3,8 @@ package nu.marginalia.wmsa.edge.index.postings.forward;
|
||||
import com.upserve.uppend.blobs.NativeIO;
|
||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||
import nu.marginalia.util.array.LongArray;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata;
|
||||
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
|
||||
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReader;
|
||||
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile;
|
||||
import org.roaringbitmap.IntConsumer;
|
||||
@ -18,26 +20,26 @@ import java.nio.file.Path;
|
||||
import static nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexParameters.*;
|
||||
|
||||
public class ForwardIndexConverter {
|
||||
private static final int RWF_BIN_SIZE = 10_000_000;
|
||||
|
||||
private final Path tmpFileDir;
|
||||
private final File inputFile;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final Path outputFileDocsId;
|
||||
private final Path outputFileDocsData;
|
||||
private final DomainRankings domainRankings;
|
||||
|
||||
|
||||
public ForwardIndexConverter(Path tmpFileDir,
|
||||
public ForwardIndexConverter(
|
||||
File inputFile,
|
||||
Path outputFileDocsId,
|
||||
Path outputFileDocsData
|
||||
Path outputFileDocsData,
|
||||
DomainRankings domainRankings
|
||||
) {
|
||||
this.tmpFileDir = tmpFileDir;
|
||||
this.inputFile = inputFile;
|
||||
this.outputFileDocsId = outputFileDocsId;
|
||||
this.outputFileDocsData = outputFileDocsData;
|
||||
this.domainRankings = domainRankings;
|
||||
}
|
||||
|
||||
public void convert() throws IOException {
|
||||
@ -50,6 +52,8 @@ public class ForwardIndexConverter {
|
||||
|
||||
logger.info("Converting {} {}",inputFile, journalReader.fileHeader);
|
||||
|
||||
logger.info("Domain Rankings size = {}", domainRankings.size());
|
||||
|
||||
try {
|
||||
LongArray docsFileId = getDocIds(outputFileDocsId, journalReader);
|
||||
|
||||
@ -68,7 +72,10 @@ public class ForwardIndexConverter {
|
||||
journalReader.forEach(entry -> {
|
||||
long entryOffset = (long) ENTRY_SIZE * docIdToIdx.get(entry.urlId());
|
||||
|
||||
docFileData.set(entryOffset + METADATA_OFFSET, entry.docMeta());
|
||||
int ranking = domainRankings.getRanking(entry.domainId());
|
||||
long meta = EdgePageDocumentsMetadata.encodeRank(entry.docMeta(), ranking);
|
||||
|
||||
docFileData.set(entryOffset + METADATA_OFFSET, meta);
|
||||
docFileData.set(entryOffset + DOMAIN_OFFSET, entry.domainId());
|
||||
});
|
||||
|
||||
|
@ -29,20 +29,30 @@ public class ForwardIndexReader {
|
||||
|
||||
logger.info("Switching forward index");
|
||||
|
||||
ids = loadIds(idsFile);
|
||||
data = loadData(dataFile);
|
||||
}
|
||||
|
||||
private static TLongIntHashMap loadIds(Path idsFile) throws IOException {
|
||||
var idsArray = LongArray.mmapRead(idsFile);
|
||||
idsArray.advice(NativeIO.Advice.Sequential);
|
||||
|
||||
ids = new TLongIntHashMap((int) idsArray.size(), 0.5f, -1, -1);
|
||||
var ids = new TLongIntHashMap((int) idsArray.size(), 0.5f, -1, -1);
|
||||
|
||||
// This hash table should be of the same size as the number of documents, so typically less than 1 Gb
|
||||
idsArray.forEach(0, idsArray.size(), (pos, val) -> {
|
||||
ids.put(val, (int) pos);
|
||||
});
|
||||
|
||||
data = LongArray.mmapRead(dataFile);
|
||||
return ids;
|
||||
}
|
||||
|
||||
private static LongArray loadData(Path dataFile) throws IOException {
|
||||
var data = LongArray.mmapRead(dataFile);
|
||||
|
||||
data.advice(NativeIO.Advice.Random);
|
||||
|
||||
return data;
|
||||
}
|
||||
|
||||
private int idxForDoc(long docId) {
|
||||
@ -55,6 +65,7 @@ public class ForwardIndexReader {
|
||||
|
||||
return data.get(ENTRY_SIZE * offset + METADATA_OFFSET);
|
||||
}
|
||||
|
||||
public int getDomainId(long docId) {
|
||||
long offset = idxForDoc(docId);
|
||||
if (offset < 0) return 0;
|
||||
|
@ -16,7 +16,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
||||
|
||||
@Override
|
||||
public boolean test(long docId) {
|
||||
var post = forwardIndexReader.docPost(docId);
|
||||
var post = forwardIndexReader.docPost(docId & 0xFFFF_FFFFL);
|
||||
|
||||
if (!validateDomain(post)) {
|
||||
return false;
|
||||
@ -33,6 +33,11 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
||||
if (!validateSize(post)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!validateRank(post)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -51,6 +56,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
||||
|
||||
return limit.test(quality);
|
||||
}
|
||||
|
||||
private boolean validateYear(ForwardIndexReader.DocPost post) {
|
||||
if (params.year().type() == SpecificationLimitType.NONE)
|
||||
return true;
|
||||
@ -69,6 +75,15 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
|
||||
return params.size().test(postVal);
|
||||
}
|
||||
|
||||
private boolean validateRank(ForwardIndexReader.DocPost post) {
|
||||
if (params.rank().type() == SpecificationLimitType.NONE)
|
||||
return true;
|
||||
|
||||
int postVal = EdgePageDocumentsMetadata.decodeRank(post.meta());
|
||||
|
||||
return params.rank().test(postVal);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double cost() {
|
||||
return 32;
|
||||
|
@ -8,6 +8,7 @@ import nu.marginalia.util.array.functional.LongBinaryIOOperation;
|
||||
import nu.marginalia.util.array.functional.LongIOTransformer;
|
||||
import nu.marginalia.util.array.functional.LongTransformer;
|
||||
import nu.marginalia.util.btree.BTreeWriter;
|
||||
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
|
||||
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry;
|
||||
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalStatistics;
|
||||
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReader;
|
||||
@ -32,18 +33,22 @@ public class ReverseIndexConverter {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final SearchIndexJournalReaderSingleFile journalReader;
|
||||
private final DomainRankings domainRankings;
|
||||
private final Path outputFileWords;
|
||||
private final Path outputFileDocs;
|
||||
|
||||
private final SortingContext sortingContext;
|
||||
|
||||
public ReverseIndexConverter(Path tmpFileDir,
|
||||
SearchIndexJournalReaderSingleFile journalReader,
|
||||
DomainRankings domainRankings,
|
||||
Path outputFileWords,
|
||||
Path outputFileDocs) {
|
||||
this.tmpFileDir = tmpFileDir;
|
||||
this.journalReader = journalReader;
|
||||
this.domainRankings = domainRankings;
|
||||
this.outputFileWords = outputFileWords;
|
||||
this.outputFileDocs = outputFileDocs;
|
||||
this.sortingContext = new SortingContext(tmpFileDir, 64_000);
|
||||
}
|
||||
|
||||
public void convert() throws IOException {
|
||||
@ -56,7 +61,7 @@ public class ReverseIndexConverter {
|
||||
final SearchIndexJournalStatistics statistics = journalReader.getStatistics();
|
||||
|
||||
final Path intermediateUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
|
||||
SortingContext sortingContext = new SortingContext(tmpFileDir, 64_000);
|
||||
|
||||
|
||||
try {
|
||||
final long wordsFileSize = statistics.highestWord() + 1;
|
||||
@ -187,7 +192,7 @@ public class ReverseIndexConverter {
|
||||
}
|
||||
}
|
||||
|
||||
private static class IntermediateIndexConstructor implements SearchIndexJournalReaderSingleFile.LongObjectConsumer<SearchIndexJournalEntry.Record>, AutoCloseable {
|
||||
private class IntermediateIndexConstructor implements SearchIndexJournalReaderSingleFile.LongObjectConsumer<SearchIndexJournalEntry.Record>, AutoCloseable {
|
||||
|
||||
private final LongArray wordRangeEnds;
|
||||
private final IntArray wordRangeOffset;
|
||||
@ -205,12 +210,26 @@ public class ReverseIndexConverter {
|
||||
|
||||
@Override
|
||||
public void accept(long docId, SearchIndexJournalEntry.Record record) {
|
||||
final long urlId = docId & 0xFFFF_FFFFL;
|
||||
final int wordId = record.wordId();
|
||||
|
||||
/* Encode the ID as
|
||||
*
|
||||
* 32 bits 32 bits
|
||||
* [ ranking | url-id ]
|
||||
*
|
||||
* in order to get low-ranking documents to be considered first
|
||||
* when sorting the items.
|
||||
*/
|
||||
|
||||
int domainId = (int) (docId >>> 32);
|
||||
long rankingId = (long) domainRankings.getRanking(domainId) << 32;
|
||||
|
||||
int urlId = (int) (docId & 0xFFFF_FFFFL);
|
||||
long rankEncodedId = rankingId | urlId;
|
||||
|
||||
final int wordId = record.wordId();
|
||||
long offset = startOfRange(wordId);
|
||||
|
||||
documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), urlId);
|
||||
documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), rankEncodedId);
|
||||
documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), record.metadata());
|
||||
|
||||
}
|
||||
|
@ -47,18 +47,6 @@ public class ReverseIndexPrioReader {
|
||||
return new ReverseIndexEntrySource(createReaderNew(offset), ReverseIndexEntrySourceBehavior.DO_PREFER);
|
||||
}
|
||||
|
||||
public int numDocuments(int wordId) {
|
||||
if (wordId < 0)
|
||||
return 0;
|
||||
|
||||
long offset = words.get(wordId);
|
||||
|
||||
if (offset < 0)
|
||||
return 0;
|
||||
|
||||
return createReaderNew(offset).numEntries();
|
||||
}
|
||||
|
||||
private BTreeReader createReaderNew(long offset) {
|
||||
return new BTreeReader(documents, ReverseIndexParameters.bTreeContext, offset);
|
||||
}
|
||||
|
@ -53,6 +53,11 @@ public class ReverseIndexReader {
|
||||
}
|
||||
|
||||
public EntrySource documents(int wordId, ReverseIndexEntrySourceBehavior behavior) {
|
||||
if (null == words) {
|
||||
logger.warn("Reverse index is not ready, dropping query");
|
||||
return new EmptyEntrySource();
|
||||
}
|
||||
|
||||
if (wordId < 0 || wordId >= words.size()) return new EmptyEntrySource();
|
||||
|
||||
long offset = words.get(wordId);
|
||||
|
@ -7,6 +7,7 @@ import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit;
|
||||
public record IndexQueryParams(SpecificationLimit qualityLimit,
|
||||
SpecificationLimit year,
|
||||
SpecificationLimit size,
|
||||
SpecificationLimit rank,
|
||||
SearchSet searchSet,
|
||||
QueryStrategy queryStrategy
|
||||
)
|
||||
|
@ -1,21 +1,21 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
package nu.marginalia.wmsa.edge.index.ranking;
|
||||
|
||||
import gnu.trove.list.TIntList;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import gnu.trove.map.hash.TIntIntHashMap;
|
||||
import gnu.trove.map.hash.TIntObjectHashMap;
|
||||
import it.unimi.dsi.fastutil.ints.IntArrays;
|
||||
import it.unimi.dsi.fastutil.ints.IntComparator;
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultAccumulator;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainData;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.function.IntToDoubleFunction;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
import static java.lang.Math.min;
|
||||
|
||||
public abstract class RankingAlgorithm {
|
||||
protected final TIntObjectHashMap<RankingDomainData> domainsById = new TIntObjectHashMap<>();
|
||||
@ -85,6 +85,10 @@ public abstract class RankingAlgorithm {
|
||||
logger.info("Origin Domains: {}", originDomainIds.size());
|
||||
}
|
||||
|
||||
public RankingDomainData getDomainData(int id) {
|
||||
return domainsById.get(id);
|
||||
}
|
||||
|
||||
public void addPeripheralNodes() {
|
||||
|
||||
int newNodesIdxCutoff = domainIdToIndex.size();
|
||||
@ -133,29 +137,7 @@ public abstract class RankingAlgorithm {
|
||||
return domainsById.size();
|
||||
}
|
||||
|
||||
|
||||
public RankVector pageRankVector() {
|
||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
||||
|
||||
int iter_max = 100;
|
||||
for (int i = 0; i < iter_max; i++) {
|
||||
RankVector newRank = createNewRankVector(rank);
|
||||
|
||||
double oldNorm = rank.norm();
|
||||
double newNorm = newRank.norm();
|
||||
double dNorm = oldNorm - newNorm ;
|
||||
if (i < iter_max-1) {
|
||||
adjustRankVector(newRank, dNorm, oldNorm);
|
||||
}
|
||||
|
||||
rank = newRank;
|
||||
}
|
||||
|
||||
return rank;
|
||||
}
|
||||
|
||||
|
||||
public RoaringBitmap pageRank(int resultCount) {
|
||||
public <T> T pageRank(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
||||
|
||||
int iter_max = 100;
|
||||
@ -174,10 +156,10 @@ public abstract class RankingAlgorithm {
|
||||
}
|
||||
|
||||
|
||||
return rank.getRanking(resultCount);
|
||||
return rank.getRanking(resultCount, accumulatorP).get();
|
||||
}
|
||||
|
||||
public RoaringBitmap pageRankWithPeripheralNodes(int resultCount) {
|
||||
public <T> T pageRankWithPeripheralNodes(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
||||
|
||||
int iter_max = 100;
|
||||
@ -201,32 +183,11 @@ public abstract class RankingAlgorithm {
|
||||
|
||||
logger.info("PRWPN iteration done");
|
||||
|
||||
return rank.getRanking(resultCount);
|
||||
return rank.getRanking(resultCount, accumulatorP).get();
|
||||
}
|
||||
|
||||
abstract void adjustRankVector(RankVector vector, double dNorm, double oldNorm);
|
||||
|
||||
public TIntList pageRank(IntToDoubleFunction weight, int resultCount) {
|
||||
RankVector rank = new RankVector(1.d / domainsById.size());
|
||||
|
||||
int iter_max = 100;
|
||||
for (int i = 0; i < iter_max; i++) {
|
||||
RankVector newRank = createNewRankVector(rank);
|
||||
|
||||
double oldNorm = rank.norm();
|
||||
double newNorm = newRank.norm();
|
||||
double dNorm = oldNorm - newNorm ;
|
||||
|
||||
if (i < iter_max-1) {
|
||||
adjustRankVector(newRank, dNorm, oldNorm);
|
||||
}
|
||||
|
||||
rank = newRank;
|
||||
}
|
||||
|
||||
return rank.getRanking(weight, resultCount);
|
||||
}
|
||||
|
||||
abstract RankVector createNewRankVector(RankVector rank);
|
||||
|
||||
public boolean includeInRanking(RankingDomainData data) {
|
||||
@ -245,9 +206,9 @@ public abstract class RankingAlgorithm {
|
||||
public void setMaxKnownUrls(int maxKnownUrls) {
|
||||
this.maxKnownUrls = maxKnownUrls;
|
||||
}
|
||||
|
||||
public class RankVector {
|
||||
private final double[] rank;
|
||||
|
||||
public RankVector(double defaultValue) {
|
||||
rank = new double[domainIndexToId.size()];
|
||||
if (defaultValue != 0.) {
|
||||
@ -271,9 +232,8 @@ public abstract class RankingAlgorithm {
|
||||
|
||||
public double norm() {
|
||||
double v = 0.;
|
||||
for (int i = 0; i < rank.length; i++) {
|
||||
if (rank[i] > 0) { v+=rank[i]; }
|
||||
else { v -= rank[i]; }
|
||||
for (double value : rank) {
|
||||
v += Math.abs(value);
|
||||
}
|
||||
return v;
|
||||
}
|
||||
@ -281,74 +241,39 @@ public abstract class RankingAlgorithm {
|
||||
public double norm(RankVector other) {
|
||||
double v = 0.;
|
||||
for (int i = 0; i < rank.length; i++) {
|
||||
double dv = rank[i] - other.get(i);
|
||||
|
||||
if (dv > 0) { v+=dv; }
|
||||
else { v -= dv; }
|
||||
v += Math.abs(rank[i] - other.get(i));
|
||||
}
|
||||
return v;
|
||||
}
|
||||
|
||||
public TIntList getRanking(IntToDoubleFunction other, int numResults) {
|
||||
TIntArrayList list = new TIntArrayList(numResults);
|
||||
public <T> RankingResultAccumulator<T> getRanking(int numResults, Supplier<RankingResultAccumulator<T>> accumulatorP) {
|
||||
|
||||
Comparator<Integer> comparator = Comparator.comparing(i -> Math.sqrt(other.applyAsDouble(domainIdToIndex.get(i)) * rank[i]));
|
||||
|
||||
IntStream.range(0, rank.length)
|
||||
.boxed()
|
||||
.sorted(comparator.reversed())
|
||||
.map(domainIndexToId::get)
|
||||
.limit(numResults)
|
||||
.forEach(list::add);
|
||||
|
||||
return list;
|
||||
}
|
||||
|
||||
public RoaringBitmap getRanking(int numResults) {
|
||||
if (numResults < 0) {
|
||||
numResults = domainIdToIndex.size();
|
||||
}
|
||||
if (numResults >= rank.length) {
|
||||
numResults = rank.length;
|
||||
}
|
||||
numResults = min(numResults, min(domainIdToIndex.size(), rank.length));
|
||||
|
||||
RoaringBitmap list = new RoaringBitmap();
|
||||
int[] nodes = sortOrder(rank);
|
||||
var accumulator = accumulatorP.get();
|
||||
|
||||
int[] nodes = new int[rank.length];
|
||||
Arrays.setAll(nodes, i->i);
|
||||
IntComparator comp = (i,j) -> (int) Math.signum(rank[j] - rank[i]);
|
||||
IntArrays.quickSort(nodes, comp);
|
||||
|
||||
int i;
|
||||
|
||||
for (i = 0; i < numResults; i++) {
|
||||
for (int i = 0; i < numResults; i++) {
|
||||
int id = domainIndexToId.get(nodes[i]);
|
||||
|
||||
if (includeInRanking(domainsById.get(id)))
|
||||
list.add(id);
|
||||
accumulator.add(id, i);
|
||||
}
|
||||
|
||||
for (; i < nodes.length && domainsById.size() < numResults; i++) {
|
||||
int id = domainIndexToId.get(nodes[i]);
|
||||
return accumulator;
|
||||
}
|
||||
private static int[] sortOrder(double[] values) {
|
||||
|
||||
if (includeInRanking(domainsById.get(id)))
|
||||
list.add(id);
|
||||
}
|
||||
int[] ret = new int[values.length];
|
||||
Arrays.setAll(ret, i->i);
|
||||
IntArrays.quickSort(ret, (i,j) -> (int) Math.signum(values[j] - values[i]));
|
||||
|
||||
|
||||
return list;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
public void incrementAll(double v) {
|
||||
for (int i = 0; i < rank.length; i++) {
|
||||
rank[i]+=v;
|
||||
}
|
||||
}
|
||||
|
||||
int size() {
|
||||
return domainsById.size();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -1,10 +1,12 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
package nu.marginalia.wmsa.edge.index.ranking;
|
||||
|
||||
|
||||
public class BetterReversePageRank extends RankingAlgorithm {
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
|
||||
|
||||
public class ReversePageRank extends RankingAlgorithm {
|
||||
|
||||
|
||||
public BetterReversePageRank(RankingDomainFetcher domains, String... origins) {
|
||||
public ReversePageRank(RankingDomainFetcher domains, String... origins) {
|
||||
super(domains, origins);
|
||||
}
|
||||
|
||||
@ -20,8 +22,6 @@ public class BetterReversePageRank extends RankingAlgorithm {
|
||||
double newRankValue = 0;
|
||||
|
||||
if (links != null && links.size() > 0) {
|
||||
|
||||
|
||||
for (int j = 0; j < links.size(); j++) {
|
||||
var revLinks = linkDataDest2Src[links.getQuick(j)];
|
||||
newRankValue += rank.get(links.getQuick(j)) / revLinks.size();
|
@ -1,9 +1,11 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
package nu.marginalia.wmsa.edge.index.ranking;
|
||||
|
||||
|
||||
public class BetterStandardPageRank extends RankingAlgorithm {
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
|
||||
|
||||
public BetterStandardPageRank(RankingDomainFetcher domains, String... origins) {
|
||||
public class StandardPageRank extends RankingAlgorithm {
|
||||
|
||||
public StandardPageRank(RankingDomainFetcher domains, String... origins) {
|
||||
super(domains, origins);
|
||||
}
|
||||
|
||||
@ -38,8 +40,7 @@ public class BetterStandardPageRank extends RankingAlgorithm {
|
||||
|
||||
@Override
|
||||
void adjustRankVector(RankVector vector, double dNorm, double oldNorm) {
|
||||
originDomainIds.forEach(id -> vector.increment(id, 0.15 / originDomainIds.size() /* dNorm/originDomainIds.size() */ ));
|
||||
// vector.incrementAll(0.14*dNorm/vector.size());
|
||||
originDomainIds.forEach(id -> vector.increment(id, 0.15 / originDomainIds.size() ));
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,6 @@
|
||||
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
|
||||
|
||||
public interface RankingResultAccumulator<T> {
|
||||
void add(int domainId, int rank);
|
||||
T get();
|
||||
}
|
@ -0,0 +1,17 @@
|
||||
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
|
||||
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
|
||||
public class RankingResultBitSetAccumulator implements RankingResultAccumulator<RoaringBitmap> {
|
||||
private final RoaringBitmap result = new RoaringBitmap();
|
||||
|
||||
@Override
|
||||
public void add(int domainId, int rank) {
|
||||
result.add(domainId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public RoaringBitmap get() {
|
||||
return result;
|
||||
}
|
||||
}
|
@ -0,0 +1,21 @@
|
||||
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
|
||||
|
||||
public class RankingResultHashMapAccumulator implements RankingResultAccumulator<Int2IntOpenHashMap> {
|
||||
private final Int2IntOpenHashMap result;
|
||||
|
||||
public RankingResultHashMapAccumulator(int size) {
|
||||
result = new Int2IntOpenHashMap(size);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void add(int domainId, int rank) {
|
||||
result.put(domainId, rank);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Int2IntOpenHashMap get() {
|
||||
return result;
|
||||
}
|
||||
}
|
@ -0,0 +1,24 @@
|
||||
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
|
||||
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
|
||||
public class RankingResultListAccumulator implements RankingResultAccumulator<TIntArrayList> {
|
||||
private final TIntArrayList result;
|
||||
|
||||
public RankingResultListAccumulator(int size) {
|
||||
result = new TIntArrayList(size);
|
||||
}
|
||||
public RankingResultListAccumulator() {
|
||||
result = new TIntArrayList(10_000);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void add(int domainId, int rank) {
|
||||
result.add(domainId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TIntArrayList get() {
|
||||
return result;
|
||||
}
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
package nu.marginalia.wmsa.edge.index.ranking.data;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
@ -10,7 +10,7 @@ public class RankingDomainData {
|
||||
public final int id;
|
||||
public final String name;
|
||||
private int alias;
|
||||
private EdgeDomainIndexingState state;
|
||||
public EdgeDomainIndexingState state;
|
||||
public final int knownUrls;
|
||||
|
||||
public int resolveAlias() {
|
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.util.ranking;
|
||||
package nu.marginalia.wmsa.edge.index.ranking.data;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
@ -11,12 +12,13 @@ import java.sql.SQLException;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.IntConsumer;
|
||||
|
||||
@Singleton
|
||||
public class RankingDomainFetcher {
|
||||
private final HikariDataSource dataSource;
|
||||
private final EdgeDomainBlacklistImpl blacklist;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
protected final HikariDataSource dataSource;
|
||||
protected final EdgeDomainBlacklistImpl blacklist;
|
||||
protected final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final boolean getNames = false;
|
||||
protected boolean getNames = false;
|
||||
|
||||
@Inject
|
||||
public RankingDomainFetcher(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
|
||||
@ -24,6 +26,10 @@ public class RankingDomainFetcher {
|
||||
this.blacklist = blacklist;
|
||||
}
|
||||
|
||||
public void retainNames() {
|
||||
this.getNames = true;
|
||||
}
|
||||
|
||||
public void getDomains(Consumer<RankingDomainData> consumer) {
|
||||
String query;
|
||||
if (getNames) {
|
||||
@ -49,14 +55,19 @@ public class RankingDomainFetcher {
|
||||
getDomains(query, consumer);
|
||||
}
|
||||
|
||||
private void getDomains(String query, Consumer<RankingDomainData> consumer) {
|
||||
protected void getDomains(String query, Consumer<RankingDomainData> consumer) {
|
||||
try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) {
|
||||
stmt.setFetchSize(10000);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
int id = rsp.getInt(1);
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5)));
|
||||
consumer.accept(
|
||||
new RankingDomainData(id,
|
||||
rsp.getString(2),
|
||||
rsp.getInt(3),
|
||||
EdgeDomainIndexingState.valueOf(rsp.getString(4)),
|
||||
rsp.getInt(5)));
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,103 @@
|
||||
package nu.marginalia.wmsa.edge.index.ranking.data;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
@Singleton
|
||||
public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher {
|
||||
final boolean hasData;
|
||||
|
||||
@Inject
|
||||
public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
|
||||
super(dataSource, blacklist);
|
||||
|
||||
hasData = isDomainNeighborTablePopulated(dataSource);
|
||||
}
|
||||
|
||||
private static boolean isDomainNeighborTablePopulated(HikariDataSource dataSource) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.createStatement();
|
||||
var rs = stmt.executeQuery("SELECT DOMAIN_ID FROM EC_DOMAIN_NEIGHBORS_2 LIMIT 1")) {
|
||||
|
||||
return rs.next();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
LoggerFactory
|
||||
.getLogger(RankingDomainFetcherForSimilarityData.class)
|
||||
.error("Failed to get count from EC_DOMAIN_NEIGHBORS_2", ex);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
public boolean hasData() {
|
||||
return hasData;
|
||||
}
|
||||
|
||||
public void eachDomainLink(DomainLinkConsumer consumer) {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS FROM EC_DOMAIN_NEIGHBORS_2"))
|
||||
{
|
||||
stmt.setFetchSize(10000);
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
|
||||
while (rsp.next()) {
|
||||
int src = rsp.getInt(1);
|
||||
int dst = rsp.getInt(2);
|
||||
|
||||
// these "links" are bidi
|
||||
consumer.accept(src, dst);
|
||||
consumer.accept(dst, src);
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to fetch domain links", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public void getDomains(Consumer<RankingDomainData> consumer) {
|
||||
// String query =
|
||||
// """
|
||||
// SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
|
||||
// FROM EC_DOMAIN
|
||||
// LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
// INNER JOIN EC_DOMAIN_LINK ON DEST_DOMAIN_ID=EC_DOMAIN.ID
|
||||
// WHERE SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID
|
||||
// GROUP BY EC_DOMAIN.ID
|
||||
// HAVING COUNT(SOURCE_DOMAIN_ID)>5
|
||||
// """;
|
||||
|
||||
String query;
|
||||
if (getNames) {
|
||||
query =
|
||||
"""
|
||||
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
|
||||
FROM EC_DOMAIN
|
||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
GROUP BY EC_DOMAIN.ID
|
||||
""";
|
||||
}
|
||||
else {
|
||||
query =
|
||||
"""
|
||||
SELECT EC_DOMAIN.ID,"",DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
|
||||
FROM EC_DOMAIN
|
||||
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
GROUP BY EC_DOMAIN.ID
|
||||
""";
|
||||
}
|
||||
|
||||
getDomains(query, consumer);
|
||||
}
|
||||
|
||||
|
||||
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
|
||||
// This is not relevant for this variant of pagerank since it is bidirectional
|
||||
}
|
||||
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.util.ranking.old;
|
||||
package nu.marginalia.wmsa.edge.index.ranking.old;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.util.ranking.old;
|
||||
package nu.marginalia.wmsa.edge.index.ranking.old;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
@ -125,7 +125,6 @@ public class StandardPageRank {
|
||||
|
||||
final TIntArrayList empty = new TIntArrayList();
|
||||
|
||||
double rankNorm = rank.norm();
|
||||
RankVector newRank = new RankVector(0);
|
||||
|
||||
for (DomainData domain : domains.valueCollection()) {
|
||||
@ -176,8 +175,6 @@ public class StandardPageRank {
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
TIntHashSet deadEnds = new TIntHashSet(domains.size());
|
||||
}
|
||||
|
||||
private class RankVector {
|
@ -1,43 +1,30 @@
|
||||
package nu.marginalia.util.ranking.tool;
|
||||
package nu.marginalia.wmsa.edge.index.ranking.tool;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
public class UpdateDomainRanksTool {
|
||||
public class CreateBrowseDomainRanksTool {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool.class);
|
||||
private static final Logger logger = LoggerFactory.getLogger(CreateBrowseDomainRanksTool.class);
|
||||
|
||||
public Set<String> originDomains = new HashSet<>();
|
||||
public Set<Integer> originDomainIds = new HashSet<>();
|
||||
public final long domainIdMax = -1;
|
||||
public int domainCount;
|
||||
private volatile static int rankMax;
|
||||
|
||||
public int maxId() {
|
||||
return (int) domainIdMax;
|
||||
}
|
||||
public int domainCount() {
|
||||
return domainCount;
|
||||
}
|
||||
|
||||
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
|
||||
volatile static boolean running = true;
|
||||
|
||||
@SneakyThrows
|
||||
public static void main(String... args) {
|
||||
org.mariadb.jdbc.Driver driver = new Driver();
|
||||
Driver driver = new Driver();
|
||||
var conn = new DatabaseModule().provideConnection();
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
@ -45,20 +32,21 @@ public class UpdateDomainRanksTool {
|
||||
|
||||
logger.info("Ranking");
|
||||
var ds = new DatabaseModule().provideConnection();
|
||||
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||
var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu");
|
||||
var domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds));
|
||||
var rpr = new StandardPageRank(domains, args);
|
||||
|
||||
rankMax = spr.size()*2;
|
||||
uploader.start();
|
||||
|
||||
var rankData = spr.pageRankWithPeripheralNodes(rankMax);
|
||||
for (int i : rankData) {
|
||||
var rankData = rpr.pageRankWithPeripheralNodes(1000, RankingResultListAccumulator::new);
|
||||
|
||||
rankData.forEach(i -> {
|
||||
try {
|
||||
uploadQueue.put(i);
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
return true;
|
||||
});
|
||||
|
||||
long end = System.currentTimeMillis();
|
||||
running = false;
|
||||
@ -68,24 +56,14 @@ public class UpdateDomainRanksTool {
|
||||
}
|
||||
|
||||
public static void uploadThread(HikariDataSource dataSource) {
|
||||
int i = 0;
|
||||
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
logger.info("Resetting rank");
|
||||
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET RANK=1")) {
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
|
||||
logger.info("Updating ranks");
|
||||
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET RANK=? WHERE ID=?")) {
|
||||
try (var stmt = conn.prepareStatement("INSERT IGNORE INTO EC_RANDOM_DOMAINS(DOMAIN_SET, DOMAIN_ID) VALUES (3, ?)")) {
|
||||
while (running || (!running && !uploadQueue.isEmpty())) {
|
||||
var job = uploadQueue.take();
|
||||
stmt.setDouble(1, i++ / (double) rankMax);
|
||||
stmt.setInt(2, job);
|
||||
stmt.setInt(1, job);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
} catch (SQLException | InterruptedException throwables) {
|
||||
throwables.printStackTrace();
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.util.ranking.tool;
|
||||
package nu.marginalia.wmsa.edge.index.ranking.tool;
|
||||
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
@ -10,9 +10,9 @@ import it.unimi.dsi.fastutil.ints.IntArrays;
|
||||
import it.unimi.dsi.fastutil.ints.IntComparator;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.RankingAlgorithm;
|
||||
import nu.marginalia.util.ranking.RankingDomainData;
|
||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.RankingAlgorithm;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainData;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
@ -33,8 +33,6 @@ public class PerusePageRankV2 {
|
||||
TIntArrayList[] linkDataSrc2Dest;
|
||||
TIntArrayList[] linkDataDest2Src;
|
||||
|
||||
private static final boolean getNames = true;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
static final LinkedBlockingQueue<LinkAdjacencies> uploadQueue = new LinkedBlockingQueue<>(10);
|
@ -0,0 +1,67 @@
|
||||
package nu.marginalia.wmsa.edge.index.ranking.tool;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
public class PrintDomainRanksTool {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(PrintDomainRanksTool.class);
|
||||
|
||||
private volatile static int rankMax;
|
||||
|
||||
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
|
||||
volatile static boolean running = true;
|
||||
|
||||
@SneakyThrows
|
||||
public static void main(String... args) {
|
||||
Driver driver = new Driver();
|
||||
var conn = new DatabaseModule().provideConnection();
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
|
||||
logger.info("Ranking");
|
||||
var ds = new DatabaseModule().provideConnection();
|
||||
|
||||
RankingDomainFetcher domains;
|
||||
if (Boolean.getBoolean("use-link-data")) {
|
||||
domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||
domains.retainNames();
|
||||
}
|
||||
else {
|
||||
domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds));
|
||||
domains.retainNames();
|
||||
}
|
||||
|
||||
var rpr = new StandardPageRank(domains, args);
|
||||
|
||||
rankMax = rpr.size();
|
||||
|
||||
var rankData = rpr.pageRankWithPeripheralNodes(rankMax, RankingResultListAccumulator::new);
|
||||
|
||||
AtomicInteger cnt = new AtomicInteger();
|
||||
rankData.forEach(i -> {
|
||||
|
||||
var data = rpr.getDomainData(i);
|
||||
|
||||
System.out.printf("%d %s %s\n", cnt.getAndIncrement(), data.name, data.state);
|
||||
return true;
|
||||
});
|
||||
|
||||
long end = System.currentTimeMillis();
|
||||
running = false;
|
||||
|
||||
logger.info("Done in {}", (end - start)/1000.0);
|
||||
}
|
||||
|
||||
}
|
@ -1,11 +1,12 @@
|
||||
package nu.marginalia.util.ranking.tool;
|
||||
package nu.marginalia.wmsa.edge.index.ranking.tool;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.BetterReversePageRank;
|
||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -13,12 +14,10 @@ import org.slf4j.LoggerFactory;
|
||||
import java.sql.SQLException;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
public class UpdateDomainRanksTool2 {
|
||||
public class UpdateDomainRanksTool {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool2.class);
|
||||
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool.class);
|
||||
|
||||
public final long domainIdMax = -1;
|
||||
public int domainCount;
|
||||
private volatile static int rankMax;
|
||||
|
||||
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
|
||||
@ -34,21 +33,22 @@ public class UpdateDomainRanksTool2 {
|
||||
|
||||
logger.info("Ranking");
|
||||
var ds = new DatabaseModule().provideConnection();
|
||||
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||
var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
|
||||
var domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds));
|
||||
var rpr = new StandardPageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com");
|
||||
|
||||
var rankVector = rpr.pageRankVector();
|
||||
rankMax = rpr.size();
|
||||
uploader.start();
|
||||
|
||||
var rankData = rpr.pageRankWithPeripheralNodes(rankMax);
|
||||
for (int i : rankData) {
|
||||
var rankData = rpr.pageRankWithPeripheralNodes(rankMax, RankingResultListAccumulator::new);
|
||||
|
||||
rankData.forEach(i -> {
|
||||
try {
|
||||
uploadQueue.put(i);
|
||||
} catch (InterruptedException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
return true;
|
||||
});
|
||||
|
||||
long end = System.currentTimeMillis();
|
||||
running = false;
|
@ -5,7 +5,7 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.prometheus.client.Histogram;
|
||||
import nu.marginalia.util.array.buffer.LongQueryBuffer;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
|
||||
import nu.marginalia.wmsa.client.GsonFactory;
|
||||
import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl;
|
||||
import nu.marginalia.wmsa.edge.index.query.IndexResultDomainDeduplicator;
|
||||
@ -101,7 +101,7 @@ public class EdgeIndexDomainQueryService {
|
||||
|
||||
private OptionalInt lookUpWord(String s) {
|
||||
int ret = indexes.getLexiconReader().get(s);
|
||||
if (ret == DictionaryHashMap.NO_VALUE) {
|
||||
if (ret == OffHeapDictionaryHashMap.NO_VALUE) {
|
||||
return OptionalInt.empty();
|
||||
}
|
||||
return OptionalInt.of(ret);
|
||||
|
@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.index.svc;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.protobuf.InvalidProtocolBufferException;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker;
|
||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||
@ -51,7 +51,7 @@ public class EdgeIndexLexiconService {
|
||||
|
||||
final int wordId = lr.get(word);
|
||||
|
||||
if (DictionaryHashMap.NO_VALUE == wordId) {
|
||||
if (OffHeapDictionaryHashMap.NO_VALUE == wordId) {
|
||||
response.status(404);
|
||||
return "";
|
||||
}
|
||||
@ -110,7 +110,7 @@ public class EdgeIndexLexiconService {
|
||||
String word = words[i];
|
||||
|
||||
long id = keywordLexicon.getOrInsert(word);
|
||||
if (id != DictionaryHashMap.NO_VALUE) {
|
||||
if (id != OffHeapDictionaryHashMap.NO_VALUE) {
|
||||
ids[putIdx++] = id;
|
||||
ids[putIdx++] = meta[i];
|
||||
}
|
||||
|
@ -12,7 +12,7 @@ import io.prometheus.client.Histogram;
|
||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.util.array.buffer.LongQueryBuffer;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
|
||||
import nu.marginalia.wmsa.client.GsonFactory;
|
||||
import nu.marginalia.wmsa.edge.index.postings.EdgeIndexQuerySearchTerms;
|
||||
import nu.marginalia.wmsa.edge.index.postings.IndexResultValuator;
|
||||
@ -115,11 +115,13 @@ public class EdgeIndexQueryService {
|
||||
TLongHashSet consideredUrlIds;
|
||||
|
||||
public SearchQuery(EdgeSearchSpecification specsSet) {
|
||||
this.fetchSize = specsSet.fetchSize;
|
||||
this.budget = new IndexSearchBudget(specsSet.timeoutMs);
|
||||
var limits = specsSet.queryLimits;
|
||||
|
||||
this.fetchSize = limits.fetchSize();
|
||||
this.budget = new IndexSearchBudget(limits.timeoutMs());
|
||||
this.subqueries = specsSet.subqueries;
|
||||
this.limitByDomain = specsSet.limitByDomain;
|
||||
this.limitTotal = specsSet.limitTotal;
|
||||
this.limitByDomain = limits.resultsByDomain();
|
||||
this.limitTotal = limits.resultsTotal();
|
||||
|
||||
this.consideredUrlIds = new TLongHashSet(fetchSize * 4);
|
||||
|
||||
@ -127,6 +129,7 @@ public class EdgeIndexQueryService {
|
||||
specsSet.quality,
|
||||
specsSet.year,
|
||||
specsSet.size,
|
||||
specsSet.rank,
|
||||
getSearchSet(specsSet),
|
||||
specsSet.queryStrategy);
|
||||
}
|
||||
@ -151,7 +154,7 @@ public class EdgeIndexQueryService {
|
||||
}
|
||||
}
|
||||
|
||||
final var evaluator = new IndexResultValuator(indexes, results, subqueries);
|
||||
final var evaluator = new IndexResultValuator(indexes, results, subqueries, queryParams);
|
||||
|
||||
ArrayList<EdgeSearchResultItem> items = new ArrayList<>(results.size());
|
||||
ArrayList<EdgeSearchResultItem> refusedItems = new ArrayList<>(results.size());
|
||||
@ -293,7 +296,7 @@ public class EdgeIndexQueryService {
|
||||
|
||||
private OptionalInt lookUpWord(String s) {
|
||||
int ret = indexes.getLexiconReader().get(s);
|
||||
if (ret == DictionaryHashMap.NO_VALUE) {
|
||||
if (ret == OffHeapDictionaryHashMap.NO_VALUE) {
|
||||
return OptionalInt.empty();
|
||||
}
|
||||
return OptionalInt.of(ret);
|
||||
|
@ -2,20 +2,20 @@ package nu.marginalia.wmsa.edge.index.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import gnu.trove.list.TIntList;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.ranking.BetterReversePageRank;
|
||||
import nu.marginalia.util.ranking.BetterStandardPageRank;
|
||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultHashMapAccumulator;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultBitSetAccumulator;
|
||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
|
||||
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
|
||||
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData;
|
||||
import nu.marginalia.wmsa.edge.index.svc.searchset.RankingSearchSet;
|
||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSet;
|
||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny;
|
||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier;
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -23,137 +23,47 @@ import java.io.IOException;
|
||||
|
||||
@Singleton
|
||||
public class EdgeIndexSearchSetsService {
|
||||
private final HikariDataSource dataSource;
|
||||
private RankingDomainFetcher rankingDomains;
|
||||
private final RankingSettings rankingSettings;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final RankingDomainFetcher rankingDomains;
|
||||
private final RankingDomainFetcher similarityDomains;
|
||||
private final RankingSettings rankingSettings;
|
||||
|
||||
private final SearchSet anySet = new SearchSetAny();
|
||||
|
||||
// Below are binary indices that are used to constrain a search
|
||||
private volatile RankingSearchSet retroSet;
|
||||
private volatile RankingSearchSet smallWebSet;
|
||||
private volatile RankingSearchSet academiaSet;
|
||||
private final SearchSet anySet = new SearchSetAny();
|
||||
|
||||
// The ranking value of the domains used in sorting the domains
|
||||
private volatile DomainRankings domainRankings = new DomainRankings();
|
||||
|
||||
@Inject
|
||||
public EdgeIndexSearchSetsService(HikariDataSource dataSource,
|
||||
RankingDomainFetcher rankingDomains,
|
||||
public EdgeIndexSearchSetsService(RankingDomainFetcher rankingDomains,
|
||||
RankingDomainFetcherForSimilarityData similarityDomains,
|
||||
RankingSettings rankingSettings,
|
||||
IndexServicesFactory servicesFactory) throws IOException {
|
||||
this.dataSource = dataSource;
|
||||
|
||||
this.rankingDomains = rankingDomains;
|
||||
|
||||
if (similarityDomains.hasData()) {
|
||||
this.similarityDomains = similarityDomains;
|
||||
}
|
||||
else {
|
||||
// on test environments the cosine similarity graph may not be present
|
||||
logger.info("Domain similarity is not present, falling back on link graph");
|
||||
this.similarityDomains = rankingDomains;
|
||||
}
|
||||
|
||||
this.rankingSettings = rankingSettings;
|
||||
|
||||
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat"));
|
||||
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, servicesFactory.getSearchSetsBase().resolve("academia.dat"));
|
||||
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, servicesFactory.getSearchSetsBase().resolve("retro.dat"));
|
||||
|
||||
logger.info("SearchIndexDao ranking settings = {}", rankingSettings);
|
||||
}
|
||||
|
||||
public void recalculateAll() {
|
||||
updateAcademiaDomains();
|
||||
updateRetroDomains();
|
||||
updateSmallWebDomains();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public RoaringBitmap goodUrls() {
|
||||
RoaringBitmap domains = new RoaringBitmap();
|
||||
RoaringBitmap urls = new RoaringBitmap();
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND IS_ALIVE")) {
|
||||
stmt.setFetchSize(10_000);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
domains.add(rsp.getInt(1));
|
||||
}
|
||||
}
|
||||
|
||||
// For some reason, doing this "INNER JOIN" in Java is significantly faster than doing it in SQL
|
||||
try (var stmt = connection.prepareStatement("SELECT ID,DOMAIN_ID FROM EC_URL WHERE VISITED AND EC_URL.STATE='OK'")) {
|
||||
stmt.setFetchSize(10_000);
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
if (domains.contains(rsp.getInt(2))) {
|
||||
urls.add(rsp.getInt(1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return urls;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void updateRetroDomains() {
|
||||
var spr = new BetterStandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new));
|
||||
var data = spr.pageRankWithPeripheralNodes(spr.size() / 2);
|
||||
|
||||
synchronized (this) {
|
||||
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data);
|
||||
retroSet.write();
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void updateSmallWebDomains() {
|
||||
var rpr = new BetterReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new));
|
||||
rpr.setMaxKnownUrls(750);
|
||||
var data = rpr.pageRankWithPeripheralNodes(rpr.size());
|
||||
|
||||
synchronized (this) {
|
||||
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data);
|
||||
smallWebSet.write();
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void updateAcademiaDomains() {
|
||||
var spr = new BetterStandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new));
|
||||
var data = spr.pageRankWithPeripheralNodes(spr.size()/2);
|
||||
|
||||
synchronized (this) {
|
||||
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data);
|
||||
academiaSet.write();
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public TIntList getStandardDomains() {
|
||||
TIntArrayList results = new TIntArrayList();
|
||||
|
||||
try (var connection = dataSource.getConnection();
|
||||
var stmt = connection.prepareStatement(
|
||||
"""
|
||||
SELECT ID FROM EC_DOMAIN
|
||||
WHERE INDEXED>0
|
||||
AND STATE='ACTIVE'
|
||||
AND DOMAIN_ALIAS IS NULL
|
||||
ORDER BY ID ASC
|
||||
""");
|
||||
) {
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
results.add(rs.getInt(1));
|
||||
}
|
||||
}
|
||||
return results;
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public TIntList getSpecialDomains() {
|
||||
TIntArrayList results = new TIntArrayList();
|
||||
try (var connection = dataSource.getConnection();
|
||||
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE='SPECIAL'")
|
||||
) {
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
results.add(rs.getInt(1));
|
||||
}
|
||||
}
|
||||
return results;
|
||||
public DomainRankings getDomainRankings() {
|
||||
return domainRankings;
|
||||
}
|
||||
|
||||
public SearchSet getSearchSetByName(SearchSetIdentifier searchSetIdentifier) {
|
||||
@ -167,4 +77,54 @@ public class EdgeIndexSearchSetsService {
|
||||
case SMALLWEB -> smallWebSet;
|
||||
};
|
||||
}
|
||||
|
||||
public void recalculateAll() {
|
||||
updateAcademiaDomainsSet();
|
||||
updateRetroDomainsSet();
|
||||
updateSmallWebDomainsSet();
|
||||
updateDomainRankings();
|
||||
}
|
||||
|
||||
private void updateDomainRankings() {
|
||||
var spr = new StandardPageRank(similarityDomains, rankingSettings.retro.toArray(String[]::new));
|
||||
|
||||
var ranks = spr.pageRankWithPeripheralNodes(spr.size() / 2, () -> new RankingResultHashMapAccumulator(100_000));
|
||||
synchronized (this) {
|
||||
domainRankings = new DomainRankings(ranks);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void updateRetroDomainsSet() {
|
||||
var spr = new StandardPageRank(similarityDomains, rankingSettings.retro.toArray(String[]::new));
|
||||
var data = spr.pageRankWithPeripheralNodes(spr.size() / 2, RankingResultBitSetAccumulator::new);
|
||||
|
||||
synchronized (this) {
|
||||
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data);
|
||||
retroSet.write();
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void updateSmallWebDomainsSet() {
|
||||
var rpr = new ReversePageRank(similarityDomains, rankingSettings.small.toArray(String[]::new));
|
||||
rpr.setMaxKnownUrls(750);
|
||||
var data = rpr.pageRankWithPeripheralNodes(rpr.size(), RankingResultBitSetAccumulator::new);
|
||||
|
||||
synchronized (this) {
|
||||
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data);
|
||||
smallWebSet.write();
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void updateAcademiaDomainsSet() {
|
||||
var spr = new StandardPageRank(similarityDomains, rankingSettings.academia.toArray(String[]::new));
|
||||
var data = spr.pageRankWithPeripheralNodes(spr.size()/2, RankingResultBitSetAccumulator::new);
|
||||
|
||||
synchronized (this) {
|
||||
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data);
|
||||
academiaSet.write();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -9,21 +9,37 @@ import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
/** A serializable bit map of domains
|
||||
*
|
||||
* @see SearchSetIdentifier
|
||||
*
|
||||
* */
|
||||
public class RankingSearchSet implements SearchSet {
|
||||
|
||||
private final RoaringBitmap set;
|
||||
public final SearchSetIdentifier identifier;
|
||||
public final Path source;
|
||||
|
||||
public RankingSearchSet(SearchSetIdentifier identifier, Path source, RoaringBitmap set) {
|
||||
this.identifier = identifier;
|
||||
this.source = source;
|
||||
this.set = set;
|
||||
}
|
||||
|
||||
public RankingSearchSet(SearchSetIdentifier identifier, Path source) throws IOException {
|
||||
this.identifier = identifier;
|
||||
this.source = source;
|
||||
set = new RoaringBitmap();
|
||||
|
||||
if (!Files.exists(source)) {
|
||||
return;
|
||||
set = new RoaringBitmap();
|
||||
}
|
||||
else {
|
||||
set = load(source);
|
||||
}
|
||||
}
|
||||
|
||||
private static RoaringBitmap load(Path source) throws IOException {
|
||||
var set = new RoaringBitmap();
|
||||
try (var ds = new DataInputStream(Files.newInputStream(source, StandardOpenOption.READ))) {
|
||||
for (;;) {
|
||||
try {
|
||||
@ -32,12 +48,7 @@ public class RankingSearchSet implements SearchSet {
|
||||
catch (IOException ex) { break; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public RankingSearchSet(SearchSetIdentifier identifier, Path source, RoaringBitmap set) {
|
||||
this.identifier = identifier;
|
||||
this.source = source;
|
||||
this.set = set;
|
||||
return set;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -46,7 +57,11 @@ public class RankingSearchSet implements SearchSet {
|
||||
}
|
||||
|
||||
public void write() throws IOException {
|
||||
try (var ds = new DataOutputStream(Files.newOutputStream(source, StandardOpenOption.WRITE, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))) {
|
||||
try (var ds = new DataOutputStream(Files.newOutputStream(source,
|
||||
StandardOpenOption.WRITE,
|
||||
StandardOpenOption.CREATE,
|
||||
StandardOpenOption.TRUNCATE_EXISTING)))
|
||||
{
|
||||
for (var iter = set.getIntIterator(); iter.hasNext();) {
|
||||
ds.writeInt(iter.next());
|
||||
}
|
||||
|
@ -1,5 +1,12 @@
|
||||
package nu.marginalia.wmsa.edge.index.svc.searchset;
|
||||
|
||||
import nu.marginalia.wmsa.edge.search.model.EdgeSearchProfile;
|
||||
|
||||
/** Identifies a RankingSearchSet, associated with an EdgeSearchProfile
|
||||
*
|
||||
* @see RankingSearchSet
|
||||
* @see EdgeSearchProfile
|
||||
* */
|
||||
public enum SearchSetIdentifier {
|
||||
NONE,
|
||||
RETRO,
|
||||
|
@ -13,8 +13,8 @@ public class SmallSearchSet implements SearchSet {
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean contains(int urlId) {
|
||||
return entries.contains(urlId);
|
||||
public boolean contains(int domainId) {
|
||||
return entries.contains(domainId);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
|
@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.integration.stackoverflow;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
|
||||
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.wmsa.edge.integration.wikipedia;
|
||||
|
||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.KeywordMetadata;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
|
||||
|
@ -11,8 +11,6 @@ import java.util.regex.Pattern;
|
||||
@Getter @Setter @Builder
|
||||
public class EdgeDomain {
|
||||
|
||||
private static final Predicate<String> ipPatternTest = Pattern.compile("[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}").asMatchPredicate();
|
||||
private static final Predicate<String> govListTest = Pattern.compile(".*\\.(ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate();
|
||||
|
||||
@Nonnull
|
||||
public final String subDomain;
|
||||
@ -27,7 +25,7 @@ public class EdgeDomain {
|
||||
|
||||
var dot = host.lastIndexOf('.');
|
||||
|
||||
if (dot < 0 || ipPatternTest.test(host)) { // IPV6 >.>
|
||||
if (dot < 0 || looksLikeAnIp(host)) { // IPV6 >.>
|
||||
subDomain = "";
|
||||
domain = host;
|
||||
}
|
||||
@ -38,7 +36,7 @@ public class EdgeDomain {
|
||||
domain = host;
|
||||
}
|
||||
else {
|
||||
if (govListTest.test(host))
|
||||
if (looksLikeGovTld(host))
|
||||
{ // Capture .ac.jp, .co.uk
|
||||
int dot3 = host.substring(0, dot2).lastIndexOf('.');
|
||||
if (dot3 >= 0) {
|
||||
@ -59,6 +57,35 @@ public class EdgeDomain {
|
||||
}
|
||||
}
|
||||
|
||||
private static final Predicate<String> govListTest = Pattern.compile(".*\\.(ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate();
|
||||
private boolean looksLikeGovTld(String host) {
|
||||
if (host.length() < 8)
|
||||
return false;
|
||||
int cnt = 0;
|
||||
for (int i = host.length() - 7; i < host.length(); i++) {
|
||||
if (host.charAt(i) == '.')
|
||||
cnt++;
|
||||
}
|
||||
return cnt >= 2 && govListTest.test(host);
|
||||
}
|
||||
|
||||
|
||||
private static final Predicate<String> ipPatternTest = Pattern.compile("[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}").asMatchPredicate();
|
||||
|
||||
private boolean looksLikeAnIp(String host) {
|
||||
if (host.length() < 7)
|
||||
return false;
|
||||
|
||||
char firstChar = host.charAt(0);
|
||||
int lastChar = host.charAt(host.length() - 1);
|
||||
|
||||
return Character.isDigit(firstChar)
|
||||
&& Character.isDigit(lastChar)
|
||||
&& ipPatternTest.test(host);
|
||||
}
|
||||
|
||||
|
||||
|
||||
public EdgeUrl toRootUrl() {
|
||||
// Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http
|
||||
return new EdgeUrl("http", this, null, "/", null);
|
||||
|
@ -24,6 +24,11 @@ public record EdgeSearchResultKeywordScore(int set,
|
||||
sum += 20;
|
||||
}
|
||||
|
||||
int rank = EdgePageDocumentsMetadata.decodeRank(encodedDocMetadata) - 13;
|
||||
if (rank < 0)
|
||||
sum += rank / 2;
|
||||
else
|
||||
sum += rank / 4;
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.wmsa.edge.model.search;
|
||||
|
||||
import lombok.*;
|
||||
import nu.marginalia.wmsa.edge.index.model.QueryLimits;
|
||||
import nu.marginalia.wmsa.edge.index.model.QueryStrategy;
|
||||
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit;
|
||||
@ -9,23 +10,18 @@ import java.util.List;
|
||||
|
||||
@ToString @Getter @Builder @With @AllArgsConstructor
|
||||
public class EdgeSearchSpecification {
|
||||
|
||||
public List<EdgeSearchSubquery> subqueries;
|
||||
public List<Integer> domains;
|
||||
public SearchSetIdentifier searchSetIdentifier;
|
||||
|
||||
public final int limitByDomain;
|
||||
public final int limitTotal;
|
||||
|
||||
public final String humanQuery;
|
||||
|
||||
public final int timeoutMs;
|
||||
public final int fetchSize;
|
||||
|
||||
public final SpecificationLimit quality;
|
||||
public final SpecificationLimit year;
|
||||
public final SpecificationLimit size;
|
||||
public final SpecificationLimit rank;
|
||||
|
||||
public final QueryLimits queryLimits;
|
||||
public final QueryStrategy queryStrategy;
|
||||
|
||||
}
|
||||
|
@ -6,6 +6,7 @@ import nu.marginalia.util.language.WordPatterns;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||
import nu.marginalia.wmsa.edge.index.model.QueryLimits;
|
||||
import nu.marginalia.wmsa.edge.index.model.QueryStrategy;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery;
|
||||
@ -84,6 +85,8 @@ public class QueryFactory {
|
||||
List<String> problems = new ArrayList<>();
|
||||
String domain = null;
|
||||
|
||||
QueryStrategy queryStrategy = QueryStrategy.AUTO;
|
||||
|
||||
var basicQuery = queryParser.parse(query);
|
||||
|
||||
if (basicQuery.size() >= 8) {
|
||||
@ -94,6 +97,7 @@ public class QueryFactory {
|
||||
SpecificationLimit qualityLimit = profile.getQualityLimit();
|
||||
SpecificationLimit year = profile.getYearLimit();
|
||||
SpecificationLimit size = profile.getSizeLimit();
|
||||
SpecificationLimit rank = SpecificationLimit.none();
|
||||
|
||||
for (Token t : basicQuery) {
|
||||
if (t.type == TokenType.QUOT_TERM || t.type == TokenType.LITERAL_TERM) {
|
||||
@ -113,6 +117,12 @@ public class QueryFactory {
|
||||
if (t.type == TokenType.SIZE_TERM) {
|
||||
size = parseSpecificationLimit(t.str);
|
||||
}
|
||||
if (t.type == TokenType.RANK_TERM) {
|
||||
rank = parseSpecificationLimit(t.str);
|
||||
}
|
||||
if (t.type == TokenType.QS_TERM) {
|
||||
queryStrategy = parseQueryStrategy(t.str);
|
||||
}
|
||||
}
|
||||
|
||||
var queryPermutations = queryParser.permuteQueriesNew(basicQuery);
|
||||
@ -148,6 +158,8 @@ public class QueryFactory {
|
||||
case QUALITY_TERM:
|
||||
case YEAR_TERM:
|
||||
case SIZE_TERM:
|
||||
case RANK_TERM:
|
||||
case QS_TERM:
|
||||
break; //
|
||||
case NEAR_TERM:
|
||||
near = t.str;
|
||||
@ -179,25 +191,25 @@ public class QueryFactory {
|
||||
}
|
||||
}
|
||||
|
||||
int domainLimit;
|
||||
if (domain != null) {
|
||||
domainLimit = 100;
|
||||
} else {
|
||||
domainLimit = 2;
|
||||
}
|
||||
|
||||
EdgeSearchSpecification.EdgeSearchSpecificationBuilder specsBuilder = EdgeSearchSpecification.builder()
|
||||
.subqueries(subqueries)
|
||||
.limitTotal(100)
|
||||
.queryLimits(new QueryLimits(domainLimit, 100, 250, 4096))
|
||||
.humanQuery(query)
|
||||
.timeoutMs(250)
|
||||
.fetchSize(4096)
|
||||
.quality(qualityLimit)
|
||||
.year(year)
|
||||
.size(size)
|
||||
.rank(rank)
|
||||
.domains(domains)
|
||||
.queryStrategy(QueryStrategy.AUTO)
|
||||
.queryStrategy(queryStrategy)
|
||||
.searchSetIdentifier(profile.searchSetIdentifier);
|
||||
|
||||
if (domain != null) {
|
||||
specsBuilder = specsBuilder.limitByDomain(100);
|
||||
} else {
|
||||
specsBuilder = specsBuilder.limitByDomain(2);
|
||||
}
|
||||
|
||||
EdgeSearchSpecification specs = specsBuilder.build();
|
||||
|
||||
return new EdgeSearchQuery(specs, searchTermsHuman, domain);
|
||||
@ -210,10 +222,10 @@ public class QueryFactory {
|
||||
if (startChar == '=') {
|
||||
return SpecificationLimit.equals(val);
|
||||
}
|
||||
else if (startChar == '<'){
|
||||
else if (startChar == '<') {
|
||||
return SpecificationLimit.lessThan(val);
|
||||
}
|
||||
else if (startChar == '>'){
|
||||
else if (startChar == '>') {
|
||||
return SpecificationLimit.greaterThan(val);
|
||||
}
|
||||
else {
|
||||
@ -221,6 +233,17 @@ public class QueryFactory {
|
||||
}
|
||||
}
|
||||
|
||||
private QueryStrategy parseQueryStrategy(String str) {
|
||||
return switch (str.toUpperCase()) {
|
||||
case "RF_TITLE" -> QueryStrategy.REQUIRE_FIELD_TITLE;
|
||||
case "RF_SUBJECT" -> QueryStrategy.REQUIRE_FIELD_SUBJECT;
|
||||
case "RF_SITE" -> QueryStrategy.REQUIRE_FIELD_SITE;
|
||||
case "SENTENCE" -> QueryStrategy.SENTENCE;
|
||||
case "TOPIC" -> QueryStrategy.TOPIC;
|
||||
default -> QueryStrategy.AUTO;
|
||||
};
|
||||
}
|
||||
|
||||
private String normalizeDomainName(String str) {
|
||||
return str.toLowerCase();
|
||||
}
|
||||
|
@ -93,6 +93,10 @@ public class QueryParser {
|
||||
entity.replace(new Token(TokenType.YEAR_TERM, t.str.substring(4), t.displayStr));
|
||||
} else if (t.str.startsWith("size") && t.str.matches("size[=><]\\d+")) {
|
||||
entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr));
|
||||
} else if (t.str.startsWith("rank") && t.str.matches("rank[=><]\\d+")) {
|
||||
entity.replace(new Token(TokenType.RANK_TERM, t.str.substring(4), t.displayStr));
|
||||
} else if (t.str.startsWith("qs=")) {
|
||||
entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr));
|
||||
} else if (t.str.contains(":")) {
|
||||
entity.replace(new Token(TokenType.ADVICE_TERM, t.str, t.displayStr));
|
||||
}
|
||||
@ -506,8 +510,11 @@ enum TokenType implements Predicate<Token> {
|
||||
QUALITY_TERM,
|
||||
YEAR_TERM,
|
||||
SIZE_TERM,
|
||||
RANK_TERM,
|
||||
NEAR_TERM,
|
||||
|
||||
QS_TERM,
|
||||
|
||||
QUOT,
|
||||
MINUS,
|
||||
QMARK,
|
||||
|
@ -8,7 +8,7 @@ import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.util.language.processing.KeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
||||
import nu.marginalia.util.language.processing.model.WordSpan;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
|
||||
@ -25,12 +25,12 @@ public class QueryVariants {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final KeywordExtractor keywordExtractor;
|
||||
private final SentenceExtractor sentenceExtractor;
|
||||
private final TermFrequencyDict dict;
|
||||
private final PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
private final NGramBloomFilter nGramBloomFilter;
|
||||
private final EnglishDictionary englishDictionary;
|
||||
private final ThreadLocal<SentenceExtractor> sentenceExtractor;
|
||||
|
||||
@Inject
|
||||
public QueryVariants(LanguageModels lm,
|
||||
@ -40,7 +40,7 @@ public class QueryVariants {
|
||||
this.nGramBloomFilter = nGramBloomFilter;
|
||||
this.englishDictionary = englishDictionary;
|
||||
this.keywordExtractor = new KeywordExtractor();
|
||||
this.sentenceExtractor = new SentenceExtractor(lm);
|
||||
this.sentenceExtractor = ThreadLocal.withInitial(() -> new SentenceExtractor(lm));
|
||||
this.dict = dict;
|
||||
}
|
||||
|
||||
@ -78,10 +78,8 @@ public class QueryVariants {
|
||||
|
||||
final TreeMap<Integer, List<WordSpan>> byStart = new TreeMap<>();
|
||||
|
||||
logger.debug("Q: {}", query);
|
||||
logger.debug("QAS: {}", joinedQuery);
|
||||
|
||||
var sentence = sentenceExtractor.extractSentence(joinedQuery.joinedQuery);
|
||||
var se = sentenceExtractor.get();
|
||||
var sentence = se.extractSentence(joinedQuery.joinedQuery);
|
||||
|
||||
for (int i = 0; i < sentence.posTags.length; i++) {
|
||||
if (sentence.posTags[i].startsWith("N") || sentence.posTags[i].startsWith("V")) {
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user