A tiny release between crawls (#138)

Bringing online new ranking changes

Co-authored-by: Viktor Lofgren <vlofgren@marginalia.nu>
Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/138
This commit is contained in:
Viktor Lofgren 2023-02-12 10:57:07 +01:00
parent 467bf566a9
commit fa9b4e4352
145 changed files with 3896 additions and 1952 deletions

View File

@ -106,8 +106,9 @@ dependencies {
implementation group: 'org.yaml', name: 'snakeyaml', version: '1.30'
implementation 'com.syncthemall:boilerpipe:1.2.2'
implementation 'com.github.luben:zstd-jni:1.5.2-2'
implementation 'org.lz4:lz4-java:1.8.0'
implementation 'com.github.vladimir-bukhtoyarov:bucket4j-core:7.5.0'
implementation 'de.rototor.jeuclid:jeuclid-core:3.1.14'
@ -126,7 +127,6 @@ dependencies {
implementation 'org.roaringbitmap:RoaringBitmap:0.9.32'
implementation group: 'mysql', name: 'mysql-connector-java', version: '8.0.29'
implementation 'com.github.Marcono1234:gson-record-type-adapter-factory:0.2.0'
testImplementation 'org.junit.jupiter:junit-jupiter-api:5.8.2'

View File

@ -2,11 +2,13 @@ package nu.marginalia.util;
public class BrailleBlockPunchCards {
private static final char brailleBlockBase = '\u2800';
public static String printBits(int val, int bits) {
StringBuilder builder = new StringBuilder();
for (int b = 0; b < bits; b+=8, val>>>=8) {
builder.append((char)('\u2800'+bin2brail(val)));
builder.append((char)(brailleBlockBase + bin2brail(val)));
}
return builder.toString();

View File

@ -42,7 +42,7 @@ public abstract class ParallelPipe<INPUT,INTERMEDIATE> {
@SneakyThrows
private void runProcessThread() {
while (expectingInput || !inputs.isEmpty()) {
var in = inputs.poll(1, TimeUnit.SECONDS);
var in = inputs.poll(10, TimeUnit.SECONDS);
if (in != null) {
try {

View File

@ -108,7 +108,6 @@ public class RandomWriteFunnel implements AutoCloseable {
private void eval(ByteBuffer dest) throws IOException {
flushBuffer();
channel.force(false);
channel.position(0);
buffer.clear();

View File

@ -1,20 +1,33 @@
package nu.marginalia.util;
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
import java.util.Arrays;
import java.util.HashMap;
public class StringPool {
private final HashMap<String, String> words;
public StringPool() {
this.words = new HashMap<>(1000);
private final HashMap<String, String> words;
private final Object2LongOpenHashMap<String> ages;
private final int maxCap;
long idx;
private StringPool(int capacity, int maxCap) {
this.ages = new Object2LongOpenHashMap<>(capacity);
this.words = new HashMap<>(capacity);
this.maxCap = maxCap;
}
public StringPool(int capacity) {
words = new HashMap<>(capacity);
public static StringPool create(int capacity) {
return new StringPool(capacity, capacity * 10);
}
public String internalize(String str) {
prune();
final String ret = words.putIfAbsent(str, str);
ages.put(ret, idx++);
if (null == ret)
return str;
@ -22,6 +35,37 @@ public class StringPool {
return ret;
}
public String[] internalize(String[] str) {
for (int i = 0; i < str.length; i++) {
str[i] = internalize(str[i]);
}
return str;
}
public void prune() {
if (words.size() < maxCap)
return;
long[] ageValues = ages.values().toLongArray();
Arrays.sort(ageValues);
long cutoff = ageValues[ageValues.length - maxCap / 10];
words.clear();
ages.forEach((word, cnt) -> {
if (cnt >= cutoff) {
words.put(word, word);
}
});
ages.clear();
words.forEach((w,w2) -> {
ages.put(w, idx);
});
}
public void flush() {
words.clear();
}

View File

@ -18,22 +18,14 @@ public interface IntArrayBase extends BulkTransferArray<IntBuffer> {
}
}
default void increment(long pos) {
set(pos, get(pos) + 1);
}
default void swap(long pos1, long pos2) {
int tmp = get(pos1);
set(pos1, get(pos2));
set(pos2, tmp);
}
default void swapn(int n, long pos1, long pos2) {
for (int i = 0; i < n; i++) {
int tmp = get(pos1+i);
set(pos1+i, get(pos2+i));
set(pos2+i, tmp);
}
default void increment(long pos) {
set(pos, get(pos) + 1);
}
default int getAndIncrement(long pos) {
@ -47,6 +39,7 @@ public interface IntArrayBase extends BulkTransferArray<IntBuffer> {
set(start+i, buffer.get(i + bufferStart));
}
}
default void get(long start, long end, IntBuffer buffer, int bufferStart) {
for (int i = 0; i < (end-start); i++) {
buffer.put(i + bufferStart, get(start + i));

View File

@ -28,6 +28,7 @@ public interface LongArrayBase extends BulkTransferArray<LongBuffer> {
set(pos2, tmp);
}
/** Behavior not defined for overlapping ranges */
default void swapn(int n, long pos1, long pos2) {
for (int i = 0; i < n; i++) {
long tmp = get(pos1+i);

View File

@ -2,6 +2,7 @@ package nu.marginalia.util.array.delegate;
import com.upserve.uppend.blobs.NativeIO;
import nu.marginalia.util.array.IntArray;
import nu.marginalia.util.array.algo.SortingContext;
import nu.marginalia.util.array.buffer.IntQueryBuffer;
import nu.marginalia.util.array.functional.IntBinaryIOOperation;
import nu.marginalia.util.array.functional.IntIOTransformer;
@ -61,6 +62,16 @@ public class ShiftedIntArray implements IntArray {
delegate.get(shift+start, shift+end, buffer);
}
@Override
public int getAndIncrement(long pos) {
return delegate.getAndIncrement(shift + pos);
}
@Override
public void fill(long start, long end, int val) {
delegate.fill(start + shift, end + shift, val);
}
@Override
public long size() {
return size;
@ -97,6 +108,12 @@ public class ShiftedIntArray implements IntArray {
return delegate.isSorted(shift + start, shift + end);
}
public void sortLargeSpan(SortingContext ctx, long start, long end) throws IOException {
delegate.sortLargeSpan(ctx, start, end);
}
public long search(int key) {
if (size < 128) {
return linearSearch(key);

View File

@ -3,6 +3,7 @@ package nu.marginalia.util.array.delegate;
import com.upserve.uppend.blobs.NativeIO;
import nu.marginalia.util.array.LongArray;
import nu.marginalia.util.array.algo.LongArraySearch;
import nu.marginalia.util.array.algo.SortingContext;
import nu.marginalia.util.array.buffer.LongQueryBuffer;
import nu.marginalia.util.array.functional.LongBinaryIOOperation;
import nu.marginalia.util.array.functional.LongIOTransformer;
@ -62,6 +63,16 @@ public class ShiftedLongArray implements LongArray {
delegate.get(shift+start, shift+end, buffer);
}
@Override
public long getAndIncrement(long pos) {
return delegate.getAndIncrement(shift + pos);
}
@Override
public void fill(long start, long end, long val) {
delegate.fill(start + shift, end + shift, val);
}
@Override
public long size() {
return size;
@ -106,6 +117,14 @@ public class ShiftedLongArray implements LongArray {
return delegate.isSortedN(sz, shift + start, shift + end);
}
public void sortLargeSpanN(SortingContext ctx, int sz, long start, long end) throws IOException {
delegate.sortLargeSpanN(ctx, sz, start, end);
}
public void sortLargeSpan(SortingContext ctx, long start, long end) throws IOException {
delegate.sortLargeSpan(ctx, start, end);
}
public long searchN(int sz, long key) {
if (size < 128) {
return linearSearchN(sz, key);

View File

@ -2,6 +2,7 @@ package nu.marginalia.util.array.page;
import com.upserve.uppend.blobs.NativeIO;
import nu.marginalia.util.array.IntArray;
import nu.marginalia.util.array.algo.SortingContext;
import nu.marginalia.util.array.buffer.IntQueryBuffer;
import nu.marginalia.util.array.delegate.ReferenceImplIntArrayDelegate;
import nu.marginalia.util.array.functional.IntBinaryIOOperation;
@ -113,6 +114,11 @@ public class PagingIntArray extends AbstractPagingArray<IntArrayPage, IntBuffer>
}
}
@Override
public int getAndIncrement(long pos) {
return pages[partitioningScheme.getPage(pos)].getAndIncrement(partitioningScheme.getOffset(pos));
}
@Override
public void get(long start, long end, int[] buffer) {
if (partitioningScheme.isSamePage(start, end)) {
@ -272,6 +278,22 @@ public class PagingIntArray extends AbstractPagingArray<IntArrayPage, IntBuffer>
}
}
public void sortLargeSpan(SortingContext ctx, long start, long end) throws IOException {
if (partitioningScheme.isSamePage(start, end)) {
int sOff = partitioningScheme.getOffset(start);
int eOff = partitioningScheme.getEndOffset(start, end);
if (eOff > sOff) {
pages[partitioningScheme.getPage(start)].sortLargeSpan(ctx, sOff, eOff);
}
}
else {
defaults.sortLargeSpan(ctx, start, end);
}
}
public void write(Path fileName) throws IOException {
try (var channel = (FileChannel) Files.newByteChannel(fileName, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
for (int i = 0; i < pages.length; i++) {

View File

@ -2,6 +2,7 @@ package nu.marginalia.util.array.page;
import com.upserve.uppend.blobs.NativeIO;
import nu.marginalia.util.array.LongArray;
import nu.marginalia.util.array.algo.SortingContext;
import nu.marginalia.util.array.buffer.LongQueryBuffer;
import nu.marginalia.util.array.delegate.ReferenceImplLongArrayDelegate;
import nu.marginalia.util.array.functional.LongBinaryIOOperation;
@ -118,6 +119,11 @@ public class PagingLongArray extends AbstractPagingArray<LongArrayPage, LongBuff
}
}
@Override
public long getAndIncrement(long pos) {
return pages[partitioningScheme.getPage(pos)].getAndIncrement(partitioningScheme.getOffset(pos));
}
@Override
public void set(long pos, long value) {
int page = partitioningScheme.getPage(pos);
@ -439,6 +445,33 @@ public class PagingLongArray extends AbstractPagingArray<LongArrayPage, LongBuff
defaults.mergeSortN(sz, start, end, tempDir);
}
}
public void sortLargeSpanN(SortingContext ctx, int sz, long start, long end) throws IOException {
if (partitioningScheme.isSamePage(start, end)) {
int sOff = partitioningScheme.getOffset(start);
int eOff = partitioningScheme.getEndOffset(start, end);
if (eOff > sOff) {
pages[partitioningScheme.getPage(start)].sortLargeSpanN(ctx, sz, sOff, eOff);
}
}
else {
defaults.sortLargeSpanN(ctx, sz, start, end);
}
}
public void sortLargeSpan(SortingContext ctx, long start, long end) throws IOException {
if (partitioningScheme.isSamePage(start, end)) {
int sOff = partitioningScheme.getOffset(start);
int eOff = partitioningScheme.getEndOffset(start, end);
if (eOff > sOff) {
pages[partitioningScheme.getPage(start)].sortLargeSpan(ctx, sOff, eOff);
}
}
else {
defaults.sortLargeSpan(ctx, start, end);
}
}
public void write(Path fileName) throws IOException {
try (var channel = (FileChannel) Files.newByteChannel(fileName, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {

View File

@ -0,0 +1,17 @@
package nu.marginalia.util.bigstring;
public interface BigString {
static BigString encode(String stringValue) {
if (stringValue.length() > 64) {
return new CompressedBigString(stringValue);
}
else {
return new PlainBigString(stringValue);
}
}
String decode();
byte[] getBytes();
int length();
}

View File

@ -0,0 +1,39 @@
package nu.marginalia.util.bigstring;
import net.jpountz.lz4.LZ4Compressor;
import net.jpountz.lz4.LZ4Factory;
import net.jpountz.lz4.LZ4FastDecompressor;
import java.nio.charset.StandardCharsets;
public class CompressedBigString implements BigString {
private final int originalSize;
private final int length;
private final byte[] encoded;
private static final LZ4Factory lz4Factory = LZ4Factory.fastestInstance();;
private static final LZ4Compressor compressor = lz4Factory.fastCompressor();
private static final LZ4FastDecompressor decompressor = lz4Factory.fastDecompressor();
public CompressedBigString(String stringValue) {
byte[] byteValue = stringValue.getBytes(StandardCharsets.UTF_16);
originalSize = byteValue.length;
encoded = compressor.compress(byteValue);
length = stringValue.length();
}
@Override
public String decode() {
return new String(getBytes(), StandardCharsets.UTF_16);
}
@Override
public byte[] getBytes() {
return decompressor.decompress(encoded, originalSize);
}
@Override
public int length() {
return length;
}
}

View File

@ -0,0 +1,26 @@
package nu.marginalia.util.bigstring;
import java.nio.charset.StandardCharsets;
public class PlainBigString implements BigString {
private final String value;
public PlainBigString(String value) {
this.value = value;
}
@Override
public String decode() {
return value;
}
@Override
public byte[] getBytes() {
return value.getBytes(StandardCharsets.UTF_8);
}
@Override
public int length() {
return value.length();
}
}

View File

@ -1,8 +1,5 @@
package nu.marginalia.util.dict;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.ByteBuffer;
import java.nio.LongBuffer;
import java.util.ArrayList;
@ -10,7 +7,6 @@ import java.util.ArrayList;
public class DictionaryData {
private final int DICTIONARY_BANK_SIZE;
private static final Logger logger = LoggerFactory.getLogger(DictionaryData.class);
private final ArrayList<DictionaryDataBank> banks = new ArrayList<>(100);

View File

@ -1,6 +1,17 @@
package nu.marginalia.util.dict;
public interface DictionaryMap {
int NO_VALUE = Integer.MIN_VALUE;
static DictionaryMap create() {
if (Boolean.getBoolean("small-ram")) {
return new OnHeapDictionaryMap();
}
else {
return new OffHeapDictionaryHashMap(1L << 31);
}
}
int size();
int put(long key);

View File

@ -16,15 +16,14 @@ import static nu.marginalia.util.FileSizeUtil.readableSize;
* Spiritually influenced by GNU Trove's hash maps
* LGPL 2.1
*/
public class DictionaryHashMap implements DictionaryMap {
private static final Logger logger = LoggerFactory.getLogger(DictionaryHashMap.class);
public class OffHeapDictionaryHashMap implements DictionaryMap {
private static final Logger logger = LoggerFactory.getLogger(OffHeapDictionaryHashMap.class);
private static final Gauge probe_count_metrics
= Gauge.build("wmsa_dictionary_hash_map_probe_count", "Probing Count")
.register();
private final int bufferCount;
private final IntBuffer[] buffers;
public static final int NO_VALUE = Integer.MIN_VALUE;
private final DictionaryData dictionaryData;
@ -35,7 +34,7 @@ public class DictionaryHashMap implements DictionaryMap {
private final AtomicInteger sz = new AtomicInteger(0);
public DictionaryHashMap(long sizeMemory) {
public OffHeapDictionaryHashMap(long sizeMemory) {
final int intSize = 4;
bufferCount = 1 + (int) ((intSize*sizeMemory) / (1<<30));

View File

@ -0,0 +1,23 @@
package nu.marginalia.util.dict;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
public class OnHeapDictionaryMap implements DictionaryMap {
private final Long2IntOpenHashMap entries = new Long2IntOpenHashMap(100_000, 0.75f);
@Override
public int size() {
return entries.size();
}
@Override
public int put(long key) {
entries.putIfAbsent(key, entries.size());
return get(key);
}
@Override
public int get(long key) {
return entries.getOrDefault(key, NO_VALUE);
}
}

View File

@ -19,9 +19,6 @@ public class GuardedRegexFactory {
public static GuardedRegex contains(String substring, @Language("RegExp") String regex) {
return new GuardedRegexContains(substring, regex);
}
public static GuardedRegex minLength(int minLength, @Language("RegExp") String regex) {
return new GuardedRegexMinLength(minLength, regex);
}
private record GuardedRegexContains(String contains, Pattern pattern) implements GuardedRegex {
public GuardedRegexContains(String contains, String pattern) {
@ -32,15 +29,6 @@ public class GuardedRegexFactory {
return s.contains(contains) && pattern.matcher(s).find();
}
}
private record GuardedRegexMinLength(int minLength, Pattern pattern) implements GuardedRegex {
public GuardedRegexMinLength(int minLength, String pattern) {
this(minLength, Pattern.compile(pattern));
}
public boolean test(String s) {
return s.length() >= minLength && pattern.matcher(s).find();
}
}
private record GuardedRegexStartsWith(String start, Pattern pattern) implements GuardedRegex {
public GuardedRegexStartsWith(String start, String pattern) {
this(start, Pattern.compile(pattern));

View File

@ -4,7 +4,7 @@ import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.KeywordCounter;
import nu.marginalia.util.language.processing.KeywordExtractor;
import nu.marginalia.util.language.processing.NameCounter;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
import nu.marginalia.util.language.processing.model.DocumentSentence;
import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
@ -68,9 +68,6 @@ public class DocumentDebugger {
Set<String> reps = new HashSet<>();
// kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed));
// kc.count(languageData).forEach(rep -> reps.add(rep.stemmed));
try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) {
for (var sent : languageData.titleSentences) {

View File

@ -1,5 +1,7 @@
package nu.marginalia.util.language;
import org.apache.commons.lang3.StringUtils;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
@ -123,14 +125,24 @@ public class WordPatterns {
if (!filter(s)) {
return true;
}
if (isTopWord(s)) {
String sLc;
if (StringUtils.isAllLowerCase(s)) {
sLc = s;
}
else {
sLc = s.toLowerCase();
}
if (isTopWord(sLc)) {
return true;
}
return false;
}
public static boolean isTopWord(String s) {
return topWords.contains(s.toLowerCase());
public static boolean isTopWord(String strLowerCase) {
return topWords.contains(strLowerCase);
}
}

View File

@ -35,9 +35,7 @@ public class DocumentKeywordExtractor {
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
tfIdfCounter.countHisto(keywordMetadata, documentLanguageData);
List<WordRep> subjects = subjectCounter.count(keywordMetadata, documentLanguageData);
for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
@ -59,11 +57,12 @@ public class DocumentKeywordExtractor {
getWordPositions(keywordMetadata, documentLanguageData);
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
List<WordRep> wordsTfIdf = tfIdfCounter.countHisto(keywordMetadata, documentLanguageData);
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
List<WordRep> subjects = subjectCounter.count(keywordMetadata, documentLanguageData);
for (var rep : titleWords) keywordMetadata.titleKeywords().add(rep.stemmed);
for (var rep : wordsNamesAll) keywordMetadata.namesKeywords().add(rep.stemmed);
for (var rep : subjects) keywordMetadata.subjectKeywords().add(rep.stemmed);
@ -94,7 +93,7 @@ public class DocumentKeywordExtractor {
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getNames(sent)) {
for (var span : keywordExtractor.getProperNames(sent)) {
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
}
@ -108,7 +107,7 @@ public class DocumentKeywordExtractor {
ret.merge(word.stemmed(), posBit, this::bitwiseOr);
}
for (var span : keywordExtractor.getNames(sent)) {
for (var span : keywordExtractor.getProperNames(sent)) {
ret.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr);
}
@ -155,16 +154,16 @@ public class DocumentKeywordExtractor {
if (!word.isStopWord()) {
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
wordsBuilder.add(w, metadata.forWord(flagsTemplate, word.stemmed()));
wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, word.stemmed()));
}
}
}
for (var names : keywordExtractor.getNames(sent)) {
for (var names : keywordExtractor.getProperNames(sent)) {
var rep = new WordRep(sent, names);
String w = AsciiFlattener.flattenUnicode(rep.word);
wordsBuilder.add(w, metadata.forWord(flagsTemplate, rep.stemmed));
wordsBuilder.add(w, metadata.getMetadataForWord(flagsTemplate, rep.stemmed));
}
}
@ -218,7 +217,7 @@ public class DocumentKeywordExtractor {
continue;
}
wordsBuilder.add(flatWord, metadata.forWord(metadata.wordFlagsTemplate(), word.stemmed) | additionalMeta);
wordsBuilder.add(flatWord, metadata.getMetadataForWord(metadata.wordFlagsTemplate(), word.stemmed) | additionalMeta);
}
}

View File

@ -43,8 +43,8 @@ public class KeywordCounter {
counts.mergeInt(rep.stemmed, 1, Integer::sum);
var instanceSet = instances.computeIfAbsent(rep.stemmed, k -> new HashSet<>(500));
if (instanceSet.size() < 250) {
var instanceSet = instances.computeIfAbsent(rep.stemmed, k -> new HashSet<>(16));
if (instanceSet.size() < 4) {
instanceSet.add(rep);
}
}

View File

@ -7,14 +7,12 @@ import nu.marginalia.util.language.processing.model.tag.WordSeparator;
import java.lang.ref.SoftReference;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;
public class KeywordExtractor {
public WordSpan[] getNames(DocumentSentence sentence) {
List<WordSpan> spans = new ArrayList<>(sentence.length());
public WordSpan[] getProperNames(DocumentSentence sentence) {
List<WordSpan> spans = new ArrayList<>(2 * sentence.length());
for (int i = 0; i < sentence.length(); i++) {
if (isProperNoun(i, sentence))
@ -57,27 +55,73 @@ public class KeywordExtractor {
return spans.toArray(WordSpan[]::new);
}
public WordSpan[] getKeywordsFromSentence(DocumentSentence sentence) {
if (sentence.keywords != null) {
return sentence.keywords.get();
}
List<WordSpan> spans = new ArrayList<>(sentence.length());
Set<String> topWords = Collections.emptySet();
public WordSpan[] getNouns(DocumentSentence sentence) {
List<WordSpan> spans = new ArrayList<>(2 * sentence.length());
for (int i = 0; i < sentence.length(); i++) {
if (isName(i, sentence, topWords) || isTopAdj(i, sentence, topWords))
if (isNoun(i, sentence))
spans.add(new WordSpan(i, i+1));
}
for (int i = 1; i < sentence.length(); i++) {
if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }
if (isName(i, sentence, topWords)) {
if (isName(i - 1, sentence, topWords) || isTopAdj(i-1, sentence, topWords))
if (isNoun(i, sentence)
&& (isNoun(i-1, sentence)) || "JJ".equals(sentence.posTags[i-1])) {
spans.add(new WordSpan(i - 1, i + 1));
}
}
for (int i = 2; i < sentence.length(); i++) {
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
if ((isNoun(i, sentence))
&& (isJoiner(sentence, i-1) || isNoun(i-1, sentence))
&& (isNoun(i-2, sentence)) || "JJ".equals(sentence.posTags[i-2]))
spans.add(new WordSpan(i-2, i+1));
}
for (int i = 3; i < sentence.length(); i++) {
if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; }
if (isNoun(i, sentence) && (isNoun(i-3, sentence) || "JJ".equals(sentence.posTags[i-3]))) {
if (isNoun(i - 1, sentence) && isNoun(i - 2, sentence))
spans.add(new WordSpan(i-3, i+1));
else if (isJoiner(sentence, i-2) && sentence.posTags[i-1].equals("DT"))
spans.add(new WordSpan(i-3, i+1));
else if ((isJoiner(sentence, i-1) ||isNoun(i-1, sentence))
&& (isJoiner(sentence, i-2)||isNoun(i-2, sentence)))
spans.add(new WordSpan(i-3, i+1));
}
}
return spans.toArray(WordSpan[]::new);
}
public WordSpan[] getKeywordsFromSentence(DocumentSentence sentence) {
if (sentence.keywords != null) {
return sentence.keywords.get();
}
List<WordSpan> spans = new ArrayList<>(2 * sentence.length());
for (int i = 0; i < sentence.length(); i++) {
if (isName(i, sentence) || isTopAdj(i, sentence))
spans.add(new WordSpan(i, i+1));
}
for (int i = 1; i < sentence.length(); i++) {
if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; }
if (isName(i, sentence)) {
if (isName(i - 1, sentence) || isTopAdj(i-1, sentence))
spans.add(new WordSpan(i - 1, i + 1));
}
if (sentence.posTags[i].equals("CD") && isName(i-1, sentence, topWords)) {
if (sentence.posTags[i].equals("CD") && isName(i-1, sentence)) {
spans.add(new WordSpan(i - 1, i + 1));
}
}
@ -86,16 +130,16 @@ public class KeywordExtractor {
if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; }
if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; }
if (isName(i, sentence, topWords)) {
if ((isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords))
&& (isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords))) {
if (isName(i, sentence)) {
if ((isName(i-1, sentence) || isTopAdj(i-1, sentence))
&& (isName(i-2, sentence) || isTopAdj(i-2, sentence))) {
spans.add(new WordSpan(i - 2, i + 1));
}
else if ((isProperNoun(i-1, sentence) || isJoiner(sentence, i-1)) && isProperNoun(i-2, sentence)) {
spans.add(new WordSpan(i - 2, i + 1));
}
}
else if (sentence.posTags[i].equals("CD") && isName(i-1, sentence, topWords) && isName(i-2, sentence, topWords)) {
else if (sentence.posTags[i].equals("CD") && isName(i-1, sentence) && isName(i-2, sentence)) {
spans.add(new WordSpan(i - 2, i + 1));
}
}
@ -105,10 +149,10 @@ public class KeywordExtractor {
if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; }
if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; }
if (isName(i, sentence, topWords) &&
(isName(i-1, sentence, topWords) || isTopAdj(i-1, sentence, topWords)) &&
(isName(i-2, sentence, topWords) || isTopAdj(i-2, sentence, topWords)) &&
(isName(i-3, sentence, topWords) || isTopAdj(i-3, sentence, topWords))) {
if (isName(i, sentence) &&
(isName(i-1, sentence) || isTopAdj(i-1, sentence)) &&
(isName(i-2, sentence) || isTopAdj(i-2, sentence)) &&
(isName(i-3, sentence) || isTopAdj(i-3, sentence))) {
spans.add(new WordSpan(i - 3, i + 1));
}
else if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) {
@ -134,7 +178,9 @@ public class KeywordExtractor {
public boolean isProperNoun(int i, DocumentSentence sent) {
return "NNP".equals(sent.posTags[i]) || "NNPS".equals(sent.posTags[i]);
}
public boolean isNoun(int i, DocumentSentence sent) {
return sent.posTags[i].startsWith("NN");
}
public boolean isJoiner(DocumentSentence sent, int i) {
if(sent.posTags[i].equals("IN")) {
return true;
@ -183,21 +229,13 @@ public class KeywordExtractor {
return true;
}
private boolean isName(int i, DocumentSentence sentence, Set<String> topWords) {
if (!topWords.isEmpty()) {
String posTag = sentence.posTags[i];
String word = sentence.stemmedWords[i];
return ((topWords.contains(word)) && (posTag.startsWith("N") || "VBN".equals(posTag)) && !sentence.isStopWord(i));
}
private boolean isName(int i, DocumentSentence sentence) {
String posTag = sentence.posTags[i];
return (posTag.startsWith("N") || "VBN".equals(posTag)) && !sentence.isStopWord(i);
return (posTag.startsWith("N") || "VBG".equals(posTag)|| "VBN".equals(posTag)) && !sentence.isStopWord(i);
}
private boolean isTopAdj(int i, DocumentSentence sentence, Set<String> topWords) {
private boolean isTopAdj(int i, DocumentSentence sentence) {
String posTag = sentence.posTags[i];
return (posTag.startsWith("JJ") || posTag.startsWith("R") || posTag.startsWith("VBG"));

View File

@ -20,7 +20,7 @@ public class NameCounter {
for (int i = 0; i < dld.sentences.length; i++) {
DocumentSentence sent = dld.sentences[i];
var keywords = keywordExtractor.getNames(sent);
var keywords = keywordExtractor.getProperNames(sent);
for (var span : keywords) {
if (span.size() <= 1)
continue;

View File

@ -1,9 +1,11 @@
package nu.marginalia.util.language.processing;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.util.language.processing.model.KeywordMetadata;
import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.util.language.processing.model.WordSpan;
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
import org.apache.commons.lang3.StringUtils;
import java.util.*;
import java.util.stream.Collectors;
@ -23,13 +25,13 @@ public class SubjectCounter {
// Greeks bearing gifts -> Greeks
// Steve McQueen drove fast | cars -> Steve McQueen
public List<WordRep> count(DocumentLanguageData dld) {
public List<WordRep> count(KeywordMetadata keywordMetadata, DocumentLanguageData dld) {
Map<String, Integer> counts = new HashMap<>();
Map<String, Set<WordRep>> instances = new HashMap<>();
for (var sentence : dld.sentences) {
for (WordSpan kw : keywordExtractor.getNames(sentence)) {
for (WordSpan kw : keywordExtractor.getNouns(sentence)) {
if (kw.end + 2 >= sentence.length()) {
continue;
}
@ -46,20 +48,46 @@ public class SubjectCounter {
String stemmed = rep.stemmed;
counts.merge(stemmed, -1, Integer::sum);
instances.computeIfAbsent(stemmed, s -> new HashSet<>()).add(rep);
}
}
}
int best = counts.values().stream().mapToInt(Integer::valueOf).min().orElse(0);
Map<String, Integer> scores = new HashMap<>(instances.size());
for (String stemmed : instances.keySet()) {
scores.put(stemmed, getTermTfIdf(keywordMetadata, stemmed));
}
return counts.entrySet().stream().sorted(Map.Entry.comparingByValue())
.filter(e -> e.getValue()<-2 && e.getValue()<=best*0.75)
return scores.entrySet().stream()
.filter(e -> e.getValue() >= 150)
.flatMap(e -> instances.getOrDefault(e.getKey(), Collections.emptySet()).stream())
.collect(Collectors.toList());
}
private int getTermTfIdf(KeywordMetadata keywordMetadata, String stemmed) {
if (stemmed.contains("_")) {
int sum = 0;
String[] parts = StringUtils.split(stemmed, '_');
if (parts.length == 0) {
return 0;
}
for (String part : parts) {
sum += getTermTfIdf(keywordMetadata, part);
}
return sum / parts.length;
}
var meta = keywordMetadata.wordsTfIdf().get(stemmed);
if (meta != null) {
return meta.tfIdfNormalized();
}
return 0;
}
private boolean isDetOrAdverbOrVerb(String posTag) {
return "DT".equals(posTag) // determinant
|| "RB".equals(posTag) // adverb

View File

@ -2,12 +2,13 @@ package nu.marginalia.util.language.processing.model;
import gnu.trove.map.hash.TObjectIntHashMap;
import lombok.AllArgsConstructor;
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
import java.util.Arrays;
import java.util.stream.Stream;
/**
* @see nu.marginalia.util.language.processing.SentenceExtractor
* @see SentenceExtractor
*/
@AllArgsConstructor
public class DocumentLanguageData {

View File

@ -17,9 +17,6 @@ public record KeywordMetadata(HashSet<String> titleKeywords,
)
{
private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0);
private static final int TF_IDF_HIGH_LIMIT = 64;
public KeywordMetadata(EnumSet<EdgePageWordFlags> flags) {
this(new HashSet<>(50), new HashSet<>(10), new HashSet<>(50),
new HashMap<>(15_000),
@ -31,7 +28,8 @@ public record KeywordMetadata(HashSet<String> titleKeywords,
this(EnumSet.noneOf(EdgePageWordFlags.class));
}
public long forWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
private static final KeywordCounter.WordFrequencyData empty = new KeywordCounter.WordFrequencyData(0, 0);
public long getMetadataForWord(EnumSet<EdgePageWordFlags> flagsTemplate, String stemmed) {
KeywordCounter.WordFrequencyData tfidf = wordsTfIdf.getOrDefault(stemmed, empty);
EnumSet<EdgePageWordFlags> flags = flagsTemplate.clone();

View File

@ -24,7 +24,7 @@ public class WordRep implements Comparable<WordRep> {
@Override
public int compareTo(@NotNull WordRep o) {
return stemmed.compareTo(o.stemmed);
return word.compareTo(o.word);
}
@Override

View File

@ -1,16 +1,14 @@
package nu.marginalia.util.language.processing;
package nu.marginalia.util.language.processing.sentence;
import com.github.datquocnguyen.RDRPOSTagger;
import com.github.jknack.handlebars.internal.lang3.StringUtils;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TObjectIntHashMap;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.SneakyThrows;
import nu.marginalia.util.StringPool;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.HtmlTagCleaner;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.util.language.processing.model.DocumentSentence;
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.stemmer.PorterStemmer;
@ -24,25 +22,22 @@ import javax.inject.Inject;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import java.util.*;
import java.util.regex.Pattern;
import static nu.marginalia.util.language.WordPatterns.*;
public class SentenceExtractor {
private SentenceDetectorME sentenceDetector;
private final RDRPOSTagger rdrposTagger;
private final PorterStemmer porterStemmer = new PorterStemmer();
private boolean legacyMode = false;
private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class);
private static final HtmlTagCleaner tagCleaner = new HtmlTagCleaner();
private final ThreadLocal<StringPool> stringPool = ThreadLocal.withInitial(() -> StringPool.create(10_000));
@SneakyThrows @Inject
public SentenceExtractor(LanguageModels models) {
try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) {
@ -66,6 +61,22 @@ public class SentenceExtractor {
final String text = asText(doc);
final DocumentSentence[] textSentences = extractSentencesFromString(text);
String title = getTitle(doc, textSentences);
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
var titleSentences = extractSentencesFromString(title.toLowerCase());
return new DocumentLanguageData(textSentences, titleSentences, counts);
}
public DocumentLanguageData extractSentences(String text, String title) {
final DocumentSentence[] textSentences = extractSentencesFromString(text);
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts);
}
private String getTitle(Document doc, DocumentSentence[] textSentences) {
String title = doc.getElementsByTag("title").text() + " . " +
Optional.ofNullable(doc.getElementsByTag("h1").first()).map(Element::text).orElse("");
@ -82,34 +93,7 @@ public class SentenceExtractor {
}
}
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
var titleSentences = extractSentencesFromString(title.toLowerCase());
return new DocumentLanguageData(textSentences, titleSentences, counts);
}
public DocumentLanguageData extractSentences(String text) {
final DocumentSentence[] textSentences = extractSentencesFromString(text);
String title = "";
for (DocumentSentence textSentence : textSentences) {
if (textSentence.length() > 0) {
title = textSentence.originalSentence.toLowerCase();
break;
}
}
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts);
}
public DocumentLanguageData extractSentences(String text, String title) {
final DocumentSentence[] textSentences = extractSentencesFromString(text);
TObjectIntHashMap<String> counts = calculateWordCounts(textSentences);
return new DocumentLanguageData(textSentences, extractSentencesFromString(title.toLowerCase()), counts);
return title;
}
@ -125,79 +109,95 @@ public class SentenceExtractor {
return counts;
}
private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)");
// private static final Pattern badCharPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|(\\.(\\s+|$))");
private boolean isBadChar(char c) {
if (c >= 'a' && c <= 'z') return false;
if (c >= 'A' && c <= 'Z') return false;
if (c >= '0' && c <= '9') return false;
if ("_#@.".indexOf(c) >= 0) return false;
if (c >= '\u00C0' && c <= '\u00D6') return false;
if (c >= '\u00D8' && c <= '\u00F6') return false;
if (c >= '\u00F8' && c <= '\u00FF') return false;
return true;
}
private String sanitizeString(String s) {
char[] newChars = new char[s.length()];
int pi = 0;
for (int i = 0; i < newChars.length; i++) {
char c = s.charAt(i);
if (!isBadChar(c)) {
newChars[pi++] = c;
}
else {
newChars[pi++] = ' ';
}
}
s = new String(newChars, 0, pi);
if (s.startsWith(".")) {
s = s.substring(1);
if (s.isBlank())
return "";
}
return s;
}
public DocumentSentence extractSentence(String text) {
var wordsAndSeps = splitSegment(text);
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text);
var words = wordsAndSeps.words;
var seps = wordsAndSeps.separators;
var lc = toLc(wordsAndSeps.words);
var lc = SentenceExtractorStringUtils.toLowerCaseStripPossessive(wordsAndSeps.words);
return new DocumentSentence(
sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc)
SentenceExtractorStringUtils.sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc)
);
}
public String normalizeSpaces(String s) {
if (s.indexOf('\t') >= 0) {
s = s.replace('\t', ' ');
}
if (s.indexOf('\n') >= 0) {
s = s.replace('\n', ' ');
}
return s;
}
public DocumentSentence[] extractSentencesFromString(String text) {
String[] sentences;
String textNormalizedSpaces = normalizeSpaces(text);
String textNormalizedSpaces = SentenceExtractorStringUtils.normalizeSpaces(text);
try {
sentences = sentenceDetector.sentDetect(textNormalizedSpaces);
}
catch (Exception ex) {
// shitty fallback logic
sentences = StringUtils.split(textNormalizedSpaces, '.');
}
sentences = preCleanSentences(sentences);
final String[][] tokens = new String[sentences.length][];
final int[][] separators = new int[sentences.length][];
final String[][] posTags = new String[sentences.length][];
final String[][] tokensLc = new String[sentences.length][];
final String[][] stemmedWords = new String[sentences.length][];
for (int i = 0; i < tokens.length; i++) {
var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sentences[i]);
tokens[i] = wordsAndSeps.words;
separators[i] = wordsAndSeps.separators;
if (tokens[i].length > 250) {
tokens[i] = Arrays.copyOf(tokens[i], 250);
separators[i] = Arrays.copyOf(separators[i], 250);
}
for (int j = 0; j < tokens[i].length; j++) {
while (tokens[i][j].endsWith(".")) {
tokens[i][j] = StringUtils.removeEnd(tokens[i][j], ".");
}
}
}
var sPool = stringPool.get();
for (int i = 0; i < tokens.length; i++) {
tokens[i] = sPool.internalize(tokens[i]);
}
for (int i = 0; i < tokens.length; i++) {
posTags[i] = rdrposTagger.tagsForEnSentence(tokens[i]);
// don't need to internalize this
}
for (int i = 0; i < tokens.length; i++) {
tokensLc[i] = SentenceExtractorStringUtils.toLowerCaseStripPossessive(tokens[i]);
tokensLc[i] = sPool.internalize(tokensLc[i]);
}
for (int i = 0; i < tokens.length; i++) {
stemmedWords[i] = stemSentence(tokensLc[i]);
stemmedWords[i] = sPool.internalize(stemmedWords[i]);
}
DocumentSentence[] ret = new DocumentSentence[sentences.length];
for (int i = 0; i < ret.length; i++) {
String fullString;
if (i == 0) {
fullString = SentenceExtractorStringUtils.sanitizeString(sentences[i]);
}
else {
fullString = "";
}
ret[i] = new DocumentSentence(fullString, tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]);
}
return ret;
}
private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)");
private String[] preCleanSentences(String[] sentences) {
if (sentences.length > 250) {
sentences = Arrays.copyOf(sentences, 250);
}
@ -212,53 +212,13 @@ public class SentenceExtractor {
sentenceList.add(s);
}
}
sentences = sentenceList.toArray(String[]::new);
final String[][] tokens = new String[sentences.length][];
final int[][] separators = new int[sentences.length][];
final String[][] posTags = new String[sentences.length][];
final String[][] tokensLc = new String[sentences.length][];
final String[][] stemmedWords = new String[sentences.length][];
for (int i = 0; i < tokens.length; i++) {
var wordsAndSeps = splitSegment(sentences[i]); //tokenizer.tokenize(sentences[i]);
tokens[i] = wordsAndSeps.words;
separators[i] = wordsAndSeps.separators;
if (tokens[i].length > 250) {
tokens[i] = Arrays.copyOf(tokens[i], 250);
separators[i] = Arrays.copyOf(separators[i], 250);
}
for (int j = 0; j < tokens[i].length; j++) {
while (tokens[i][j].endsWith(".")) {
tokens[i][j] = StringUtils.removeEnd(tokens[i][j], ".");
}
}
}
for (int i = 0; i < tokens.length; i++) {
posTags[i] = rdrposTagger.tagsForEnSentence(tokens[i]);
}
for (int i = 0; i < tokens.length; i++) {
tokensLc[i] = toLc(tokens[i]);
}
for (int i = 0; i < tokens.length; i++) {
stemmedWords[i] = stemSentence(tokensLc[i]);
}
DocumentSentence[] ret = new DocumentSentence[sentences.length];
for (int i = 0; i < ret.length; i++) {
ret[i] = new DocumentSentence(sanitizeString(sentences[i]), tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]);
}
return ret;
return sentenceList.toArray(String[]::new);
}
private String[] stemSentence(String[] strings) {
String[] stemmed = new String[strings.length];
for (int i = 0; i < stemmed.length; i++) {
var sent = cleanPossessive(strings[i]);
var sent = SentenceExtractorStringUtils.stripPossessive(strings[i]);
try {
stemmed[i] = porterStemmer.stem(sent);
}
@ -269,27 +229,6 @@ public class SentenceExtractor {
return stemmed;
}
private String cleanPossessive(String s) {
int end = s.length();
if (s.endsWith("\'")) {
return s.substring(0, end-1);
} else if (end > 2 && s.charAt(end-2) == '\'' && "sS".indexOf(s.charAt(end-1))>=0) {
return s.substring(0, end-2).toLowerCase();
}
else {
return s;
}
}
private String[] toLc(String[] words) {
String[] lower = new String[words.length];
for (int i = 0; i < lower.length; i++) {
lower[i] = cleanPossessive(words[i]).toLowerCase();
}
return lower;
}
public String asText(Document dc) {
tagCleaner.clean(dc);
@ -299,67 +238,6 @@ public class SentenceExtractor {
return text.substring(0, (int) (text.length()*0.95));
}
@AllArgsConstructor @Getter
private static class WordsAndSeparators {
String[] words;
int[] separators;
}
private WordsAndSeparators splitSegment(String segment) {
var matcher = wordBreakPattern.matcher(segment);
List<String> words = new ArrayList<>(segment.length()/6);
TIntArrayList separators = new TIntArrayList(segment.length()/6);
int start = 0;
int wordStart = 0;
while (wordStart <= segment.length()) {
if (!matcher.find(wordStart)) {
words.add(segment.substring(wordStart));
separators.add(WordSeparator.SPACE);
break;
}
if (wordStart != matcher.start()) {
words.add(segment.substring(wordStart, matcher.start()));
separators.add(segment.substring(matcher.start(), matcher.end()).isBlank() ? WordSeparator.SPACE : WordSeparator.COMMA);
}
wordStart = matcher.end();
}
String[] parts = words.toArray(String[]::new);
int length = 0;
for (int i = 0; i < parts.length; i++) {
if (parts[i].isBlank() || parts[i].length() >= MAX_WORD_LENGTH || characterNoisePredicate.test(parts[i])) {
parts[i] = null;
}
else {
length++;
}
}
String[] ret = new String[length];
int[] seps = new int[length];
for (int i = 0, j=0; i < parts.length; i++) {
if (parts[i] != null) {
seps[j] = separators.getQuick(i);
ret[j++] = parts[i];
}
}
for (int i = 0; i < ret.length; i++) {
if (ret[i].startsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(1); }
if (ret[i].endsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(0, ret[i].length()-1); }
}
return new WordsAndSeparators(ret, seps);
}
public boolean isLegacyMode() {
return legacyMode;
}
public void setLegacyMode(boolean legacyMode) {
this.legacyMode = legacyMode;
}
}

View File

@ -0,0 +1,93 @@
package nu.marginalia.util.language.processing.sentence;
import java.util.Arrays;
import java.util.Objects;
public class SentenceExtractorStringUtils {
public static String sanitizeString(String s) {
char[] newChars = new char[s.length()];
int pi = 0;
boolean changed = false;
for (int i = 0; i < newChars.length; i++) {
char c = s.charAt(i);
if (!isBadChar(c)) {
newChars[pi++] = c;
}
else {
changed = true;
newChars[pi++] = ' ';
}
}
if (changed) {
s = new String(newChars, 0, pi);
}
if (s.startsWith(".")) {
s = s.substring(1);
}
if (s.isBlank()) {
return "";
}
return s;
}
private static boolean isBadChar(char c) {
if (c >= 'a' && c <= 'z') return false;
if (c >= 'A' && c <= 'Z') return false;
if (c >= '0' && c <= '9') return false;
if ("_#@.".indexOf(c) >= 0) return false;
if (c >= '\u00C0' && c <= '\u00D6') return false;
if (c >= '\u00D8' && c <= '\u00F6') return false;
if (c >= '\u00F8' && c <= '\u00FF') return false;
return true;
}
public static String normalizeSpaces(String s) {
if (s.indexOf('\t') >= 0) {
s = s.replace('\t', ' ');
}
if (s.indexOf('\n') >= 0) {
s = s.replace('\n', ' ');
}
return s;
}
public static String toLowerCaseStripPossessive(String word) {
String val = stripPossessive(word).toLowerCase();
if (Objects.equals(val, word)) {
return word;
}
return val;
}
public static String[] toLowerCaseStripPossessive(String[] words) {
String[] lc = new String[words.length];
Arrays.setAll(lc, i ->SentenceExtractorStringUtils.toLowerCaseStripPossessive(words[i]));
return lc;
}
public static String stripPossessive(String s) {
int end = s.length();
if (s.endsWith("'")) {
return s.substring(0, end-1);
}
if (s.endsWith("'s") || s.endsWith("'S")) {
return s.substring(0, end-2);
}
return s;
}
}

View File

@ -0,0 +1,72 @@
package nu.marginalia.util.language.processing.sentence;
import gnu.trove.list.array.TIntArrayList;
import lombok.AllArgsConstructor;
import lombok.Getter;
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
import java.util.ArrayList;
import java.util.List;
import static nu.marginalia.util.language.WordPatterns.*;
public class SentenceSegmentSplitter {
@AllArgsConstructor
@Getter
public static class SeparatedSentence {
String[] words;
int[] separators;
}
public static SeparatedSentence splitSegment(String segment) {
var matcher = wordBreakPattern.matcher(segment);
List<String> words = new ArrayList<>(segment.length()/6);
TIntArrayList separators = new TIntArrayList(segment.length()/6);
int wordStart = 0;
while (wordStart <= segment.length()) {
if (!matcher.find(wordStart)) {
words.add(segment.substring(wordStart));
separators.add(WordSeparator.SPACE);
break;
}
if (wordStart != matcher.start()) {
words.add(segment.substring(wordStart, matcher.start()));
separators.add(segment.substring(matcher.start(), matcher.end()).isBlank() ? WordSeparator.SPACE : WordSeparator.COMMA);
}
wordStart = matcher.end();
}
String[] parts = words.toArray(String[]::new);
int length = 0;
for (int i = 0; i < parts.length; i++) {
if (parts[i].isBlank() || parts[i].length() >= MAX_WORD_LENGTH || characterNoisePredicate.test(parts[i])) {
parts[i] = null;
}
else {
length++;
}
}
String[] ret = new String[length];
int[] seps = new int[length];
for (int i = 0, j=0; i < parts.length; i++) {
if (parts[i] != null) {
seps[j] = separators.getQuick(i);
ret[j++] = parts[i];
}
}
for (int i = 0; i < ret.length; i++) {
if (ret[i].startsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(1); }
if (ret[i].endsWith("'") && ret[i].length() > 1) { ret[i] = ret[i].substring(0, ret[i].length()-1); }
}
return new SeparatedSentence(ret, seps);
}
}

View File

@ -1,39 +0,0 @@
package nu.marginalia.util.ranking;
public class BuggyReversePageRank extends RankingAlgorithm {
public BuggyReversePageRank(RankingDomainFetcher domains, String... origins) {
super(domains, origins);
}
@Override
RankVector createNewRankVector(RankVector rank) {
double rankNorm = rank.norm();
RankVector newRank = new RankVector(0);
for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) {
var links = linkDataSrc2Dest[domainId];
if (links != null && links.size() > 0) {
double newRankValue = 0;
for (int j = 0; j < links.size(); j++) {
newRankValue += rank.get(links.getQuick(j)) / links.size();
}
newRank.set(domainId, 0.85*newRankValue/rankNorm);
}
}
return newRank;
}
@Override
void adjustRankVector(RankVector vector, double dNorm, double oldNorm) {
originDomainIds.forEach(id -> vector.increment(domainIdToIndex.get(id), dNorm/oldNorm));
}
}

View File

@ -1,45 +0,0 @@
package nu.marginalia.util.ranking;
public class BuggyStandardPageRank extends RankingAlgorithm {
public BuggyStandardPageRank(RankingDomainFetcher domains, String... origins) {
super(domains, origins);
}
@Override
RankingAlgorithm.RankVector createNewRankVector(RankingAlgorithm.RankVector rank) {
RankVector newRank = new RankVector(0);
for (int domainId = 0; domainId < domainIndexToId.size(); domainId++) {
var links = linkDataSrc2Dest[domainId];
double newRankValue = 0;
if (links != null && links.size() > 0) {
for (int j = 0; j < links.size(); j++) {
int linkedDomain = links.getQuick(j);
int linkSize = 1;
var bl = linkDataSrc2Dest[linkedDomain];
if (bl != null) {
linkSize = bl.size();
}
newRankValue += rank.get(linkedDomain) / linkSize;
}
}
newRank.set(domainId, 0.85 * newRankValue);
}
return newRank;
}
@Override
void adjustRankVector(RankingAlgorithm.RankVector vector, double dNorm, double oldNorm) {
originDomainIds.forEach(id -> vector.increment(id, dNorm/originDomainIds.size()));
vector.incrementAll(0.14*dNorm/vector.size());
}
}

View File

@ -1,89 +0,0 @@
package nu.marginalia.util.ranking.tool;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.SneakyThrows;
import lombok.ToString;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import org.mariadb.jdbc.Driver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class DedupTool {
private static final Logger logger = LoggerFactory.getLogger(DedupTool.class);
public Set<String> originDomains = new HashSet<>();
public Set<Integer> originDomainIds = new HashSet<>();
public final long domainIdMax = -1;
public int domainCount;
private volatile static int rankMax;
public int maxId() {
return (int) domainIdMax;
}
public int domainCount() {
return domainCount;
}
static LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
volatile static boolean running = true;
@AllArgsConstructor @ToString @Getter
static class Data {
String url;
int id;
String domain;
}
@SneakyThrows
public static void main(String... args) {
Driver driver = new Driver();
var ds = new DatabaseModule().provideConnection();
Map<Integer, Map<Integer, List<Data>>> domainToHashToUrl = new HashMap<>();
try (var conn = ds.getConnection();
var fetchStmt = conn.prepareStatement("SELECT URL_TOP_DOMAIN_ID,DATA_HASH,URL,EC_URL.ID,EC_DOMAIN.DOMAIN_NAME FROM EC_URL INNER JOIN EC_DOMAIN ON EC_DOMAIN.ID=DOMAIN_ID WHERE DATA_HASH IS NOT NULL");
var updateStmt = conn.prepareStatement("UPDATE EC_URL SET STATE='redirect' WHERE ID=?")
) {
fetchStmt.setFetchSize(10_000);
var rsp = fetchStmt.executeQuery();
while (rsp.next()) {
domainToHashToUrl.computeIfAbsent(rsp.getInt(1), i -> new HashMap<>())
.computeIfAbsent(rsp.getInt(2), i -> new ArrayList<>()).add(new Data(rsp.getString(3), rsp.getInt(4), rsp.getString(5)));
}
List<Integer> updateIds = new ArrayList<>();
domainToHashToUrl.forEach((domain, hashes) -> {
hashes.forEach((hash, urls) -> {
if (urls.size() > 1) {
Comparator<Data> c = Comparator.comparing(d -> d.domain.length());
var urls2 = urls.stream().sorted(c.thenComparing(d -> d.url.length()))
.collect(Collectors.partitioningBy(d -> d.url.endsWith("/")));
Stream
.concat(urls2.get(true).stream(),urls2.get(false).stream()).skip(1)
.map(Data::getId)
.forEach(updateIds::add);
}
});
});
for (int id : updateIds) {
updateStmt.setInt(1, id);
updateStmt.executeUpdate();
}
}
}
}

View File

@ -2,6 +2,7 @@ package nu.marginalia.wmsa.client;
import com.google.gson.*;
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
import nu.marginalia.util.bigstring.BigString;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
@ -24,6 +25,8 @@ public class GsonFactory {
.registerTypeAdapter(EdgeDomain.class, (JsonDeserializer<EdgeDomain>) (json, typeOfT, context) -> new EdgeDomain(json.getAsString()))
.registerTypeAdapter(EdgeId.class, (JsonDeserializer<EdgeId<?>>) (json, typeOfT, context) -> new EdgeId<>(json.getAsInt()))
.registerTypeAdapter(EdgeId.class, (JsonSerializer<EdgeId<?>>) (src, typeOfSrc, context) -> new JsonPrimitive(src.id()))
.registerTypeAdapter(BigString.class, (JsonDeserializer<BigString>) (json, typeOfT, context) -> BigString.encode(json.getAsString()))
.registerTypeAdapter(BigString.class, (JsonSerializer<BigString>) (src, typeOfT, context) -> new JsonPrimitive(src.decode()))
.serializeSpecialFloatingPointValues()
.create();
}

View File

@ -13,7 +13,6 @@ import nu.marginalia.wmsa.memex.MemexMain;
import nu.marginalia.wmsa.podcasts.PodcastScraperMain;
import nu.marginalia.wmsa.renderer.RendererMain;
import nu.marginalia.wmsa.resource_store.ResourceStoreMain;
import nu.marginalia.wmsa.smhi.scraper.SmhiScraperMain;
import org.apache.logging.log4j.core.lookup.MainMapLookup;
import java.util.Map;
@ -26,7 +25,6 @@ public enum ServiceDescriptor {
AUTH("auth", 5003, AuthMain.class),
API("api", 5004, ApiMain.class),
SMHI_SCRAPER("smhi-scraper",5012, SmhiScraperMain.class),
PODCST_SCRAPER("podcast-scraper", 5013, PodcastScraperMain.class),
EDGE_INDEX("edge-index", 5021, EdgeIndexMain.class),

View File

@ -2,9 +2,10 @@ package nu.marginalia.wmsa.edge.assistant.dict;
import ca.rmen.porterstemmer.PorterStemmer;
import gnu.trove.map.hash.TLongIntHashMap;
import gnu.trove.set.hash.TLongHashSet;
import nu.marginalia.util.language.LanguageFilter;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter;
@ -18,11 +19,10 @@ import javax.annotation.Nullable;
import javax.inject.Inject;
import javax.inject.Singleton;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
@ -101,12 +101,15 @@ public class TermFrequencyDict {
fjp.execute(() -> {
TLongHashSet words = new TLongHashSet(10_000);
for (var doc : domain.doc) {
if (doc.documentBody == null)
continue;
docCount.incrementAndGet();
Document parsed = Jsoup.parse(doc.documentBody);
Document parsed = Jsoup.parse(doc.documentBody.decode());
parsed.body().filter(new DomPruningFilter(0.5));
DocumentLanguageData dld = se.get().extractSentences(parsed);
@ -115,28 +118,30 @@ public class TermFrequencyDict {
return;
}
Set<String> words = new HashSet<>(10_000);
for (var sent : dld.sentences) {
for (var word : sent) {
words.add(word.stemmed());
words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8)));
}
}
fjp.execute(() -> {
synchronized (counts) {
for (var word : words) {
counts.adjustOrPutValue(longHash(word.getBytes()), 1, 1);
}
}
});
synchronized (counts) {
words.forEach(w -> {
counts.adjustOrPutValue(w, 1, 1);
return true;
});
}
words.clear();
}
System.out.println(domain.domain + "\t" + counts.size());
});
}
fjp.shutdown();
fjp.awaitTermination(10, TimeUnit.SECONDS);
fjp.awaitTermination(10, TimeUnit.DAYS);
try (var dos = new DataOutputStream(Files.newOutputStream(Path.of(outFile)))) {
synchronized (counts) {
@ -155,14 +160,6 @@ public class TermFrequencyDict {
}
System.out.println(docCount.get());
//
// counts.forEachEntry((w,c) -> {
// if (c > 3L) {
// System.out.println(w + ":" + c);
// }
// return true;
// });
}
public static long getStringHash(String s) {

View File

@ -46,17 +46,12 @@ public class ConverterMain {
InstructionsCompiler compiler,
Gson gson
) throws Exception {
;
logger.info("Starting pipe");
try (WorkLog processLog = plan.createProcessWorkLog();
ConversionLog log = new ConversionLog(plan.process.getDir())) {
instructionWriter = new LoadInstructionWriter(log, plan.process.getDir(), gson);
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 20, 4, 2) {
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 16, 4, 2) {
@Override
protected ProcessingInstructions onProcess(CrawledDomain domainData) {

View File

@ -150,7 +150,7 @@ public class LinkKeywordExtractorMain {
for (var doc : crawledDomain.doc) {
if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) {
anchorTextExtractor.processDocument(doc.url, doc.documentBody);
anchorTextExtractor.processDocument(doc.url, doc.documentBody.decode());
}
}
}

View File

@ -7,7 +7,7 @@ import nu.marginalia.util.gregex.GuardedRegex;
import nu.marginalia.util.gregex.GuardedRegexFactory;
import nu.marginalia.util.language.LanguageFilter;
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.util.language.processing.model.KeywordMetadata;
import nu.marginalia.wmsa.edge.converting.model.DisqualifiedException;
@ -178,11 +178,13 @@ public class DocumentProcessor {
private DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument)
throws DisqualifiedException, URISyntaxException {
if (languageFilter.isBlockedUnicodeRange(crawledDocument.documentBody)) {
String documentBody = crawledDocument.documentBody.decode();
if (languageFilter.isBlockedUnicodeRange(documentBody)) {
throw new DisqualifiedException(DisqualificationReason.LANGUAGE);
}
Document doc = Jsoup.parse(crawledDocument.documentBody);
Document doc = Jsoup.parse(documentBody);
if (AcceptableAds.hasAcceptableAdsTag(doc)) {
// I've never encountered a website where this hasn't been a severe indicator

View File

@ -42,7 +42,7 @@ public class DomainProcessor {
fixBadCanonicalTags(crawledDomain.doc);
StringPool stringPool = new StringPool(1000 + 100 * crawledDomain.doc.size());
StringPool stringPool = StringPool.create(1000 + 100 * crawledDomain.doc.size());
for (var doc : crawledDomain.doc) {
var processedDoc = documentProcessor.process(doc, crawledDomain);

View File

@ -33,8 +33,7 @@ public class SiteWords {
Set<String> commonSiteWords = new HashSet<>(10);
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain,
EdgePageWordFlags.Subjects,
EdgePageWordFlags.TfIdfHigh));
EdgePageWordFlags.Subjects));
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(processedDomain,
EdgePageWordFlags.Title));

View File

@ -11,7 +11,7 @@ public class CommonKeywordExtractor {
private static final int MIN_REQUIRED_DOCUMENTS = 25;
private static final int REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION = 100;
private static final int REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION = 15;
private static final double QUALIFYING_PROPORTION_FOR_KEYWORD = .25;
private static final int MAX_SITE_KEYWORDS_TO_EXTRACT = 5;

View File

@ -126,6 +126,9 @@ public class LinkParser {
if (doesUrlStringHaveProtocol(s)) {
return s;
}
else if (s.startsWith("//")) { // scheme-relative URL
return baseUrl.proto + ":" + s;
}
String[] parts = paramSeparatorPattern.split(s, 2);
String path = parts[0];

View File

@ -2,11 +2,11 @@ package nu.marginalia.wmsa.edge.crawling;
import com.github.luben.zstd.ZstdInputStream;
import com.google.gson.Gson;
import jdkoverride.LargeLineBufferedReader;
import nu.marginalia.wmsa.client.GsonFactory;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
@ -19,61 +19,41 @@ import java.util.concurrent.TimeUnit;
public class CrawledDomainReader {
private final Gson gson = GsonFactory.get();
private final ForkJoinPool pool = new ForkJoinPool(4);
private final ForkJoinPool pool = new ForkJoinPool(6);
public CrawledDomainReader() {
}
public CrawledDomain read(Path path) throws IOException {
List<CrawledDocument> docs = new ArrayList<>();
CrawledDomain domain = null;
DomainDataAssembler domainData = new DomainDataAssembler();
try (var br = new LargeLineBufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))))) {
String line;
while ((line = br.readLine()) != null) {
if (line.startsWith("//")) {
String identifier = line;
String data = br.readLine();
try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))))) {
br.mark(2);
boolean legacy = '{' == br.read();
br.reset();
if (legacy) {
domain = gson.fromJson(br, CrawledDomain.class);
}
else {
String line;
while ((line = br.readLine()) != null) {
if (line.startsWith("//")) {
String nextLine = br.readLine();
if (nextLine == null) break;
if (line.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
domain = gson.fromJson(nextLine, CrawledDomain.class);
} else if (line.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
pool.execute(() -> {
var doc = gson.fromJson(nextLine, CrawledDocument.class);
synchronized (docs) {
docs.add(doc);
}
});
}
} else if (line.charAt(0) == '{') {
domain = gson.fromJson(line, CrawledDomain.class);
}
pool.execute(() -> deserializeLine(identifier, data, domainData));
}
}
}
pool.awaitQuiescence(10, TimeUnit.SECONDS);
while (!pool.awaitQuiescence(1, TimeUnit.SECONDS));
if (domain == null) {
return null;
return domainData.assemble();
}
private void deserializeLine(String identifier, String data, DomainDataAssembler assembler) {
if (null == data) {
return;
}
if (!docs.isEmpty()) {
if (domain.doc == null)
domain.doc = new ArrayList<>();
domain.doc.addAll(docs);
if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
assembler.acceptDomain(gson.fromJson(data, CrawledDomain.class));
} else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
assembler.acceptDoc(gson.fromJson(data, CrawledDocument.class));
}
return domain;
}
public CrawledDomain readRuntimeExcept(Path path) {
@ -84,4 +64,27 @@ public class CrawledDomainReader {
throw new RuntimeException(ex);
}
}
private static class DomainDataAssembler {
private CrawledDomain domainPrototype;
private final List<CrawledDocument> docs = new ArrayList<>();
public synchronized void acceptDomain(CrawledDomain domain) {
this.domainPrototype = domain;
}
public synchronized void acceptDoc(CrawledDocument doc) {
docs.add(doc);
}
public synchronized CrawledDomain assemble() {
if (!docs.isEmpty()) {
if (domainPrototype.doc == null)
domainPrototype.doc = new ArrayList<>();
domainPrototype.doc.addAll(docs);
}
return domainPrototype;
}
}
}

View File

@ -24,7 +24,7 @@ public class UrlBlocklist {
patterns.add(s -> s.contains("-download-free"));
// long base64-strings in URLs are typically git hashes or the like, rarely worth crawling
patterns.add(GuardedRegexFactory.minLength(48, ".*/[^/]*[a-f0-9]{32,}(/|$)"));
patterns.add(this::hashTest);
// link farms &c
patterns.add(GuardedRegexFactory.contains("/download", "/download(-([A-Za-z]+|[0-9]+)){4,}\\.(htm|html|php)$"));
@ -38,6 +38,33 @@ public class UrlBlocklist {
}
public boolean hashTest(String path) {
// look for strings might be a git hash (i.e. long hexadecimal strings)
// there is no good guard for a regular expression for this so hand-rolling this
// is necessary
int runLength = 0;
int minLength = 32;
if (path.length() <= minLength + 2)
return false;
for (int i = 0; i < path.length(); i++) {
int c = path.charAt(i);
if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f')) {
runLength++;
}
else if (runLength >= minLength) {
return true;
}
else {
runLength = 0;
}
}
return runLength >= minLength;
}
public boolean isUrlBlocked(EdgeUrl url) {
try {
if (badDomains.contains(url.domain.domain)) {

View File

@ -1,6 +1,8 @@
package nu.marginalia.wmsa.edge.crawling.model;
import lombok.Builder;
import nu.marginalia.util.bigstring.BigString;
import nu.marginalia.util.bigstring.CompressedBigString;
@Builder
public class CrawledDocument implements SerializableCrawlData {
@ -16,8 +18,7 @@ public class CrawledDocument implements SerializableCrawlData {
public String crawlerStatusDesc;
public String headers;
public String documentBody;
public BigString documentBody;
public String documentBodyHash;
public String canonicalUrl;

View File

@ -206,7 +206,7 @@ public class CrawlerRetreiver {
if (doc.documentBody != null) {
doc.documentBodyHash = createHash(doc.documentBody);
doc.documentBodyHash = createHash(doc.documentBody.decode());
Optional<Document> parsedDoc = parseDoc(doc);
EdgeUrl url = new EdgeUrl(doc.url);
@ -251,7 +251,7 @@ public class CrawlerRetreiver {
private Optional<Document> parseDoc(CrawledDocument doc) {
if (doc.documentBody == null)
return Optional.empty();
return Optional.of(Jsoup.parse(doc.documentBody));
return Optional.of(Jsoup.parse(doc.documentBody.decode()));
}
public boolean isSameDomain(EdgeUrl url) {

View File

@ -7,6 +7,7 @@ import crawlercommons.robots.SimpleRobotRulesParser;
import lombok.AllArgsConstructor;
import lombok.SneakyThrows;
import lombok.ToString;
import nu.marginalia.util.bigstring.BigString;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic;
@ -271,7 +272,7 @@ public class HttpFetcher {
.canonicalUrl(canonical)
.httpStatus(rsp.code())
.url(responseUrl.toString())
.documentBody(strData)
.documentBody(BigString.encode(strData))
.build();
}
@ -325,7 +326,7 @@ public class HttpFetcher {
private SimpleRobotRules parseRobotsTxt(CrawledDocument doc) {
return robotsParser.parseContent(doc.url,
doc.documentBody.getBytes(StandardCharsets.UTF_8),
doc.documentBody.getBytes(),
doc.contentType,
userAgent);
}

View File

@ -2,6 +2,7 @@ package nu.marginalia.wmsa.edge.index;
import com.google.inject.Inject;
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService;
import java.io.IOException;
@ -9,14 +10,16 @@ import java.io.IOException;
public class EdgeIndexControl {
private final IndexServicesFactory servicesFactory;
private final EdgeIndexSearchSetsService searchSetsService;
@Inject
public EdgeIndexControl(IndexServicesFactory servicesFactory) {
public EdgeIndexControl(IndexServicesFactory servicesFactory, EdgeIndexSearchSetsService searchSetsService) {
this.servicesFactory = servicesFactory;
this.searchSetsService = searchSetsService;
}
public void regenerateIndex() throws IOException {
servicesFactory.convertIndex();
servicesFactory.convertIndex(searchSetsService.getDomainRankings());
System.gc();
}

View File

@ -13,13 +13,6 @@ public class EdgeIndexModule extends AbstractModule {
public void configure() {
if (Boolean.getBoolean("small-ram")) {
bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 27);
}
else {
bind(Long.class).annotatedWith(Names.named("edge-dictionary-hash-map-size")).toInstance(1L << 31);
}
}
@Provides

View File

@ -5,11 +5,11 @@ import com.google.inject.Singleton;
import com.google.inject.name.Named;
import lombok.SneakyThrows;
import nu.marginalia.util.array.LongArray;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklist;
import nu.marginalia.util.dict.DictionaryMap;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
import nu.marginalia.wmsa.edge.index.postings.SearchIndex;
import nu.marginalia.wmsa.edge.index.postings.SearchIndexReader;
import nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexConverter;
@ -20,6 +20,7 @@ import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexConverter;
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPrioReader;
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexPriorityParameters;
import nu.marginalia.wmsa.edge.index.postings.reverse.ReverseIndexReader;
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -33,7 +34,6 @@ import java.util.concurrent.Callable;
@Singleton
public class IndexServicesFactory {
private final Path tmpFileDir;
private final EdgeDomainBlacklist domainBlacklist;
private final Logger logger = LoggerFactory.getLogger(getClass());
@ -48,7 +48,6 @@ public class IndexServicesFactory {
private final PartitionedDataFile revPrioIndexWords;
private volatile static KeywordLexicon keywordLexicon;
private final Long dictionaryHashMapSize;
private final Path searchSetsBase;
@ -59,14 +58,10 @@ public class IndexServicesFactory {
public IndexServicesFactory(
@Named("tmp-file-dir") Path tmpFileDir,
@Named("partition-root-slow") Path partitionRootSlow,
@Named("partition-root-fast") Path partitionRootFast,
@Named("edge-dictionary-hash-map-size") Long dictionaryHashMapSize,
EdgeDomainBlacklist domainBlacklist
@Named("partition-root-fast") Path partitionRootFast
) throws IOException {
this.tmpFileDir = tmpFileDir;
this.dictionaryHashMapSize = dictionaryHashMapSize;
this.domainBlacklist = domainBlacklist;
this.writerIndexFile = new PartitionedDataFile(partitionRootSlow, "page-index.dat");
this.keywordLexiconFile = new RootDataFile(partitionRootSlow, "dictionary.dat");
@ -98,7 +93,7 @@ public class IndexServicesFactory {
public KeywordLexicon getKeywordLexicon() {
if (keywordLexicon == null) {
final var journal = new KeywordLexiconJournal(keywordLexiconFile.get());
keywordLexicon = new KeywordLexicon(journal, new DictionaryHashMap(dictionaryHashMapSize));
keywordLexicon = new KeywordLexicon(journal, DictionaryMap.create());
}
return keywordLexicon;
}
@ -109,15 +104,15 @@ public class IndexServicesFactory {
}
public void convertIndex() throws IOException {
convertForwardIndex();
convertFullReverseIndex();
convertPriorityReverseIndex();
public void convertIndex(DomainRankings domainRankings) throws IOException {
convertForwardIndex(domainRankings);
convertFullReverseIndex(domainRankings);
convertPriorityReverseIndex(domainRankings);
}
private void convertFullReverseIndex() throws IOException {
private void convertFullReverseIndex(DomainRankings domainRankings) throws IOException {
logger.info("Converting full reverse index");
@ -125,6 +120,7 @@ public class IndexServicesFactory {
var journalReader = new SearchIndexJournalReaderSingleFile(longArray);
var converter = new ReverseIndexConverter(tmpFileDir,
journalReader,
domainRankings,
revIndexWords.get(NEXT_PART).toPath(),
revIndexDoc.get(NEXT_PART).toPath());
@ -133,7 +129,7 @@ public class IndexServicesFactory {
tryGc();
}
private void convertPriorityReverseIndex() throws IOException {
private void convertPriorityReverseIndex(DomainRankings domainRankings) throws IOException {
logger.info("Converting priority reverse index");
@ -143,6 +139,7 @@ public class IndexServicesFactory {
var converter = new ReverseIndexConverter(tmpFileDir,
journalReader,
domainRankings,
revPrioIndexWords.get(NEXT_PART).toPath(),
revPrioIndexDoc.get(NEXT_PART).toPath());
@ -151,13 +148,14 @@ public class IndexServicesFactory {
tryGc();
}
private void convertForwardIndex() throws IOException {
private void convertForwardIndex(DomainRankings domainRankings) throws IOException {
logger.info("Converting forward index data");
new ForwardIndexConverter(tmpFileDir,
new ForwardIndexConverter(
writerIndexFile.get(0),
fwdIndexDocId.get(NEXT_PART).toPath(),
fwdIndexDocData.get(NEXT_PART).toPath())
fwdIndexDocData.get(NEXT_PART).toPath(),
domainRankings)
.convert();
tryGc();
@ -215,8 +213,8 @@ public class IndexServicesFactory {
}
}
public SearchIndex createIndexBucket() {
return new SearchIndex(this, new EdgeIndexControl(this));
public SearchIndex createIndexBucket(EdgeIndexSearchSetsService searchSetsService) {
return new SearchIndex(this, new EdgeIndexControl(this, searchSetsService));
}
public SearchIndexReader getSearchIndexReader() throws IOException {

View File

@ -3,7 +3,8 @@ package nu.marginalia.wmsa.edge.index.client;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
import nu.marginalia.util.dict.DictionaryMap;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker;
@ -32,14 +33,9 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient {
@Inject
public EdgeIndexLocalService(@Named("local-index-path") Path path) throws IOException {
long hashMapSize = 1L << 31;
if (Boolean.getBoolean("small-ram")) {
hashMapSize = 1L << 27;
}
var lexiconJournal = new KeywordLexiconJournal(path.resolve("dictionary.dat").toFile());
lexicon = new KeywordLexicon(lexiconJournal, new DictionaryHashMap(hashMapSize));
lexicon = new KeywordLexicon(lexiconJournal, DictionaryMap.create());
indexWriter = new SearchIndexJournalWriterImpl(lexicon, path.resolve("index.dat").toFile());
}
@ -72,7 +68,7 @@ public class EdgeIndexLocalService implements EdgeIndexWriterClient {
String word = words[i];
long id = lexicon.getOrInsert(word);
if (id != DictionaryHashMap.NO_VALUE) {
if (id != OffHeapDictionaryHashMap.NO_VALUE) {
ids[putIdx++] = id;
ids[putIdx++] = meta[i];
}

View File

@ -4,7 +4,6 @@ import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import io.prometheus.client.Gauge;
import lombok.SneakyThrows;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.util.dict.DictionaryMap;
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
import org.slf4j.Logger;
@ -55,7 +54,7 @@ public class KeywordLexicon implements AutoCloseable {
private int getOrInsert(byte[] bytes) {
if (bytes.length >= Byte.MAX_VALUE) {
logger.warn("getOrInsert({}), illegal length {}", new String(bytes), bytes.length);
return DictionaryHashMap.NO_VALUE;
return DictionaryMap.NO_VALUE;
}
final long key = hashFunction.hashBytes(bytes).padToLong();

View File

@ -8,7 +8,8 @@ import java.util.Set;
import static java.lang.Math.max;
import static java.lang.Math.min;
public record EdgePageDocumentsMetadata(int encSize,
public record EdgePageDocumentsMetadata(int rank,
int encSize,
int topology,
int year,
int sets,
@ -16,9 +17,13 @@ public record EdgePageDocumentsMetadata(int encSize,
byte flags) {
public static final long RANK_MASK = 0xFFL;
public static final int RANK_SHIFT = 48;
public static final long ENCSIZE_MASK = 0xFFL;
public static final int ENCSIZE_SHIFT = 48;
public static final int ENCSIZE_SHIFT = 40;
public static final int ENCSIZE_MULTIPLIER = 50;
public static final long TOPOLOGY_MASK = 0xFFL;
public static final int TOPOLOGY_SHIFT = 32;
@ -39,7 +44,7 @@ public record EdgePageDocumentsMetadata(int encSize,
this(defaultValue());
}
public EdgePageDocumentsMetadata(int topology, int year, int sets, int quality, EnumSet<EdgePageDocumentFlags> flags) {
this(0, topology, year, sets, quality, encodeFlags(flags));
this(0, 0, topology, year, sets, quality, encodeFlags(flags));
}
public EdgePageDocumentsMetadata withSize(int size) {
@ -49,7 +54,7 @@ public record EdgePageDocumentsMetadata(int encSize,
final int encSize = (int) Math.min(ENCSIZE_MASK, Math.max(1, size / ENCSIZE_MULTIPLIER));
return new EdgePageDocumentsMetadata(encSize, topology, year, sets, quality, flags);
return new EdgePageDocumentsMetadata(rank, encSize, topology, year, sets, quality, flags);
}
private static byte encodeFlags(Set<EdgePageDocumentFlags> flags) {
@ -63,7 +68,8 @@ public record EdgePageDocumentsMetadata(int encSize,
}
public EdgePageDocumentsMetadata(long value) {
this( (int) ((value >>> ENCSIZE_SHIFT) & ENCSIZE_MASK),
this( (int) ((value >>> RANK_SHIFT) & RANK_MASK),
(int) ((value >>> ENCSIZE_SHIFT) & ENCSIZE_MASK),
(int) ((value >>> TOPOLOGY_SHIFT) & TOPOLOGY_MASK),
(int) ((value >>> YEAR_SHIFT) & YEAR_MASK),
(int) ((value >>> SETS_SHIFT) & SETS_MASK),
@ -84,12 +90,13 @@ public record EdgePageDocumentsMetadata(int encSize,
ret |= min(YEAR_MASK, max(0, year)) << YEAR_SHIFT;
ret |= min(TOPOLOGY_MASK, max(0, topology)) << TOPOLOGY_SHIFT;
ret |= min(ENCSIZE_MASK, max(0, encSize)) << ENCSIZE_SHIFT;
ret |= min(RANK_MASK, max(0, rank)) << RANK_SHIFT;
return ret;
}
public boolean isEmpty() {
return encSize == 0 && topology == 0 && sets == 0 && quality == 0 && year == 0 && flags == 0;
return encSize == 0 && topology == 0 && sets == 0 && quality == 0 && year == 0 && flags == 0 && rank == 0;
}
public static int decodeQuality(long encoded) {
@ -112,6 +119,12 @@ public record EdgePageDocumentsMetadata(int encSize,
return ENCSIZE_MULTIPLIER * (int) ((encoded >>> ENCSIZE_SHIFT) & ENCSIZE_MASK);
}
public static int decodeRank(long encoded) {
return (int) ((encoded >>> RANK_SHIFT) & RANK_MASK);
}
public static long encodeRank(long encoded, int rank) {
return encoded | min(RANK_MASK, max(0, rank)) << RANK_SHIFT;
}
}

View File

@ -0,0 +1,4 @@
package nu.marginalia.wmsa.edge.index.model;
public record QueryLimits(int resultsByDomain, int resultsTotal, int timeoutMs, int fetchSize) {
}

View File

@ -3,5 +3,10 @@ package nu.marginalia.wmsa.edge.index.model;
public enum QueryStrategy {
SENTENCE,
TOPIC,
REQUIRE_FIELD_SITE,
REQUIRE_FIELD_TITLE,
REQUIRE_FIELD_SUBJECT,
AUTO
}

View File

@ -0,0 +1,43 @@
package nu.marginalia.wmsa.edge.index.postings;
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
import it.unimi.dsi.fastutil.ints.Int2ShortOpenHashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static java.lang.Math.max;
import static java.lang.Math.min;
public class DomainRankings {
private final Int2ShortOpenHashMap rankings;
private final int MAX_MEANINGFUL_RANK = 50_000;
private final int MAX_RANK_VALUE = 255;
private final int MIN_RANK_VALUE = 1;
private final double RANK_SCALING_FACTOR = (double) MAX_RANK_VALUE / MAX_MEANINGFUL_RANK;
public DomainRankings() {
rankings = new Int2ShortOpenHashMap();
}
public DomainRankings(Int2IntOpenHashMap values) {
rankings = new Int2ShortOpenHashMap(values.size());
values.forEach(this::putRanking);
}
private void putRanking(int domainId, int value) {
rankings.put(domainId, scaleRank(value));
}
private short scaleRank(int value) {
double rankScaled = RANK_SCALING_FACTOR * value;
return (short) min(MAX_RANK_VALUE, max(MIN_RANK_VALUE, rankScaled));
}
public int getRanking(int domainId) {
return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE);
}
public int size() {
return rankings.size();
}
}

View File

@ -7,6 +7,8 @@ import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import nu.marginalia.wmsa.edge.index.model.EdgePageWordFlags;
import nu.marginalia.wmsa.edge.index.model.EdgePageWordMetadata;
import nu.marginalia.wmsa.edge.index.model.QueryStrategy;
import nu.marginalia.wmsa.edge.index.query.IndexQueryParams;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultKeywordScore;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery;
@ -17,6 +19,7 @@ import java.util.Objects;
public class IndexResultValuator {
private final IndexMetadataService metadataService;
private final List<List<String>> searchTermVariants;
private final IndexQueryParams queryParams;
private final int[] termIdsAll;
private final TLongHashSet resultsWithPriorityTerms;
@ -24,9 +27,10 @@ public class IndexResultValuator {
private final TObjectIntHashMap<String> termToId = new TObjectIntHashMap<>(10, 0.75f, -1);
private final TermMetadata termMetadata;
public IndexResultValuator(SearchIndexControl indexes, TLongList results, List<EdgeSearchSubquery> subqueries) {
public IndexResultValuator(SearchIndexControl indexes, TLongList results, List<EdgeSearchSubquery> subqueries, IndexQueryParams queryParams) {
this.metadataService = new IndexMetadataService(indexes);
this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
this.queryParams = queryParams;
var lexiconReader = Objects.requireNonNull(indexes.getLexiconReader());
IntArrayList termIdsList = new IntArrayList();
@ -114,10 +118,15 @@ public class IndexResultValuator {
docMetadata,
resultsWithPriorityTerms.contains(searchResult.combinedId)
);
searchResult.scores.add(score);
setScore += score.termValue();
if (!filterRequired(metadata, queryParams.queryStrategy())) {
setScore += 1000;
}
if (termIdx == 0) {
setScore += score.documentValue();
}
@ -130,6 +139,19 @@ public class IndexResultValuator {
return setScore/setSize;
}
private boolean filterRequired(long metadata, QueryStrategy queryStrategy) {
if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) {
return EdgePageWordFlags.Site.isPresent(metadata);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) {
return EdgePageWordFlags.Subjects.isPresent(metadata);
}
else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) {
return EdgePageWordFlags.Title.isPresent(metadata);
}
return true;
}
private double calculateTermCoherencePenalty(int urlId, TObjectIntHashMap<String> termToId, List<String> termList) {
long maskDirectGenerous = ~0;
long maskDirectRaw = ~0;
@ -139,6 +161,9 @@ public class IndexResultValuator {
| EdgePageWordFlags.Subjects.asBit()
| EdgePageWordFlags.Synthetic.asBit();
int termCount = 0;
double tfIdfSum = 1.;
for (String term : termList) {
var meta = termMetadata.getTermMetadata(termToId.get(term), urlId);
long positions;
@ -156,18 +181,22 @@ public class IndexResultValuator {
maskDirectGenerous &= positions;
}
termCount++;
tfIdfSum += EdgePageWordMetadata.decodeTfidf(meta);
}
double avgTfIdf = termCount / tfIdfSum;
if (maskAdjacent == 0) {
return 40;
return Math.max(-2, 40 - 0.5 * avgTfIdf);
}
if (maskDirectGenerous == 0) {
return 20;
return Math.max(-1, 20 - 0.3 * avgTfIdf);
}
if (maskDirectRaw == 0) {
return 2;
return Math.max(-1, 15 - 0.2 * avgTfIdf);
}
return Long.numberOfTrailingZeros(maskDirectGenerous)/5. - Long.bitCount(maskDirectGenerous);

View File

@ -92,7 +92,8 @@ public class SearchIndex {
SearchIndexReader.IndexQueryBuilder query =
switch(params.queryStrategy()) {
case SENTENCE -> indexReader.findWordAsSentence(orderedIncludes);
case TOPIC -> indexReader.findWordAsTopic(orderedIncludes);
case TOPIC, REQUIRE_FIELD_SITE, REQUIRE_FIELD_TITLE, REQUIRE_FIELD_SUBJECT
-> indexReader.findWordAsTopic(orderedIncludes);
case AUTO -> indexReader.findWordTopicDynamicMode(orderedIncludes);
};

View File

@ -6,6 +6,7 @@ import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexiconReadOnlyView;
import nu.marginalia.wmsa.edge.index.postings.journal.writer.SearchIndexJournalWriterImpl;
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexSearchSetsService;
import nu.marginalia.wmsa.edge.index.svc.EdgeOpsLockService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -26,13 +27,14 @@ public class SearchIndexControl {
@Inject
public SearchIndexControl(IndexServicesFactory servicesFactory,
EdgeOpsLockService opsLockService) {
EdgeOpsLockService opsLockService,
EdgeIndexSearchSetsService searchSetsService) {
this.servicesFactory = servicesFactory;
this.primaryIndexWriter = servicesFactory.getIndexWriter(0);
this.secondaryIndexWriter = servicesFactory.getIndexWriter(1);
index = servicesFactory.createIndexBucket();
index = servicesFactory.createIndexBucket(searchSetsService);
this.opsLockService = opsLockService;
}

View File

@ -3,6 +3,8 @@ package nu.marginalia.wmsa.edge.index.postings.forward;
import com.upserve.uppend.blobs.NativeIO;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import nu.marginalia.util.array.LongArray;
import nu.marginalia.wmsa.edge.index.model.EdgePageDocumentsMetadata;
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReader;
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReaderSingleFile;
import org.roaringbitmap.IntConsumer;
@ -18,26 +20,26 @@ import java.nio.file.Path;
import static nu.marginalia.wmsa.edge.index.postings.forward.ForwardIndexParameters.*;
public class ForwardIndexConverter {
private static final int RWF_BIN_SIZE = 10_000_000;
private final Path tmpFileDir;
private final File inputFile;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Path outputFileDocsId;
private final Path outputFileDocsData;
private final DomainRankings domainRankings;
public ForwardIndexConverter(Path tmpFileDir,
public ForwardIndexConverter(
File inputFile,
Path outputFileDocsId,
Path outputFileDocsData
Path outputFileDocsData,
DomainRankings domainRankings
) {
this.tmpFileDir = tmpFileDir;
this.inputFile = inputFile;
this.outputFileDocsId = outputFileDocsId;
this.outputFileDocsData = outputFileDocsData;
this.domainRankings = domainRankings;
}
public void convert() throws IOException {
@ -50,6 +52,8 @@ public class ForwardIndexConverter {
logger.info("Converting {} {}",inputFile, journalReader.fileHeader);
logger.info("Domain Rankings size = {}", domainRankings.size());
try {
LongArray docsFileId = getDocIds(outputFileDocsId, journalReader);
@ -68,7 +72,10 @@ public class ForwardIndexConverter {
journalReader.forEach(entry -> {
long entryOffset = (long) ENTRY_SIZE * docIdToIdx.get(entry.urlId());
docFileData.set(entryOffset + METADATA_OFFSET, entry.docMeta());
int ranking = domainRankings.getRanking(entry.domainId());
long meta = EdgePageDocumentsMetadata.encodeRank(entry.docMeta(), ranking);
docFileData.set(entryOffset + METADATA_OFFSET, meta);
docFileData.set(entryOffset + DOMAIN_OFFSET, entry.domainId());
});

View File

@ -29,20 +29,30 @@ public class ForwardIndexReader {
logger.info("Switching forward index");
ids = loadIds(idsFile);
data = loadData(dataFile);
}
private static TLongIntHashMap loadIds(Path idsFile) throws IOException {
var idsArray = LongArray.mmapRead(idsFile);
idsArray.advice(NativeIO.Advice.Sequential);
ids = new TLongIntHashMap((int) idsArray.size(), 0.5f, -1, -1);
var ids = new TLongIntHashMap((int) idsArray.size(), 0.5f, -1, -1);
// This hash table should be of the same size as the number of documents, so typically less than 1 Gb
idsArray.forEach(0, idsArray.size(), (pos, val) -> {
ids.put(val, (int) pos);
});
data = LongArray.mmapRead(dataFile);
return ids;
}
private static LongArray loadData(Path dataFile) throws IOException {
var data = LongArray.mmapRead(dataFile);
data.advice(NativeIO.Advice.Random);
return data;
}
private int idxForDoc(long docId) {
@ -55,6 +65,7 @@ public class ForwardIndexReader {
return data.get(ENTRY_SIZE * offset + METADATA_OFFSET);
}
public int getDomainId(long docId) {
long offset = idxForDoc(docId);
if (offset < 0) return 0;

View File

@ -16,7 +16,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
@Override
public boolean test(long docId) {
var post = forwardIndexReader.docPost(docId);
var post = forwardIndexReader.docPost(docId & 0xFFFF_FFFFL);
if (!validateDomain(post)) {
return false;
@ -33,6 +33,11 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
if (!validateSize(post)) {
return false;
}
if (!validateRank(post)) {
return false;
}
return true;
}
@ -51,6 +56,7 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
return limit.test(quality);
}
private boolean validateYear(ForwardIndexReader.DocPost post) {
if (params.year().type() == SpecificationLimitType.NONE)
return true;
@ -69,6 +75,15 @@ public class ParamMatchingQueryFilter implements QueryFilterStepIf {
return params.size().test(postVal);
}
private boolean validateRank(ForwardIndexReader.DocPost post) {
if (params.rank().type() == SpecificationLimitType.NONE)
return true;
int postVal = EdgePageDocumentsMetadata.decodeRank(post.meta());
return params.rank().test(postVal);
}
@Override
public double cost() {
return 32;

View File

@ -8,6 +8,7 @@ import nu.marginalia.util.array.functional.LongBinaryIOOperation;
import nu.marginalia.util.array.functional.LongIOTransformer;
import nu.marginalia.util.array.functional.LongTransformer;
import nu.marginalia.util.btree.BTreeWriter;
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalEntry;
import nu.marginalia.wmsa.edge.index.postings.journal.model.SearchIndexJournalStatistics;
import nu.marginalia.wmsa.edge.index.postings.journal.reader.SearchIndexJournalReader;
@ -32,18 +33,22 @@ public class ReverseIndexConverter {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final SearchIndexJournalReaderSingleFile journalReader;
private final DomainRankings domainRankings;
private final Path outputFileWords;
private final Path outputFileDocs;
private final SortingContext sortingContext;
public ReverseIndexConverter(Path tmpFileDir,
SearchIndexJournalReaderSingleFile journalReader,
DomainRankings domainRankings,
Path outputFileWords,
Path outputFileDocs) {
this.tmpFileDir = tmpFileDir;
this.journalReader = journalReader;
this.domainRankings = domainRankings;
this.outputFileWords = outputFileWords;
this.outputFileDocs = outputFileDocs;
this.sortingContext = new SortingContext(tmpFileDir, 64_000);
}
public void convert() throws IOException {
@ -56,7 +61,7 @@ public class ReverseIndexConverter {
final SearchIndexJournalStatistics statistics = journalReader.getStatistics();
final Path intermediateUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
SortingContext sortingContext = new SortingContext(tmpFileDir, 64_000);
try {
final long wordsFileSize = statistics.highestWord() + 1;
@ -187,7 +192,7 @@ public class ReverseIndexConverter {
}
}
private static class IntermediateIndexConstructor implements SearchIndexJournalReaderSingleFile.LongObjectConsumer<SearchIndexJournalEntry.Record>, AutoCloseable {
private class IntermediateIndexConstructor implements SearchIndexJournalReaderSingleFile.LongObjectConsumer<SearchIndexJournalEntry.Record>, AutoCloseable {
private final LongArray wordRangeEnds;
private final IntArray wordRangeOffset;
@ -205,12 +210,26 @@ public class ReverseIndexConverter {
@Override
public void accept(long docId, SearchIndexJournalEntry.Record record) {
final long urlId = docId & 0xFFFF_FFFFL;
final int wordId = record.wordId();
/* Encode the ID as
*
* 32 bits 32 bits
* [ ranking | url-id ]
*
* in order to get low-ranking documents to be considered first
* when sorting the items.
*/
int domainId = (int) (docId >>> 32);
long rankingId = (long) domainRankings.getRanking(domainId) << 32;
int urlId = (int) (docId & 0xFFFF_FFFFL);
long rankEncodedId = rankingId | urlId;
final int wordId = record.wordId();
long offset = startOfRange(wordId);
documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), urlId);
documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), rankEncodedId);
documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), record.metadata());
}

View File

@ -47,18 +47,6 @@ public class ReverseIndexPrioReader {
return new ReverseIndexEntrySource(createReaderNew(offset), ReverseIndexEntrySourceBehavior.DO_PREFER);
}
public int numDocuments(int wordId) {
if (wordId < 0)
return 0;
long offset = words.get(wordId);
if (offset < 0)
return 0;
return createReaderNew(offset).numEntries();
}
private BTreeReader createReaderNew(long offset) {
return new BTreeReader(documents, ReverseIndexParameters.bTreeContext, offset);
}

View File

@ -53,6 +53,11 @@ public class ReverseIndexReader {
}
public EntrySource documents(int wordId, ReverseIndexEntrySourceBehavior behavior) {
if (null == words) {
logger.warn("Reverse index is not ready, dropping query");
return new EmptyEntrySource();
}
if (wordId < 0 || wordId >= words.size()) return new EmptyEntrySource();
long offset = words.get(wordId);

View File

@ -7,6 +7,7 @@ import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit;
public record IndexQueryParams(SpecificationLimit qualityLimit,
SpecificationLimit year,
SpecificationLimit size,
SpecificationLimit rank,
SearchSet searchSet,
QueryStrategy queryStrategy
)

View File

@ -1,21 +1,21 @@
package nu.marginalia.util.ranking;
package nu.marginalia.wmsa.edge.index.ranking;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.map.hash.TIntObjectHashMap;
import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.ints.IntComparator;
import org.roaringbitmap.RoaringBitmap;
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultAccumulator;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainData;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Set;
import java.util.function.IntToDoubleFunction;
import java.util.stream.IntStream;
import java.util.function.Supplier;
import static java.lang.Math.min;
public abstract class RankingAlgorithm {
protected final TIntObjectHashMap<RankingDomainData> domainsById = new TIntObjectHashMap<>();
@ -85,6 +85,10 @@ public abstract class RankingAlgorithm {
logger.info("Origin Domains: {}", originDomainIds.size());
}
public RankingDomainData getDomainData(int id) {
return domainsById.get(id);
}
public void addPeripheralNodes() {
int newNodesIdxCutoff = domainIdToIndex.size();
@ -133,29 +137,7 @@ public abstract class RankingAlgorithm {
return domainsById.size();
}
public RankVector pageRankVector() {
RankVector rank = new RankVector(1.d / domainsById.size());
int iter_max = 100;
for (int i = 0; i < iter_max; i++) {
RankVector newRank = createNewRankVector(rank);
double oldNorm = rank.norm();
double newNorm = newRank.norm();
double dNorm = oldNorm - newNorm ;
if (i < iter_max-1) {
adjustRankVector(newRank, dNorm, oldNorm);
}
rank = newRank;
}
return rank;
}
public RoaringBitmap pageRank(int resultCount) {
public <T> T pageRank(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
RankVector rank = new RankVector(1.d / domainsById.size());
int iter_max = 100;
@ -174,10 +156,10 @@ public abstract class RankingAlgorithm {
}
return rank.getRanking(resultCount);
return rank.getRanking(resultCount, accumulatorP).get();
}
public RoaringBitmap pageRankWithPeripheralNodes(int resultCount) {
public <T> T pageRankWithPeripheralNodes(int resultCount, Supplier<RankingResultAccumulator<T>> accumulatorP) {
RankVector rank = new RankVector(1.d / domainsById.size());
int iter_max = 100;
@ -201,32 +183,11 @@ public abstract class RankingAlgorithm {
logger.info("PRWPN iteration done");
return rank.getRanking(resultCount);
return rank.getRanking(resultCount, accumulatorP).get();
}
abstract void adjustRankVector(RankVector vector, double dNorm, double oldNorm);
public TIntList pageRank(IntToDoubleFunction weight, int resultCount) {
RankVector rank = new RankVector(1.d / domainsById.size());
int iter_max = 100;
for (int i = 0; i < iter_max; i++) {
RankVector newRank = createNewRankVector(rank);
double oldNorm = rank.norm();
double newNorm = newRank.norm();
double dNorm = oldNorm - newNorm ;
if (i < iter_max-1) {
adjustRankVector(newRank, dNorm, oldNorm);
}
rank = newRank;
}
return rank.getRanking(weight, resultCount);
}
abstract RankVector createNewRankVector(RankVector rank);
public boolean includeInRanking(RankingDomainData data) {
@ -245,9 +206,9 @@ public abstract class RankingAlgorithm {
public void setMaxKnownUrls(int maxKnownUrls) {
this.maxKnownUrls = maxKnownUrls;
}
public class RankVector {
private final double[] rank;
public RankVector(double defaultValue) {
rank = new double[domainIndexToId.size()];
if (defaultValue != 0.) {
@ -271,9 +232,8 @@ public abstract class RankingAlgorithm {
public double norm() {
double v = 0.;
for (int i = 0; i < rank.length; i++) {
if (rank[i] > 0) { v+=rank[i]; }
else { v -= rank[i]; }
for (double value : rank) {
v += Math.abs(value);
}
return v;
}
@ -281,74 +241,39 @@ public abstract class RankingAlgorithm {
public double norm(RankVector other) {
double v = 0.;
for (int i = 0; i < rank.length; i++) {
double dv = rank[i] - other.get(i);
if (dv > 0) { v+=dv; }
else { v -= dv; }
v += Math.abs(rank[i] - other.get(i));
}
return v;
}
public TIntList getRanking(IntToDoubleFunction other, int numResults) {
TIntArrayList list = new TIntArrayList(numResults);
public <T> RankingResultAccumulator<T> getRanking(int numResults, Supplier<RankingResultAccumulator<T>> accumulatorP) {
Comparator<Integer> comparator = Comparator.comparing(i -> Math.sqrt(other.applyAsDouble(domainIdToIndex.get(i)) * rank[i]));
IntStream.range(0, rank.length)
.boxed()
.sorted(comparator.reversed())
.map(domainIndexToId::get)
.limit(numResults)
.forEach(list::add);
return list;
}
public RoaringBitmap getRanking(int numResults) {
if (numResults < 0) {
numResults = domainIdToIndex.size();
}
if (numResults >= rank.length) {
numResults = rank.length;
}
numResults = min(numResults, min(domainIdToIndex.size(), rank.length));
RoaringBitmap list = new RoaringBitmap();
int[] nodes = sortOrder(rank);
var accumulator = accumulatorP.get();
int[] nodes = new int[rank.length];
Arrays.setAll(nodes, i->i);
IntComparator comp = (i,j) -> (int) Math.signum(rank[j] - rank[i]);
IntArrays.quickSort(nodes, comp);
int i;
for (i = 0; i < numResults; i++) {
for (int i = 0; i < numResults; i++) {
int id = domainIndexToId.get(nodes[i]);
if (includeInRanking(domainsById.get(id)))
list.add(id);
accumulator.add(id, i);
}
for (; i < nodes.length && domainsById.size() < numResults; i++) {
int id = domainIndexToId.get(nodes[i]);
return accumulator;
}
private static int[] sortOrder(double[] values) {
if (includeInRanking(domainsById.get(id)))
list.add(id);
}
int[] ret = new int[values.length];
Arrays.setAll(ret, i->i);
IntArrays.quickSort(ret, (i,j) -> (int) Math.signum(values[j] - values[i]));
return list;
return ret;
}
public void incrementAll(double v) {
for (int i = 0; i < rank.length; i++) {
rank[i]+=v;
}
}
int size() {
return domainsById.size();
}
}
}

View File

@ -1,10 +1,12 @@
package nu.marginalia.util.ranking;
package nu.marginalia.wmsa.edge.index.ranking;
public class BetterReversePageRank extends RankingAlgorithm {
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
public class ReversePageRank extends RankingAlgorithm {
public BetterReversePageRank(RankingDomainFetcher domains, String... origins) {
public ReversePageRank(RankingDomainFetcher domains, String... origins) {
super(domains, origins);
}
@ -20,8 +22,6 @@ public class BetterReversePageRank extends RankingAlgorithm {
double newRankValue = 0;
if (links != null && links.size() > 0) {
for (int j = 0; j < links.size(); j++) {
var revLinks = linkDataDest2Src[links.getQuick(j)];
newRankValue += rank.get(links.getQuick(j)) / revLinks.size();

View File

@ -1,9 +1,11 @@
package nu.marginalia.util.ranking;
package nu.marginalia.wmsa.edge.index.ranking;
public class BetterStandardPageRank extends RankingAlgorithm {
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
public BetterStandardPageRank(RankingDomainFetcher domains, String... origins) {
public class StandardPageRank extends RankingAlgorithm {
public StandardPageRank(RankingDomainFetcher domains, String... origins) {
super(domains, origins);
}
@ -38,8 +40,7 @@ public class BetterStandardPageRank extends RankingAlgorithm {
@Override
void adjustRankVector(RankVector vector, double dNorm, double oldNorm) {
originDomainIds.forEach(id -> vector.increment(id, 0.15 / originDomainIds.size() /* dNorm/originDomainIds.size() */ ));
// vector.incrementAll(0.14*dNorm/vector.size());
originDomainIds.forEach(id -> vector.increment(id, 0.15 / originDomainIds.size() ));
}
}

View File

@ -0,0 +1,6 @@
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
public interface RankingResultAccumulator<T> {
void add(int domainId, int rank);
T get();
}

View File

@ -0,0 +1,17 @@
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
import org.roaringbitmap.RoaringBitmap;
public class RankingResultBitSetAccumulator implements RankingResultAccumulator<RoaringBitmap> {
private final RoaringBitmap result = new RoaringBitmap();
@Override
public void add(int domainId, int rank) {
result.add(domainId);
}
@Override
public RoaringBitmap get() {
return result;
}
}

View File

@ -0,0 +1,21 @@
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
public class RankingResultHashMapAccumulator implements RankingResultAccumulator<Int2IntOpenHashMap> {
private final Int2IntOpenHashMap result;
public RankingResultHashMapAccumulator(int size) {
result = new Int2IntOpenHashMap(size);
}
@Override
public void add(int domainId, int rank) {
result.put(domainId, rank);
}
@Override
public Int2IntOpenHashMap get() {
return result;
}
}

View File

@ -0,0 +1,24 @@
package nu.marginalia.wmsa.edge.index.ranking.accumulator;
import gnu.trove.list.array.TIntArrayList;
public class RankingResultListAccumulator implements RankingResultAccumulator<TIntArrayList> {
private final TIntArrayList result;
public RankingResultListAccumulator(int size) {
result = new TIntArrayList(size);
}
public RankingResultListAccumulator() {
result = new TIntArrayList(10_000);
}
@Override
public void add(int domainId, int rank) {
result.add(domainId);
}
@Override
public TIntArrayList get() {
return result;
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.util.ranking;
package nu.marginalia.wmsa.edge.index.ranking.data;
import lombok.AllArgsConstructor;
import lombok.Data;
@ -10,7 +10,7 @@ public class RankingDomainData {
public final int id;
public final String name;
private int alias;
private EdgeDomainIndexingState state;
public EdgeDomainIndexingState state;
public final int knownUrls;
public int resolveAlias() {

View File

@ -1,6 +1,7 @@
package nu.marginalia.util.ranking;
package nu.marginalia.wmsa.edge.index.ranking.data;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
@ -11,12 +12,13 @@ import java.sql.SQLException;
import java.util.function.Consumer;
import java.util.function.IntConsumer;
@Singleton
public class RankingDomainFetcher {
private final HikariDataSource dataSource;
private final EdgeDomainBlacklistImpl blacklist;
private final Logger logger = LoggerFactory.getLogger(getClass());
protected final HikariDataSource dataSource;
protected final EdgeDomainBlacklistImpl blacklist;
protected final Logger logger = LoggerFactory.getLogger(getClass());
private final boolean getNames = false;
protected boolean getNames = false;
@Inject
public RankingDomainFetcher(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
@ -24,6 +26,10 @@ public class RankingDomainFetcher {
this.blacklist = blacklist;
}
public void retainNames() {
this.getNames = true;
}
public void getDomains(Consumer<RankingDomainData> consumer) {
String query;
if (getNames) {
@ -49,14 +55,19 @@ public class RankingDomainFetcher {
getDomains(query, consumer);
}
private void getDomains(String query, Consumer<RankingDomainData> consumer) {
protected void getDomains(String query, Consumer<RankingDomainData> consumer) {
try (var conn = dataSource.getConnection(); var stmt = conn.prepareStatement(query)) {
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int id = rsp.getInt(1);
if (!blacklist.isBlacklisted(id)) {
consumer.accept(new RankingDomainData(id, rsp.getString(2), rsp.getInt(3), EdgeDomainIndexingState.valueOf(rsp.getString(4)), rsp.getInt(5)));
consumer.accept(
new RankingDomainData(id,
rsp.getString(2),
rsp.getInt(3),
EdgeDomainIndexingState.valueOf(rsp.getString(4)),
rsp.getInt(5)));
}
}
}

View File

@ -0,0 +1,103 @@
package nu.marginalia.wmsa.edge.index.ranking.data;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.function.Consumer;
@Singleton
public class RankingDomainFetcherForSimilarityData extends RankingDomainFetcher {
final boolean hasData;
@Inject
public RankingDomainFetcherForSimilarityData(HikariDataSource dataSource, EdgeDomainBlacklistImpl blacklist) {
super(dataSource, blacklist);
hasData = isDomainNeighborTablePopulated(dataSource);
}
private static boolean isDomainNeighborTablePopulated(HikariDataSource dataSource) {
try (var conn = dataSource.getConnection();
var stmt = conn.createStatement();
var rs = stmt.executeQuery("SELECT DOMAIN_ID FROM EC_DOMAIN_NEIGHBORS_2 LIMIT 1")) {
return rs.next();
}
catch (SQLException ex) {
LoggerFactory
.getLogger(RankingDomainFetcherForSimilarityData.class)
.error("Failed to get count from EC_DOMAIN_NEIGHBORS_2", ex);
return false;
}
}
public boolean hasData() {
return hasData;
}
public void eachDomainLink(DomainLinkConsumer consumer) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("SELECT DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS FROM EC_DOMAIN_NEIGHBORS_2"))
{
stmt.setFetchSize(10000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
int src = rsp.getInt(1);
int dst = rsp.getInt(2);
// these "links" are bidi
consumer.accept(src, dst);
consumer.accept(dst, src);
}
}
catch (SQLException ex) {
logger.error("Failed to fetch domain links", ex);
}
}
public void getDomains(Consumer<RankingDomainData> consumer) {
// String query =
// """
// SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
// FROM EC_DOMAIN
// LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
// INNER JOIN EC_DOMAIN_LINK ON DEST_DOMAIN_ID=EC_DOMAIN.ID
// WHERE SOURCE_DOMAIN_ID!=DEST_DOMAIN_ID
// GROUP BY EC_DOMAIN.ID
// HAVING COUNT(SOURCE_DOMAIN_ID)>5
// """;
String query;
if (getNames) {
query =
"""
SELECT EC_DOMAIN.ID,DOMAIN_NAME,DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
GROUP BY EC_DOMAIN.ID
""";
}
else {
query =
"""
SELECT EC_DOMAIN.ID,"",DOMAIN_ALIAS,STATE,COALESCE(KNOWN_URLS, 0)
FROM EC_DOMAIN
INNER JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
GROUP BY EC_DOMAIN.ID
""";
}
getDomains(query, consumer);
}
public void getPeripheralDomains(Consumer<RankingDomainData> consumer) {
// This is not relevant for this variant of pagerank since it is bidirectional
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.util.ranking.old;
package nu.marginalia.wmsa.edge.index.ranking.old;
import com.zaxxer.hikari.HikariDataSource;

View File

@ -1,4 +1,4 @@
package nu.marginalia.util.ranking.old;
package nu.marginalia.wmsa.edge.index.ranking.old;
import com.zaxxer.hikari.HikariDataSource;
@ -125,7 +125,6 @@ public class StandardPageRank {
final TIntArrayList empty = new TIntArrayList();
double rankNorm = rank.norm();
RankVector newRank = new RankVector(0);
for (DomainData domain : domains.valueCollection()) {
@ -176,8 +175,6 @@ public class StandardPageRank {
}
});
}
TIntHashSet deadEnds = new TIntHashSet(domains.size());
}
private class RankVector {

View File

@ -1,43 +1,30 @@
package nu.marginalia.util.ranking.tool;
package nu.marginalia.wmsa.edge.index.ranking.tool;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BuggyStandardPageRank;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData;
import org.mariadb.jdbc.Driver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.LinkedBlockingQueue;
public class UpdateDomainRanksTool {
public class CreateBrowseDomainRanksTool {
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool.class);
private static final Logger logger = LoggerFactory.getLogger(CreateBrowseDomainRanksTool.class);
public Set<String> originDomains = new HashSet<>();
public Set<Integer> originDomainIds = new HashSet<>();
public final long domainIdMax = -1;
public int domainCount;
private volatile static int rankMax;
public int maxId() {
return (int) domainIdMax;
}
public int domainCount() {
return domainCount;
}
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
volatile static boolean running = true;
@SneakyThrows
public static void main(String... args) {
org.mariadb.jdbc.Driver driver = new Driver();
Driver driver = new Driver();
var conn = new DatabaseModule().provideConnection();
long start = System.currentTimeMillis();
@ -45,20 +32,21 @@ public class UpdateDomainRanksTool {
logger.info("Ranking");
var ds = new DatabaseModule().provideConnection();
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
var spr = new BuggyStandardPageRank(domains, "memex.marginalia.nu");
var domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds));
var rpr = new StandardPageRank(domains, args);
rankMax = spr.size()*2;
uploader.start();
var rankData = spr.pageRankWithPeripheralNodes(rankMax);
for (int i : rankData) {
var rankData = rpr.pageRankWithPeripheralNodes(1000, RankingResultListAccumulator::new);
rankData.forEach(i -> {
try {
uploadQueue.put(i);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
return true;
});
long end = System.currentTimeMillis();
running = false;
@ -68,24 +56,14 @@ public class UpdateDomainRanksTool {
}
public static void uploadThread(HikariDataSource dataSource) {
int i = 0;
try (var conn = dataSource.getConnection()) {
logger.info("Resetting rank");
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET RANK=1")) {
stmt.executeUpdate();
}
logger.info("Updating ranks");
try (var stmt = conn.prepareStatement("UPDATE EC_DOMAIN SET RANK=? WHERE ID=?")) {
try (var stmt = conn.prepareStatement("INSERT IGNORE INTO EC_RANDOM_DOMAINS(DOMAIN_SET, DOMAIN_ID) VALUES (3, ?)")) {
while (running || (!running && !uploadQueue.isEmpty())) {
var job = uploadQueue.take();
stmt.setDouble(1, i++ / (double) rankMax);
stmt.setInt(2, job);
stmt.setInt(1, job);
stmt.executeUpdate();
}
}
} catch (SQLException | InterruptedException throwables) {
throwables.printStackTrace();
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.util.ranking.tool;
package nu.marginalia.wmsa.edge.index.ranking.tool;
import com.zaxxer.hikari.HikariDataSource;
@ -10,9 +10,9 @@ import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.ints.IntComparator;
import lombok.AllArgsConstructor;
import lombok.SneakyThrows;
import nu.marginalia.util.ranking.RankingAlgorithm;
import nu.marginalia.util.ranking.RankingDomainData;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.edge.index.ranking.RankingAlgorithm;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainData;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
import org.jetbrains.annotations.NotNull;
@ -33,8 +33,6 @@ public class PerusePageRankV2 {
TIntArrayList[] linkDataSrc2Dest;
TIntArrayList[] linkDataDest2Src;
private static final boolean getNames = true;
private final Logger logger = LoggerFactory.getLogger(getClass());
static final LinkedBlockingQueue<LinkAdjacencies> uploadQueue = new LinkedBlockingQueue<>(10);

View File

@ -0,0 +1,67 @@
package nu.marginalia.wmsa.edge.index.ranking.tool;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData;
import org.mariadb.jdbc.Driver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
public class PrintDomainRanksTool {
private static final Logger logger = LoggerFactory.getLogger(PrintDomainRanksTool.class);
private volatile static int rankMax;
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
volatile static boolean running = true;
@SneakyThrows
public static void main(String... args) {
Driver driver = new Driver();
var conn = new DatabaseModule().provideConnection();
long start = System.currentTimeMillis();
logger.info("Ranking");
var ds = new DatabaseModule().provideConnection();
RankingDomainFetcher domains;
if (Boolean.getBoolean("use-link-data")) {
domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
domains.retainNames();
}
else {
domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds));
domains.retainNames();
}
var rpr = new StandardPageRank(domains, args);
rankMax = rpr.size();
var rankData = rpr.pageRankWithPeripheralNodes(rankMax, RankingResultListAccumulator::new);
AtomicInteger cnt = new AtomicInteger();
rankData.forEach(i -> {
var data = rpr.getDomainData(i);
System.out.printf("%d %s %s\n", cnt.getAndIncrement(), data.name, data.state);
return true;
});
long end = System.currentTimeMillis();
running = false;
logger.info("Done in {}", (end - start)/1000.0);
}
}

View File

@ -1,11 +1,12 @@
package nu.marginalia.util.ranking.tool;
package nu.marginalia.wmsa.edge.index.ranking.tool;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BetterReversePageRank;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultListAccumulator;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.dbcommon.EdgeDomainBlacklistImpl;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData;
import org.mariadb.jdbc.Driver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -13,12 +14,10 @@ import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.util.concurrent.LinkedBlockingQueue;
public class UpdateDomainRanksTool2 {
public class UpdateDomainRanksTool {
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool2.class);
private static final Logger logger = LoggerFactory.getLogger(UpdateDomainRanksTool.class);
public final long domainIdMax = -1;
public int domainCount;
private volatile static int rankMax;
static final LinkedBlockingQueue<Integer> uploadQueue = new LinkedBlockingQueue<>(10);
@ -34,21 +33,22 @@ public class UpdateDomainRanksTool2 {
logger.info("Ranking");
var ds = new DatabaseModule().provideConnection();
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
var rpr = new BetterReversePageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com", "%neocities.org");
var domains = new RankingDomainFetcherForSimilarityData(ds, new EdgeDomainBlacklistImpl(ds));
var rpr = new StandardPageRank(domains, "memex.marginalia.nu", "bikobatanari.art", "sadgrl.online", "wiki.xxiivv.com");
var rankVector = rpr.pageRankVector();
rankMax = rpr.size();
uploader.start();
var rankData = rpr.pageRankWithPeripheralNodes(rankMax);
for (int i : rankData) {
var rankData = rpr.pageRankWithPeripheralNodes(rankMax, RankingResultListAccumulator::new);
rankData.forEach(i -> {
try {
uploadQueue.put(i);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
return true;
});
long end = System.currentTimeMillis();
running = false;

View File

@ -5,7 +5,7 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import io.prometheus.client.Histogram;
import nu.marginalia.util.array.buffer.LongQueryBuffer;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
import nu.marginalia.wmsa.client.GsonFactory;
import nu.marginalia.wmsa.edge.index.postings.SearchIndexControl;
import nu.marginalia.wmsa.edge.index.query.IndexResultDomainDeduplicator;
@ -101,7 +101,7 @@ public class EdgeIndexDomainQueryService {
private OptionalInt lookUpWord(String s) {
int ret = indexes.getLexiconReader().get(s);
if (ret == DictionaryHashMap.NO_VALUE) {
if (ret == OffHeapDictionaryHashMap.NO_VALUE) {
return OptionalInt.empty();
}
return OptionalInt.of(ret);

View File

@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.index.svc;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.protobuf.InvalidProtocolBufferException;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.KeywordListChunker;
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
@ -51,7 +51,7 @@ public class EdgeIndexLexiconService {
final int wordId = lr.get(word);
if (DictionaryHashMap.NO_VALUE == wordId) {
if (OffHeapDictionaryHashMap.NO_VALUE == wordId) {
response.status(404);
return "";
}
@ -110,7 +110,7 @@ public class EdgeIndexLexiconService {
String word = words[i];
long id = keywordLexicon.getOrInsert(word);
if (id != DictionaryHashMap.NO_VALUE) {
if (id != OffHeapDictionaryHashMap.NO_VALUE) {
ids[putIdx++] = id;
ids[putIdx++] = meta[i];
}

View File

@ -12,7 +12,7 @@ import io.prometheus.client.Histogram;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.util.array.buffer.LongQueryBuffer;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.util.dict.OffHeapDictionaryHashMap;
import nu.marginalia.wmsa.client.GsonFactory;
import nu.marginalia.wmsa.edge.index.postings.EdgeIndexQuerySearchTerms;
import nu.marginalia.wmsa.edge.index.postings.IndexResultValuator;
@ -115,11 +115,13 @@ public class EdgeIndexQueryService {
TLongHashSet consideredUrlIds;
public SearchQuery(EdgeSearchSpecification specsSet) {
this.fetchSize = specsSet.fetchSize;
this.budget = new IndexSearchBudget(specsSet.timeoutMs);
var limits = specsSet.queryLimits;
this.fetchSize = limits.fetchSize();
this.budget = new IndexSearchBudget(limits.timeoutMs());
this.subqueries = specsSet.subqueries;
this.limitByDomain = specsSet.limitByDomain;
this.limitTotal = specsSet.limitTotal;
this.limitByDomain = limits.resultsByDomain();
this.limitTotal = limits.resultsTotal();
this.consideredUrlIds = new TLongHashSet(fetchSize * 4);
@ -127,6 +129,7 @@ public class EdgeIndexQueryService {
specsSet.quality,
specsSet.year,
specsSet.size,
specsSet.rank,
getSearchSet(specsSet),
specsSet.queryStrategy);
}
@ -151,7 +154,7 @@ public class EdgeIndexQueryService {
}
}
final var evaluator = new IndexResultValuator(indexes, results, subqueries);
final var evaluator = new IndexResultValuator(indexes, results, subqueries, queryParams);
ArrayList<EdgeSearchResultItem> items = new ArrayList<>(results.size());
ArrayList<EdgeSearchResultItem> refusedItems = new ArrayList<>(results.size());
@ -293,7 +296,7 @@ public class EdgeIndexQueryService {
private OptionalInt lookUpWord(String s) {
int ret = indexes.getLexiconReader().get(s);
if (ret == DictionaryHashMap.NO_VALUE) {
if (ret == OffHeapDictionaryHashMap.NO_VALUE) {
return OptionalInt.empty();
}
return OptionalInt.of(ret);

View File

@ -2,20 +2,20 @@ package nu.marginalia.wmsa.edge.index.svc;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;
import lombok.SneakyThrows;
import nu.marginalia.util.ranking.BetterReversePageRank;
import nu.marginalia.util.ranking.BetterStandardPageRank;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.edge.index.ranking.ReversePageRank;
import nu.marginalia.wmsa.edge.index.ranking.StandardPageRank;
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultHashMapAccumulator;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcher;
import nu.marginalia.wmsa.edge.index.ranking.accumulator.RankingResultBitSetAccumulator;
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
import nu.marginalia.wmsa.edge.index.model.RankingSettings;
import nu.marginalia.wmsa.edge.index.postings.DomainRankings;
import nu.marginalia.wmsa.edge.index.ranking.data.RankingDomainFetcherForSimilarityData;
import nu.marginalia.wmsa.edge.index.svc.searchset.RankingSearchSet;
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSet;
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetAny;
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier;
import org.roaringbitmap.RoaringBitmap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -23,137 +23,47 @@ import java.io.IOException;
@Singleton
public class EdgeIndexSearchSetsService {
private final HikariDataSource dataSource;
private RankingDomainFetcher rankingDomains;
private final RankingSettings rankingSettings;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final RankingDomainFetcher rankingDomains;
private final RankingDomainFetcher similarityDomains;
private final RankingSettings rankingSettings;
private final SearchSet anySet = new SearchSetAny();
// Below are binary indices that are used to constrain a search
private volatile RankingSearchSet retroSet;
private volatile RankingSearchSet smallWebSet;
private volatile RankingSearchSet academiaSet;
private final SearchSet anySet = new SearchSetAny();
// The ranking value of the domains used in sorting the domains
private volatile DomainRankings domainRankings = new DomainRankings();
@Inject
public EdgeIndexSearchSetsService(HikariDataSource dataSource,
RankingDomainFetcher rankingDomains,
public EdgeIndexSearchSetsService(RankingDomainFetcher rankingDomains,
RankingDomainFetcherForSimilarityData similarityDomains,
RankingSettings rankingSettings,
IndexServicesFactory servicesFactory) throws IOException {
this.dataSource = dataSource;
this.rankingDomains = rankingDomains;
if (similarityDomains.hasData()) {
this.similarityDomains = similarityDomains;
}
else {
// on test environments the cosine similarity graph may not be present
logger.info("Domain similarity is not present, falling back on link graph");
this.similarityDomains = rankingDomains;
}
this.rankingSettings = rankingSettings;
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, servicesFactory.getSearchSetsBase().resolve("small-web.dat"));
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, servicesFactory.getSearchSetsBase().resolve("academia.dat"));
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, servicesFactory.getSearchSetsBase().resolve("retro.dat"));
logger.info("SearchIndexDao ranking settings = {}", rankingSettings);
}
public void recalculateAll() {
updateAcademiaDomains();
updateRetroDomains();
updateSmallWebDomains();
}
@SneakyThrows
public RoaringBitmap goodUrls() {
RoaringBitmap domains = new RoaringBitmap();
RoaringBitmap urls = new RoaringBitmap();
try (var connection = dataSource.getConnection()) {
try (var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_ALIAS IS NULL AND IS_ALIVE")) {
stmt.setFetchSize(10_000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
domains.add(rsp.getInt(1));
}
}
// For some reason, doing this "INNER JOIN" in Java is significantly faster than doing it in SQL
try (var stmt = connection.prepareStatement("SELECT ID,DOMAIN_ID FROM EC_URL WHERE VISITED AND EC_URL.STATE='OK'")) {
stmt.setFetchSize(10_000);
var rsp = stmt.executeQuery();
while (rsp.next()) {
if (domains.contains(rsp.getInt(2))) {
urls.add(rsp.getInt(1));
}
}
}
}
return urls;
}
@SneakyThrows
public void updateRetroDomains() {
var spr = new BetterStandardPageRank(rankingDomains,rankingSettings.retro.toArray(String[]::new));
var data = spr.pageRankWithPeripheralNodes(spr.size() / 2);
synchronized (this) {
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data);
retroSet.write();
}
}
@SneakyThrows
public void updateSmallWebDomains() {
var rpr = new BetterReversePageRank(rankingDomains, rankingSettings.small.toArray(String[]::new));
rpr.setMaxKnownUrls(750);
var data = rpr.pageRankWithPeripheralNodes(rpr.size());
synchronized (this) {
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data);
smallWebSet.write();
}
}
@SneakyThrows
public void updateAcademiaDomains() {
var spr = new BetterStandardPageRank(rankingDomains, rankingSettings.academia.toArray(String[]::new));
var data = spr.pageRankWithPeripheralNodes(spr.size()/2);
synchronized (this) {
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data);
academiaSet.write();
}
}
@SneakyThrows
public TIntList getStandardDomains() {
TIntArrayList results = new TIntArrayList();
try (var connection = dataSource.getConnection();
var stmt = connection.prepareStatement(
"""
SELECT ID FROM EC_DOMAIN
WHERE INDEXED>0
AND STATE='ACTIVE'
AND DOMAIN_ALIAS IS NULL
ORDER BY ID ASC
""");
) {
var rs = stmt.executeQuery();
while (rs.next()) {
results.add(rs.getInt(1));
}
}
return results;
}
@SneakyThrows
public TIntList getSpecialDomains() {
TIntArrayList results = new TIntArrayList();
try (var connection = dataSource.getConnection();
var stmt = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE STATE='SPECIAL'")
) {
var rs = stmt.executeQuery();
while (rs.next()) {
results.add(rs.getInt(1));
}
}
return results;
public DomainRankings getDomainRankings() {
return domainRankings;
}
public SearchSet getSearchSetByName(SearchSetIdentifier searchSetIdentifier) {
@ -167,4 +77,54 @@ public class EdgeIndexSearchSetsService {
case SMALLWEB -> smallWebSet;
};
}
public void recalculateAll() {
updateAcademiaDomainsSet();
updateRetroDomainsSet();
updateSmallWebDomainsSet();
updateDomainRankings();
}
private void updateDomainRankings() {
var spr = new StandardPageRank(similarityDomains, rankingSettings.retro.toArray(String[]::new));
var ranks = spr.pageRankWithPeripheralNodes(spr.size() / 2, () -> new RankingResultHashMapAccumulator(100_000));
synchronized (this) {
domainRankings = new DomainRankings(ranks);
}
}
@SneakyThrows
public void updateRetroDomainsSet() {
var spr = new StandardPageRank(similarityDomains, rankingSettings.retro.toArray(String[]::new));
var data = spr.pageRankWithPeripheralNodes(spr.size() / 2, RankingResultBitSetAccumulator::new);
synchronized (this) {
retroSet = new RankingSearchSet(SearchSetIdentifier.RETRO, retroSet.source, data);
retroSet.write();
}
}
@SneakyThrows
public void updateSmallWebDomainsSet() {
var rpr = new ReversePageRank(similarityDomains, rankingSettings.small.toArray(String[]::new));
rpr.setMaxKnownUrls(750);
var data = rpr.pageRankWithPeripheralNodes(rpr.size(), RankingResultBitSetAccumulator::new);
synchronized (this) {
smallWebSet = new RankingSearchSet(SearchSetIdentifier.SMALLWEB, smallWebSet.source, data);
smallWebSet.write();
}
}
@SneakyThrows
public void updateAcademiaDomainsSet() {
var spr = new StandardPageRank(similarityDomains, rankingSettings.academia.toArray(String[]::new));
var data = spr.pageRankWithPeripheralNodes(spr.size()/2, RankingResultBitSetAccumulator::new);
synchronized (this) {
academiaSet = new RankingSearchSet(SearchSetIdentifier.ACADEMIA, academiaSet.source, data);
academiaSet.write();
}
}
}

View File

@ -9,21 +9,37 @@ import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
/** A serializable bit map of domains
*
* @see SearchSetIdentifier
*
* */
public class RankingSearchSet implements SearchSet {
private final RoaringBitmap set;
public final SearchSetIdentifier identifier;
public final Path source;
public RankingSearchSet(SearchSetIdentifier identifier, Path source, RoaringBitmap set) {
this.identifier = identifier;
this.source = source;
this.set = set;
}
public RankingSearchSet(SearchSetIdentifier identifier, Path source) throws IOException {
this.identifier = identifier;
this.source = source;
set = new RoaringBitmap();
if (!Files.exists(source)) {
return;
set = new RoaringBitmap();
}
else {
set = load(source);
}
}
private static RoaringBitmap load(Path source) throws IOException {
var set = new RoaringBitmap();
try (var ds = new DataInputStream(Files.newInputStream(source, StandardOpenOption.READ))) {
for (;;) {
try {
@ -32,12 +48,7 @@ public class RankingSearchSet implements SearchSet {
catch (IOException ex) { break; }
}
}
}
public RankingSearchSet(SearchSetIdentifier identifier, Path source, RoaringBitmap set) {
this.identifier = identifier;
this.source = source;
this.set = set;
return set;
}
@Override
@ -46,7 +57,11 @@ public class RankingSearchSet implements SearchSet {
}
public void write() throws IOException {
try (var ds = new DataOutputStream(Files.newOutputStream(source, StandardOpenOption.WRITE, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))) {
try (var ds = new DataOutputStream(Files.newOutputStream(source,
StandardOpenOption.WRITE,
StandardOpenOption.CREATE,
StandardOpenOption.TRUNCATE_EXISTING)))
{
for (var iter = set.getIntIterator(); iter.hasNext();) {
ds.writeInt(iter.next());
}

View File

@ -1,5 +1,12 @@
package nu.marginalia.wmsa.edge.index.svc.searchset;
import nu.marginalia.wmsa.edge.search.model.EdgeSearchProfile;
/** Identifies a RankingSearchSet, associated with an EdgeSearchProfile
*
* @see RankingSearchSet
* @see EdgeSearchProfile
* */
public enum SearchSetIdentifier {
NONE,
RETRO,

View File

@ -13,8 +13,8 @@ public class SmallSearchSet implements SearchSet {
}
@Override
public boolean contains(int urlId) {
return entries.contains(urlId);
public boolean contains(int domainId) {
return entries.contains(domainId);
}
public String toString() {

View File

@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.integration.stackoverflow;
import com.google.inject.Inject;
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
import nu.marginalia.util.language.processing.model.KeywordMetadata;
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;

View File

@ -1,7 +1,7 @@
package nu.marginalia.wmsa.edge.integration.wikipedia;
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
import nu.marginalia.util.language.processing.model.KeywordMetadata;
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;

View File

@ -11,8 +11,6 @@ import java.util.regex.Pattern;
@Getter @Setter @Builder
public class EdgeDomain {
private static final Predicate<String> ipPatternTest = Pattern.compile("[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}").asMatchPredicate();
private static final Predicate<String> govListTest = Pattern.compile(".*\\.(ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate();
@Nonnull
public final String subDomain;
@ -27,7 +25,7 @@ public class EdgeDomain {
var dot = host.lastIndexOf('.');
if (dot < 0 || ipPatternTest.test(host)) { // IPV6 >.>
if (dot < 0 || looksLikeAnIp(host)) { // IPV6 >.>
subDomain = "";
domain = host;
}
@ -38,7 +36,7 @@ public class EdgeDomain {
domain = host;
}
else {
if (govListTest.test(host))
if (looksLikeGovTld(host))
{ // Capture .ac.jp, .co.uk
int dot3 = host.substring(0, dot2).lastIndexOf('.');
if (dot3 >= 0) {
@ -59,6 +57,35 @@ public class EdgeDomain {
}
}
private static final Predicate<String> govListTest = Pattern.compile(".*\\.(ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate();
private boolean looksLikeGovTld(String host) {
if (host.length() < 8)
return false;
int cnt = 0;
for (int i = host.length() - 7; i < host.length(); i++) {
if (host.charAt(i) == '.')
cnt++;
}
return cnt >= 2 && govListTest.test(host);
}
private static final Predicate<String> ipPatternTest = Pattern.compile("[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}").asMatchPredicate();
private boolean looksLikeAnIp(String host) {
if (host.length() < 7)
return false;
char firstChar = host.charAt(0);
int lastChar = host.charAt(host.length() - 1);
return Character.isDigit(firstChar)
&& Character.isDigit(lastChar)
&& ipPatternTest.test(host);
}
public EdgeUrl toRootUrl() {
// Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http
return new EdgeUrl("http", this, null, "/", null);

View File

@ -24,6 +24,11 @@ public record EdgeSearchResultKeywordScore(int set,
sum += 20;
}
int rank = EdgePageDocumentsMetadata.decodeRank(encodedDocMetadata) - 13;
if (rank < 0)
sum += rank / 2;
else
sum += rank / 4;
return sum;
}

View File

@ -1,6 +1,7 @@
package nu.marginalia.wmsa.edge.model.search;
import lombok.*;
import nu.marginalia.wmsa.edge.index.model.QueryLimits;
import nu.marginalia.wmsa.edge.index.model.QueryStrategy;
import nu.marginalia.wmsa.edge.index.svc.searchset.SearchSetIdentifier;
import nu.marginalia.wmsa.edge.model.search.domain.SpecificationLimit;
@ -9,23 +10,18 @@ import java.util.List;
@ToString @Getter @Builder @With @AllArgsConstructor
public class EdgeSearchSpecification {
public List<EdgeSearchSubquery> subqueries;
public List<Integer> domains;
public SearchSetIdentifier searchSetIdentifier;
public final int limitByDomain;
public final int limitTotal;
public final String humanQuery;
public final int timeoutMs;
public final int fetchSize;
public final SpecificationLimit quality;
public final SpecificationLimit year;
public final SpecificationLimit size;
public final SpecificationLimit rank;
public final QueryLimits queryLimits;
public final QueryStrategy queryStrategy;
}

View File

@ -6,6 +6,7 @@ import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import nu.marginalia.wmsa.edge.index.model.QueryLimits;
import nu.marginalia.wmsa.edge.index.model.QueryStrategy;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSubquery;
@ -84,6 +85,8 @@ public class QueryFactory {
List<String> problems = new ArrayList<>();
String domain = null;
QueryStrategy queryStrategy = QueryStrategy.AUTO;
var basicQuery = queryParser.parse(query);
if (basicQuery.size() >= 8) {
@ -94,6 +97,7 @@ public class QueryFactory {
SpecificationLimit qualityLimit = profile.getQualityLimit();
SpecificationLimit year = profile.getYearLimit();
SpecificationLimit size = profile.getSizeLimit();
SpecificationLimit rank = SpecificationLimit.none();
for (Token t : basicQuery) {
if (t.type == TokenType.QUOT_TERM || t.type == TokenType.LITERAL_TERM) {
@ -113,6 +117,12 @@ public class QueryFactory {
if (t.type == TokenType.SIZE_TERM) {
size = parseSpecificationLimit(t.str);
}
if (t.type == TokenType.RANK_TERM) {
rank = parseSpecificationLimit(t.str);
}
if (t.type == TokenType.QS_TERM) {
queryStrategy = parseQueryStrategy(t.str);
}
}
var queryPermutations = queryParser.permuteQueriesNew(basicQuery);
@ -148,6 +158,8 @@ public class QueryFactory {
case QUALITY_TERM:
case YEAR_TERM:
case SIZE_TERM:
case RANK_TERM:
case QS_TERM:
break; //
case NEAR_TERM:
near = t.str;
@ -179,25 +191,25 @@ public class QueryFactory {
}
}
int domainLimit;
if (domain != null) {
domainLimit = 100;
} else {
domainLimit = 2;
}
EdgeSearchSpecification.EdgeSearchSpecificationBuilder specsBuilder = EdgeSearchSpecification.builder()
.subqueries(subqueries)
.limitTotal(100)
.queryLimits(new QueryLimits(domainLimit, 100, 250, 4096))
.humanQuery(query)
.timeoutMs(250)
.fetchSize(4096)
.quality(qualityLimit)
.year(year)
.size(size)
.rank(rank)
.domains(domains)
.queryStrategy(QueryStrategy.AUTO)
.queryStrategy(queryStrategy)
.searchSetIdentifier(profile.searchSetIdentifier);
if (domain != null) {
specsBuilder = specsBuilder.limitByDomain(100);
} else {
specsBuilder = specsBuilder.limitByDomain(2);
}
EdgeSearchSpecification specs = specsBuilder.build();
return new EdgeSearchQuery(specs, searchTermsHuman, domain);
@ -210,10 +222,10 @@ public class QueryFactory {
if (startChar == '=') {
return SpecificationLimit.equals(val);
}
else if (startChar == '<'){
else if (startChar == '<') {
return SpecificationLimit.lessThan(val);
}
else if (startChar == '>'){
else if (startChar == '>') {
return SpecificationLimit.greaterThan(val);
}
else {
@ -221,6 +233,17 @@ public class QueryFactory {
}
}
private QueryStrategy parseQueryStrategy(String str) {
return switch (str.toUpperCase()) {
case "RF_TITLE" -> QueryStrategy.REQUIRE_FIELD_TITLE;
case "RF_SUBJECT" -> QueryStrategy.REQUIRE_FIELD_SUBJECT;
case "RF_SITE" -> QueryStrategy.REQUIRE_FIELD_SITE;
case "SENTENCE" -> QueryStrategy.SENTENCE;
case "TOPIC" -> QueryStrategy.TOPIC;
default -> QueryStrategy.AUTO;
};
}
private String normalizeDomainName(String str) {
return str.toLowerCase();
}

View File

@ -93,6 +93,10 @@ public class QueryParser {
entity.replace(new Token(TokenType.YEAR_TERM, t.str.substring(4), t.displayStr));
} else if (t.str.startsWith("size") && t.str.matches("size[=><]\\d+")) {
entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr));
} else if (t.str.startsWith("rank") && t.str.matches("rank[=><]\\d+")) {
entity.replace(new Token(TokenType.RANK_TERM, t.str.substring(4), t.displayStr));
} else if (t.str.startsWith("qs=")) {
entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr));
} else if (t.str.contains(":")) {
entity.replace(new Token(TokenType.ADVICE_TERM, t.str, t.displayStr));
}
@ -506,8 +510,11 @@ enum TokenType implements Predicate<Token> {
QUALITY_TERM,
YEAR_TERM,
SIZE_TERM,
RANK_TERM,
NEAR_TERM,
QS_TERM,
QUOT,
MINUS,
QMARK,

View File

@ -8,7 +8,7 @@ import lombok.Getter;
import lombok.ToString;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.KeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.sentence.SentenceExtractor;
import nu.marginalia.util.language.processing.model.DocumentSentence;
import nu.marginalia.util.language.processing.model.WordSpan;
import nu.marginalia.wmsa.edge.assistant.dict.NGramBloomFilter;
@ -25,12 +25,12 @@ public class QueryVariants {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final KeywordExtractor keywordExtractor;
private final SentenceExtractor sentenceExtractor;
private final TermFrequencyDict dict;
private final PorterStemmer ps = new PorterStemmer();
private final NGramBloomFilter nGramBloomFilter;
private final EnglishDictionary englishDictionary;
private final ThreadLocal<SentenceExtractor> sentenceExtractor;
@Inject
public QueryVariants(LanguageModels lm,
@ -40,7 +40,7 @@ public class QueryVariants {
this.nGramBloomFilter = nGramBloomFilter;
this.englishDictionary = englishDictionary;
this.keywordExtractor = new KeywordExtractor();
this.sentenceExtractor = new SentenceExtractor(lm);
this.sentenceExtractor = ThreadLocal.withInitial(() -> new SentenceExtractor(lm));
this.dict = dict;
}
@ -78,10 +78,8 @@ public class QueryVariants {
final TreeMap<Integer, List<WordSpan>> byStart = new TreeMap<>();
logger.debug("Q: {}", query);
logger.debug("QAS: {}", joinedQuery);
var sentence = sentenceExtractor.extractSentence(joinedQuery.joinedQuery);
var se = sentenceExtractor.get();
var sentence = se.extractSentence(joinedQuery.joinedQuery);
for (int i = 0; i < sentence.posTags.length; i++) {
if (sentence.posTags[i].startsWith("N") || sentence.posTags[i].startsWith("V")) {

Some files were not shown because too many files have changed in this diff Show More