Merge pull request 'Merge changes from experimental-22-08 into master' (#109) from master into release

Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/109
This commit is contained in:
Viktor Lofgren 2022-09-12 10:55:36 +02:00
commit afb0c78e4d
193 changed files with 4478 additions and 2409 deletions

View File

@ -58,7 +58,7 @@ jmhJar {
}
dependencies {
implementation project(':third_party')
implementation project(':protocol')
implementation 'org.projectlombok:lombok:1.18.24'
annotationProcessor 'org.projectlombok:lombok:1.18.24'
@ -157,6 +157,7 @@ dependencies {
jmh 'org.openjdk.jmh:jmh-core:1.35'
jmh 'org.openjdk.jmh:jmh-generator-annprocess:1.35'
}
configurations {

View File

@ -188,7 +188,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
System.out.println(driver.getTitle());
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
assertEquals(List.of("Bird"), getTitlesFromSearchResults(html));
assertEquals(List.of("Bird", "Washington, D.C."), getTitlesFromSearchResults(html));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query"));
}
@ -201,7 +201,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
System.out.println(driver.getTitle());
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
assertNotEquals(List.of("Bird"), getTitlesFromSearchResults(html));
assertNotEquals(List.of("Bird", "Washington, D.C."), getTitlesFromSearchResults(html));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query-yes-js"));
}
@ -214,7 +214,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
System.out.println(driver.getTitle());
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
assertEquals(List.of("Bird"), getTitlesFromSearchResults(html));
assertEquals(List.of("Bird", "Washington, D.C."), getTitlesFromSearchResults(html));
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query-no-js"));
}
@ -240,7 +240,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-search"));
assertEquals(List.of("Frog", "Binomial nomenclature", "Mantis", "Amphibian"), getTitlesFromSearchResults(html));
assertEquals(List.of("Frog", "Amphibian"), getTitlesFromSearchResults(html));
}
@Test

View File

@ -1,6 +1,10 @@
package nu.marginalia.util;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
public class DenseBitMap {
public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8;
@ -15,6 +19,31 @@ public class DenseBitMap {
this.buffer = ByteBuffer.allocateDirect((int)((cardinality / 8) + (misaligned ? 1 : 0)));
}
public static DenseBitMap loadFromFile(Path file) throws IOException {
long size = Files.size(file);
var dbm = new DenseBitMap(size/8);
try (var bc = Files.newByteChannel(file)) {
while (dbm.buffer.position() < dbm.buffer.capacity()) {
bc.read(dbm.buffer);
}
}
dbm.buffer.clear();
return dbm;
}
public void writeToFile(Path file) throws IOException {
try (var bc = Files.newByteChannel(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
while (buffer.position() < buffer.capacity()) {
bc.write(buffer);
}
}
buffer.clear();
}
public boolean get(long pos) {
return (buffer.get((int)(pos >>> 3)) & ((byte)1 << (int)(pos & 7))) != 0;
}

View File

@ -25,14 +25,16 @@ public class CachingBTreeReader {
return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
}
public Cache prepareCache() {
return new Cache();
public BTreeCachedIndex prepareCache(BTreeHeader header) {
return new BTreeCachedIndex(header);
}
/**
*
* @return file offset of entry matching keyRaw, negative if absent
*/
public long findEntry(BTreeHeader header, Cache cache, final long keyRaw) {
public long findEntry(BTreeCachedIndex cache, final long keyRaw) {
BTreeHeader header = cache.header;
final int blockSize = ctx.BLOCK_SIZE_WORDS();
final long key = keyRaw & ctx.equalityMask();
@ -46,7 +48,7 @@ public class CachingBTreeReader {
numEntries = header.numEntries();
}
else {
cache.load(header);
cache.load();
long dataLayerOffset = searchIndex(header, cache, key);
if (dataLayerOffset < 0) {
@ -60,7 +62,7 @@ public class CachingBTreeReader {
return dataSearcher.binarySearch(key, searchStart, numEntries);
}
private long searchIndex(BTreeHeader header, Cache cache, long key) {
private long searchIndex(BTreeHeader header, BTreeCachedIndex cache, long key) {
final int blockSize = ctx.BLOCK_SIZE_WORDS();
long layerOffset = 0;
@ -77,11 +79,22 @@ public class CachingBTreeReader {
return layerOffset;
}
public class Cache {
/** A cache for the BTree index data that will drastically reduce the number of disk reads
* for repeated queries against the same tree. The memory consumption is typically very low
* and the disk access pattern for reading the entire index relatively cheap.
*/
public class BTreeCachedIndex {
long[] indexData;
final BTreeHeader header;
public void load(BTreeHeader header) {
final int indexedDataSize;
public BTreeCachedIndex(BTreeHeader header) {
this.header = header;
indexedDataSize = header.numEntries();
}
public void load() {
if (indexData != null)
return;
@ -107,5 +120,17 @@ public class CachingBTreeReader {
}
return low;
}
public long sizeBytes() {
return isLoaded() ? 8L*indexData.length : 0;
}
public int getIndexedDataSize() {
return indexedDataSize;
}
public boolean isLoaded() {
return indexData != null;
}
}
}

View File

@ -1,18 +1,18 @@
package nu.marginalia.util.dict;
import nu.marginalia.util.SeekDictionary;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.ByteBuffer;
import java.nio.LongBuffer;
import java.util.ArrayList;
public class DictionaryData {
private final int DICTIONARY_BANK_SIZE;
private static final Logger logger = LoggerFactory.getLogger(DictionaryData.class);
private final SeekDictionary<DictionaryDataBank> banks = SeekDictionary.of(DictionaryDataBank::getSize);
private final ArrayList<DictionaryDataBank> banks = new ArrayList(100);
public DictionaryData(int bankSize) {
DICTIONARY_BANK_SIZE = bankSize;
@ -20,12 +20,8 @@ public class DictionaryData {
banks.add(new DictionaryDataBank(0, bankSize));
}
public int size() {
return banks.end();
}
public int add(long key) {
var activeBank = banks.last();
var activeBank = banks.get(banks.size()-1);
int rb = activeBank.add(key);
if (rb == -1) {
@ -42,10 +38,10 @@ public class DictionaryData {
public long getKey(int offset) {
return banks.bankForOffset(offset).getKey(offset);
return banks.get(offset/DICTIONARY_BANK_SIZE).getKey(offset);
}
public boolean keyEquals(int offset, long otherKey) {
return banks.bankForOffset(offset).keyEquals(offset, otherKey);
return banks.get(offset/DICTIONARY_BANK_SIZE).keyEquals(offset, otherKey);
}
private static class DictionaryDataBank {

View File

@ -8,7 +8,7 @@ import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.model.DocumentSentence;
import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import org.jsoup.nodes.Document;
import java.io.FileNotFoundException;
@ -30,7 +30,7 @@ public class DocumentDebugger {
Path tempDir;
public DocumentDebugger(LanguageModels lm) throws IOException {
se = new SentenceExtractor(lm);
var dict = new NGramDict(lm);
var dict = new TermFrequencyDict(lm);
ke = new KeywordExtractor();
kc = new KeywordCounter(dict, ke);
@ -69,7 +69,7 @@ public class DocumentDebugger {
Set<String> reps = new HashSet<>();
// kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed));
kc.count(languageData).forEach(rep -> reps.add(rep.stemmed));
// kc.count(languageData).forEach(rep -> reps.add(rep.stemmed));
try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) {

View File

@ -19,7 +19,12 @@ public class WordPatterns {
public static final Pattern wordBreakPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
public static final Pattern characterNoisePattern = Pattern.compile("^[/+\\-]+$");
public static final Pattern singleWordAdditionalPattern =
Pattern.compile("[\\da-zA-Z]{1,15}([.\\-_/:][\\da-zA-Z]{1,10}){0,4}");
public static final Predicate<String> singleWordQualitiesPredicate = singleWordAdditionalPattern.asMatchPredicate();
public static final Predicate<String> wordQualitiesPredicate = wordPattern.asMatchPredicate();
public static final Predicate<String> wordAppendixPredicate = wordAppendixPattern.asMatchPredicate();
public static final Predicate<String> wordPredicateEither = wordQualitiesPredicate.or(wordAppendixPredicate);
public static final Predicate<String> characterNoisePredicate = characterNoisePattern.asMatchPredicate();
@ -58,7 +63,7 @@ public class WordPatterns {
if (word.isBlank()) {
return false;
}
if (hasMoreThanTwo(word, '-', 2)) {
if (hasMoreThanTwo(word, '-', 4)) {
return false;
}
if (hasMoreThanTwo(word, '+', 2)) {
@ -75,7 +80,7 @@ public class WordPatterns {
if (Character.isDigit(word.charAt(i))) {
numDigits++;
}
if (numDigits > 6)
if (numDigits > 16)
return false;
}

View File

@ -6,8 +6,9 @@ import java.nio.file.Path;
@AllArgsConstructor
public class LanguageModels {
public final Path ngramDictionary;
public final Path ngramFrequency;
public final Path ngramBloomFilter;
public final Path termFrequencies;
public final Path openNLPSentenceDetectionData;
public final Path posRules;
public final Path posDict;

View File

@ -5,8 +5,8 @@ import java.util.regex.Pattern;
public class AsciiFlattener {
private static final Pattern nonAscii = Pattern.compile("[^a-zA-Z0-9_.'+@#:]+");
private static final Pattern plainAsciiPattern = Pattern.compile("^[a-zA-Z0-9_.'+@#:]+$");
private static final Pattern nonAscii = Pattern.compile("[^a-zA-Z0-9_.'+@#:\\-]+");
private static final Pattern plainAsciiPattern = Pattern.compile("^[a-zA-Z0-9_.'+@#:\\-]+$");
private static final Predicate<String> plainAscii = plainAsciiPattern.asMatchPredicate();
public static String flattenUnicode(String s) {

View File

@ -1,99 +1,164 @@
package nu.marginalia.util.language.processing;
import com.google.common.collect.Sets;
import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
import org.jetbrains.annotations.NotNull;
import javax.inject.Inject;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
public class DocumentKeywordExtractor {
private final KeywordExtractor keywordExtractor;
private final KeywordCounter tfIdfCounter;
private final NameCounter nameCounter;
private final LongNameCounter longNameCounter;
private final SubjectCounter subjectCounter;
private final NGramDict dict;
private final TermFrequencyDict dict;
private final double docCount;
@Inject
public DocumentKeywordExtractor(NGramDict dict) {
public DocumentKeywordExtractor(TermFrequencyDict dict) {
this.dict = dict;
docCount = dict.docCount();
keywordExtractor = new KeywordExtractor();
tfIdfCounter = new KeywordCounter(dict, keywordExtractor);
nameCounter = new NameCounter(keywordExtractor);
longNameCounter = new LongNameCounter(dict, keywordExtractor);
subjectCounter = new SubjectCounter(keywordExtractor);
}
public EdgePageWordSet extractKeywordsMinimal(DocumentLanguageData documentLanguageData) {
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
List<WordRep> topKeywords = new ArrayList<>(wordsTfIdf.top());
Collection<String> artifacts = getArtifacts(documentLanguageData);
return new EdgePageWordSet(
createWords(IndexBlock.Subjects, subjects),
createWords(IndexBlock.Title, titleWords),
createWords(IndexBlock.NamesWords, wordsNamesAll),
createWords(IndexBlock.Tfidf_Top, topKeywords),
createWords(IndexBlock.Tfidf_Middle, midKeywords),
new EdgePageWords(IndexBlock.Artifacts, artifacts)
);
}
public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData) {
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
List<WordRep> wordsTfIdf = tfIdfCounter.count(documentLanguageData);
List<WordRep> wordsNamesRepeated = nameCounter.count(documentLanguageData, 2);
KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
List<WordRep> wordsLongName = longNameCounter.count(documentLanguageData);
int totalSize = wordsTfIdf.size();
List<WordRep> lowKeywords = new ArrayList<>(totalSize / 2);
List<WordRep> midKeywords = new ArrayList<>(totalSize / 2);
List<WordRep> topKeywords = new ArrayList<>(totalSize / 2);
for(var v : wordsTfIdf) {
if (topKeywords.size() <= totalSize / 10) topKeywords.add(v);
else if (midKeywords.size() <= totalSize / 5) midKeywords.add(v);
else lowKeywords.add(v);
}
var wordsToMatchWithTitle = joinWordLists(topKeywords, midKeywords, wordsNamesRepeated, subjects);
var words = getSimpleWords(documentLanguageData);
for (var w : wordsLongName)
words.add(w.word);
for (var w : lowKeywords)
words.remove(w.word);
for (var w : midKeywords)
words.remove(w.word);
for (var w : topKeywords)
words.remove(w.word);
List<WordRep> lowKeywords = new ArrayList<>(wordsTfIdf.lower());
List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
List<WordRep> topKeywords = new ArrayList<>(wordsTfIdf.top());
Collection<String> artifacts = getArtifacts(documentLanguageData);
var wordSet = new EdgePageWordSet(
createWords(IndexBlock.TitleKeywords, overlappingStems(titleWords, wordsToMatchWithTitle)),
createWords(IndexBlock.Topic, subjects),
createWords(IndexBlock.Subjects, subjects),
createWords(IndexBlock.Title, titleWords),
createWords(IndexBlock.NamesWords, wordsNamesAll),
createWords(IndexBlock.Top, topKeywords),
createWords(IndexBlock.Middle, midKeywords),
createWords(IndexBlock.Low, lowKeywords),
createWords(IndexBlock.Tfidf_Top, topKeywords),
createWords(IndexBlock.Tfidf_Middle, midKeywords),
createWords(IndexBlock.Tfidf_Lower, lowKeywords),
new EdgePageWords(IndexBlock.Artifacts, artifacts)
);
wordSet.append(IndexBlock.Words, words);
getSimpleWords(wordSet, documentLanguageData,
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus);
return wordSet;
}
private void getSimpleWords(EdgePageWordSet wordSet, DocumentLanguageData documentLanguageData, IndexBlock... blocks) {
int start = 0;
int lengthGoal = 32;
for (int blockIdx = 0; blockIdx < blocks.length-1 && start < documentLanguageData.sentences.length; blockIdx++) {
IndexBlock block = blocks[blockIdx];
Set<String> words = new HashSet<>(lengthGoal+100);
int pos;
int length = 0;
for (pos = start; pos < documentLanguageData.sentences.length && length < lengthGoal; pos++) {
var sent = documentLanguageData.sentences[pos];
length += sent.length();
for (var word : sent) {
if (!word.isStopWord()) {
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
words.add(w);
}
}
}
}
wordSet.append(block, words);
start = pos;
lengthGoal+=32;
}
if (start < documentLanguageData.sentences.length) {
Map<String, Integer> counts = new HashMap<>(documentLanguageData.totalNumWords());
for (int pos = start; pos < documentLanguageData.sentences.length && counts.size() < lengthGoal; pos++) {
var sent = documentLanguageData.sentences[pos];
for (var word : sent) {
if (!word.isStopWord()) {
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
if (counts.containsKey(w) || (WordPatterns.singleWordQualitiesPredicate.test(w))) {
counts.merge(w, 1, Integer::sum);
}
}
}
}
Set<String> lastSet;
if (counts.size() < 1024) {
lastSet = counts.keySet();
}
else {
lastSet = counts.entrySet().stream()
.sorted(Comparator.comparing(e -> {
double N = docCount; // Number of documents in term freq dictionary
// Caveat: This is actually the *negated* term score, because the second logarithm has
// its parameter inverted (log(a^b) = b log(a); here b = -1)
return (1 + Math.log(e.getValue())) * Math.log((1. + dict.getTermFreq(e.getKey())) / N);
}))
.map(Map.Entry::getKey)
.limit(1024)
.collect(Collectors.toCollection(LinkedHashSet::new));
}
wordSet.append(blocks[blocks.length - 1], lastSet);
}
}
private Collection<String> getArtifacts(DocumentLanguageData documentLanguageData) {
Set<String> reps = new HashSet<>();
for (var sent : documentLanguageData.sentences) {
for (var word : sent) {
String lc = word.wordLowerCase();
@ -123,57 +188,7 @@ public class DocumentKeywordExtractor {
.collect(Collectors.toList());
}
private Collection<WordRep> joinWordLists(List<WordRep>... words) {
int size = 0;
for (var lst : words) {
size += lst.size();
}
if (size == 0)
return Collections.emptyList();
final LinkedHashSet<WordRep> ret = new LinkedHashSet<>(size);
for (var lst : words) {
ret.addAll(lst);
}
return ret;
}
@NotNull
private Set<String> getSimpleWords(DocumentLanguageData documentLanguageData) {
Map<String, Integer> counts = new HashMap<>(documentLanguageData.totalNumWords());
for (var sent : documentLanguageData.sentences) {
for (int i = 0; i < sent.length(); i++) {
if (!sent.isStopWord(i)) {
String w = AsciiFlattener.flattenUnicode(sent.wordsLowerCase[i]);
if (counts.containsKey(w) || (WordPatterns.wordQualitiesPredicate.test(w) && WordPatterns.filter(w))) {
counts.merge(w, 1, Integer::sum);
}
}
}
}
return counts.entrySet().stream()
.sorted(Comparator.comparing(e -> {
double N = 11820118.; // Number of documents in term freq dictionary
// Caveat: This is actually the *negated* term score, because the second logarithm has
// its parameter inverted (log(a^b) = b log(a); here b = -1)
return (1+Math.log(e.getValue())) * Math.log((1.+dict.getTermFreq(e.getKey()))/N);
}))
.map(Map.Entry::getKey)
.limit(512).collect(Collectors.toCollection(LinkedHashSet::new));
}
public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) {
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet()));
}
private Set<WordRep> overlappingStems(Collection<WordRep> wordsA, Collection<WordRep> wordsB) {
Set<String> stemmedA = wordsA.stream().map(WordRep::getStemmed).collect(Collectors.toSet());
Set<String> stemmedB = wordsB.stream().map(WordRep::getStemmed).collect(Collectors.toSet());
Set<String> stemmedIntersect = Sets.intersection(stemmedA, stemmedB);
return Stream.concat(wordsA.stream(), wordsB.stream()).filter(w -> stemmedIntersect.contains(w.getStemmed())).collect(Collectors.toSet());
}
}

View File

@ -1,65 +1,92 @@
package nu.marginalia.util.language.processing;
import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import java.util.*;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class KeywordCounter {
private final KeywordExtractor keywordExtractor;
private final NGramDict dict;
private final TermFrequencyDict dict;
private final double docCount;
public KeywordCounter(NGramDict dict, KeywordExtractor keywordExtractor) {
public KeywordCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) {
this.dict = dict;
this.keywordExtractor = keywordExtractor;
this.docCount = (double) dict.docCount();
}
public List<WordRep> count(DocumentLanguageData dld) {
HashMap<String, Double> counts = new HashMap<>(1000);
public WordHistogram countHisto(DocumentLanguageData dld) {
HashMap<String, Integer> counts = new HashMap<>(1000);
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
for (var sent : dld.sentences) {
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
for (var span : keywords) {
if (span.size() == 1 &&
WordPatterns.isStopWord(sent.words[span.start]))
continue;
String stemmed = sent.constructStemmedWordFromSpan(span);
counts.merge(stemmed, 1., Double::sum);
counts.merge(stemmed, 1, Integer::sum);
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span));
}
}
return counts.entrySet().stream()
.filter(e -> e.getValue() > 1)
.sorted(Comparator.comparing(this::getTermValue))
.map(Map.Entry::getKey)
.flatMap(w -> instances.get(w).stream())
.filter(w -> w.word.length() > 1)
.limit(150)
.collect(Collectors.toList());
double maxC = counts.values().stream().mapToDouble(Double::valueOf).max().orElse(1);
Set<WordRep> h5 = new HashSet<>();
Set<WordRep> h10 = new HashSet<>();
Set<WordRep> h15 = new HashSet<>();
int doubleWordCount = 0;
for (var entry : counts.entrySet()) {
double value = getTermValue(entry, maxC);
double avgCnt = entry.getValue();
String wordStemmed = entry.getKey();
Set<WordRep> histogram;
if (value < -3 && avgCnt>1) histogram = h15;
else if (value < -1.75 && avgCnt>1) histogram = h10;
else if (value < -1 &&
(!wordStemmed.contains("_") || doubleWordCount++ < 50))
histogram = h5;
else continue;
histogram.addAll(instances.get(wordStemmed));
}
return new WordHistogram(h5, h10, h15);
}
private static final Pattern separator = Pattern.compile("_");
public double getTermValue(Map.Entry<String, Double> e) {
public double getTermValue(Map.Entry<String, Integer> e, double maxValue) {
String[] parts = separator.split(e.getKey());
double totalValue = 0.;
for (String part : parts) {
totalValue += value(part, e.getValue());
totalValue += value(part, e.getValue(), maxValue);
}
return totalValue / Math.sqrt(parts.length);
return totalValue / parts.length;
}
double value(String key, double value) {
double value(String key, double value, double maxValue) {
double freq = dict.getTermFreqStemmed(key);
if (freq < 1) {
freq = 10;
freq = 1;
}
return (1+Math.log(value)) * Math.log((1.1+freq)/11820118.);
return (0.1 + 0.9*value/maxValue) * Math.log(freq/docCount);
}
public record WordHistogram(Set<WordRep> lower, Set<WordRep> mid, Set<WordRep> top) { }
}

View File

@ -1,93 +1,18 @@
package nu.marginalia.util.language.processing;
import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.util.language.processing.model.DocumentSentence;
import nu.marginalia.util.language.processing.model.WordSpan;
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
import nu.marginalia.util.language.WordPatterns;
import java.lang.ref.SoftReference;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.IntStream;
import java.util.stream.Stream;
public class KeywordExtractor {
public boolean isLegacy() {
return legacy;
}
public void setLegacy(boolean legacy) {
this.legacy = legacy;
}
private boolean legacy;
public WordSpan[] getNameLikes(DocumentSentence sentence) {
var direct = IntStream.range(0, sentence.length())
.filter(i -> sentence.posTags[i].startsWith("N"))
.mapToObj(i -> new WordSpan(i, i+1))
;
var two = IntStream.range(1, sentence.length())
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE)
.filter(i -> isName(i, sentence, Collections.emptySet()))
.filter(i -> isName(i -1, sentence, Collections.emptySet()))
.mapToObj(i -> new WordSpan(i-1, i+1))
;
var a_in_b = IntStream.range(2, sentence.length())
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE)
.filter(i -> isProperNoun(i, sentence))
.filter(i -> isJoiner(sentence, i-1))
.filter(i -> isProperNoun(i-2, sentence))
.mapToObj(i -> new WordSpan(i-2, i+1))
;
var a_in_det_b = IntStream.range(3, sentence.length())
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
&& sentence.separators[i-2] == WordSeparator.SPACE)
.filter(i -> isProperNoun(i, sentence))
.filter(i -> isJoiner(sentence, i-1))
.filter(i -> sentence.posTags[i-2].equals("DT"))
.filter(i -> isProperNoun(i-3, sentence))
.mapToObj(i -> new WordSpan(i-3, i+1))
;
var a_in_in_b = IntStream.range(3, sentence.length())
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
&& sentence.separators[i-2] == WordSeparator.SPACE)
.filter(i -> isProperNoun(i, sentence))
.filter(i -> isJoiner(sentence, i-1) || isProperNoun(i-1, sentence))
.filter(i -> isJoiner(sentence, i-2) || isProperNoun(i-2, sentence))
.filter(i -> isProperNoun(i-3, sentence))
.mapToObj(i -> new WordSpan(i-3, i+1))
;
var three = IntStream.range(2, sentence.length())
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
&& sentence.separators[i-2] == WordSeparator.SPACE)
.filter(i -> isName(i, sentence, Collections.emptySet()))
.filter(i -> isName(i-1, sentence, Collections.emptySet()))
.filter(i -> isName(i-2, sentence, Collections.emptySet()))
.mapToObj(i -> new WordSpan(i-2, i+1))
;
var four = IntStream.range(3, sentence.length())
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
&& sentence.separators[i-2] == WordSeparator.SPACE
&& sentence.separators[i-3] == WordSeparator.SPACE)
.filter(i -> isName(i, sentence, Collections.emptySet()))
.filter(i -> isName(i - 1, sentence, Collections.emptySet()))
.filter(i -> isName(i - 2, sentence, Collections.emptySet()))
.filter(i -> isName(i - 3, sentence, Collections.emptySet()))
.mapToObj(i -> new WordSpan(i-3, i+1))
;
return Stream.of(direct, two, a_in_b, a_in_in_b, a_in_det_b, three, four).flatMap(Function.identity())
.toArray(WordSpan[]::new);
}
public WordSpan[] getNames(DocumentSentence sentence) {
List<WordSpan> spans = new ArrayList<>(sentence.length());
@ -214,7 +139,7 @@ public class KeywordExtractor {
}
String word = sentence.constructWordFromSpan(w);
if (word.isBlank() || WordPatterns.isStopWord(word)) return false;
if (word.isBlank() || !WordPatterns.filter(word)) return false;
if (sentence.posTags[w.start].equals("CC")) return false;
if (sentence.posTags[w.end-1].equals("IN")) return false;
if (sentence.posTags[w.end-1].equals("DT")) return false;
@ -377,4 +302,6 @@ public class KeywordExtractor {
return (posTag.startsWith("JJ") || posTag.startsWith("R") || posTag.startsWith("VBG"));
}
}

View File

@ -3,7 +3,7 @@ package nu.marginalia.util.language.processing;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.util.language.processing.model.DocumentSentence;
import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import java.util.*;
import java.util.regex.Pattern;
@ -11,10 +11,11 @@ import java.util.stream.Collectors;
public class LongNameCounter {
private final KeywordExtractor keywordExtractor;
private final NGramDict dict;
public LongNameCounter(NGramDict dict, KeywordExtractor keywordExtractor) {
private final TermFrequencyDict dict;
private final double docCount;
public LongNameCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) {
this.dict = dict;
docCount = (double) dict.docCount();
this.keywordExtractor = keywordExtractor;
}

View File

@ -22,6 +22,9 @@ public class NameCounter {
DocumentSentence sent = dld.sentences[i];
var keywords = keywordExtractor.getNames(sent);
for (var span : keywords) {
if (span.size() <= 1)
continue;
var stemmed = sent.constructStemmedWordFromSpan(span);
counts.merge(stemmed, 1., Double::sum);

View File

@ -2,11 +2,11 @@ package nu.marginalia.wmsa.api;
import com.google.common.base.Strings;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.wmsa.api.model.ApiLicense;
import nu.marginalia.wmsa.client.GsonFactory;
import nu.marginalia.wmsa.configuration.server.*;
import nu.marginalia.wmsa.edge.search.client.EdgeSearchClient;
import org.slf4j.Logger;
@ -20,7 +20,7 @@ import java.util.concurrent.ConcurrentHashMap;
public class ApiService extends Service {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Gson gson = new GsonBuilder().create();
private final Gson gson = GsonFactory.get();
private final EdgeSearchClient searchClient;
private final HikariDataSource dataSource;
private final ConcurrentHashMap<String, ApiLicense> licenseCache = new ConcurrentHashMap<>();

View File

@ -1,12 +1,11 @@
package nu.marginalia.wmsa.client;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.protobuf.GeneratedMessageV3;
import io.reactivex.rxjava3.core.Observable;
import io.reactivex.rxjava3.core.ObservableSource;
import io.reactivex.rxjava3.plugins.RxJavaPlugins;
import lombok.SneakyThrows;
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
import nu.marginalia.wmsa.client.exception.LocalException;
import nu.marginalia.wmsa.client.exception.NetworkException;
import nu.marginalia.wmsa.client.exception.RemoteException;
@ -17,8 +16,6 @@ import org.apache.http.HttpHost;
import org.apache.logging.log4j.ThreadContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Marker;
import org.slf4j.MarkerFactory;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
@ -32,9 +29,7 @@ import java.util.zip.GZIPOutputStream;
public abstract class AbstractClient implements AutoCloseable {
public static final String CONTEXT_OUTBOUND_REQUEST = "outbound-request";
private final Gson gson = new GsonBuilder()
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
.create();
private final Gson gson = GsonFactory.get();
private final Logger logger = LoggerFactory.getLogger(getClass());
@ -186,6 +181,31 @@ public abstract class AbstractClient implements AutoCloseable {
.doFinally(() -> ThreadContext.remove("outbound-request"));
}
@SneakyThrows
protected synchronized Observable<HttpStatusCode> post(Context ctx, String endpoint, GeneratedMessageV3 data) {
ensureAlive();
RequestBody body = RequestBody.create(
MediaType.parse("application/protobuf"),
data.toByteArray());
var req = ctx.paint(new Request.Builder()).url(url + endpoint).post(body).build();
var call = client.newCall(req);
logInbound(call);
ThreadContext.put("outbound-request", url + endpoint);
try (var rsp = call.execute()) {
logOutbound(rsp);
int code = rsp.code();
return validateStatus(code, req).map(HttpStatusCode::new);
}
finally {
ThreadContext.remove("outbound-request");
}
}
@SneakyThrows
protected synchronized <T> Observable<T> postGet(Context ctx, String endpoint, Object data, Class<T> returnType) {

View File

@ -0,0 +1,29 @@
package nu.marginalia.wmsa.client;
import com.google.gson.*;
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
import java.net.URISyntaxException;
public class GsonFactory {
public static Gson get() {
return new GsonBuilder()
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
.registerTypeAdapter(EdgeUrl.class, (JsonSerializer<EdgeUrl>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
.registerTypeAdapter(EdgeDomain.class, (JsonSerializer<EdgeDomain>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
.registerTypeAdapter(EdgeUrl.class, (JsonDeserializer<EdgeUrl>) (json, typeOfT, context) -> {
try {
return new EdgeUrl(json.getAsString());
} catch (URISyntaxException e) {
throw new JsonParseException("URL Parse Exception", e);
}
})
.registerTypeAdapter(EdgeDomain.class, (JsonDeserializer<EdgeDomain>) (json, typeOfT, context) -> new EdgeDomain(json.getAsString()))
.registerTypeAdapter(EdgeId.class, (JsonDeserializer<EdgeId<?>>) (json, typeOfT, context) -> new EdgeId<>(json.getAsInt()))
.registerTypeAdapter(EdgeId.class, (JsonSerializer<EdgeId<?>>) (src, typeOfSrc, context) -> new JsonPrimitive(src.id()))
.create();
}
}

View File

@ -2,10 +2,7 @@ package nu.marginalia.wmsa.configuration;
import nu.marginalia.wmsa.api.ApiMain;
import nu.marginalia.wmsa.auth.AuthMain;
import nu.marginalia.wmsa.configuration.command.Command;
import nu.marginalia.wmsa.configuration.command.ListCommand;
import nu.marginalia.wmsa.configuration.command.StartCommand;
import nu.marginalia.wmsa.configuration.command.VersionCommand;
import nu.marginalia.wmsa.configuration.command.*;
import nu.marginalia.wmsa.edge.assistant.EdgeAssistantMain;
import nu.marginalia.wmsa.edge.dating.DatingMain;
import nu.marginalia.wmsa.edge.index.EdgeIndexMain;
@ -82,6 +79,9 @@ public enum ServiceDescriptor {
MainMapLookup.setMainArguments(args);
Map<String, Command> functions = Stream.of(new ListCommand(),
new StartCommand(),
new ConvertCommand(),
new LoadCommand(),
new ReindexCommand(),
new VersionCommand()
).collect(Collectors.toMap(c -> c.name, c -> c));

View File

@ -87,7 +87,7 @@ public class WmsaHome {
final Path home = getHomePath();
return new LanguageModels(
home.resolve("model/ngrams-generous-emstr.bin"),
home.resolve("model/ngrams.bin"),
home.resolve("model/tfreq-new-algo3.bin"),
home.resolve("model/opennlp-sentence.bin"),
home.resolve("model/English.RDR"),
@ -95,4 +95,8 @@ public class WmsaHome {
home.resolve("model/opennlp-tok.bin"));
}
private static final boolean debugMode = Boolean.getBoolean("wmsa-debug");
public static boolean isDebug() {
return debugMode;
}
}

View File

@ -0,0 +1,24 @@
package nu.marginalia.wmsa.configuration.command;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.converting.ConverterMain;
import java.util.Arrays;
public class ConvertCommand extends Command {
public ConvertCommand() {
super("convert");
}
@Override
@SneakyThrows
public void execute(String... args) {
if (args.length < 2) {
System.err.println("Usage: convert plan.yaml");
System.exit(255);
}
String[] args2 = Arrays.copyOfRange(args, 1, args.length);
ConverterMain.main(args2);
}
}

View File

@ -0,0 +1,24 @@
package nu.marginalia.wmsa.configuration.command;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.converting.LoaderMain;
import java.util.Arrays;
public class LoadCommand extends Command {
public LoadCommand() {
super("load");
}
@Override
@SneakyThrows
public void execute(String... args) {
if (args.length < 2) {
System.err.println("Usage: load plan.yaml");
System.exit(255);
}
String[] args2 = Arrays.copyOfRange(args, 1, args.length);
LoaderMain.main(args2);
}
}

View File

@ -0,0 +1,24 @@
package nu.marginalia.wmsa.configuration.command;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.converting.ReindexTriggerMain;
import java.util.Arrays;
public class ReindexCommand extends Command {
public ReindexCommand() {
super("reindex");
}
@Override
@SneakyThrows
public void execute(String... args) {
if (args.length < 2) {
System.err.println("Usage: reindex host");
System.exit(255);
}
String[] args2 = Arrays.copyOfRange(args, 1, args.length);
ReindexTriggerMain.main(args2);
}
}

View File

@ -1,6 +1,7 @@
package nu.marginalia.wmsa.configuration.command;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import java.util.Arrays;
@ -14,6 +15,12 @@ public class StartCommand extends Command {
public void execute(String... args) {
if (args.length < 2) {
System.err.println("Usage: start service-descriptor");
System.err.println();
System.err.println("Available services:");
System.err.println();
for (var d : ServiceDescriptor.values()) {
System.err.println("\t"+d.name);
}
System.exit(255);
}
var mainMethod = getKind(args[1]).mainClass.getMethod("main", String[].class);

View File

@ -84,6 +84,7 @@ public class DatabaseModule extends AbstractModule {
config.addDataSourceProperty("cachePrepStmts", "true");
config.addDataSourceProperty("prepStmtCacheSize", "250");
config.addDataSourceProperty("prepStmtCacheSqlLimit", "2048");
config.setMaximumPoolSize(100);
config.setMinimumIdle(10);
return new HikariDataSource(config);

View File

@ -1,10 +1,10 @@
package nu.marginalia.wmsa.edge.assistant;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.client.GsonFactory;
import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.configuration.server.MetricsServer;
import nu.marginalia.wmsa.configuration.server.Service;
@ -22,7 +22,7 @@ import spark.Spark;
public class EdgeAssistantService extends Service {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Gson gson = new GsonBuilder().create();
private final Gson gson = GsonFactory.get();
private final Units units;
private final MathParser mathParser;
private final Suggestions suggestions;

View File

@ -0,0 +1,93 @@
package nu.marginalia.wmsa.edge.assistant.dict;
import ca.rmen.porterstemmer.PorterStemmer;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.google.inject.Inject;
import nu.marginalia.util.DenseBitMap;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournalFile;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
public class NGramBloomFilter {
private final DenseBitMap bitMap;
private static final PorterStemmer ps = new PorterStemmer();
private static final HashFunction hasher = Hashing.murmur3_128(0);
private static final Logger logger = LoggerFactory.getLogger(NGramBloomFilter.class);
@Inject
public NGramBloomFilter() throws IOException {
this(WmsaHome.getLanguageModels());
}
public NGramBloomFilter(LanguageModels lm) throws IOException {
this(loadSafely(lm.ngramBloomFilter));
}
private static DenseBitMap loadSafely(Path path) throws IOException {
if (Files.isRegularFile(path)) {
return DenseBitMap.loadFromFile(path);
}
else {
logger.warn("NGrams file missing " + path);
return new DenseBitMap(1);
}
}
public NGramBloomFilter(DenseBitMap bitMap) {
this.bitMap = bitMap;
}
public boolean isKnownNGram(String word) {
long bit = bitForWord(word, bitMap.cardinality);
return bitMap.get(bit);
}
public static void main(String... args) throws IOException {
var filter = convertFromDictionaryFile(new File(args[0]));
filter.bitMap.writeToFile(Path.of(args[1]));
}
public static NGramBloomFilter load(Path file) throws IOException {
return new NGramBloomFilter(DenseBitMap.loadFromFile(file));
}
public static NGramBloomFilter convertFromDictionaryFile(File file) throws IOException {
DenseBitMap bitMap = new DenseBitMap(1024*1024*1024L);
AtomicInteger popCount = new AtomicInteger();
try (var f = new KeywordLexiconJournalFile(file)) {
f.loadFile(data -> {
long bit = bitForWord(new String(data), bitMap.cardinality);
if (!bitMap.set(bit))
popCount.incrementAndGet();
});
}
System.out.println("popcount = " + popCount.get());
return new NGramBloomFilter(bitMap);
}
private static final Pattern underscore = Pattern.compile("_");
private static long bitForWord(String s, long n) {
String[] parts = underscore.split(s);
long hc = 0;
for (String part : parts) {
hc = hc * 31 + hasher.hashString(ps.stemWord(part), StandardCharsets.UTF_8).padToLong();
}
return (hc & 0x7FFF_FFFF_FFFF_FFFFL) % n;
}
}

View File

@ -1,137 +0,0 @@
package nu.marginalia.wmsa.edge.assistant.dict;
import ca.rmen.porterstemmer.PorterStemmer;
import gnu.trove.map.hash.TLongIntHashMap;
import nu.marginalia.util.language.conf.LanguageModels;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import javax.inject.Inject;
import javax.inject.Singleton;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@Singleton
public class NGramDict {
private final TLongIntHashMap wordRates = new TLongIntHashMap(1_000_000, 0.5f, 0, 0);
private final Logger logger = LoggerFactory.getLogger(getClass());
private static final Pattern separator = Pattern.compile("[_ ]+");
private static final PorterStemmer ps = new PorterStemmer();
private static long fileSize(Path p) throws IOException {
return Files.size(p);
}
@Inject
public NGramDict(@Nullable LanguageModels models) {
if (models == null) {
return;
}
if (models.ngramFrequency != null) {
try (var frequencyData = new DataInputStream(new BufferedInputStream(new FileInputStream(models.ngramFrequency.toFile())))) {
wordRates.ensureCapacity((int)(fileSize(models.ngramFrequency)/16));
for (;;) {
wordRates.put(frequencyData.readLong(), (int) frequencyData.readLong());
}
} catch (EOFException eof) {
// ok
} catch (IOException e) {
logger.error("IO Exception reading " + models.ngramFrequency, e);
}
}
logger.info("Read {} N-grams frequencies", wordRates.size());
}
public static void main(String... args) {
if (args.length != 2) {
System.err.println("Expected arguments: in-file out-file");
}
String inFile = args[0];
String outFile = args[1];
var wordPattern = Pattern.compile("\\w+(_\\w+)*").asMatchPredicate();
try (var linesStr = Files.lines(Path.of(inFile));
var dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(outFile)))
) {
linesStr
.filter(wordPattern)
.mapToLong(NGramDict::getStringHash).forEach(l ->
{
try {
dos.writeLong(l);
} catch (IOException e) {
e.printStackTrace();
}
});
} catch (IOException e) {
e.printStackTrace();
}
}
public static long getStringHash(String s) {
String[] strings = separator.split(s);
if (s.length() > 1) {
byte[][] parts = new byte[strings.length][];
for (int i = 0; i < parts.length; i++) {
parts[i] = ps.stemWord(strings[i]).getBytes();
}
return longHash(parts);
}
else {
return longHash(s.getBytes());
}
}
public long getTermFreqHash(long hash) {
return wordRates.get(hash);
}
public long getTermFreq(String s) {
return wordRates.get(getStringHash(s));
}
public long getTermFreqStemmed(String s) {
return wordRates.get(longHash(s.getBytes()));
}
public static String getStemmedString(String s) {
String[] strings = separator.split(s);
if (s.length() > 1) {
return Arrays.stream(strings).map(ps::stemWord).collect(Collectors.joining("_"));
}
else {
return s;
}
}
public static long longHash(byte[]... bytesSets) {
if (bytesSets == null || bytesSets.length == 0)
return 0;
// https://cp-algorithms.com/string/string-hashing.html
int p = 127;
long m = (1L<<61)-1;
long p_power = 1;
long hash_val = 0;
for (byte[] bytes: bytesSets) {
for (byte element : bytes) {
hash_val = (hash_val + (element + 1) * p_power) % m;
p_power = (p_power * p) % m;
}
}
return hash_val;
}
}

View File

@ -0,0 +1,221 @@
package nu.marginalia.wmsa.edge.assistant.dict;
import ca.rmen.porterstemmer.PorterStemmer;
import gnu.trove.map.hash.TLongIntHashMap;
import nu.marginalia.util.language.LanguageFilter;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter;
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import javax.inject.Inject;
import javax.inject.Singleton;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@Singleton
public class TermFrequencyDict {
private final TLongIntHashMap wordRates = new TLongIntHashMap(1_000_000, 0.5f, 0, 0);
private final Logger logger = LoggerFactory.getLogger(getClass());
private static final Pattern separator = Pattern.compile("[_ ]+");
private static final PorterStemmer ps = new PorterStemmer();
private static final long DOC_COUNT_KEY = ~0L;
private static long fileSize(Path p) throws IOException {
return Files.size(p);
}
@Inject
public TermFrequencyDict(@Nullable LanguageModels models) {
if (models == null) {
return;
}
if (models.termFrequencies != null) {
try (var frequencyData = new DataInputStream(new BufferedInputStream(new FileInputStream(models.termFrequencies.toFile())))) {
wordRates.ensureCapacity((int)(fileSize(models.termFrequencies)/16));
for (;;) {
wordRates.put(frequencyData.readLong(), (int) frequencyData.readLong());
}
} catch (EOFException eof) {
// ok
} catch (IOException e) {
logger.error("IO Exception reading " + models.termFrequencies, e);
}
}
logger.info("Read {} N-grams frequencies", wordRates.size());
}
public int docCount() {
int cnt = wordRates.get(DOC_COUNT_KEY);
if (cnt == 0) {
cnt = 11820118; // legacy
}
return cnt;
}
public static void main(String... args) throws IOException, InterruptedException {
if (args.length != 2) {
System.err.println("Expected arguments: plan.yaml out-file");
}
String outFile = args[1];
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
ThreadLocal<SentenceExtractor> se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels()));
LanguageFilter lf = new LanguageFilter();
TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1);
ForkJoinPool fjp = new ForkJoinPool(24);
AtomicInteger docCount = new AtomicInteger();
for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine
if (domain.doc == null)
continue;
fjp.execute(() -> {
for (var doc : domain.doc) {
if (doc.documentBody == null)
continue;
docCount.incrementAndGet();
Document parsed = Jsoup.parse(doc.documentBody);
parsed.body().filter(new DomPruningFilter(0.5));
DocumentLanguageData dld = se.get().extractSentences(parsed);
if (lf.dictionaryAgreement(dld) < 0.1) {
return;
}
Set<String> words = new HashSet<>(10_000);
for (var sent : dld.sentences) {
for (var word : sent) {
words.add(word.stemmed());
}
}
fjp.execute(() -> {
synchronized (counts) {
for (var word : words) {
counts.adjustOrPutValue(longHash(word.getBytes()), 1, 1);
}
}
});
}
});
}
fjp.shutdown();
fjp.awaitTermination(10, TimeUnit.SECONDS);
try (var dos = new DataOutputStream(Files.newOutputStream(Path.of(outFile)))) {
synchronized (counts) {
counts.put(DOC_COUNT_KEY, docCount.get());
counts.forEachEntry((hash, cnt) -> {
try {
dos.writeLong(hash);
dos.writeLong(cnt);
} catch (IOException e) {
throw new RuntimeException(e);
}
return true;
});
}
}
System.out.println(docCount.get());
//
// counts.forEachEntry((w,c) -> {
// if (c > 3L) {
// System.out.println(w + ":" + c);
// }
// return true;
// });
}
public static long getStringHash(String s) {
String[] strings = separator.split(s);
if (s.length() > 1) {
byte[][] parts = new byte[strings.length][];
for (int i = 0; i < parts.length; i++) {
parts[i] = ps.stemWord(strings[i]).getBytes();
}
return longHash(parts);
}
else {
return longHash(s.getBytes());
}
}
public long getTermFreqHash(long hash) {
return wordRates.get(hash);
}
public long getTermFreq(String s) {
return wordRates.get(getStringHash(s));
}
public long getTermFreqStemmed(String s) {
return wordRates.get(longHash(s.getBytes()));
}
public static String getStemmedString(String s) {
String[] strings = separator.split(s);
if (s.length() > 1) {
return Arrays.stream(strings).map(ps::stemWord).collect(Collectors.joining("_"));
}
else {
return s;
}
}
public static long longHash(byte[]... bytesSets) {
if (bytesSets == null || bytesSets.length == 0)
return 0;
// https://cp-algorithms.com/string/string-hashing.html
int p = 127;
long m = (1L<<61)-1;
long p_power = 1;
long hash_val = 0;
for (byte[] bytes: bytesSets) {
for (byte element : bytes) {
hash_val = (hash_val + (element + 1) * p_power) % m;
p_power = (p_power * p) % m;
}
}
return hash_val;
}
}

View File

@ -6,7 +6,7 @@ import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View File

@ -2,8 +2,8 @@ package nu.marginalia.wmsa.edge.assistant.suggest;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
import org.apache.commons.collections4.trie.PatriciaTrie;
import org.slf4j.Logger;
@ -21,7 +21,7 @@ import java.util.stream.Stream;
public class Suggestions {
private final PatriciaTrie<String> suggestionsTrie;
private final NGramDict nGramDict;
private final TermFrequencyDict termFrequencyDict;
private final SpellChecker spellChecker;
private static final Pattern suggestionPattern = Pattern.compile("^[a-zA-Z0-9]+( [a-zA-Z0-9]+)*$");
@ -31,12 +31,12 @@ public class Suggestions {
@Inject
public Suggestions(@Named("suggestions-file") Path suggestionsFile,
SpellChecker spellChecker,
NGramDict dict
TermFrequencyDict dict
) {
this.spellChecker = spellChecker;
suggestionsTrie = loadSuggestions(suggestionsFile);
nGramDict = dict;
termFrequencyDict = dict;
logger.info("Loaded {} suggestions", suggestionsTrie.size());
}
@ -138,7 +138,7 @@ public class Suggestions {
}
Map<String, Long> scach = new HashMap<>(512);
Function<String, Long> valr = s -> -nGramDict.getTermFreqHash(scach.computeIfAbsent(s, NGramDict::getStringHash));
Function<String, Long> valr = s -> -termFrequencyDict.getTermFreqHash(scach.computeIfAbsent(s, TermFrequencyDict::getStringHash));
return Stream.iterate(start.getKey(), Objects::nonNull, suggestionsTrie::nextKey)
.takeWhile(s -> s.startsWith(prefix))

View File

@ -22,7 +22,7 @@ import java.util.List;
public class ConverterMain {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final CrawledInstructionWriter instructionWriter;
private final LoadInstructionWriter instructionWriter;
public static void main(String... args) throws IOException {
@ -47,12 +47,12 @@ public class ConverterMain {
Gson gson
) throws Exception {
instructionWriter = new CrawledInstructionWriter(plan.process.getDir(), gson);
instructionWriter = new LoadInstructionWriter(plan.process.getDir(), gson);
logger.info("Starting pipe");
try (WorkLog processLog = plan.createProcessWorkLog()) {
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 48, 4, 2) {
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 20, 4, 2) {
@Override
protected ProcessingInstructions onProcess(CrawledDomain domainData) {
@ -73,12 +73,7 @@ public class ConverterMain {
};
plan.forEachCrawledDomain(domain -> {
if (!processLog.isJobFinished(domain.id)) {
logger.info("{} - {}", domain.domain, domain.id);
pipe.accept(domain);
}
});
plan.forEachCrawledDomain(id -> !processLog.isJobFinished(id), pipe::accept);
pipe.join();
}

View File

@ -1,16 +1,17 @@
package nu.marginalia.wmsa.edge.converting;
import com.google.gson.*;
import com.google.gson.Gson;
import com.google.inject.AbstractModule;
import com.google.inject.name.Names;
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.wmsa.client.GsonFactory;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
import nu.marginalia.wmsa.edge.index.client.EdgeIndexLocalService;
import nu.marginalia.wmsa.edge.index.client.EdgeIndexWriterClient;
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import java.net.URISyntaxException;
import java.nio.file.Path;
public class ConverterModule extends AbstractModule {
@ -31,24 +32,20 @@ public class ConverterModule extends AbstractModule {
bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128);
bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255);
if (null != System.getProperty("local-index-path")) {
bind(Path.class).annotatedWith(Names.named("local-index-path")).toInstance(Path.of(System.getProperty("local-index-path")));
bind(EdgeIndexWriterClient.class).to(EdgeIndexLocalService.class);
}
else {
bind(EdgeIndexWriterClient.class).to(EdgeIndexClient.class);
}
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
}
private Gson createGson() {
return new GsonBuilder()
.registerTypeAdapter(EdgeUrl.class, (JsonSerializer<EdgeUrl>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
.registerTypeAdapter(EdgeDomain.class, (JsonSerializer<EdgeDomain>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
.registerTypeAdapter(EdgeUrl.class, (JsonDeserializer<EdgeUrl>) (json, typeOfT, context) -> {
try {
return new EdgeUrl(json.getAsString());
} catch (URISyntaxException e) {
throw new JsonParseException("URL Parse Exception", e);
}
})
.registerTypeAdapter(EdgeDomain.class, (JsonDeserializer<EdgeDomain>) (json, typeOfT, context) -> new EdgeDomain(json.getAsString()))
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
.create();
return GsonFactory.get();
}
}

View File

@ -1,62 +0,0 @@
package nu.marginalia.wmsa.edge.converting;
import com.github.luben.zstd.ZstdOutputStream;
import com.google.gson.Gson;
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
public class CrawledInstructionWriter {
private final Path outputDir;
private final Gson gson;
private static final Logger logger = LoggerFactory.getLogger(CrawledInstructionWriter.class);
public CrawledInstructionWriter(Path outputDir, Gson gson) {
this.outputDir = outputDir;
this.gson = gson;
if (!Files.isDirectory(outputDir)) {
throw new IllegalArgumentException("Output dir " + outputDir + " does not exist");
}
}
public String accept(String id, List<Instruction> instructionList) throws IOException {
Path outputFile = getOutputFile(id);
if (Files.exists(outputFile)) {
Files.delete(outputFile);
}
try (var outputStream = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outputFile.toFile()))))) {
logger.info("Writing {} - {}", id, instructionList.size());
for (var instr : instructionList) {
outputStream.append(instr.tag().name());
outputStream.append(' ');
gson.toJson(instr, outputStream);
outputStream.append('\n');
}
}
return outputFile.getFileName().toString();
}
private Path getOutputFile(String id) throws IOException {
String first = id.substring(0, 2);
String second = id.substring(2, 4);
Path destDir = outputDir.resolve(first).resolve(second);
if (!Files.exists(destDir)) {
Files.createDirectories(destDir);
}
return destDir.resolve(id + ".pzstd");
}
}

View File

@ -2,11 +2,10 @@ package nu.marginalia.wmsa.edge.converting;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
import java.io.IOException;
import java.nio.file.Files;
@ -76,9 +75,8 @@ public class LinkKeywordLoaderMain {
// System.out.println(lastLine + " -/- " + domainId + ":" + urlId + " : " + keywords);
indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -5, new EdgePageWordSet(
new EdgePageWords(IndexBlock.Link, new HashSet<>(keywords))), 0
).blockingSubscribe();
indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId),
new DocumentKeywords(IndexBlock.Link, keywords.toArray(String[]::new)), 0);
}
lastLine = urlKeyword.url;

View File

@ -0,0 +1,121 @@
package nu.marginalia.wmsa.edge.converting;
import com.github.luben.zstd.ZstdOutputStream;
import com.google.gson.Gson;
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.BufferedOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
public class LoadInstructionWriter {
private final Path outputDir;
private final Gson gson;
private static final Logger logger = LoggerFactory.getLogger(LoadInstructionWriter.class);
public LoadInstructionWriter(Path outputDir, Gson gson) {
this.outputDir = outputDir;
this.gson = gson;
if (!Files.isDirectory(outputDir)) {
throw new IllegalArgumentException("Output dir " + outputDir + " does not exist");
}
}
public String accept(String id, List<Instruction> instructionList) throws IOException {
Path outputFile = getOutputFile(id);
if (Files.exists(outputFile)) {
Files.delete(outputFile);
}
try (var outputStream = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outputFile.toFile()))))) {
SummarizingInterpreter summary = new SummarizingInterpreter(instructionList);
logger.info("Writing {} - {} - {}", id, instructionList.size(), summary);
for (var instr : instructionList) {
outputStream.append(instr.tag().name());
outputStream.append(' ');
gson.toJson(instr, outputStream);
outputStream.append('\n');
}
}
return outputFile.getFileName().toString();
}
private Path getOutputFile(String id) throws IOException {
String first = id.substring(0, 2);
String second = id.substring(2, 4);
Path destDir = outputDir.resolve(first).resolve(second);
if (!Files.exists(destDir)) {
Files.createDirectories(destDir);
}
return destDir.resolve(id + ".pzstd");
}
private static class SummarizingInterpreter implements Interpreter {
private SummarizingInterpreter(List<Instruction> instructions) {
for (var i : instructions) {
i.apply(this);
}
}
private String domainName;
private int ok = 0;
private int error = 0;
public String toString() {
return String.format("%s - %d %d", domainName, ok, error);
}
@Override
public void loadUrl(EdgeUrl[] url) {}
@Override
public void loadDomain(EdgeDomain[] domain) {}
@Override
public void loadRssFeed(EdgeUrl[] rssFeed) {}
@Override
public void loadDomainLink(DomainLink[] links) {}
@Override
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
this.domainName = domain.toString();
}
@Override
public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {
ok++;
}
@Override
public void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) {
error++;
}
@Override
public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {}
@Override
public void loadDomainRedirect(DomainLink link) {}
}
}

View File

@ -27,7 +27,6 @@ public class LoaderMain {
private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class);
private final Path processDir;
private final EdgeCrawlPlan plan;
private final ConvertedDomainReader instructionsReader;
private final LoaderFactory loaderFactory;
@ -59,7 +58,6 @@ public class LoaderMain {
LoaderFactory loaderFactory,
EdgeIndexClient indexClient) {
this.processDir = plan.process.getDir();
this.plan = plan;
this.instructionsReader = instructionsReader;
this.loaderFactory = loaderFactory;
@ -106,7 +104,12 @@ public class LoaderMain {
public void run() {
long startTime = System.currentTimeMillis();
for (var i : instructionList) {
i.apply(loader);
try {
i.apply(loader);
}
catch (Exception ex) {
logger.error("Failed to load instruction {}", i);
}
}
loader.finish();

View File

@ -6,7 +6,7 @@ import lombok.SneakyThrows;
import nu.marginalia.util.DenseBitMap;
import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.apache.logging.log4j.util.Strings;
@ -36,7 +36,7 @@ public class AnchorTextExtractor {
// de-duplicating billions of shuffled (url, word) tuples on limited hardware
private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
private final NGramDict ngramDict = new NGramDict(WmsaHome.getLanguageModels());
private final TermFrequencyDict ngramDict = new TermFrequencyDict(WmsaHome.getLanguageModels());
public AnchorTextExtractor(Predicate<String> includeDomainPredicate,
Predicate<EdgeUrl> includeUrlPredicate,

View File

@ -4,23 +4,22 @@ import com.google.inject.Inject;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.index.client.EdgeIndexWriterClient;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Arrays;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
public class IndexLoadKeywords implements Runnable {
private final EdgeIndexClient client;
private static final Logger logger = LoggerFactory.getLogger(IndexLoadKeywords.class);
private final LinkedBlockingQueue<InsertTask> insertQueue = new LinkedBlockingQueue<>(32);
private record InsertTask(int urlId, int domainId, EdgePageWordSet wordSet) {}
private final LinkedBlockingQueue<InsertTask> insertQueue = new LinkedBlockingQueue<>(32);
private final EdgeIndexWriterClient client;
private record InsertTask(int urlId, int domainId, DocumentKeywords wordSet) {}
private final Thread runThread;
private volatile boolean canceled = false;
@ -28,7 +27,7 @@ public class IndexLoadKeywords implements Runnable {
private static final int index = Integer.getInteger("keyword-index", 1);
@Inject
public IndexLoadKeywords(EdgeIndexClient client) {
public IndexLoadKeywords(EdgeIndexWriterClient client) {
this.client = client;
runThread = new Thread(this, getClass().getSimpleName());
runThread.start();
@ -39,7 +38,7 @@ public class IndexLoadKeywords implements Runnable {
while (!canceled) {
var data = insertQueue.poll(1, TimeUnit.SECONDS);
if (data != null) {
client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), -5., data.wordSet, index).blockingSubscribe();
client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.wordSet, index);
}
}
}
@ -53,15 +52,13 @@ public class IndexLoadKeywords implements Runnable {
int domainId = loaderData.getDomainId(url.domain);
int urlId = loaderData.getUrlId(url);
if (urlId < 0 || domainId < 0) {
if (urlId <= 0 || domainId <= 0) {
logger.warn("Failed to get IDs for {} -- d={},u={}", url, domainId, urlId);
return;
}
var ws = new EdgePageWordSet();
for (var doc : words) {
ws.append(doc.block(), Arrays.asList(doc.keywords()));
for (var ws : words) {
insertQueue.put(new InsertTask(urlId, domainId, ws));
}
insertQueue.put(new InsertTask(urlId, domainId, ws));
}
}

View File

@ -27,6 +27,9 @@ public class Loader implements Interpreter {
private final List<LoadProcessedDocument> processedDocumentList;
private final List<LoadProcessedDocumentWithError> processedDocumentWithErrorList;
private final List<EdgeDomain> deferredDomains = new ArrayList<>();
private final List<EdgeUrl> deferredUrls = new ArrayList<>();
public final LoaderData data;
public Loader(int sizeHint,
@ -72,28 +75,54 @@ public class Loader implements Interpreter {
@Override
public void loadDomainLink(DomainLink[] links) {
logger.debug("loadDomainLink({})", links, null);
sqlLoadDomainLinks.load(links);
sqlLoadDomainLinks.load(data, links);
}
@Override
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
logger.debug("loadProcessedDomain({}, {}, {})", domain, state, ip);
sqlLoadProcessedDomain.load(data, domain, state, ip);
}
@Override
public void loadProcessedDocument(LoadProcessedDocument document) {
deferralCheck(document.url());
processedDocumentList.add(document);
}
@Override
public void loadProcessedDocumentWithError(LoadProcessedDocumentWithError document) {
deferralCheck(document.url());
processedDocumentWithErrorList.add(document);
}
private void deferralCheck(EdgeUrl url) {
if (data.getDomainId(url.domain) <= 0)
deferredDomains.add(url.domain);
if (data.getUrlId(url) <= 0)
deferredUrls.add(url);
}
@Override
public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {
logger.debug("loadKeywords(#{})", words.length);
// This is a bit of a bandaid safeguard against a bug in
// in the converter, shouldn't be necessary in the future
if (!deferredDomains.isEmpty()) {
loadDomain(deferredDomains.toArray(EdgeDomain[]::new));
deferredDomains.clear();
}
if (!deferredUrls.isEmpty()) {
loadUrl(deferredUrls.toArray(EdgeUrl[]::new));
deferredUrls.clear();
}
try {
indexLoadKeywords.load(data, url, words);
} catch (InterruptedException e) {

View File

@ -40,13 +40,21 @@ public class SqlLoadDomainLinks {
}
}
public void load(DomainLink[] links) {
public void load(LoaderData data, DomainLink[] links) {
try (var connection = dataSource.getConnection();
var nukeExistingLinksForDomain =
connection.prepareStatement("""
DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?
""");
var stmt =
connection.prepareCall("CALL INSERT_LINK(?,?)"))
{
connection.setAutoCommit(false);
nukeExistingLinksForDomain.setInt(1, data.getDomainId(links[0].from()));
nukeExistingLinksForDomain.executeUpdate();
for (DomainLink link : links) {
stmt.setString(1, link.from().toString());
stmt.setString(2, link.to().toString());
@ -60,6 +68,10 @@ public class SqlLoadDomainLinks {
logger.warn("load({}) -- bad row count {}", links[rv], ret[rv]);
}
}
connection.commit();
connection.setAutoCommit(true);
}
catch (SQLException ex) {
logger.warn("SQL error inserting domain links", ex);

View File

@ -41,16 +41,18 @@ public class SqlLoadDomains {
try (var connection = dataSource.getConnection()) {
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) {
connection.setAutoCommit(false);
insertCall.setString(1, domain.toString());
insertCall.setString(2, domain.domain);
insertCall.addBatch();
var ret = insertCall.executeUpdate();
connection.commit();
if (ret < 0) {
logger.warn("load({}) -- bad row count {}", domain, ret);
logger.warn("load({}) -- bad return status {}", domain, ret);
}
findIdForTargetDomain(connection, data);
findIdForDomain(connection, data, domain);
connection.setAutoCommit(true);
}
}
catch (SQLException ex) {
@ -67,30 +69,48 @@ public class SqlLoadDomains {
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) {
int cnt = 0; int batchOffset = 0;
for (var domain : domains) {
insertCall.setString(1, domain.toString());
insertCall.setString(2, domain.domain);
insertCall.addBatch();
}
var ret = insertCall.executeBatch();
for (int rv = 0; rv < domains.length; rv++) {
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
logger.warn("load({}) -- bad row count {}", domains[rv], ret[rv]);
if (++cnt == 1000) {
var ret = insertCall.executeBatch();
connection.commit();
for (int rv = 0; rv < cnt; rv++) {
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
logger.warn("load({}) -- bad row count {}", domains[batchOffset + rv], ret[rv]);
}
}
cnt = 0;
batchOffset += 1000;
}
}
if (cnt > 0) {
var ret = insertCall.executeBatch();
connection.commit();
for (int rv = 0; rv < cnt; rv++) {
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
logger.warn("load({}) -- bad row count {}", domains[batchOffset + rv], ret[rv]);
}
}
}
}
connection.commit();
connection.setAutoCommit(true);
findIdForTargetDomain(connection, data);
findIdForDomain(connection, data, domains);
}
catch (SQLException ex) {
logger.warn("SQL error inserting domains", ex);
}
}
void findIdForTargetDomain(Connection connection, LoaderData data) {
void findIdForDomain(Connection connection, LoaderData data, EdgeDomain... domains) {
if (data.getTargetDomain() == null || data.getDomainId(data.getTargetDomain()) > 0) {
return;
}
@ -98,14 +118,39 @@ public class SqlLoadDomains {
try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?"))
{
var targetDomain = data.getTargetDomain();
query.setString(1, targetDomain.toString());
var rsp = query.executeQuery();
if (rsp.next()) {
data.addDomain(targetDomain, rsp.getInt(1));
for (var domain : domains) {
if (data.getDomainId(domain) > 0)
continue;
query.setString(1, domain.toString());
var rsp = query.executeQuery();
if (rsp.next()) {
data.addDomain(domain, rsp.getInt(1));
} else {
logger.warn("load() -- could not find ID for target domain {}", domain);
}
}
else {
logger.warn("load() -- could not find ID for target domain {}", targetDomain);
}
catch (SQLException ex) {
logger.warn("SQL error finding id for domain", ex);
}
}
void loadAdditionalDomains(Connection connection, LoaderData data, EdgeDomain[] domains) {
try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?"))
{
for (var domain : domains) {
if (data.getDomainId(domain) == 0) continue;
query.setString(1, domain.toString());
var rsp = query.executeQuery();
if (rsp.next()) {
data.addDomain(domain, rsp.getInt(1));
} else {
logger.warn("load() -- could not find ID for target domain {}", domain);
}
}
}
catch (SQLException ex) {

View File

@ -60,13 +60,15 @@ public class SqlLoadProcessedDocument {
}
public void load(LoaderData data, List<LoadProcessedDocument> documents) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?)")) {
conn.setAutoCommit(false);
int cnt = 0; int batchOffset = 0;
for (var doc : documents) {
int urlId = data.getUrlId(doc.url());
if (urlId < 0) {
if (urlId <= 0) {
logger.warn("Failed to resolve ID for URL {}", doc.url());
return;
}
@ -81,25 +83,46 @@ public class SqlLoadProcessedDocument {
stmt.setDouble(8, doc.quality());
stmt.setInt(9, (int) doc.hash());
stmt.addBatch();
}
var ret = stmt.executeBatch();
for (int rv = 0; rv < documents.size(); rv++) {
if (ret[rv] < 1 && ret[rv] != SUCCESS_NO_INFO) {
logger.warn("load({}) -- bad row count {}", documents.get(rv), ret[rv]);
if (++cnt == 100) {
var ret = stmt.executeBatch();
conn.commit();
for (int rv = 0; rv < cnt; rv++) {
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]);
}
}
cnt = 0;
batchOffset += 100;
}
}
if (cnt > 0) {
var ret = stmt.executeBatch();
conn.commit();
for (int rv = 0; rv < cnt; rv++) {
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]);
}
}
}
conn.commit();
conn.setAutoCommit(true);
} catch (SQLException ex) {
logger.warn("SQL error inserting document", ex);
}
}
public void loadWithError(LoaderData data, List<LoadProcessedDocumentWithError> documents) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT_BAD(?, ?)")) {
conn.setAutoCommit(false);
int cnt = 0; int batchOffset = 0;
for (var doc : documents) {
int urlId = data.getUrlId(doc.url());
if (urlId < 0) {
@ -110,13 +133,32 @@ public class SqlLoadProcessedDocument {
stmt.setInt(1, urlId);
stmt.setString(2, doc.state().name());
stmt.addBatch();
}
var ret = stmt.executeBatch();
for (int rv = 0; rv < documents.size(); rv++) {
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
logger.warn("load({}) -- bad row count {}", documents.get(rv), ret[rv]);
if (++cnt == 100) {
var ret = stmt.executeBatch();
conn.commit();
for (int rv = 0; rv < cnt; rv++) {
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]);
}
}
cnt = 0;
batchOffset += 100;
}
}
if (cnt > 0) {
var ret = stmt.executeBatch();
conn.commit();
for (int rv = 0; rv < cnt; rv++) {
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]);
}
}
}
conn.setAutoCommit(true);
} catch (SQLException ex) {
logger.warn("SQL error inserting failed document", ex);
}

View File

@ -14,6 +14,7 @@ public class SqlLoadProcessedDomain {
private final HikariDataSource dataSource;
private final SqlLoadDomains loadDomains;
private static final Logger logger = LoggerFactory.getLogger(SqlLoadProcessedDomain.class);
@Inject
public SqlLoadProcessedDomain(HikariDataSource dataSource, SqlLoadDomains loadDomains) {
this.dataSource = dataSource;
@ -54,6 +55,7 @@ public class SqlLoadProcessedDomain {
initCall.setInt(3, data.getDomainId(domain));
initCall.setString(4, ip);
int rc = initCall.executeUpdate();
conn.commit();
if (rc < 1) {
logger.warn("load({},{}) -- bad rowcount {}", domain, state, rc);
}
@ -75,6 +77,7 @@ public class SqlLoadProcessedDomain {
stmt.setString(1, link.to().toString());
stmt.setString(2, link.from().toString());
int rc = stmt.executeUpdate();
conn.commit();
if (rc != 1) {
logger.warn("loadAlias({}) - unexpected row count {}", link, rc);
}

View File

@ -4,6 +4,7 @@ import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -11,6 +12,8 @@ import org.slf4j.LoggerFactory;
import java.nio.charset.StandardCharsets;
import java.sql.SQLException;
import java.sql.Types;
import java.util.HashSet;
import java.util.Set;
import static java.sql.Statement.SUCCESS_NO_INFO;
@ -46,17 +49,22 @@ public class SqlLoadUrls {
}
public void load(LoaderData data, EdgeUrl[] urls) {
Set<EdgeDomain> affectedDomains = new HashSet<>();
try (var conn = dataSource.getConnection();
var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?,?,?)");
var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=?")
)
{
conn.setAutoCommit(false);
int cnt = 0; int batchOffset = 0;
for (var url : urls) {
if (url.path.length() >= 255) {
logger.warn("Skipping bad URL {}", url);
continue;
}
affectedDomains.add(url.domain);
insertCall.setString(1, url.proto);
insertCall.setString(2, url.domain.toString());
@ -70,30 +78,48 @@ public class SqlLoadUrls {
insertCall.setString(5, url.param);
insertCall.setLong(6, hashPath(url.path, url.param));
insertCall.addBatch();
if (cnt++ == 250) {
var ret = insertCall.executeBatch();
conn.commit();
for (int rv = 0; rv < cnt; rv++) {
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
logger.warn("load({}) -- bad row count {}", urls[batchOffset + rv], ret[rv]);
}
}
batchOffset += cnt;
cnt = 0;
}
}
var ret = insertCall.executeBatch();
for (int rv = 0; rv < ret.length; rv++) {
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
logger.warn("load({}) -- bad row count {}", urls[rv], ret[rv]);
if (cnt > 0) {
var ret = insertCall.executeBatch();
conn.commit();
for (int rv = 0; rv < cnt; rv++) {
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
logger.warn("load({}) -- bad row count {}", urls[batchOffset + rv], ret[rv]);
}
}
}
conn.commit();
conn.setAutoCommit(true);
var targetDomain = data.getTargetDomain();
queryCall.setInt(1, data.getDomainId(targetDomain));
for (var domain : affectedDomains) {
queryCall.setInt(1, data.getDomainId(domain));
var rsp = queryCall.executeQuery();
rsp.setFetchSize(1000);
var rsp = queryCall.executeQuery();
while (rsp.next()) {
int urlId = rsp.getInt(1);
String proto = rsp.getString(2);
String path = rsp.getString(3);
String param = rsp.getString(4);
while (rsp.next()) {
int urlId = rsp.getInt(1);
String proto = rsp.getString(2);
String path = rsp.getString(3);
String param = rsp.getString(4);
data.addUrl(new EdgeUrl(proto, targetDomain, null, path, param), urlId);
data.addUrl(new EdgeUrl(proto, domain, null, path, param), urlId);
}
}
}

View File

@ -17,6 +17,7 @@ public class DisqualifiedException extends Exception {
LANGUAGE,
STATUS,
QUALITY,
ACCEPTABLE_ADS
ACCEPTABLE_ADS,
FORBIDDEN
}
}

View File

@ -15,6 +15,7 @@ public class ProcessedDocument {
public EdgePageWordSet words;
public EdgeUrlState state;
public String stateReason;
public OptionalDouble quality() {
if (details != null) {

View File

@ -70,11 +70,22 @@ public class DocumentProcessor {
this.summaryExtractor = summaryExtractor;
}
public ProcessedDocument makeDisqualifiedStub(CrawledDocument crawledDocument) {
ProcessedDocument ret = new ProcessedDocument();
try {
ret.state = EdgeUrlState.DISQUALIFIED;
ret.url = getDocumentUrl(crawledDocument);
}
catch (Exception ex) {}
return ret;
}
public ProcessedDocument process(CrawledDocument crawledDocument, CrawledDomain crawledDomain) {
ProcessedDocument ret = new ProcessedDocument();
try {
ret.url = new EdgeUrl(crawledDocument.url);
ret.url = getDocumentUrl(crawledDocument);
ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus);
if (ret.state == EdgeUrlState.OK) {
@ -86,10 +97,6 @@ public class DocumentProcessor {
if (isAcceptedContentType(crawledDocument)) {
var detailsWords = createDetails(crawledDomain, crawledDocument);
if (detailsWords.details().quality < minDocumentQuality) {
throw new DisqualifiedException(DisqualificationReason.QUALITY);
}
ret.details = detailsWords.details();
ret.words = detailsWords.words();
}
@ -103,17 +110,31 @@ public class DocumentProcessor {
}
catch (DisqualifiedException ex) {
ret.state = EdgeUrlState.DISQUALIFIED;
ret.stateReason = ex.reason.toString();
logger.debug("Disqualified {}: {}", ret.url, ex.reason);
}
catch (Exception ex) {
ret.state = EdgeUrlState.DISQUALIFIED;
logger.info("Failed to convert " + ret.url, ex);
logger.info("Failed to convert " + crawledDocument.url, ex);
ex.printStackTrace();
}
return ret;
}
private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument)
throws URISyntaxException
{
if (crawledDocument.canonicalUrl != null) {
try {
return new EdgeUrl(crawledDocument.canonicalUrl);
}
catch (URISyntaxException ex) { /* fallthrough */ }
}
return new EdgeUrl(crawledDocument.url);
}
public static boolean isAcceptedContentType(CrawledDocument crawledDocument) {
if (crawledDocument.contentType == null) {
return false;
@ -141,27 +162,44 @@ public class DocumentProcessor {
private DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument)
throws DisqualifiedException, URISyntaxException {
var doc = Jsoup.parse(crawledDocument.documentBody);
Document doc = Jsoup.parse(crawledDocument.documentBody);
if (AcceptableAds.hasAcceptableAdsTag(doc)) {
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
}
if (doc.select("meta[name=robots]").attr("content").contains("noindex")) {
throw new DisqualifiedException(DisqualificationReason.FORBIDDEN);
}
var dld = sentenceExtractor.extractSentences(doc.clone());
Document prunedDoc = doc.clone();
prunedDoc.body().filter(new DomPruningFilter(0.5));
var dld = sentenceExtractor.extractSentences(prunedDoc);
checkDocumentLanguage(dld);
var ret = new ProcessedDocumentDetails();
ret.description = getDescription(doc);
ret.length = getLength(doc);
ret.standard = getHtmlStandard(doc);
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
ret.features = featureExtractor.getFeatures(crawledDomain, doc);
ret.quality = documentValuator.getQuality(ret.standard, doc, dld);
ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();
var words = getWords(dld);
final boolean doSimpleProcessing = ret.quality < minDocumentQuality;
EdgePageWordSet words;
if (doSimpleProcessing) {
ret.features = Set.of(HtmlFeature.UNKNOWN);
words = keywordExtractor.extractKeywordsMinimal(dld);
ret.description = "";
}
else {
ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld);
words = keywordExtractor.extractKeywords(dld);
ret.description = getDescription(doc);
}
var url = new EdgeUrl(crawledDocument.url);
addMetaWords(ret, url, crawledDomain, words);
@ -192,7 +230,6 @@ public class DocumentProcessor {
ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
words.append(IndexBlock.Meta, tagWords);
words.append(IndexBlock.Words, tagWords);
}
private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) {
@ -208,12 +245,11 @@ public class DocumentProcessor {
if (linkParser.shouldIndexLink(atag)) {
linkOpt.ifPresent(lp::accept);
}
else if (linkOpt.isPresent()) {
if (linkParser.hasBinarySuffix(linkOpt.get().toString())) {
linkOpt.ifPresent(lp::acceptNonIndexable);
}
else {
linkOpt
.filter(url -> linkParser.hasBinarySuffix(url.path.toLowerCase()))
.ifPresent(lp::acceptNonIndexable);
}
}
for (var frame : doc.getElementsByTag("frame")) {
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
@ -233,26 +269,24 @@ public class DocumentProcessor {
linkTerms.add("links:"+fd.toString().toLowerCase());
linkTerms.add("links:"+fd.getDomain().toLowerCase());
}
words.append(IndexBlock.Meta, linkTerms);
Set<String> fileKeywords = new HashSet<>(100);
for (var link : lp.getNonIndexableUrls()) {
if (!Objects.equals(domain, link.domain)) {
if (!domain.hasSameTopDomain(link.domain)) {
continue;
}
synthesizeFilenameKeyword(fileKeywords, link);
}
words.append(IndexBlock.Artifacts, fileKeywords);
}
private void synthesizeFilenameKeyword(Set<String> fileKeywords, EdgeUrl link) {
Path pFilename = Path.of(link.path.toLowerCase()).getFileName();
if (pFilename == null) return;
@ -289,10 +323,6 @@ public class DocumentProcessor {
return htmlStandard;
}
private EdgePageWordSet getWords(DocumentLanguageData dld) {
return keywordExtractor.extractKeywords(dld);
}
private String getDescription(Document doc) {
return summaryExtractor.extractSummary(doc);
}

View File

@ -1,23 +1,29 @@
package nu.marginalia.wmsa.edge.converting.processor;
import com.google.common.base.Strings;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
import nu.marginalia.wmsa.edge.converting.processor.logic.CommonKeywordExtractor;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.*;
import static nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus.BAD_CANONICAL;
public class DomainProcessor {
private static final CommonKeywordExtractor commonKeywordExtractor = new CommonKeywordExtractor();
private final DocumentProcessor documentProcessor;
private final Double minAvgDocumentQuality;
@Inject
public DomainProcessor(DocumentProcessor documentProcessor,
@Named("min-avg-document-quality") Double minAvgDocumentQuality
@ -39,10 +45,39 @@ public class DomainProcessor {
if (crawledDomain.doc != null) {
ret.documents = new ArrayList<>(crawledDomain.doc.size());
fixBadCanonicalTags(crawledDomain.doc);
DocumentDisqualifier disqualifier = new DocumentDisqualifier();
for (var doc : crawledDomain.doc) {
var processedDoc = documentProcessor.process(doc, crawledDomain);
if (processedDoc.url != null) {
ret.documents.add(processedDoc);
if (disqualifier.isQualified()) {
var processedDoc = documentProcessor.process(doc, crawledDomain);
if (processedDoc.url != null) {
ret.documents.add(processedDoc);
processedDoc.quality().ifPresent(disqualifier::offer);
}
else if ("LANGUAGE".equals(processedDoc.stateReason)) {
disqualifier.offer(-100);
}
}
else { // Short-circuit processing if quality is too low
var stub = documentProcessor.makeDisqualifiedStub(doc);
if (stub.url != null) {
ret.documents.add(stub);
}
}
}
Set<String> commonSiteWords = new HashSet<>(10);
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Tfidf_Top, IndexBlock.Subjects));
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Title));
if (!commonSiteWords.isEmpty()) {
for (var doc : ret.documents) {
if (doc.words != null) {
doc.words.get(IndexBlock.Site).addAll(commonSiteWords);
}
}
}
}
@ -50,30 +85,60 @@ public class DomainProcessor {
ret.documents = Collections.emptyList();
}
double averageQuality = getAverageQuality(ret.documents);
if (averageQuality < minAvgDocumentQuality) {
ret.documents.forEach(doc -> doc.state = EdgeUrlState.DISQUALIFIED);
}
ret.state = getState(crawledDomain.crawlerStatus);
return ret;
}
private double getAverageQuality(List<ProcessedDocument> documents) {
int n = 0;
double q = 0.;
for (var doc : documents) {
if (doc.quality().isPresent()) {
n++;
q += doc.quality().getAsDouble();
private void fixBadCanonicalTags(List<CrawledDocument> docs) {
Map<String, Set<String>> seenCanonicals = new HashMap<>();
Set<String> seenUrls = new HashSet<>();
// Sometimes sites set a blanket canonical link to their root page
// this removes such links from consideration
for (var document : docs) {
if (!Strings.isNullOrEmpty(document.canonicalUrl) && !Objects.equals(document.canonicalUrl, document.url)) {
seenCanonicals.computeIfAbsent(document.canonicalUrl, url -> new HashSet<>()).add(document.documentBodyHash);
}
seenUrls.add(document.url);
}
for (var document : docs) {
if (!Strings.isNullOrEmpty(document.canonicalUrl)
&& !Objects.equals(document.canonicalUrl, document.url)
&& seenCanonicals.getOrDefault(document.canonicalUrl, Collections.emptySet()).size() > 1) {
if (seenUrls.add(document.canonicalUrl)) {
document.canonicalUrl = document.url;
}
else {
document.crawlerStatus = BAD_CANONICAL.name();
}
}
}
if (n > 0) {
return q / n;
for (var document : docs) {
if (!Strings.isNullOrEmpty(document.canonicalUrl)
&& !Objects.equals(document.canonicalUrl, document.url)
&& seenCanonicals.getOrDefault(document.canonicalUrl, Collections.emptySet()).size() > 1) {
document.canonicalUrl = document.url;
}
}
// Ignore canonical URL if it points to a different domain
// ... this confuses the hell out of the loader
for (var document : docs) {
if (Strings.isNullOrEmpty(document.canonicalUrl))
continue;
Optional<EdgeUrl> cUrl = EdgeUrl.parse(document.canonicalUrl);
Optional<EdgeUrl> dUrl = EdgeUrl.parse(document.url);
if (cUrl.isPresent() && dUrl.isPresent() && !Objects.equals(cUrl.get().domain, dUrl.get().domain)) {
document.canonicalUrl = document.url;
}
}
return -5.;
}
private EdgeDomainIndexingState getState(String crawlerStatus) {
@ -84,4 +149,20 @@ public class DomainProcessor {
default -> EdgeDomainIndexingState.ERROR;
};
}
class DocumentDisqualifier {
int count;
int goodCount;
void offer(double quality) {
count++;
if (quality > minAvgDocumentQuality) {
goodCount++;
}
}
boolean isQualified() {
return count < 25 || goodCount*10 >= count;
}
}
}

View File

@ -26,7 +26,6 @@ public class InstructionsCompiler {
}
if (domain.redirect != null) {
compileRedirect(ret, domain.domain, domain.redirect);
}
return ret;

View File

@ -0,0 +1,71 @@
package nu.marginalia.wmsa.edge.converting.processor.logic;
import ca.rmen.porterstemmer.PorterStemmer;
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import java.util.*;
public class CommonKeywordExtractor {
private final PorterStemmer ps = new PorterStemmer();
private static final int MIN_REQUIRED_DOCUMENTS = 25;
private static final int REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION = 100;
private static final double QUALIFYING_PROPORTION_FOR_KEYWORD = .25;
private static final int MAX_SITE_KEYWORDS_TO_EXTRACT = 5;
public List<String> getCommonSiteWords(ProcessedDomain ret, IndexBlock... sourceBlocks) {
if (ret.documents.size() < MIN_REQUIRED_DOCUMENTS)
return Collections.emptyList();
final Map<String, String> wordToStemmedMemoized = new HashMap<>(ret.documents.size()*10);
final Map<String, Integer> topStemmedKeywordCount = new HashMap<>(ret.documents.size()*10);
final Map<String, Set<String>> stemmedToNonstemmedVariants = new HashMap<>(ret.documents.size()*10);
int qualifiedDocCount = 0;
for (var doc : ret.documents) {
if (doc.words == null)
continue;
qualifiedDocCount++;
for (var block : sourceBlocks) {
for (var word : doc.words.get(block).words) {
String wordStemmed = wordToStemmedMemoized.computeIfAbsent(word, ps::stemWord);
// Count by negative values to sort by Map.Entry.comparingByValue() in reverse
topStemmedKeywordCount.merge(wordStemmed, -1, Integer::sum);
stemmedToNonstemmedVariants.computeIfAbsent(wordStemmed, w -> new HashSet<>()).add(word);
}
}
}
int totalValue = 0;
for (int value : topStemmedKeywordCount.values()) {
totalValue += value;
}
if (totalValue > -REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION)
return Collections.emptyList();
List<String> topWords = new ArrayList<>(MAX_SITE_KEYWORDS_TO_EXTRACT);
double qualifyingValue = -qualifiedDocCount * QUALIFYING_PROPORTION_FOR_KEYWORD;
topStemmedKeywordCount.entrySet().stream()
.filter(e -> e.getValue() < qualifyingValue)
.sorted(Map.Entry.comparingByValue())
.limit(MAX_SITE_KEYWORDS_TO_EXTRACT)
.forEach(e -> topWords.addAll(stemmedToNonstemmedVariants.get(e.getKey())));
return topWords;
}
}

View File

@ -0,0 +1,105 @@
package nu.marginalia.wmsa.edge.converting.processor.logic;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeFilter;
import java.util.HashMap;
import java.util.Map;
public class DomPruningFilter implements NodeFilter {
private final double pruneThreshold;
private final Map<Node, NodeData> data = new HashMap<>();
private final NodeData dummy = new NodeData(Integer.MAX_VALUE, 1, 0);
public DomPruningFilter(double pruneThreshold) {
this.pruneThreshold = pruneThreshold;
}
@Override
public FilterResult head(Node node, int depth) {
return FilterResult.CONTINUE;
}
@Override
public FilterResult tail(Node node, int depth) {
final NodeData dataForNode;
if (node instanceof TextNode tn) {
dataForNode = new NodeData(depth, tn.text().length(), 0);
}
else if (isSignal(node)) {
dataForNode = new NodeData(depth, 0,0);
for (var childNode : node.childNodes()) {
dataForNode.add(data.getOrDefault(childNode, dummy));
}
}
else {
dataForNode = new NodeData(depth, 0,0);
for (var childNode : node.childNodes()) {
dataForNode.addAsNoise(data.getOrDefault(childNode, dummy));
}
}
data.put(node, dataForNode);
if (dataForNode.depth <= 1)
return FilterResult.CONTINUE;
if (dataForNode.signalNodeSize == 0)
return FilterResult.REMOVE;
if (dataForNode.noiseNodeSize > 0
&& dataForNode.signalRate() < pruneThreshold
&& dataForNode.treeSize > 3)
return FilterResult.REMOVE;
return FilterResult.CONTINUE;
}
public boolean isSignal(Node node) {
if (node instanceof Element e) {
if ("a".equalsIgnoreCase(e.tagName()))
return false;
if ("nav".equalsIgnoreCase(e.tagName()))
return false;
if ("footer".equalsIgnoreCase(e.tagName()))
return false;
if ("header".equalsIgnoreCase(e.tagName()))
return false;
}
return true;
}
}
class NodeData {
int signalNodeSize;
int noiseNodeSize;
int treeSize = 1;
int depth;
NodeData(int depth, int signalNodeSize, int noiseNodeSize) {
this.depth = depth;
this.signalNodeSize = signalNodeSize;
this.noiseNodeSize = noiseNodeSize;
}
public void add(NodeData other) {
signalNodeSize += other.signalNodeSize;
noiseNodeSize += other.noiseNodeSize;
treeSize += other.treeSize;
}
public void addAsNoise(NodeData other) {
noiseNodeSize += other.noiseNodeSize + other.signalNodeSize;
treeSize += other.treeSize;
}
public double signalRate() {
return signalNodeSize / (double)(signalNodeSize + noiseNodeSize);
}
}

View File

@ -2,7 +2,11 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector;
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
@ -35,14 +39,20 @@ public class FeatureExtractor {
"d31qbv1cthcecs.cloudfront.net",
"linkedin.com");
private AdblockSimulator adblockSimulator;
private final AdblockSimulator adblockSimulator;
private final RecipeDetector recipeDetector;
private final TextileCraftDetector textileCraftDetector;
private final WoodworkingDetector woodworkingDetector;
@Inject
public FeatureExtractor(AdblockSimulator adblockSimulator) {
public FeatureExtractor(AdblockSimulator adblockSimulator, RecipeDetector recipeDetector, TextileCraftDetector textileCraftDetector, WoodworkingDetector woodworkingDetector) {
this.adblockSimulator = adblockSimulator;
this.recipeDetector = recipeDetector;
this.textileCraftDetector = textileCraftDetector;
this.woodworkingDetector = woodworkingDetector;
}
public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc) {
public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc, DocumentLanguageData dld) {
final Set<HtmlFeature> features = new HashSet<>();
final Elements scriptTags = doc.getElementsByTag("script");
@ -81,9 +91,14 @@ public class FeatureExtractor {
}
}
if (!domain.cookies.isEmpty()) {
if (!domain.cookies.isEmpty())
features.add(HtmlFeature.COOKIES);
}
if (recipeDetector.testP(dld) > 0.5)
features.add(HtmlFeature.CATEGORY_FOOD);
// these should be mutually exclusive
else if (woodworkingDetector.testP(dld) > 0.3 || textileCraftDetector.testP(dld) > 0.3)
features.add(HtmlFeature.CATEGORY_CRAFTS);
return features;
}

View File

@ -12,6 +12,10 @@ public enum HtmlFeature {
CATEGORY_FOOD("category:food"),
ADVERTISEMENT("special:ads"),
CATEGORY_CRAFTS("category:crafts"),
UNKNOWN("special:uncategorized")
;
private final String keyword;

View File

@ -19,10 +19,14 @@ import java.util.regex.Pattern;
public class LinkParser {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final List<String> blockPrefixList = List.of(
"mailto:", "javascript:", "tel:", "itpc:", "#", "file:");
private final List<String> blockSuffixList = List.of(
private final List<String> binarySuffixList = List.of(
".pdf", ".mp3", ".wmv", ".avi", ".zip", ".7z",
".mpv", ".mp4", ".avi", ".mkv", ".tiff", ".dat", ".tar",
".com", ".bat", ".sh",
".bin", ".exe", ".tar.gz", ".tar.bz2", ".xml", ".swf",
".wav", ".ogg", ".jpg", ".jpeg", ".png", ".gif", ".webp",
".webm", ".bmp", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx",
@ -33,7 +37,7 @@ public class LinkParser {
return Optional.of(l)
.filter(this::shouldIndexLink)
.map(this::getUrl)
.map(link -> resolveUrl(relativeBaseUrl, link))
.map(link -> resolveRelativeUrl(relativeBaseUrl, link))
.flatMap(this::createURI)
.map(URI::normalize)
.map(this::renormalize)
@ -44,7 +48,7 @@ public class LinkParser {
public Optional<EdgeUrl> parseLinkPermissive(EdgeUrl relativeBaseUrl, Element l) {
return Optional.of(l)
.map(this::getUrl)
.map(link -> resolveUrl(relativeBaseUrl, link))
.map(link -> resolveRelativeUrl(relativeBaseUrl, link))
.flatMap(this::createURI)
.map(URI::normalize)
.map(this::renormalize)
@ -74,7 +78,7 @@ public class LinkParser {
@Contract(pure=true)
public Optional<EdgeUrl> parseLink(EdgeUrl baseUrl, String str) {
return Optional.of(str)
.map(link -> resolveUrl(baseUrl, link))
.map(link -> resolveRelativeUrl(baseUrl, link))
.flatMap(this::createURI)
.map(URI::normalize)
.map(this::renormalize)
@ -85,7 +89,7 @@ public class LinkParser {
public Optional<EdgeUrl> parseFrame(EdgeUrl baseUrl, Element frame) {
return Optional.of(frame)
.map(l -> l.attr("src"))
.map(link -> resolveUrl(baseUrl, link))
.map(link -> resolveRelativeUrl(baseUrl, link))
.flatMap(this::createURI)
.map(URI::normalize)
.map(this::renormalize)
@ -95,10 +99,10 @@ public class LinkParser {
@SneakyThrows
private URI renormalize(URI uri) {
if (uri.getPath() == null) {
return renormalize(new URI(uri.getScheme(), uri.getHost(), "/", uri.getFragment()));
return renormalize(new URI(uri.getScheme(), uri.getHost(), "/", uri.getQuery(), uri.getFragment()));
}
if (uri.getPath().startsWith("/../")) {
return renormalize(new URI(uri.getScheme(), uri.getHost(), uri.getPath().substring(3), uri.getFragment()));
return renormalize(new URI(uri.getScheme(), uri.getHost(), uri.getPath().substring(3), uri.getQuery(), uri.getFragment()));
}
return uri;
}
@ -117,10 +121,10 @@ public class LinkParser {
private static final Pattern paramSeparatorPattern = Pattern.compile("\\?");
@SneakyThrows
private String resolveUrl(EdgeUrl baseUrl, String s) {
private String resolveRelativeUrl(EdgeUrl baseUrl, String s) {
// url looks like http://www.marginalia.nu/
if (isAbsoluteDomain(s)) {
if (doesUrlStringHaveProtocol(s)) {
return s;
}
@ -154,8 +158,15 @@ public class LinkParser {
return url.path.substring(0, lastSlash+1);
}
private boolean isAbsoluteDomain(String s) {
return s.matches("^[a-zA-Z]+:.*$");
private boolean doesUrlStringHaveProtocol(String s) {
int i = 0;
for (; i < s.length(); i++) {
if (!Character.isAlphabetic(s.charAt(i)))
break;
}
if (i == 0 || i == s.length())
return false;
return ':' == s.charAt(i);
}
public boolean shouldIndexLink(Element link) {
@ -168,26 +179,29 @@ public class LinkParser {
return !"noindex".equalsIgnoreCase(rel);
}
public boolean hasBinarySuffix(String href) {
return blockSuffixList.stream().anyMatch(href::endsWith);
}
private boolean isUrlRelevant(String href) {
if (null == href || "".equals(href)) {
return false;
}
if (href.length() > 128) {
return false;
}
href = href.toLowerCase();
if (blockPrefixList.stream().anyMatch(href::startsWith)) {
return false;
}
if (hasBinarySuffix(href)) {
return false;
}
if (href.length() > 128) {
return false;
}
return true;
}
public boolean hasBinarySuffix(String str) {
return binarySuffixList.stream().anyMatch(str::endsWith);
}
@Nullable
public EdgeUrl getBaseLink(Document parsed, EdgeUrl documentUrl) {
var baseTags = parsed.getElementsByTag("base");
@ -196,7 +210,7 @@ public class LinkParser {
for (var tag : baseTags) {
String href = tag.attr("href");
if (!Strings.isNullOrEmpty(href)) {
return new EdgeUrl(resolveUrl(documentUrl, href));
return new EdgeUrl(resolveRelativeUrl(documentUrl, href));
}
}
}

View File

@ -1,6 +1,7 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
import ca.rmen.porterstemmer.PorterStemmer;
import com.google.inject.Inject;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import java.util.HashMap;
@ -14,6 +15,7 @@ public class RecipeDetector {
private final Map<String, Double> termValues = new HashMap<>();
@Inject
public RecipeDetector() {
PorterStemmer ps = new PorterStemmer();

View File

@ -1,6 +1,7 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
import ca.rmen.porterstemmer.PorterStemmer;
import com.google.inject.Inject;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import java.util.HashMap;
@ -14,6 +15,7 @@ public class TextileCraftDetector {
private final Map<String, Double> termValues = new HashMap<>();
@Inject
public TextileCraftDetector() {
PorterStemmer ps = new PorterStemmer();

View File

@ -1,6 +1,7 @@
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
import ca.rmen.porterstemmer.PorterStemmer;
import com.google.inject.Inject;
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
import java.util.HashMap;
@ -14,6 +15,7 @@ public class WoodworkingDetector {
private final Map<String, Double> termValues = new HashMap<>();
@Inject
public WoodworkingDetector() {
PorterStemmer ps = new PorterStemmer();

View File

@ -4,8 +4,8 @@ import com.github.luben.zstd.ZstdOutputStream;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.wmsa.client.GsonFactory;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
@ -84,7 +84,7 @@ public class CrawlJobExtractorMain {
Driver driver = new Driver();
var outFile = Path.of(args[0]);
Gson gson = new GsonBuilder().create();
Gson gson = GsonFactory.get();
String[] targetDomains = Arrays.stream(args).skip(1).toArray(String[]::new);
@ -103,7 +103,7 @@ public class CrawlJobExtractorMain {
}
public static void writeSpec(Path outFile, String domain, List<String> urls) throws IOException {
Gson gson = new GsonBuilder().create();
Gson gson = GsonFactory.get();
try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) {
var job = new CrawlingSpecification();

View File

@ -4,15 +4,15 @@ import com.github.luben.zstd.ZstdOutputStream;
import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.util.ranking.BetterReversePageRank;
import nu.marginalia.util.ranking.RankingDomainFetcher;
import nu.marginalia.wmsa.client.GsonFactory;
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
import nu.marginalia.util.ranking.BetterReversePageRank;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
import org.mariadb.jdbc.Driver;
import java.io.BufferedOutputStream;
@ -23,7 +23,7 @@ import java.nio.file.Path;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.*;
import java.util.ArrayList;
public class CrawlJobExtractorPageRankMain {
@ -72,7 +72,7 @@ public class CrawlJobExtractorPageRankMain {
Driver driver = new Driver();
var outFile = Path.of(args[0]);
Gson gson = new GsonBuilder().create();
Gson gson = GsonFactory.get();
var ds = new DatabaseModule().provideConnection();
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));

View File

@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.crawling;
import com.github.luben.zstd.ZstdInputStream;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import nu.marginalia.wmsa.client.GsonFactory;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
@ -13,9 +13,13 @@ import java.io.InputStreamReader;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.TimeUnit;
public class CrawledDomainReader {
private final Gson gson = new GsonBuilder().create();
private final Gson gson = GsonFactory.get();
private final ForkJoinPool pool = new ForkJoinPool(4);
public CrawledDomainReader() {
}
@ -43,7 +47,12 @@ public class CrawledDomainReader {
if (line.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
domain = gson.fromJson(nextLine, CrawledDomain.class);
} else if (line.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
docs.add(gson.fromJson(nextLine, CrawledDocument.class));
pool.execute(() -> {
var doc = gson.fromJson(nextLine, CrawledDocument.class);
synchronized (docs) {
docs.add(doc);
}
});
}
} else if (line.charAt(0) == '{') {
domain = gson.fromJson(line, CrawledDomain.class);
@ -52,6 +61,8 @@ public class CrawledDomainReader {
}
}
pool.awaitQuiescence(10, TimeUnit.SECONDS);
if (domain == null) {
return null;
}

View File

@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.crawling;
import com.github.luben.zstd.ZstdOutputStream;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import nu.marginalia.wmsa.client.GsonFactory;
import nu.marginalia.wmsa.edge.crawling.model.SerializableCrawlData;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -16,7 +16,7 @@ import java.nio.file.Path;
public class CrawledDomainWriter implements AutoCloseable {
private final Path outputDir;
private final Gson gson = new GsonBuilder().create();
private final Gson gson = GsonFactory.get();
private static final Logger logger = LoggerFactory.getLogger(CrawledDomainWriter.class);
private final Writer writer;
private final Path outputFile;

View File

@ -2,16 +2,19 @@ package nu.marginalia.wmsa.edge.crawling;
import com.github.luben.zstd.ZstdInputStream;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import nu.marginalia.wmsa.client.GsonFactory;
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
import org.apache.logging.log4j.util.Strings;
import java.io.*;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.file.Path;
import java.util.function.Consumer;
public class CrawlerSpecificationLoader {
private final static Gson gson = new GsonBuilder().create();
private final static Gson gson = GsonFactory.get();
public static void readInputSpec(Path inputSpec, Consumer<CrawlingSpecification> consumer) {
try (var inputStream = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(inputSpec.toFile()))))) {

View File

@ -1,7 +1,5 @@
package nu.marginalia.wmsa.edge.crawling.blocklist;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.google.inject.Singleton;
import com.opencsv.CSVReader;
import com.opencsv.exceptions.CsvValidationException;
@ -13,10 +11,7 @@ import org.slf4j.LoggerFactory;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.InetAddress;
import java.nio.charset.StandardCharsets;
import java.util.Objects;
import java.util.Set;
import java.util.TreeMap;

View File

@ -6,5 +6,7 @@ public enum CrawlerDocumentStatus {
BAD_CHARSET,
REDIRECT,
ROBOTS_TXT,
ERROR
ERROR,
BAD_CANONICAL,
Timeout
}

View File

@ -3,8 +3,9 @@ package nu.marginalia.wmsa.edge.data.dao;
import com.google.inject.ImplementedBy;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
import nu.marginalia.wmsa.edge.model.id.EdgeIdCollection;
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
@ -18,9 +19,9 @@ public interface EdgeDataStoreDao {
List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist backlist, int set);
List<BrowseResult> getBrowseResultFromUrlIds(List<EdgeId<EdgeUrl>> urlId);
List<BrowseResult> getBrowseResultFromUrlIds(EdgeIdCollection<EdgeUrl> urlId);
List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids);
List<EdgeUrlDetails> getUrlDetailsMulti(EdgeIdCollection<EdgeUrl> ids);
EdgeDomain getDomain(EdgeId<EdgeDomain> id);

View File

@ -1,5 +1,6 @@
package nu.marginalia.wmsa.edge.data.dao;
import com.google.common.base.Strings;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.google.common.util.concurrent.UncheckedExecutionException;
@ -8,9 +9,10 @@ import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
import nu.marginalia.wmsa.edge.model.id.EdgeIdCollection;
import nu.marginalia.wmsa.edge.model.search.EdgePageScoreAdjustment;
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
@ -63,17 +65,17 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
}
}
private <T> String idList(List<EdgeId<T>> ids) {
private <T> String idList(EdgeIdCollection<EdgeUrl> ids) {
StringJoiner j = new StringJoiner(",", "(", ")");
for (var id : ids) {
j.add(Integer.toString(id.id()));
for (var id : ids.values()) {
j.add(Integer.toString(id));
}
return j.toString();
}
@SneakyThrows
@Override
public List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids) {
public List<EdgeUrlDetails> getUrlDetailsMulti(EdgeIdCollection<EdgeUrl> ids) {
if (ids.isEmpty()) {
return Collections.emptyList();
}
@ -110,12 +112,14 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
rsp.getInt(11), // dataHash
EdgePageScoreAdjustment.zero(), // urlQualityAdjustment
Integer.MAX_VALUE, // rankingId
Double.MAX_VALUE, // termScore
0 // queryLength
Double.MAX_VALUE // termScore
);
if (val.urlQuality >= QUALITY_LOWER_BOUND_CUTOFF) {
result.add(val);
if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF
&& Strings.isNullOrEmpty(val.description)
&& val.url.path.length() > 1) {
continue;
}
result.add(val);
}
}
@ -267,7 +271,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
}
@Override
public List<BrowseResult> getBrowseResultFromUrlIds(List<EdgeId<EdgeUrl>> urlIds) {
public List<BrowseResult> getBrowseResultFromUrlIds(EdgeIdCollection<EdgeUrl> urlIds) {
if (urlIds.isEmpty())
return Collections.emptyList();

View File

@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.data.dao.task;
import com.google.inject.ImplementedBy;
import gnu.trove.set.hash.TIntHashSet;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
@ImplementedBy(EdgeDomainBlacklistImpl.class)
public interface EdgeDomainBlacklist {

View File

@ -9,7 +9,7 @@ import nu.marginalia.wmsa.configuration.server.Service;
import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService;
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;

View File

@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.dating;
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
import java.util.LinkedList;

View File

@ -4,12 +4,14 @@ import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter;
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
import nu.marginalia.wmsa.edge.index.reader.query.Query;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Collections;
import java.util.Comparator;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
@ -104,47 +106,49 @@ public class EdgeIndexBucket {
return indexReader.findHotDomainsForKeyword(block, wordId, queryDepth, minHitCount, maxResults);
}
public LongStream getQuery(IndexBlock block, LongPredicate filter, IndexSearchBudget budget, EdgeIndexSearchTerms searchTerms) {
public IndexQuery getQuery(IndexQueryCachePool cachePool, IndexBlock block, LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
if (null == indexReader) {
logger.warn("Index reader not neady {}", block);
return LongStream.empty();
return new IndexQuery(Collections.emptyList());
}
var orderedIncludes = searchTerms.includes
final int[] orderedIncludes = searchTerms.includes
.stream()
.sorted(Comparator.comparingLong(i -> indexReader.numHits(block, i)))
.sorted(Comparator.comparingLong(i -> indexReader.numHits(cachePool, block, i)))
.distinct()
.mapToInt(Integer::intValue)
.toArray();
Query query;
IndexQueryFactory.IndexQueryBuilder query;
if (orderedIncludes.length == 1) {
query = indexReader.findUnderspecified(block, budget, filter, orderedIncludes[0]);
query = indexReader.findWord(cachePool, block, orderedIncludes[0]);
if (query == null) {
return new IndexQuery(Collections.emptyList());
}
else {
query = indexReader.findWord(block, budget, filter, orderedIncludes[0]);
}
int i;
for (i = 1; (i < 3 && i < orderedIncludes.length) || i < orderedIncludes.length-1; i++) {
query = query.alsoCached(orderedIncludes[i]);
}
for (; i < orderedIncludes.length; i++) {
query.filter(filter);
for (int i = 1; i < orderedIncludes.length; i++) {
query = query.also(orderedIncludes[i]);
}
for (int term : searchTerms.excludes) {
query = query.not(term);
}
return query.stream();
for (int term : orderedIncludes) {
query.prioritize(term);
}
return query.build();
}
public IndexBlock getTermScore(int termId, long urlId) {
return indexReader.getBlockForResult(termId, urlId);
public IndexBlock getTermScore(IndexQueryCachePool cachePool, int termId, long urlId) {
return indexReader.getBlockForResult(cachePool, termId, urlId);
}
public boolean isTermInBucket(IndexBlock block, int termId, long urlId) {
return indexReader.isTermInBucket(block, termId, urlId);
public boolean isTermInBucket(IndexQueryCachePool cachePool, IndexBlock block, int termId, long urlId) {
return indexReader.isTermInBucket(cachePool, block, termId, urlId);
}
}

View File

@ -1,76 +1,31 @@
package nu.marginalia.wmsa.edge.index;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import gnu.trove.map.TLongIntMap;
import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.map.hash.TLongIntHashMap;
import gnu.trove.set.hash.TIntHashSet;
import io.prometheus.client.Counter;
import io.prometheus.client.Histogram;
import io.reactivex.rxjava3.schedulers.Schedulers;
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
import nu.marginalia.util.ListChunker;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.wmsa.client.GsonFactory;
import nu.marginalia.wmsa.configuration.server.Initialization;
import nu.marginalia.wmsa.configuration.server.MetricsServer;
import nu.marginalia.wmsa.configuration.server.Service;
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
import nu.marginalia.wmsa.edge.model.search.*;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
import org.apache.http.HttpStatus;
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexLexiconService;
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexOpsService;
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexQueryService;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.HaltException;
import spark.Request;
import spark.Response;
import spark.Spark;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.function.LongPredicate;
import java.util.stream.LongStream;
import static spark.Spark.get;
import static spark.Spark.halt;
public class EdgeIndexService extends Service {
private static final int SEARCH_BUDGET_TIMEOUT_MS = 100;
private final Logger logger = LoggerFactory.getLogger(getClass());
@NotNull
private final Initialization init;
private final SearchIndexes indexes;
private final KeywordLexicon keywordLexicon;
private final Gson gson = new GsonBuilder()
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
.create();
private static final Histogram wmsa_edge_index_query_time
= Histogram.build().name("wmsa_edge_index_query_time").help("-").register();
private static final Counter wmsa_edge_index_query_count
= Counter.build().name("wmsa_edge_index_query_count").help("-").register();
private static final Histogram wmsa_edge_index_put_words_time
= Histogram.build().name("wmsa_edge_index_put_words_time").help("-").register();
public static final int DYNAMIC_BUCKET_LENGTH = 7;
@ -81,71 +36,34 @@ public class EdgeIndexService extends Service {
Initialization init,
MetricsServer metricsServer,
SearchIndexes indexes,
IndexServicesFactory servicesFactory) {
EdgeIndexOpsService opsService,
EdgeIndexLexiconService lexiconService,
EdgeIndexQueryService indexQueryService)
{
super(ip, port, init, metricsServer);
final Gson gson = GsonFactory.get();
this.init = init;
this.indexes = indexes;
this.keywordLexicon = servicesFactory.getKeywordLexicon();
Spark.post("/words/", this::putWords);
Spark.post("/search/", this::search, gson::toJson);
Spark.post("/search-domain/", this::searchDomain, gson::toJson);
Spark.post("/words/", lexiconService::putWords);
Spark.post("/dictionary/*", this::getWordId, gson::toJson);
Spark.post("/search/", indexQueryService::search, gson::toJson);
Spark.post("/search-domain/", indexQueryService::searchDomain, gson::toJson);
Spark.post("/ops/repartition", this::repartitionEndpoint);
Spark.post("/ops/preconvert", this::preconvertEndpoint);
Spark.post("/ops/reindex/:id", this::reindexEndpoint);
Spark.get("/dictionary/*", lexiconService::getWordId, gson::toJson);
Spark.post("/ops/repartition", opsService::repartitionEndpoint);
Spark.post("/ops/preconvert", opsService::preconvertEndpoint);
Spark.post("/ops/reindex/:id", opsService::reindexEndpoint);
get("/is-blocked", this::isBlocked, gson::toJson);
Schedulers.newThread().scheduleDirect(this::initialize, 1, TimeUnit.MICROSECONDS);
}
private Object getWordId(Request request, Response response) {
final String word = request.splat()[0];
var dr = indexes.getDictionaryReader();
if (null == dr) {
response.status(HttpStatus.SC_FAILED_DEPENDENCY);
return "";
}
final int wordId = dr.get(word);
if (DictionaryHashMap.NO_VALUE == wordId) {
response.status(404);
return "";
}
return wordId;
}
private Object repartitionEndpoint(Request request, Response response) {
if (!indexes.repartition()) {
Spark.halt(503, "Operations busy");
}
return "OK";
}
private Object preconvertEndpoint(Request request, Response response) {
if (!indexes.preconvert()) {
Spark.halt(503, "Operations busy");
}
return "OK";
}
private Object reindexEndpoint(Request request, Response response) {
int id = Integer.parseInt(request.params("id"));
if (!indexes.reindex(id)) {
Spark.halt(503, "Operations busy");
}
return "OK";
}
private Object isBlocked(Request request, Response response) {
return indexes.isBusy() || !initialized;
}
@ -162,352 +80,7 @@ public class EdgeIndexService extends Service {
indexes.initialize(init);
}
private Object putWords(Request request, Response response) {
var putWordsRequest = gson.fromJson(request.body(), EdgePutWordsRequest.class);
synchronized (this) {
putWords(putWordsRequest.getDomainId(), putWordsRequest.getUrlId(),
putWordsRequest.wordSet, putWordsRequest.getIndex());
}
response.status(HttpStatus.SC_ACCEPTED);
return "";
}
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
EdgePageWordSet wordSet, int idx
) {
wmsa_edge_index_put_words_time.time(() -> {
for (EdgePageWords words : wordSet.values()) {
putWords(domainId, urlId, words, idx);
}
});
}
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
EdgePageWords words, int idx
) {
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
for (var chunk : ListChunker.chopList(words.words, SearchIndexJournalEntry.MAX_LENGTH)) {
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
var header = new SearchIndexJournalEntryHeader(domainId, urlId, words.block);
indexWriter.put(header, entry);
};
}
private long[] getOrInsertWordIds(List<String> words) {
return words.stream()
.filter(w -> w.getBytes().length < Byte.MAX_VALUE)
.mapToLong(keywordLexicon::getOrInsert)
.toArray();
}
private Object searchDomain(Request request, Response response) {
if (indexes.getDictionaryReader() == null) {
logger.warn("Dictionary reader not yet initialized");
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
}
String json = request.body();
EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class);
final int wordId = keywordLexicon.getReadOnly(specsSet.keyword);
List<EdgeId<EdgeUrl>> urlIds = indexes
.getBucket(specsSet.bucket)
.findHotDomainsForKeyword(specsSet.block, wordId, specsSet.queryDepth, specsSet.minHitCount, specsSet.maxResults)
.mapToObj(lv -> new EdgeId<EdgeUrl>((int)(lv & 0xFFFF_FFFFL)))
.toList();
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
}
private Object search(Request request, Response response) {
if (indexes.getDictionaryReader() == null) {
logger.warn("Dictionary reader not yet initialized");
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
}
String json = request.body();
EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class);
long start = System.currentTimeMillis();
try {
if (specsSet.isStagger()) {
return new EdgeSearchResultSet(searchStaggered(specsSet));
}
else {
return new EdgeSearchResultSet(searchStraight(specsSet));
}
}
catch (HaltException ex) {
logger.warn("Halt", ex);
throw ex;
}
catch (Exception ex) {
logger.info("Error during search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
logger.info("Error", ex);
Spark.halt(500, "Error");
return null;
}
finally {
wmsa_edge_index_query_time.observe(System.currentTimeMillis() - start);
wmsa_edge_index_query_count.inc();
}
}
private Map<IndexBlock, List<EdgeSearchResults>> searchStaggered(EdgeSearchSpecification specsSet) {
int count = 0;
final Map<IndexBlock, List<EdgeSearchResults>> results = new HashMap<>();
final TIntHashSet seenResults = new TIntHashSet();
final DomainResultCountFilter[] domainCountFilter = new DomainResultCountFilter[] {
new DomainResultCountFilter(specsSet.limitByDomain),
new DomainResultCountFilter(specsSet.limitByDomain)
};
final IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_TIMEOUT_MS);
final TIntIntHashMap limitsPerBucketRemaining = new TIntIntHashMap(6, 0.7f, 0, specsSet.limitByBucket);
for (int i = 0; i < specsSet.buckets.size(); i+=2) {
for (var sq : specsSet.subqueries) {
for (int j = 0; j < 2 && i + j < specsSet.buckets.size(); j++) {
Optional<EdgeIndexSearchTerms> searchTerms = getSearchTerms(sq);
if (searchTerms.isEmpty())
continue;
var result = performSearch(searchTerms.get(),
budget,
seenResults,
domainCountFilter[j],
sq,
List.of(specsSet.buckets.get(i+j)),
specsSet,
Math.min(limitsPerBucketRemaining.get(i+j), specsSet.limitTotal - count)
);
if (logger.isDebugEnabled()) {
logger.debug("{} -> {} {} {}", sq.block, specsSet.buckets.get(i+j), sq.searchTermsInclude, result.results.values().stream().mapToInt(List::size).sum());
}
int sz = result.size();
count += sz;
limitsPerBucketRemaining.adjustOrPutValue(i+j, -sz, specsSet.limitByBucket-sz);
if (sz > 0) {
results.computeIfAbsent(sq.block, s -> new ArrayList<>()).add(result);
}
}
}
}
return results;
}
@NotNull
private Map<IndexBlock, List<EdgeSearchResults>> searchStraight(EdgeSearchSpecification specsSet) {
Map<IndexBlock, List<EdgeSearchResults>> results = new HashMap<>();
int count = 0;
TIntHashSet seenResults = new TIntHashSet();
final DomainResultCountFilter domainCountFilter = new DomainResultCountFilter(specsSet.limitByDomain);
IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_TIMEOUT_MS);
for (var sq : specsSet.subqueries) {
Optional<EdgeIndexSearchTerms> searchTerms = getSearchTerms(sq);
if (searchTerms.isEmpty())
continue;
var result = performSearch(searchTerms.get(),
budget, seenResults, domainCountFilter,
sq, specsSet.buckets, specsSet,
specsSet.limitTotal - count);
if (logger.isDebugEnabled()) {
logger.debug("{} -> {} {}", sq.block, sq.searchTermsInclude, result.size());
}
count += result.size();
if (result.size() > 0) {
results.computeIfAbsent(sq.block, s -> new ArrayList<>()).add(result);
}
}
return results;
}
private EdgeSearchResults performSearch(EdgeIndexSearchTerms searchTerms,
IndexSearchBudget budget,
TIntHashSet seenResults,
DomainResultCountFilter domainCountFilter,
EdgeSearchSubquery sq,
List<Integer> specBuckets,
EdgeSearchSpecification specs,
int limit)
{
if (limit <= 0) {
return new EdgeSearchResults();
}
final Map<Integer, List<EdgeSearchResultItem>> results = new HashMap<>();
final DomainResultCountFilter localFilter = new DomainResultCountFilter(specs.limitByDomain);
for (int i : specBuckets) {
int foundResultsCount = results.values().stream().mapToInt(List::size).sum();
if (foundResultsCount >= specs.limitTotal || foundResultsCount >= limit)
break;
List<EdgeSearchResultItem> resultsForBucket = new ArrayList<>(specs.limitByBucket);
getQuery(i, budget, sq.block, lv -> localFilter.filterRawValue(i, lv), searchTerms)
.mapToObj(id -> new EdgeSearchResultItem(i, sq.termSize(), id))
.filter(ri -> !seenResults.contains(ri.url.id()) && localFilter.test(i, domainCountFilter, ri))
.limit(specs.limitTotal * 3L)
.distinct()
.limit(Math.min(specs.limitByBucket
- results.values().stream().mapToInt(Collection::size).sum(), limit - foundResultsCount))
.forEach(resultsForBucket::add);
for (var result : resultsForBucket) {
seenResults.add(result.url.id());
}
for (var result : resultsForBucket) {
for (var searchTerm : sq.searchTermsInclude) {
result.scores.add(getSearchTermScore(i, searchTerm, result.getCombinedId()));
}
}
domainCountFilter.addAll(i, resultsForBucket);
if (!resultsForBucket.isEmpty()) {
results.put(i, resultsForBucket);
}
}
return new EdgeSearchResults(results);
}
private EdgeSearchResultKeywordScore getSearchTermScore(int bucketId, String term, long urlId) {
final int termId = indexes.getDictionaryReader().get(term);
var bucket = indexes.getBucket(bucketId);
return new EdgeSearchResultKeywordScore(term,
bucket.getTermScore(termId, urlId),
bucket.isTermInBucket(IndexBlock.Title, termId, urlId),
bucket.isTermInBucket(IndexBlock.Link, termId, urlId)
);
}
public LongStream getHotDomainsQuery(int bucket, IndexBlock block, int wordId,
int queryDepth, int minHitCount, int maxResults) {
if (!indexes.isValidBucket(bucket)) {
logger.warn("Invalid bucket {}", bucket);
return LongStream.empty();
}
return indexes.getBucket(bucket).findHotDomainsForKeyword(block, wordId, queryDepth, minHitCount, maxResults);
}
private LongStream getQuery(int bucket, IndexSearchBudget budget, IndexBlock block,
LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
if (!indexes.isValidBucket(bucket)) {
logger.warn("Invalid bucket {}", bucket);
return LongStream.empty();
}
return indexes.getBucket(bucket).getQuery(block, filter, budget, searchTerms);
}
static class DomainResultCountFilter {
final TLongIntMap resultsByDomain = new TLongIntHashMap(200, 0.75f, -1, 0);
final int limitByDomain;
DomainResultCountFilter(int limitByDomain) {
this.limitByDomain = limitByDomain;
}
public boolean filterRawValue(int bucket, long value) {
var domain = new EdgeId<EdgeDomain>((int)(value >>> 32));
if (domain.id() == Integer.MAX_VALUE) {
return true;
}
return resultsByDomain.get(getKey(bucket, domain)) <= limitByDomain;
}
long getKey(int bucket, EdgeId<EdgeDomain> id) {
return ((long)bucket) << 32 | id.id();
}
public boolean test(int bucket, EdgeSearchResultItem item) {
if (item.domain.id() == Integer.MAX_VALUE) {
return true;
}
return resultsByDomain.adjustOrPutValue(getKey(bucket, item.domain), 1, 1) <= limitByDomain;
}
int getCount(int bucket, EdgeSearchResultItem item) {
return resultsByDomain.get(getKey(bucket, item.domain));
}
public void addAll(int bucket, List<EdgeSearchResultItem> items) {
items.forEach(item -> {
resultsByDomain.adjustOrPutValue(getKey(bucket, item.domain), 1, 1);
});
}
public boolean test(int bucket, DomainResultCountFilter root, EdgeSearchResultItem item) {
if (item.domain.id() == Integer.MAX_VALUE) {
return true;
}
return root.getCount(bucket, item) + resultsByDomain.adjustOrPutValue(getKey(bucket, item.domain), 1, 1) <= limitByDomain;
}
}
private Optional<EdgeIndexSearchTerms> getSearchTerms(EdgeSearchSubquery request) {
final List<Integer> excludes = new ArrayList<>();
final List<Integer> includes = new ArrayList<>();
for (var include : request.searchTermsInclude) {
var word = lookUpWord(include);
if (word.isEmpty()) {
logger.debug("Unknown search term: " + include);
return Optional.empty();
}
includes.add(word.getAsInt());
}
for (var exclude : request.searchTermsExclude) {
lookUpWord(exclude).ifPresent(excludes::add);
}
if (includes.isEmpty()) {
return Optional.empty();
}
return Optional.of(new EdgeIndexSearchTerms(includes, excludes));
}
private OptionalInt lookUpWord(String s) {
int ret = indexes.getDictionaryReader().get(s);
if (ret == DictionaryHashMap.NO_VALUE) {
return OptionalInt.empty();
}
return OptionalInt.of(ret);
}
}

View File

@ -1,67 +1,65 @@
package nu.marginalia.wmsa.edge.index.client;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.inject.Singleton;
import io.prometheus.client.Summary;
import io.reactivex.rxjava3.core.Observable;
import io.reactivex.rxjava3.schedulers.Schedulers;
import nu.marginalia.wmsa.client.AbstractDynamicClient;
import nu.marginalia.wmsa.client.HttpStatusCode;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultSet;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq;
import javax.annotation.CheckReturnValue;
import java.util.List;
import java.util.concurrent.TimeUnit;
@Singleton
public class EdgeIndexClient extends AbstractDynamicClient {
private final Gson gson = new GsonBuilder()
.create();
private final Logger logger = LoggerFactory.getLogger(getClass());
public class EdgeIndexClient extends AbstractDynamicClient implements EdgeIndexWriterClient {
private static final Summary wmsa_search_index_api_time = Summary.build().name("wmsa_search_index_api_time").help("-").register();
public EdgeIndexClient() {
super(ServiceDescriptor.EDGE_INDEX);
setTimeout(30);
}
@CheckReturnValue
public Observable<HttpStatusCode> putWords(Context ctx, EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url, double quality,
EdgePageWordSet wordSet, int writer
)
@Override
public void putWords(Context ctx, EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url,
DocumentKeywords wordSet, int writer
)
{
EdgePutWordsRequest request = new EdgePutWordsRequest(domain, url, quality, wordSet, writer);
return this.post(ctx, "/words/", request);
var keywordBuilder =
IndexPutKeywordsReq.newBuilder()
.setDomain(domain.id())
.setUrl(url.id())
.setIndex(writer);
var wordSetBuilder = IndexPutKeywordsReq.WordSet.newBuilder();
wordSetBuilder.setIndex(wordSet.block().ordinal());
wordSetBuilder.addAllWords(List.of(wordSet.keywords()));
keywordBuilder.addWordSet(wordSetBuilder.build());
var req = keywordBuilder.build();
this.post(ctx, "/words/", req).blockingSubscribe();
}
@CheckReturnValue
public EdgeSearchResultSet query(Context ctx, EdgeSearchSpecification specs) {
return this.postGet(ctx, "/search/", specs, EdgeSearchResultSet.class).blockingFirst();
}
@CheckReturnValue
public List<EdgeSearchResultSet> multiQuery(Context ctx, EdgeSearchSpecification... specs) {
return Observable.fromArray(specs)
.concatMap(s -> postGet(ctx, "/search/", s, EdgeSearchResultSet.class)
.subscribeOn(Schedulers.io())
.timeout(1, TimeUnit.SECONDS)
.onErrorComplete())
.toList()
.blockingGet();
public List<EdgeSearchResultItem> query(Context ctx, EdgeSearchSpecification specs) {
return wmsa_search_index_api_time.time(
() -> this.postGet(ctx, "/search/", specs, EdgeSearchResultSet.class).blockingFirst().getResults()
);
}
@CheckReturnValue

View File

@ -0,0 +1,88 @@
package nu.marginalia.wmsa.edge.index.client;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import nu.marginalia.util.ListChunker;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.List;
@Singleton
public class EdgeIndexLocalService implements EdgeIndexWriterClient {
private final KeywordLexicon lexicon;
private final SearchIndexJournalWriterImpl indexWriter;
private static final Logger logger = LoggerFactory.getLogger(EdgeIndexLocalService.class);
@Inject
public EdgeIndexLocalService(@Named("local-index-path") Path path) throws IOException {
long hashMapSize = 1L << 31;
if (Boolean.getBoolean("small-ram")) {
hashMapSize = 1L << 27;
}
var lexiconJournal = new KeywordLexiconJournal(path.resolve("dictionary.dat").toFile());
lexicon = new KeywordLexicon(lexiconJournal, new DictionaryHashMap(hashMapSize));
indexWriter = new SearchIndexJournalWriterImpl(lexicon, path.resolve("index.dat").toFile());
}
public void putWords(Context ctx, EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url,
DocumentKeywords wordSet, int writer) {
if (wordSet.keywords().length == 0)
return;
if (domain.id() <= 0 || url.id() <= 0) {
logger.warn("Bad ID: {}:{}", domain, url);
return;
}
for (var chunk : ListChunker.chopList(List.of(wordSet.keywords()), SearchIndexJournalEntry.MAX_LENGTH)) {
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
var header = new SearchIndexJournalEntryHeader(domain, url, wordSet.block());
indexWriter.put(header, entry);
}
}
private long[] getOrInsertWordIds(List<String> words) {
long[] ids = new long[words.size()];
int putId = 0;
for (String word : words) {
long id = lexicon.getOrInsert(word);
if (id != DictionaryHashMap.NO_VALUE) {
ids[putId++] = id;
}
}
if (putId != words.size()) {
ids = Arrays.copyOf(ids, putId);
}
return ids;
}
@Override
public void close() throws Exception {
indexWriter.close();
lexicon.close();
}
}

View File

@ -0,0 +1,13 @@
package nu.marginalia.wmsa.edge.index.client;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
public interface EdgeIndexWriterClient extends AutoCloseable {
void putWords(Context ctx, EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url,
DocumentKeywords wordSets, int writer);
}

View File

@ -15,10 +15,11 @@ import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
private final KeywordLexicon dictionaryWriter;
private final KeywordLexicon lexicon;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Disposable writerTask;
@ -30,12 +31,14 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
private long pos;
@SneakyThrows
public SearchIndexJournalWriterImpl(KeywordLexicon dictionaryWriter, File indexFile) {
this.dictionaryWriter = dictionaryWriter;
public SearchIndexJournalWriterImpl(KeywordLexicon lexicon, File indexFile) {
this.lexicon = lexicon;
initializeIndexFile(indexFile);
byteBuffer = ByteBuffer.allocate(MAX_BLOCK_SIZE);
new Thread(this::journalWriterThread, "Journal Writer").start();
writerTask = Schedulers.io().schedulePeriodicallyDirect(this::forceWrite, 1, 1, TimeUnit.SECONDS);
Runtime.getRuntime().addShutdownHook(new Thread(this::forceWrite));
}
@ -56,25 +59,45 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
}
}
private record WriteJob(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {}
private final LinkedBlockingQueue<WriteJob> writeQueue = new LinkedBlockingQueue<>(512);
@Override
@SneakyThrows
public synchronized void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {
public void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {
writeQueue.put(new WriteJob(header, entryData));
}
byteBuffer.clear();
@SneakyThrows
public void journalWriterThread() {
byteBuffer.putInt(entryData.size());
byteBuffer.putInt(header.block().id);
byteBuffer.putLong(header.documentId());
while (true) {
var job = writeQueue.take();
entryData.write(byteBuffer);
writeEntry(job.header, job.entryData);
}
}
private synchronized void writeEntry(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {
byteBuffer.limit(byteBuffer.position());
byteBuffer.rewind();
try {
byteBuffer.clear();
while (byteBuffer.position() < byteBuffer.limit())
channel.write(byteBuffer);
byteBuffer.putInt(entryData.size());
byteBuffer.putInt(header.block().id);
byteBuffer.putLong(header.documentId());
writePositionMarker();
entryData.write(byteBuffer);
byteBuffer.limit(byteBuffer.position());
byteBuffer.rewind();
while (byteBuffer.position() < byteBuffer.limit())
channel.write(byteBuffer);
writePositionMarker();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
@ -90,17 +113,15 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
@Override
public void flushWords() {
dictionaryWriter.commitToDisk();
lexicon.commitToDisk();
}
private void writePositionMarker() throws IOException {
var lock = channel.lock(0, 16, false);
pos = channel.size();
raf.seek(0);
raf.writeLong(pos);
raf.writeLong(dictionaryWriter.size());
raf.writeLong(lexicon.size());
raf.seek(pos);
lock.release();
}
public synchronized void close() throws IOException {

View File

@ -2,15 +2,22 @@ package nu.marginalia.wmsa.edge.index.journal.model;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
public record SearchIndexJournalEntryHeader(int entrySize, long documentId, IndexBlock block) {
public static final int HEADER_SIZE_LONGS = 2;
public SearchIndexJournalEntryHeader( EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId, IndexBlock block) {
this(-1, (long) domainId.id() << 32 | urlId.id(), block);
this(-1, combineIds(domainId, urlId), block);
}
private static long combineIds(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId) {
long did = domainId.id();
long uid = urlId.id();
return (did << 32L) | uid;
}
}

View File

@ -53,7 +53,7 @@ public class KeywordLexicon implements AutoCloseable {
@SneakyThrows
private int getOrInsert(byte[] bytes) {
if (bytes.length >= Byte.MAX_VALUE) {
logger.warn("getOrInsert({}), illegal length {}", bytes, bytes.length);
logger.warn("getOrInsert({}), illegal length {}", new String(bytes), bytes.length);
return DictionaryHashMap.NO_VALUE;
}

View File

@ -15,7 +15,7 @@ import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.function.Consumer;
public class KeywordLexiconJournalFile {
public class KeywordLexiconJournalFile implements AutoCloseable {
private final RandomAccessFile journalFileRAF;
private final File journalFile;
private final Logger logger = LoggerFactory.getLogger(getClass());

View File

@ -9,4 +9,8 @@ import java.util.List;
public class EdgeIndexSearchTerms {
public List<Integer> includes = new ArrayList<>();
public List<Integer> excludes = new ArrayList<>();
public boolean isEmpty() {
return includes.isEmpty();
}
}

View File

@ -4,17 +4,17 @@ import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
@AllArgsConstructor @Getter
@ToString
public class EdgePutWordsRequest {
public final EdgeId<EdgeDomain> domainId;
public final EdgeId<EdgeUrl> urlId;
public final double quality;
public EdgeId<EdgeDomain> domainId;
public EdgeId<EdgeUrl> urlId;
public double quality;
public final EdgePageWordSet wordSet;
public EdgePageWordSet wordSet;
private int index = 0;
}

View File

@ -1,23 +1,36 @@
package nu.marginalia.wmsa.edge.index.model;
public enum IndexBlock {
TitleKeywords(0, 0),
Title(1, 1),
Link(2, 1.25),
Top(3, 2),
Middle(4, 3),
Low(5, 4),
Words(6, 6),
Meta(7, 7),
PositionWords(8, 4.5),
NamesWords(9, 5),
Artifacts(10, 10),
Topic(11, 0.5);
TitleKeywords(IndexBlockType.QUALITY_SIGNAL, 0, 0),
Title(IndexBlockType.QUALITY_SIGNAL, 1, 1),
Link(IndexBlockType.QUALITY_SIGNAL, 2, 1.15),
Subjects(IndexBlockType.QUALITY_SIGNAL, 3, 1.0),
NamesWords(IndexBlockType.QUALITY_SIGNAL, 4, 3.0),
Artifacts(IndexBlockType.PAGE_DATA, 5, 10),
Meta(IndexBlockType.PAGE_DATA, 6, 7),
Tfidf_Top(IndexBlockType.TF_IDF, 7, 1.5),
Tfidf_Middle(IndexBlockType.TF_IDF, 8, 2),
Tfidf_Lower(IndexBlockType.TF_IDF, 9, 3.5),
Words_1(IndexBlockType.PAGE_DATA, 10, 2.0),
Words_2(IndexBlockType.PAGE_DATA,11, 3.5),
Words_4(IndexBlockType.PAGE_DATA,12, 4.0),
Words_8(IndexBlockType.PAGE_DATA,13, 4.5),
Words_16Plus(IndexBlockType.PAGE_DATA,14, 7.0),
Site(IndexBlockType.QUALITY_SIGNAL, 15, 1.2)
;
public final IndexBlockType type;
public final int id;
public final double sortOrder;
IndexBlock(int id, double sortOrder) {
IndexBlock(IndexBlockType type, int id, double sortOrder) {
this.type = type;
this.sortOrder = sortOrder;
this.id = id;
}
@ -31,3 +44,5 @@ public enum IndexBlock {
throw new IllegalArgumentException("Bad block id");
}
}

View File

@ -0,0 +1,7 @@
package nu.marginalia.wmsa.edge.index.model;
public enum IndexBlockType {
QUALITY_SIGNAL,
TF_IDF,
PAGE_DATA
}

View File

@ -49,8 +49,8 @@ public class IndexWordsTable implements AutoCloseable {
}
public long positionForWord(int wordId) {
long offset = reader.findEntry(header, wordId);
if (offset < 0) {
return -1L;
}

View File

@ -0,0 +1,43 @@
package nu.marginalia.wmsa.edge.index.reader;
import java.util.Arrays;
public class MicroCache {
private final int[] keys;
private final long[] data;
private int pos = 0;
public int hit;
public int miss;
public int full;
public static final long BAD_VALUE = Long.MIN_VALUE;
public MicroCache(int size) {
keys = new int[size];
data = new long[size];
Arrays.fill(data, BAD_VALUE);
}
public long get(int key) {
for (int i = 0; i < keys.length && data[i] != BAD_VALUE; i++) {
if (keys[i] == key) {
hit++;
return data[i];
}
}
miss++;
return BAD_VALUE;
}
public void set(int key, long val) {
keys[pos] = key;
data[pos] = val;
if (++pos >= keys.length) {
full++;
pos = 0;
}
}
}

View File

@ -9,6 +9,9 @@ import nu.marginalia.util.btree.CachingBTreeReader;
import nu.marginalia.util.btree.model.BTreeHeader;
import nu.marginalia.util.multimap.MultimapFileLong;
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -22,6 +25,7 @@ public class SearchIndex implements AutoCloseable {
private final MultimapFileLong urls;
private final IndexWordsTable words;
public final String name;
private final RandomAccessFile wordsFile;
private final BTreeReader bTreeReader;
private final CachingBTreeReader cachingBTreeReader;
@ -36,6 +40,7 @@ public class SearchIndex implements AutoCloseable {
throws IOException {
logger = LoggerFactory.getLogger(name);
this.name = name;
wordsFile = new RandomAccessFile(inWords, "r");
logger.info("{} : Loading {}", name, inUrls);
@ -65,26 +70,37 @@ public class SearchIndex implements AutoCloseable {
}
public long numUrls(int wordId) {
public long numUrls(IndexQueryCachePool pool, int wordId) {
int length = words.wordLength(wordId);
if (length < 0) return 0;
if (length > 0) return length;
return rangeForWord(wordId).numEntries();
return rangeForWord(pool, wordId).numEntries();
}
public UrlIndexTree rangeForWord(int wordId) {
return new UrlIndexTree(words.positionForWord(wordId));
public IndexBTreeRange rangeForWord(IndexQueryCachePool pool, int wordId) {
IndexBTreeRange range = pool.getRange(words, wordId);
if (range == null) {
range = new IndexBTreeRange(words.positionForWord(wordId));
pool.cacheRange(words, wordId, range);
}
return range;
}
public class UrlIndexTree {
final long dataOffset;
public IndexBTreeRange rangeForWord(int wordId) {
return new IndexBTreeRange(words.positionForWord(wordId));
}
public class IndexBTreeRange {
public final long dataOffset;
private BTreeHeader header;
public UrlIndexTree(long dataOffset) {
public IndexBTreeRange(long dataOffset) {
this.dataOffset = dataOffset;
}
public LongStream stream() {
public LongStream stream(int bufferSize) {
if (dataOffset < 0) {
return LongStream.empty();
}
@ -94,7 +110,7 @@ public class SearchIndex implements AutoCloseable {
long urlOffset = header.dataOffsetLongs();
long endOffset = header.dataOffsetLongs() + header.numEntries();
int stepSize = Math.min(1024, header.numEntries());
int stepSize = Math.min(bufferSize, header.numEntries());
long[] buffer = new long[stepSize];
@ -107,6 +123,19 @@ public class SearchIndex implements AutoCloseable {
});
}
public EntrySource asEntrySource() {
return new AsEntrySource();
}
public QueryFilterStepIf asExcludeFilterStep(IndexQueryCachePool pool) {
return new AsExcludeQueryFilterStep(pool);
}
public LongStream stream() {
return stream(1024);
}
public boolean isPresent() {
return dataOffset >= 0;
}
@ -122,35 +151,95 @@ public class SearchIndex implements AutoCloseable {
}
}
public boolean hasUrl(long url) {
if (header != null) {
return bTreeReader.findEntry(header, url) >= 0;
public boolean hasUrl(CachingBTreeReader.BTreeCachedIndex cache, long url) {
if (dataOffset < 0) return false;
return cachingBTreeReader.findEntry(cache, url) >= 0;
}
public boolean hasUrl(IndexQueryCachePool pool, long url) {
if (dataOffset < 0)
return false;
CachingBTreeReader.BTreeCachedIndex cache = pool.getIndexCache(SearchIndex.this, this);
return cachingBTreeReader.findEntry(cache, url) >= 0;
}
public CachingBTreeReader.BTreeCachedIndex createIndexCache() {
if (dataOffset < 0)
return null;
if (header == null) {
header = cachingBTreeReader.getHeader(dataOffset);
}
else if (dataOffset < 0) return false;
else {
header = bTreeReader.getHeader(dataOffset);
return bTreeReader.findEntry(header, url) >= 0;
return cachingBTreeReader.prepareCache(header);
}
class AsEntrySource implements EntrySource {
long pos;
final long endOffset;
public SearchIndex getIndex() {
return SearchIndex.this;
};
public AsEntrySource() {
if (dataOffset <= 0) {
pos = -1;
endOffset = -1;
return;
}
if (header == null) {
header = bTreeReader.getHeader(dataOffset);
}
pos = header.dataOffsetLongs();
endOffset = header.dataOffsetLongs() + header.numEntries();
}
@Override
public int read(long[] buffer, int n) {
if (pos >= endOffset) {
return 0;
}
int rb = Math.min(n, (int)(endOffset - pos));
urls.read(buffer, rb, pos);
pos += rb;
return rb;
}
}
public boolean hasUrl(CachingBTreeReader.Cache cache, long url) {
if (header != null) {
return cachingBTreeReader.findEntry(header, cache, url) >= 0;
class AsExcludeQueryFilterStep implements QueryFilterStepIf {
private final CachingBTreeReader.BTreeCachedIndex cache;
public AsExcludeQueryFilterStep(IndexQueryCachePool pool) {
cache = pool.getIndexCache(SearchIndex.this, IndexBTreeRange.this);
}
else if (dataOffset < 0) return false;
else {
header = bTreeReader.getHeader(dataOffset);
return cachingBTreeReader.findEntry(header, cache, url) >= 0;
public SearchIndex getIndex() {
return SearchIndex.this;
};
public double cost() {
return cache.getIndexedDataSize();
}
@Override
public boolean test(long value) {
return !hasUrl(cache, value);
}
public String describe() {
return "Exclude["+name+"]";
}
}
public CachingBTreeReader.Cache createIndexCache() {
return cachingBTreeReader.prepareCache();
}
}
@Override
public void close() throws Exception {
urls.close();

View File

@ -3,9 +3,8 @@ package nu.marginalia.wmsa.edge.index.reader;
import com.google.inject.Inject;
import lombok.SneakyThrows;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.reader.query.IndexQueryBuilder;
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
import nu.marginalia.wmsa.edge.index.reader.query.Query;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -19,18 +18,20 @@ import java.util.stream.Stream;
public class SearchIndexReader implements AutoCloseable {
private final EnumMap<IndexBlock, SearchIndex> indices;
private final EnumMap<IndexBlock, IndexQueryBuilder> queryBuilders;
private final EnumMap<IndexBlock, IndexQueryBuilder> underspecifiedQueryBuilders;
private final EnumMap<IndexBlock, IndexQueryFactory> queryBuilders;
private final Logger logger = LoggerFactory.getLogger(getClass());
private static final IndexBlock[] indicesBySearchOrder = new IndexBlock[] {
IndexBlock.Top,
IndexBlock.Middle,
IndexBlock.Low,
IndexBlock.Words,
IndexBlock.NamesWords,
IndexBlock.Title,
IndexBlock.Tfidf_Top,
IndexBlock.Tfidf_Middle,
IndexBlock.Tfidf_Lower,
IndexBlock.Words_1,
IndexBlock.Words_2,
IndexBlock.Words_4,
IndexBlock.Words_8,
IndexBlock.Words_16Plus,
};
@Inject
@ -38,30 +39,33 @@ public class SearchIndexReader implements AutoCloseable {
EnumMap<IndexBlock, SearchIndex> indices) {
this.indices = indices;
var lowIndex = indices.get(IndexBlock.Low);
var midIndex = indices.get(IndexBlock.Middle);
var topIndex = indices.get(IndexBlock.Top);
var lowIndex = indices.get(IndexBlock.Tfidf_Lower);
var midIndex = indices.get(IndexBlock.Tfidf_Middle);
var topIndex = indices.get(IndexBlock.Tfidf_Top);
var linkIndex = indices.get(IndexBlock.Link);
var titleIndex = indices.get(IndexBlock.Title);
var namesIndex = indices.get(IndexBlock.NamesWords);
var positionIndex = indices.get(IndexBlock.PositionWords);
var titleKeywordsIndex = indices.get(IndexBlock.TitleKeywords);
var wordsIndex = indices.get(IndexBlock.Words);
var siteIndex = indices.get(IndexBlock.Site);
var metaIndex = indices.get(IndexBlock.Meta);
var topicIndex = indices.get(IndexBlock.Topic);
var topicIndex = indices.get(IndexBlock.Subjects);
var words1 = indices.get(IndexBlock.Words_1);
var words2 = indices.get(IndexBlock.Words_2);
var words4 = indices.get(IndexBlock.Words_4);
var words8 = indices.get(IndexBlock.Words_8);
var words16 = indices.get(IndexBlock.Words_16Plus);
var artifacts = indices.get(IndexBlock.Artifacts);
queryBuilders = new EnumMap<>(IndexBlock.class);
underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class);
queryBuilders.put(IndexBlock.Words, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex, namesIndex, wordsIndex), wordsIndex));
queryBuilders.put(IndexBlock.Low, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex), wordsIndex));
queryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex), wordsIndex));
queryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex), wordsIndex));
queryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex), wordsIndex));
List<SearchIndex> excludeIndices = listOfNonNulls(metaIndex, titleIndex, topIndex, midIndex, lowIndex, words1);
List<SearchIndex> priorityIndices = listOfNonNulls(titleIndex, linkIndex, siteIndex, topIndex, topicIndex);
underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
underspecifiedQueryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
queryBuilders.put(IndexBlock.Title, new IndexQueryFactory(listOfNonNulls(metaIndex, titleIndex, linkIndex), excludeIndices, priorityIndices));
queryBuilders.put(IndexBlock.Words_1, new IndexQueryFactory(listOfNonNulls(metaIndex, words1), excludeIndices, priorityIndices));
queryBuilders.put(IndexBlock.Words_2, new IndexQueryFactory(listOfNonNulls(metaIndex, words2), excludeIndices, priorityIndices));
queryBuilders.put(IndexBlock.Words_4, new IndexQueryFactory(listOfNonNulls(metaIndex, words4), excludeIndices, priorityIndices));
queryBuilders.put(IndexBlock.Words_8, new IndexQueryFactory(listOfNonNulls(metaIndex, words8), excludeIndices, priorityIndices));
queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryFactory(listOfNonNulls(metaIndex, words16, artifacts), excludeIndices, priorityIndices));
}
@SafeVarargs
@ -99,27 +103,13 @@ public class SearchIndexReader implements AutoCloseable {
.limit(maxResults);
}
public Query findUnderspecified(
IndexBlock block,
IndexSearchBudget budget,
LongPredicate filter,
int wordId) {
var builder = underspecifiedQueryBuilders.get(block);
if (null != builder) {
return builder.buildUnderspecified(budget, filter, wordId);
}
return findWord(block, budget, filter, wordId);
}
public Query findWord(IndexBlock block, IndexSearchBudget budget, LongPredicate filter, int wordId) {
public IndexQueryFactory.IndexQueryBuilder findWord(IndexQueryCachePool cachePool, IndexBlock block, int wordId) {
var builder = queryBuilders.get(block);
if (builder == null)
return Query.EMPTY;
return null;
return builder.build(budget, filter, wordId);
return builder.buildQuery(cachePool, wordId);
}
@Override
@ -130,20 +120,20 @@ public class SearchIndexReader implements AutoCloseable {
}
@SneakyThrows
public long numHits(IndexBlock block, int word) {
IndexQueryBuilder builder = queryBuilders.get(block);
public long numHits(IndexQueryCachePool pool, IndexBlock block, int word) {
IndexQueryFactory builder = queryBuilders.get(block);
if (builder == null)
return 0L;
long hits = 0;
for (var index : builder.getIndicies()) {
hits += index.numUrls(word);
hits += index.numUrls(pool, word);
}
return hits;
}
public IndexBlock getBlockForResult(int searchTerm, long urlId) {
public IndexBlock getBlockForResult(IndexQueryCachePool cachePool, int searchTerm, long urlId) {
for (var block : indicesBySearchOrder) {
var index = indices.get(block);
@ -151,21 +141,18 @@ public class SearchIndexReader implements AutoCloseable {
continue;
}
var range = index.rangeForWord(searchTerm);
if (range.hasUrl(urlId)) {
if (cachePool.isUrlPresent(index, searchTerm, urlId))
return block;
}
}
return IndexBlock.Words;
return IndexBlock.Words_16Plus;
}
public boolean isTermInBucket(IndexBlock block, int searchTerm, long urlId) {
public boolean isTermInBucket(IndexQueryCachePool cachePool, IndexBlock block, int searchTerm, long urlId) {
final var index = indices.get(block);
if (null == index) return false;
return index
.rangeForWord(searchTerm)
.hasUrl(urlId);
return cachePool.isUrlPresent(index, searchTerm, urlId);
}
}

View File

@ -105,7 +105,7 @@ public class SearchIndexes {
}
@Nullable
public KeywordLexiconReadOnlyView getDictionaryReader() {
public KeywordLexiconReadOnlyView getLexiconReader() {
return keywordLexiconReadOnlyView;
}
@ -146,6 +146,7 @@ public class SearchIndexes {
public EdgeIndexBucket getBucket(int bucketId) {
return buckets[bucketId];
}
public boolean isValidBucket(int bucketId) {
return bucketId >= 0 && bucketId < buckets.length;
}

View File

@ -1,151 +0,0 @@
package nu.marginalia.wmsa.edge.index.reader.query;
import com.google.common.collect.Streams;
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
import java.util.Collection;
import java.util.List;
import java.util.Objects;
import java.util.function.LongPredicate;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import java.util.stream.LongStream;
public class IndexQueryBuilder {
private final List<SearchIndex> requiredIndices;
private final SearchIndex excludeIndex;
public Collection<SearchIndex> getIndicies() {
return requiredIndices;
}
public IndexQueryBuilder(List<SearchIndex> requiredIndices, SearchIndex excludeIndex) {
this.requiredIndices = requiredIndices.stream().filter(Objects::nonNull).collect(Collectors.toList());
this.excludeIndex = excludeIndex;
}
public Query build(IndexSearchBudget budget,
LongPredicate filter,
int wordId) {
return new QueryForIndices(budget, filter, wordId);
}
// Special treatment for queries with few terms, prefer hits that appear in multiple buckets
public Query buildUnderspecified(IndexSearchBudget budget, LongPredicate filter, int wordId) {
if (requiredIndices.size() == 1) {
return build(budget, filter, wordId);
}
var ranges = requiredIndices.stream().map(idx -> idx.rangeForWord(wordId)).toArray(SearchIndex.UrlIndexTree[]::new);
var relevantIndices = IntStream.range(0, requiredIndices.size()).filter(i -> ranges[i].isPresent()).toArray();
if (relevantIndices.length == 0) {
return new QueryForIndices(budget, LongStream::empty);
}
else if (relevantIndices.length == 1 || relevantIndices[0] != 0) {
return build(budget, filter, wordId);
}
var fstRange = requiredIndices.get(relevantIndices[0]).rangeForWord(wordId);
LongStream priorityStream = underspecifiedPairStream(budget, 1000, relevantIndices[0], relevantIndices[0], wordId);
for (int i = 1; i < relevantIndices.length; i++) {
priorityStream = Streams.concat(priorityStream, underspecifiedPairStream(budget, 1000, relevantIndices[0], relevantIndices[i], wordId));
}
LongStream stream = LongStream.concat(priorityStream, fstRange.stream().takeWhile(budget::take)).filter(filter);
return new QueryForIndices(budget, () -> stream);
}
private LongStream underspecifiedPairStream(IndexSearchBudget budget, int limit, int firstIdx, int otherIdx, int wordId) {
SearchIndex firstTmp = requiredIndices.get(firstIdx),
secondTmp = requiredIndices.get(otherIdx);
final SearchIndex fst;
final SearchIndex snd;
if (firstTmp.numUrls(wordId) > secondTmp.numUrls(wordId)) {
fst = secondTmp;
snd = firstTmp;
}
else {
fst = firstTmp;
snd = secondTmp;
}
var sndRange = snd.rangeForWord(wordId);
var cache = sndRange.createIndexCache();
return fst.rangeForWord(wordId).stream().takeWhile(budget::take).limit(limit).filter(data -> sndRange.hasUrl(cache, data));
}
private class QueryForIndices implements Query {
private final Supplier<LongStream> supp;
private final IndexSearchBudget budget;
private QueryForIndices(IndexSearchBudget budget, LongPredicate filter, int wordId) {
this.budget = budget;
supp = () ->
requiredIndices.stream().flatMapToLong(idx -> {
var range = idx.rangeForWord(wordId);
return range.stream().takeWhile(budget::take);
})
.filter(filter);
}
private QueryForIndices(IndexSearchBudget budget, Supplier<LongStream> supp) {
this.budget = budget;
this.supp = supp;
}
@Override
public Query also(int wordId) {
return new QueryForIndices(budget,
() -> requiredIndices.stream().flatMapToLong(idx -> alsoStream(idx, wordId)));
}
@Override
public Query alsoCached(int wordId) {
return new QueryForIndices(budget,
() -> requiredIndices.stream().flatMapToLong(idx -> alsoStreamCached(idx, wordId)));
}
@Override
public Query not(int wordId) {
// Happens when an index simply isn't present, won't find data anyway
// so it's safe to no-op the query
if (excludeIndex == null)
return new QueryForIndices(budget, LongStream::empty);
return new QueryForIndices(budget, () -> notStream(wordId));
}
private LongStream alsoStream(SearchIndex idx, int wordId) {
var range = idx.rangeForWord(wordId);
return stream().filter(range::hasUrl).takeWhile(budget::take);
}
private LongStream alsoStreamCached(SearchIndex idx, int wordId) {
var range = idx.rangeForWord(wordId);
var cache = range.createIndexCache();
return stream().filter(data -> range.hasUrl(cache, data)).takeWhile(budget::take);
}
private LongStream notStream(int wordId) {
var bodyRange = excludeIndex.rangeForWord(wordId);
var cache = bodyRange.createIndexCache();
return stream().filter(url -> !bodyRange.hasUrl(cache, url)).takeWhile(budget::take);
}
public LongStream stream() {
return supp.get();
}
}
}

View File

@ -1,16 +0,0 @@
package nu.marginalia.wmsa.edge.index.reader.query;
public class IndexSearchBudget {
private long timeout;
public IndexSearchBudget(long limitTime) {
this.timeout = System.currentTimeMillis() + limitTime;
}
// Used for short-circuiting Stream-objects using takeWhile, we don't care
public boolean take(long unused) {
return System.currentTimeMillis() < timeout;
}
}

View File

@ -1,26 +0,0 @@
package nu.marginalia.wmsa.edge.index.reader.query;
import java.util.stream.LongStream;
public interface Query {
Query EMPTY = new Query() {
@Override
public Query also(int wordId) { return this; }
@Override
public Query alsoCached(int wordId) { return this; }
@Override
public Query not(int wordId) { return this; }
@Override
public LongStream stream() { return LongStream.empty(); }
};
Query also(int wordId);
Query alsoCached(int wordId);
Query not(int wordId);
LongStream stream();
}

View File

@ -0,0 +1,107 @@
package nu.marginalia.wmsa.edge.index.svc;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.protobuf.InvalidProtocolBufferException;
import nu.marginalia.util.ListChunker;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq;
import org.apache.http.HttpStatus;
import spark.Request;
import spark.Response;
import java.util.Arrays;
import java.util.List;
@Singleton
public class EdgeIndexLexiconService {
private final SearchIndexes indexes;
private final KeywordLexicon keywordLexicon;
@Inject
public EdgeIndexLexiconService(SearchIndexes indexes, IndexServicesFactory servicesFactory) {
this.indexes = indexes;
this.keywordLexicon = servicesFactory.getKeywordLexicon();
}
public Object getWordId(Request request, Response response) {
final String word = request.splat()[0];
var lr = indexes.getLexiconReader();
if (null == lr) {
response.status(HttpStatus.SC_FAILED_DEPENDENCY);
return "";
}
final int wordId = lr.get(word);
if (DictionaryHashMap.NO_VALUE == wordId) {
response.status(404);
return "";
}
return wordId;
}
public Object putWords(Request request, Response response) throws InvalidProtocolBufferException {
var req = IndexPutKeywordsReq.parseFrom(request.bodyAsBytes());
EdgeId<EdgeDomain> domainId = new EdgeId<>(req.getDomain());
EdgeId<EdgeUrl> urlId = new EdgeId<>(req.getUrl());
int idx = req.getIndex();
for (int ws = 0; ws < req.getWordSetCount(); ws++) {
putWords(domainId, urlId, req.getWordSet(ws), idx);
}
response.status(HttpStatus.SC_ACCEPTED);
return "";
}
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
IndexPutKeywordsReq.WordSet words, int idx
) {
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
IndexBlock block = IndexBlock.values()[words.getIndex()];
for (var chunk : ListChunker.chopList(words.getWordsList(), SearchIndexJournalEntry.MAX_LENGTH)) {
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
var header = new SearchIndexJournalEntryHeader(domainId, urlId, block);
indexWriter.put(header, entry);
}
}
private long[] getOrInsertWordIds(List<String> words) {
long[] ids = new long[words.size()];
int putIdx = 0;
for (String word : words) {
long id = keywordLexicon.getOrInsert(word);
if (id != DictionaryHashMap.NO_VALUE) {
ids[putIdx++] = id;
}
}
if (putIdx != words.size()) {
ids = Arrays.copyOf(ids, putIdx);
}
return ids;
}
}

View File

@ -0,0 +1,44 @@
package nu.marginalia.wmsa.edge.index.svc;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
import spark.Request;
import spark.Response;
import spark.Spark;
@Singleton
public class EdgeIndexOpsService {
private final SearchIndexes indexes;
@Inject
public EdgeIndexOpsService(SearchIndexes indexes) {
this.indexes = indexes;
}
public Object repartitionEndpoint(Request request, Response response) {
if (!indexes.repartition()) {
Spark.halt(503, "Operations busy");
}
return "OK";
}
public Object preconvertEndpoint(Request request, Response response) {
if (!indexes.preconvert()) {
Spark.halt(503, "Operations busy");
}
return "OK";
}
public Object reindexEndpoint(Request request, Response response) {
int id = Integer.parseInt(request.params("id"));
if (!indexes.reindex(id)) {
Spark.halt(503, "Operations busy");
}
return "OK";
}
}

View File

@ -0,0 +1,325 @@
package nu.marginalia.wmsa.edge.index.svc;
import com.google.gson.Gson;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import gnu.trove.set.hash.TIntHashSet;
import io.prometheus.client.Counter;
import io.prometheus.client.Histogram;
import nu.marginalia.util.dict.DictionaryHashMap;
import nu.marginalia.wmsa.client.GsonFactory;
import nu.marginalia.wmsa.configuration.WmsaHome;
import nu.marginalia.wmsa.edge.index.EdgeIndexBucket;
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
import nu.marginalia.wmsa.edge.index.svc.query.IndexSearchBudget;
import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.id.EdgeIdList;
import nu.marginalia.wmsa.edge.model.search.*;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
import org.apache.http.HttpStatus;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.HaltException;
import spark.Request;
import spark.Response;
import spark.Spark;
import java.util.*;
import java.util.function.LongPredicate;
import static java.util.Comparator.comparing;
import static spark.Spark.halt;
@Singleton
public class EdgeIndexQueryService {
private final Logger logger = LoggerFactory.getLogger(getClass());
private static final int SEARCH_BUDGET_TIMEOUT_MS = 3000;
private static final int QUERY_FETCH_SIZE = 8192;
private static final int QUERY_FIRST_PASS_DOMAIN_LIMIT = 64;
private static final Counter wmsa_edge_index_query_timeouts = Counter.build().name("wmsa_edge_index_query_timeouts").help("-").register();
private static final Histogram wmsa_edge_index_query_time = Histogram.build().name("wmsa_edge_index_query_time").linearBuckets(50, 50, 15).help("-").register();
private static final Histogram wmsa_edge_index_domain_query_time = Histogram.build().name("wmsa_edge_index_domain_query_time").linearBuckets(50, 50, 15).help("-").register();
private final Gson gson = GsonFactory.get();
private final SearchIndexes indexes;
@Inject
public EdgeIndexQueryService(SearchIndexes indexes) {
this.indexes = indexes;
}
public Object searchDomain(Request request, Response response) {
if (indexes.getLexiconReader() == null) {
logger.warn("Dictionary reader not yet initialized");
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
}
String json = request.body();
EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class);
try {
return wmsa_edge_index_domain_query_time.time(() -> queryDomain(specsSet));
}
catch (HaltException ex) {
logger.warn("Halt", ex);
throw ex;
}
catch (Exception ex) {
logger.info("Error during domain search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
logger.info("Error", ex);
Spark.halt(500, "Error");
return null;
}
}
public Object search(Request request, Response response) {
if (indexes.getLexiconReader() == null) {
logger.warn("Dictionary reader not yet initialized");
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
}
String json = request.body();
EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class);
try {
return wmsa_edge_index_query_time.time(() -> query(specsSet));
}
catch (HaltException ex) {
logger.warn("Halt", ex);
throw ex;
}
catch (Exception ex) {
logger.info("Error during search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
logger.info("Error", ex);
Spark.halt(500, "Error");
return null;
}
}
public EdgeSearchResultSet query(EdgeSearchSpecification specsSet) {
List<EdgeSearchResultItem> results = new SearchQuery(specsSet).execute();
return new EdgeSearchResultSet(results);
}
public EdgeDomainSearchResults queryDomain(EdgeDomainSearchSpecification specsSet) {
final OptionalInt wordId = lookUpWord(specsSet.keyword);
EdgeIdList<EdgeUrl> urlIds;
if (wordId.isEmpty()) {
urlIds = new EdgeIdList<>();
} else {
urlIds = indexes
.getBucket(specsSet.bucket)
.findHotDomainsForKeyword(specsSet.block, wordId.getAsInt(), specsSet.queryDepth, specsSet.minHitCount, specsSet.maxResults)
.mapToInt(lv -> (int) (lv & 0xFFFF_FFFFL))
.collect(EdgeIdList::new, EdgeIdList::add, EdgeIdList::addAll);
}
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
}
private class SearchQuery {
private final TIntHashSet seenResults = new TIntHashSet(QUERY_FETCH_SIZE, 0.5f);
private final EdgeSearchSpecification specsSet;
private final IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_TIMEOUT_MS);
private final IndexQueryCachePool cachePool = new IndexQueryCachePool();
public SearchQuery(EdgeSearchSpecification specsSet) {
this.specsSet = specsSet;
}
private List<EdgeSearchResultItem> execute() {
final Set<EdgeSearchResultItem> results = new HashSet<>(QUERY_FETCH_SIZE);
for (var sq : specsSet.subqueries) {
results.addAll(performSearch(sq));
}
for (var result : results) {
addResultScores(result);
}
if (!budget.hasTimeLeft()) {
wmsa_edge_index_query_timeouts.inc();
}
var domainCountFilter = new ResultDomainDeduplicator(specsSet.limitByDomain);
if (WmsaHome.isDebug()) {
cachePool.printSummary(logger);
}
cachePool.clear();
return results.stream()
.sorted(
comparing(EdgeSearchResultItem::getScore)
.thenComparing(EdgeSearchResultItem::getRanking)
.thenComparing(EdgeSearchResultItem::getUrlIdInt)
)
.filter(domainCountFilter::test)
.limit(specsSet.getLimitTotal()).toList();
}
private List<EdgeSearchResultItem> performSearch(EdgeSearchSubquery sq)
{
final List<EdgeSearchResultItem> results = new ArrayList<>(QUERY_FETCH_SIZE);
final EdgeIndexSearchTerms searchTerms = getSearchTerms(sq);
if (searchTerms.isEmpty())
return Collections.emptyList();
for (int indexBucket : specsSet.buckets) {
final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT);
if (!budget.hasTimeLeft()) {
logger.info("Query timed out, omitting {}:{} for query {}", indexBucket, sq.block, sq.searchTermsInclude);
continue;
}
if (QUERY_FETCH_SIZE <= results.size())
break;
IndexQuery query = getQuery(cachePool, indexBucket, sq.block, localFilter::filterRawValue, searchTerms);
long[] buf = new long[8192];
while (query.hasMore() && results.size() < QUERY_FETCH_SIZE && budget.hasTimeLeft()) {
int cnt = query.getMoreResults(buf, budget);
for (int i = 0; i < cnt && results.size() < QUERY_FETCH_SIZE; i++) {
final long id = buf[i];
if (!seenResults.add((int)(id & 0xFFFF_FFFFL)) || !localFilter.test(id)) {
continue;
}
results.add(new EdgeSearchResultItem(indexBucket, id));
}
}
}
return results;
}
private IndexQuery getQuery(IndexQueryCachePool cachePool, int bucket, IndexBlock block,
LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
if (!indexes.isValidBucket(bucket)) {
logger.warn("Invalid bucket {}", bucket);
return new IndexQuery(Collections.emptyList());
}
return indexes.getBucket(bucket).getQuery(cachePool, block, filter, searchTerms);
}
private void addResultScores(EdgeSearchResultItem searchResult) {
final var reader = Objects.requireNonNull(indexes.getLexiconReader());
List<List<String>> searchTermVariants = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
// Memoize calls to getTermData, as they're somewhat expensive and highly redundant
Map<ResultTerm, ResultTermData> termMetadata = new HashMap<>(32);
double bestScore = 0;
for (int searchTermListIdx = 0; searchTermListIdx < searchTermVariants.size(); searchTermListIdx++) {
double setScore = 0;
int setSize = 0;
for (var searchTerm : searchTermVariants.get(searchTermListIdx)) {
final int termId = reader.get(searchTerm);
ResultTermData data = termMetadata.computeIfAbsent(
new ResultTerm(searchResult.bucketId, termId, searchResult.getCombinedId()), this::getTermData);
var score = data.asScore(searchTermListIdx, searchTerm);
searchResult.scores.add(score);
setScore += score.value();
setSize++;
}
bestScore = Math.min(bestScore, setScore/setSize);
}
searchResult.setScore(bestScore);
}
private ResultTermData getTermData(ResultTerm resultTerm) {
final EdgeIndexBucket bucket = indexes.getBucket(resultTerm.bucket);
final int termId = resultTerm.termId;
final long combinedUrlId = resultTerm.combinedUrlId;
return new ResultTermData(bucket.getTermScore(cachePool, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Title, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Link, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Site, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Subjects, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.NamesWords, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Top, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Middle, termId, combinedUrlId),
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Lower, termId, combinedUrlId)
);
}
record ResultTerm (int bucket, int termId, long combinedUrlId) {}
record ResultTermData (IndexBlock index,
boolean title,
boolean link,
boolean site,
boolean subject,
boolean name,
boolean high,
boolean mid,
boolean low
) {
public EdgeSearchResultKeywordScore asScore(int set, String searchTerm) {
return new EdgeSearchResultKeywordScore(set, searchTerm, index, title, link, site, subject, name, high, mid, low);
}
}
}
private EdgeIndexSearchTerms getSearchTerms(EdgeSearchSubquery request) {
final List<Integer> excludes = new ArrayList<>();
final List<Integer> includes = new ArrayList<>();
for (var include : request.searchTermsInclude) {
var word = lookUpWord(include);
if (word.isEmpty()) {
logger.debug("Unknown search term: " + include);
return new EdgeIndexSearchTerms(includes, excludes);
}
includes.add(word.getAsInt());
}
for (var exclude : request.searchTermsExclude) {
lookUpWord(exclude).ifPresent(excludes::add);
}
return new EdgeIndexSearchTerms(includes, excludes);
}
private OptionalInt lookUpWord(String s) {
int ret = indexes.getLexiconReader().get(s);
if (ret == DictionaryHashMap.NO_VALUE) {
return OptionalInt.empty();
}
return OptionalInt.of(ret);
}
}

View File

@ -0,0 +1,97 @@
package nu.marginalia.wmsa.edge.index.svc.query;
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
import java.util.ArrayList;
import java.util.List;
import static java.lang.Math.min;
public class IndexQuery {
private final List<EntrySource> sources;
private final List<QueryFilterStepIf> inclusionFilter = new ArrayList<>(10);
private final List<QueryFilterStepIf> priorityFilter = new ArrayList<>(10);
public IndexQuery(List<EntrySource> sources) {
this.sources = sources;
}
public void addInclusionFilter(QueryFilterStepIf filter) {
inclusionFilter.add(filter);
}
public void addPriorityFilter(QueryFilterStepIf filter) {
priorityFilter.add(filter);
}
private int si = 0;
public boolean hasMore() {
return si < sources.size();
}
public int getMoreResults(long[] dest, IndexSearchBudget budget) {
final EntrySource source = sources.get(si);
int bufferUtilizedLength = source.read(dest, dest.length);
if (bufferUtilizedLength <= 0) {
si++;
return 0;
}
for (var filter : inclusionFilter) {
bufferUtilizedLength = filter.retainDestructive(dest, bufferUtilizedLength);
if (bufferUtilizedLength <= 0) {
si++;
return 0;
}
}
if (budget.hasTimeLeft()) {
prioritizeBuffer(dest, source, bufferUtilizedLength, budget);
}
int count = min(bufferUtilizedLength, dest.length);
System.arraycopy(dest, 0, dest, 0, count);
return count;
}
private void prioritizeBuffer(long[] dest, EntrySource source, int remainingBufferSize, IndexSearchBudget budget) {
int prioStart = 0;
for (var filter : priorityFilter) {
if (!budget.hasTimeLeft())
break;
if (filter.getIndex() == source.getIndex())
continue;
prioStart += filter.retainReorder(dest, prioStart, remainingBufferSize);
if (prioStart >= remainingBufferSize) {
break;
}
}
}
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("Sources:\n");
for (var source: sources) {
sb.append("\t").append(source.getIndex().name).append("\n");
}
sb.append("Includes:\n");
for (var include : inclusionFilter) {
sb.append("\t").append(include.describe()).append("\n");
}
return sb.toString();
}
}

View File

@ -0,0 +1,60 @@
package nu.marginalia.wmsa.edge.index.svc.query;
import nu.marginalia.util.btree.CachingBTreeReader;
import nu.marginalia.wmsa.edge.index.reader.IndexWordsTable;
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
import org.slf4j.Logger;
import java.util.HashMap;
import java.util.Map;
public class IndexQueryCachePool {
private final Map<PoolKey, CachingBTreeReader.BTreeCachedIndex> indexCaches = new HashMap<>();
private final Map<RangeKey, SearchIndex.IndexBTreeRange> rangeCache = new HashMap<>();
private final Map<PoolKey, Integer> savedCounts = new HashMap<>();
public CachingBTreeReader.BTreeCachedIndex getIndexCache(SearchIndex index, SearchIndex.IndexBTreeRange range) {
var key = new PoolKey(index, range.dataOffset);
var entry = indexCaches.get(key);
if (entry == null) {
entry = range.createIndexCache();
indexCaches.put(key, entry);
}
else {
savedCounts.merge(key, 1, Integer::sum);
}
return entry;
}
public boolean isUrlPresent(SearchIndex index, int term, long url) {
var range = index.rangeForWord(this, term);
return range.isPresent() && range.hasUrl(this, url);
}
public void printSummary(Logger logger) {
long loadedBytes = indexCaches.values().stream().mapToLong(CachingBTreeReader.BTreeCachedIndex::sizeBytes).sum();
long savedBytes = savedCounts.entrySet().stream().mapToLong(e -> e.getValue() * indexCaches.get(e.getKey()).sizeBytes()).sum();
long loaded = indexCaches.values().stream().filter(CachingBTreeReader.BTreeCachedIndex::isLoaded).count();
logger.info("Index Cache Summary: {}/{} loaded/total, {} index blocks loaded, {} index blocks saved", loaded, indexCaches.size(), loadedBytes/4096., savedBytes/4096.);
}
public void clear() {
indexCaches.clear();
}
public SearchIndex.IndexBTreeRange getRange(IndexWordsTable words, int wordId) {
return rangeCache.get(new RangeKey(words, wordId));
}
public void cacheRange(IndexWordsTable words, int wordId, SearchIndex.IndexBTreeRange range) {
rangeCache.put(new RangeKey(words, wordId), range);
}
private record RangeKey(IndexWordsTable table, int wordId) {}
private record PoolKey(SearchIndex index, long dataOffset) {}
}

View File

@ -0,0 +1,103 @@
package nu.marginalia.wmsa.edge.index.svc.query;
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterBTreeRange;
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepFromPredicate;
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
import java.util.*;
import java.util.function.LongPredicate;
import java.util.stream.Collectors;
public class IndexQueryFactory {
private final List<SearchIndex> requiredIndices;
private final List<SearchIndex> excludeIndex;
private final List<SearchIndex> priortyIndices;
public Collection<SearchIndex> getIndicies() {
return requiredIndices;
}
public IndexQueryFactory(List<SearchIndex> requiredIndices, List<SearchIndex> excludeIndex, List<SearchIndex> priortyIndices) {
this.requiredIndices = requiredIndices.stream().filter(Objects::nonNull).collect(Collectors.toList());
this.excludeIndex = excludeIndex;
this.priortyIndices = priortyIndices;
}
public IndexQueryBuilder buildQuery(IndexQueryCachePool cachePool, int firstWordId) {
List<EntrySource> sources = new ArrayList<>(requiredIndices.size());
for (var ri : requiredIndices) {
var range = ri.rangeForWord(cachePool, firstWordId);
if (range.isPresent()) {
sources.add(range.asEntrySource());
}
}
return new IndexQueryBuilder(new IndexQuery(sources), cachePool);
}
public class IndexQueryBuilder {
private final IndexQuery query;
private final IndexQueryCachePool cachePool;
IndexQueryBuilder(IndexQuery query,
IndexQueryCachePool cachePool) {
this.query = query;
this.cachePool = cachePool;
}
public void filter(LongPredicate predicate) {
query.addInclusionFilter(new QueryFilterStepFromPredicate(predicate));
}
public IndexQueryBuilder also(int termId) {
List<QueryFilterStepIf> filters = new ArrayList<>(requiredIndices.size());
for (var ri : requiredIndices) {
var range = ri.rangeForWord(cachePool, termId);
if (range.isPresent()) {
filters.add(new QueryFilterBTreeRange(ri, range, cachePool));
}
else {
filters.add(QueryFilterStepIf.noPass());
}
}
filters.sort(Comparator.naturalOrder());
query.addInclusionFilter(QueryFilterStepIf.anyOf(filters));
return this;
}
public IndexQueryBuilder not(int termId) {
for (var ri : excludeIndex) {
var range = ri.rangeForWord(cachePool, termId);
if (range.isPresent()) {
query.addInclusionFilter(range.asExcludeFilterStep(cachePool));
}
}
return this;
}
public void prioritize(int termId) {
for (var idx : priortyIndices) {
var range = idx.rangeForWord(cachePool, termId);
if (range.isPresent()) {
query.addPriorityFilter(new QueryFilterBTreeRange(idx, range, cachePool));
}
}
}
public IndexQuery build() {
return query;
}
}
}

View File

@ -0,0 +1,12 @@
package nu.marginalia.wmsa.edge.index.svc.query;
import java.util.stream.LongStream;
public interface IndexQueryIf {
IndexQueryIf also(int wordId);
IndexQueryIf alsoCached(int wordId);
IndexQueryIf not(int wordId);
LongStream stream();
}

View File

@ -0,0 +1,12 @@
package nu.marginalia.wmsa.edge.index.svc.query;
public class IndexSearchBudget {
private final long timeout;
public IndexSearchBudget(long limitTime) {
this.timeout = System.currentTimeMillis() + limitTime;
}
public boolean hasTimeLeft() { return System.currentTimeMillis() < timeout; }
}

View File

@ -0,0 +1,50 @@
package nu.marginalia.wmsa.edge.index.svc.query;
import gnu.trove.map.TLongIntMap;
import gnu.trove.map.hash.TLongIntHashMap;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem;
public class ResultDomainDeduplicator {
final TLongIntMap resultsByRankingId = new TLongIntHashMap(2048, 0.5f, -1, 0);
final int limitByDomain;
public ResultDomainDeduplicator(int limitByDomain) {
this.limitByDomain = limitByDomain;
}
public boolean filterRawValue(long value) {
int rankingId = (int) (value >>> 32);
if (rankingId == Integer.MAX_VALUE) {
return true;
}
return resultsByRankingId.get(getKey(rankingId)) <= limitByDomain;
}
long getKey(int rankingId) {
return rankingId;
}
public boolean test(long value) {
int ranking = (int) (value >>> 32);
if (ranking == Integer.MAX_VALUE) {
return true;
}
return resultsByRankingId.adjustOrPutValue(ranking, 1, 1) <= limitByDomain;
}
public boolean test(EdgeSearchResultItem item) {
final int ranking = item.getRanking();
if (ranking == Integer.MAX_VALUE) {
return true;
}
// For ResultItems, consider bucketId as well as different buckets may use different
// ranking algorithms
final long key = ranking*32L + item.bucketId;
return resultsByRankingId.adjustOrPutValue(key, 1, 1) <= limitByDomain;
}
}

View File

@ -0,0 +1,9 @@
package nu.marginalia.wmsa.edge.index.svc.query.types;
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
public interface EntrySource {
SearchIndex getIndex();
int read(long[] buffer, int n);
}

Some files were not shown because too many files have changed in this diff Show More