mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Merge pull request 'Merge changes from experimental-22-08 into master' (#109) from master into release
Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/109
This commit is contained in:
commit
afb0c78e4d
@ -58,7 +58,7 @@ jmhJar {
|
||||
}
|
||||
dependencies {
|
||||
implementation project(':third_party')
|
||||
|
||||
implementation project(':protocol')
|
||||
|
||||
implementation 'org.projectlombok:lombok:1.18.24'
|
||||
annotationProcessor 'org.projectlombok:lombok:1.18.24'
|
||||
@ -157,6 +157,7 @@ dependencies {
|
||||
|
||||
jmh 'org.openjdk.jmh:jmh-core:1.35'
|
||||
jmh 'org.openjdk.jmh:jmh-generator-annprocess:1.35'
|
||||
|
||||
}
|
||||
|
||||
configurations {
|
||||
|
@ -188,7 +188,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
||||
System.out.println(driver.getTitle());
|
||||
|
||||
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
|
||||
assertEquals(List.of("Bird"), getTitlesFromSearchResults(html));
|
||||
assertEquals(List.of("Bird", "Washington, D.C."), getTitlesFromSearchResults(html));
|
||||
|
||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query"));
|
||||
}
|
||||
@ -201,7 +201,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
||||
System.out.println(driver.getTitle());
|
||||
|
||||
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
|
||||
assertNotEquals(List.of("Bird"), getTitlesFromSearchResults(html));
|
||||
assertNotEquals(List.of("Bird", "Washington, D.C."), getTitlesFromSearchResults(html));
|
||||
|
||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query-yes-js"));
|
||||
}
|
||||
@ -214,7 +214,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
||||
System.out.println(driver.getTitle());
|
||||
|
||||
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
|
||||
assertEquals(List.of("Bird"), getTitlesFromSearchResults(html));
|
||||
assertEquals(List.of("Bird", "Washington, D.C."), getTitlesFromSearchResults(html));
|
||||
|
||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query-no-js"));
|
||||
}
|
||||
@ -240,7 +240,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
||||
|
||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-search"));
|
||||
|
||||
assertEquals(List.of("Frog", "Binomial nomenclature", "Mantis", "Amphibian"), getTitlesFromSearchResults(html));
|
||||
assertEquals(List.of("Frog", "Amphibian"), getTitlesFromSearchResults(html));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -1,6 +1,10 @@
|
||||
package nu.marginalia.util;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
public class DenseBitMap {
|
||||
public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8;
|
||||
@ -15,6 +19,31 @@ public class DenseBitMap {
|
||||
this.buffer = ByteBuffer.allocateDirect((int)((cardinality / 8) + (misaligned ? 1 : 0)));
|
||||
}
|
||||
|
||||
public static DenseBitMap loadFromFile(Path file) throws IOException {
|
||||
long size = Files.size(file);
|
||||
var dbm = new DenseBitMap(size/8);
|
||||
|
||||
try (var bc = Files.newByteChannel(file)) {
|
||||
while (dbm.buffer.position() < dbm.buffer.capacity()) {
|
||||
bc.read(dbm.buffer);
|
||||
}
|
||||
}
|
||||
dbm.buffer.clear();
|
||||
|
||||
return dbm;
|
||||
}
|
||||
|
||||
public void writeToFile(Path file) throws IOException {
|
||||
|
||||
try (var bc = Files.newByteChannel(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
|
||||
while (buffer.position() < buffer.capacity()) {
|
||||
bc.write(buffer);
|
||||
}
|
||||
}
|
||||
|
||||
buffer.clear();
|
||||
}
|
||||
|
||||
public boolean get(long pos) {
|
||||
return (buffer.get((int)(pos >>> 3)) & ((byte)1 << (int)(pos & 7))) != 0;
|
||||
}
|
||||
|
@ -25,14 +25,16 @@ public class CachingBTreeReader {
|
||||
return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
|
||||
}
|
||||
|
||||
public Cache prepareCache() {
|
||||
return new Cache();
|
||||
public BTreeCachedIndex prepareCache(BTreeHeader header) {
|
||||
return new BTreeCachedIndex(header);
|
||||
}
|
||||
/**
|
||||
*
|
||||
* @return file offset of entry matching keyRaw, negative if absent
|
||||
*/
|
||||
public long findEntry(BTreeHeader header, Cache cache, final long keyRaw) {
|
||||
public long findEntry(BTreeCachedIndex cache, final long keyRaw) {
|
||||
BTreeHeader header = cache.header;
|
||||
|
||||
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
||||
|
||||
final long key = keyRaw & ctx.equalityMask();
|
||||
@ -46,7 +48,7 @@ public class CachingBTreeReader {
|
||||
numEntries = header.numEntries();
|
||||
}
|
||||
else {
|
||||
cache.load(header);
|
||||
cache.load();
|
||||
|
||||
long dataLayerOffset = searchIndex(header, cache, key);
|
||||
if (dataLayerOffset < 0) {
|
||||
@ -60,7 +62,7 @@ public class CachingBTreeReader {
|
||||
return dataSearcher.binarySearch(key, searchStart, numEntries);
|
||||
}
|
||||
|
||||
private long searchIndex(BTreeHeader header, Cache cache, long key) {
|
||||
private long searchIndex(BTreeHeader header, BTreeCachedIndex cache, long key) {
|
||||
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
||||
long layerOffset = 0;
|
||||
|
||||
@ -77,11 +79,22 @@ public class CachingBTreeReader {
|
||||
return layerOffset;
|
||||
}
|
||||
|
||||
|
||||
public class Cache {
|
||||
/** A cache for the BTree index data that will drastically reduce the number of disk reads
|
||||
* for repeated queries against the same tree. The memory consumption is typically very low
|
||||
* and the disk access pattern for reading the entire index relatively cheap.
|
||||
*/
|
||||
public class BTreeCachedIndex {
|
||||
long[] indexData;
|
||||
final BTreeHeader header;
|
||||
|
||||
public void load(BTreeHeader header) {
|
||||
final int indexedDataSize;
|
||||
|
||||
public BTreeCachedIndex(BTreeHeader header) {
|
||||
this.header = header;
|
||||
indexedDataSize = header.numEntries();
|
||||
}
|
||||
|
||||
public void load() {
|
||||
if (indexData != null)
|
||||
return;
|
||||
|
||||
@ -107,5 +120,17 @@ public class CachingBTreeReader {
|
||||
}
|
||||
return low;
|
||||
}
|
||||
|
||||
public long sizeBytes() {
|
||||
return isLoaded() ? 8L*indexData.length : 0;
|
||||
}
|
||||
|
||||
public int getIndexedDataSize() {
|
||||
return indexedDataSize;
|
||||
}
|
||||
|
||||
public boolean isLoaded() {
|
||||
return indexData != null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,18 +1,18 @@
|
||||
package nu.marginalia.util.dict;
|
||||
|
||||
import nu.marginalia.util.SeekDictionary;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.LongBuffer;
|
||||
import java.util.ArrayList;
|
||||
|
||||
public class DictionaryData {
|
||||
|
||||
private final int DICTIONARY_BANK_SIZE;
|
||||
private static final Logger logger = LoggerFactory.getLogger(DictionaryData.class);
|
||||
|
||||
private final SeekDictionary<DictionaryDataBank> banks = SeekDictionary.of(DictionaryDataBank::getSize);
|
||||
private final ArrayList<DictionaryDataBank> banks = new ArrayList(100);
|
||||
|
||||
public DictionaryData(int bankSize) {
|
||||
DICTIONARY_BANK_SIZE = bankSize;
|
||||
@ -20,12 +20,8 @@ public class DictionaryData {
|
||||
banks.add(new DictionaryDataBank(0, bankSize));
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return banks.end();
|
||||
}
|
||||
|
||||
public int add(long key) {
|
||||
var activeBank = banks.last();
|
||||
var activeBank = banks.get(banks.size()-1);
|
||||
int rb = activeBank.add(key);
|
||||
|
||||
if (rb == -1) {
|
||||
@ -42,10 +38,10 @@ public class DictionaryData {
|
||||
|
||||
|
||||
public long getKey(int offset) {
|
||||
return banks.bankForOffset(offset).getKey(offset);
|
||||
return banks.get(offset/DICTIONARY_BANK_SIZE).getKey(offset);
|
||||
}
|
||||
public boolean keyEquals(int offset, long otherKey) {
|
||||
return banks.bankForOffset(offset).keyEquals(offset, otherKey);
|
||||
return banks.get(offset/DICTIONARY_BANK_SIZE).keyEquals(offset, otherKey);
|
||||
}
|
||||
|
||||
private static class DictionaryDataBank {
|
||||
|
@ -8,7 +8,7 @@ import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
||||
import nu.marginalia.util.language.processing.model.WordRep;
|
||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
@ -30,7 +30,7 @@ public class DocumentDebugger {
|
||||
Path tempDir;
|
||||
public DocumentDebugger(LanguageModels lm) throws IOException {
|
||||
se = new SentenceExtractor(lm);
|
||||
var dict = new NGramDict(lm);
|
||||
var dict = new TermFrequencyDict(lm);
|
||||
ke = new KeywordExtractor();
|
||||
|
||||
kc = new KeywordCounter(dict, ke);
|
||||
@ -69,7 +69,7 @@ public class DocumentDebugger {
|
||||
Set<String> reps = new HashSet<>();
|
||||
|
||||
// kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed));
|
||||
kc.count(languageData).forEach(rep -> reps.add(rep.stemmed));
|
||||
// kc.count(languageData).forEach(rep -> reps.add(rep.stemmed));
|
||||
|
||||
try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) {
|
||||
|
||||
|
@ -19,7 +19,12 @@ public class WordPatterns {
|
||||
public static final Pattern wordBreakPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
|
||||
public static final Pattern characterNoisePattern = Pattern.compile("^[/+\\-]+$");
|
||||
|
||||
public static final Pattern singleWordAdditionalPattern =
|
||||
Pattern.compile("[\\da-zA-Z]{1,15}([.\\-_/:][\\da-zA-Z]{1,10}){0,4}");
|
||||
|
||||
public static final Predicate<String> singleWordQualitiesPredicate = singleWordAdditionalPattern.asMatchPredicate();
|
||||
public static final Predicate<String> wordQualitiesPredicate = wordPattern.asMatchPredicate();
|
||||
|
||||
public static final Predicate<String> wordAppendixPredicate = wordAppendixPattern.asMatchPredicate();
|
||||
public static final Predicate<String> wordPredicateEither = wordQualitiesPredicate.or(wordAppendixPredicate);
|
||||
public static final Predicate<String> characterNoisePredicate = characterNoisePattern.asMatchPredicate();
|
||||
@ -58,7 +63,7 @@ public class WordPatterns {
|
||||
if (word.isBlank()) {
|
||||
return false;
|
||||
}
|
||||
if (hasMoreThanTwo(word, '-', 2)) {
|
||||
if (hasMoreThanTwo(word, '-', 4)) {
|
||||
return false;
|
||||
}
|
||||
if (hasMoreThanTwo(word, '+', 2)) {
|
||||
@ -75,7 +80,7 @@ public class WordPatterns {
|
||||
if (Character.isDigit(word.charAt(i))) {
|
||||
numDigits++;
|
||||
}
|
||||
if (numDigits > 6)
|
||||
if (numDigits > 16)
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -6,8 +6,9 @@ import java.nio.file.Path;
|
||||
|
||||
@AllArgsConstructor
|
||||
public class LanguageModels {
|
||||
public final Path ngramDictionary;
|
||||
public final Path ngramFrequency;
|
||||
public final Path ngramBloomFilter;
|
||||
public final Path termFrequencies;
|
||||
|
||||
public final Path openNLPSentenceDetectionData;
|
||||
public final Path posRules;
|
||||
public final Path posDict;
|
||||
|
@ -5,8 +5,8 @@ import java.util.regex.Pattern;
|
||||
|
||||
public class AsciiFlattener {
|
||||
|
||||
private static final Pattern nonAscii = Pattern.compile("[^a-zA-Z0-9_.'+@#:]+");
|
||||
private static final Pattern plainAsciiPattern = Pattern.compile("^[a-zA-Z0-9_.'+@#:]+$");
|
||||
private static final Pattern nonAscii = Pattern.compile("[^a-zA-Z0-9_.'+@#:\\-]+");
|
||||
private static final Pattern plainAsciiPattern = Pattern.compile("^[a-zA-Z0-9_.'+@#:\\-]+$");
|
||||
private static final Predicate<String> plainAscii = plainAsciiPattern.asMatchPredicate();
|
||||
|
||||
public static String flattenUnicode(String s) {
|
||||
|
@ -1,99 +1,164 @@
|
||||
package nu.marginalia.util.language.processing;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import nu.marginalia.util.language.WordPatterns;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.util.language.processing.model.WordRep;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import javax.inject.Inject;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class DocumentKeywordExtractor {
|
||||
|
||||
private final KeywordExtractor keywordExtractor;
|
||||
private final KeywordCounter tfIdfCounter;
|
||||
private final NameCounter nameCounter;
|
||||
private final LongNameCounter longNameCounter;
|
||||
private final SubjectCounter subjectCounter;
|
||||
|
||||
private final NGramDict dict;
|
||||
private final TermFrequencyDict dict;
|
||||
private final double docCount;
|
||||
|
||||
@Inject
|
||||
public DocumentKeywordExtractor(NGramDict dict) {
|
||||
public DocumentKeywordExtractor(TermFrequencyDict dict) {
|
||||
this.dict = dict;
|
||||
docCount = dict.docCount();
|
||||
|
||||
keywordExtractor = new KeywordExtractor();
|
||||
|
||||
tfIdfCounter = new KeywordCounter(dict, keywordExtractor);
|
||||
nameCounter = new NameCounter(keywordExtractor);
|
||||
longNameCounter = new LongNameCounter(dict, keywordExtractor);
|
||||
subjectCounter = new SubjectCounter(keywordExtractor);
|
||||
}
|
||||
|
||||
|
||||
public EdgePageWordSet extractKeywordsMinimal(DocumentLanguageData documentLanguageData) {
|
||||
|
||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||
|
||||
KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
|
||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
||||
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
||||
|
||||
List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
|
||||
List<WordRep> topKeywords = new ArrayList<>(wordsTfIdf.top());
|
||||
|
||||
Collection<String> artifacts = getArtifacts(documentLanguageData);
|
||||
|
||||
return new EdgePageWordSet(
|
||||
createWords(IndexBlock.Subjects, subjects),
|
||||
createWords(IndexBlock.Title, titleWords),
|
||||
createWords(IndexBlock.NamesWords, wordsNamesAll),
|
||||
createWords(IndexBlock.Tfidf_Top, topKeywords),
|
||||
createWords(IndexBlock.Tfidf_Middle, midKeywords),
|
||||
new EdgePageWords(IndexBlock.Artifacts, artifacts)
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
|
||||
public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData) {
|
||||
|
||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||
|
||||
List<WordRep> wordsTfIdf = tfIdfCounter.count(documentLanguageData);
|
||||
List<WordRep> wordsNamesRepeated = nameCounter.count(documentLanguageData, 2);
|
||||
KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
|
||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
|
||||
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
||||
List<WordRep> wordsLongName = longNameCounter.count(documentLanguageData);
|
||||
|
||||
int totalSize = wordsTfIdf.size();
|
||||
|
||||
List<WordRep> lowKeywords = new ArrayList<>(totalSize / 2);
|
||||
List<WordRep> midKeywords = new ArrayList<>(totalSize / 2);
|
||||
List<WordRep> topKeywords = new ArrayList<>(totalSize / 2);
|
||||
|
||||
for(var v : wordsTfIdf) {
|
||||
if (topKeywords.size() <= totalSize / 10) topKeywords.add(v);
|
||||
else if (midKeywords.size() <= totalSize / 5) midKeywords.add(v);
|
||||
else lowKeywords.add(v);
|
||||
}
|
||||
|
||||
var wordsToMatchWithTitle = joinWordLists(topKeywords, midKeywords, wordsNamesRepeated, subjects);
|
||||
|
||||
var words = getSimpleWords(documentLanguageData);
|
||||
|
||||
for (var w : wordsLongName)
|
||||
words.add(w.word);
|
||||
for (var w : lowKeywords)
|
||||
words.remove(w.word);
|
||||
for (var w : midKeywords)
|
||||
words.remove(w.word);
|
||||
for (var w : topKeywords)
|
||||
words.remove(w.word);
|
||||
List<WordRep> lowKeywords = new ArrayList<>(wordsTfIdf.lower());
|
||||
List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
|
||||
List<WordRep> topKeywords = new ArrayList<>(wordsTfIdf.top());
|
||||
|
||||
Collection<String> artifacts = getArtifacts(documentLanguageData);
|
||||
|
||||
var wordSet = new EdgePageWordSet(
|
||||
createWords(IndexBlock.TitleKeywords, overlappingStems(titleWords, wordsToMatchWithTitle)),
|
||||
createWords(IndexBlock.Topic, subjects),
|
||||
createWords(IndexBlock.Subjects, subjects),
|
||||
createWords(IndexBlock.Title, titleWords),
|
||||
createWords(IndexBlock.NamesWords, wordsNamesAll),
|
||||
createWords(IndexBlock.Top, topKeywords),
|
||||
createWords(IndexBlock.Middle, midKeywords),
|
||||
createWords(IndexBlock.Low, lowKeywords),
|
||||
createWords(IndexBlock.Tfidf_Top, topKeywords),
|
||||
createWords(IndexBlock.Tfidf_Middle, midKeywords),
|
||||
createWords(IndexBlock.Tfidf_Lower, lowKeywords),
|
||||
new EdgePageWords(IndexBlock.Artifacts, artifacts)
|
||||
);
|
||||
|
||||
wordSet.append(IndexBlock.Words, words);
|
||||
getSimpleWords(wordSet, documentLanguageData,
|
||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus);
|
||||
|
||||
return wordSet;
|
||||
}
|
||||
|
||||
private void getSimpleWords(EdgePageWordSet wordSet, DocumentLanguageData documentLanguageData, IndexBlock... blocks) {
|
||||
|
||||
int start = 0;
|
||||
int lengthGoal = 32;
|
||||
|
||||
for (int blockIdx = 0; blockIdx < blocks.length-1 && start < documentLanguageData.sentences.length; blockIdx++) {
|
||||
IndexBlock block = blocks[blockIdx];
|
||||
Set<String> words = new HashSet<>(lengthGoal+100);
|
||||
|
||||
int pos;
|
||||
int length = 0;
|
||||
for (pos = start; pos < documentLanguageData.sentences.length && length < lengthGoal; pos++) {
|
||||
var sent = documentLanguageData.sentences[pos];
|
||||
length += sent.length();
|
||||
|
||||
for (var word : sent) {
|
||||
if (!word.isStopWord()) {
|
||||
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
||||
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
|
||||
words.add(w);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
wordSet.append(block, words);
|
||||
start = pos;
|
||||
lengthGoal+=32;
|
||||
}
|
||||
|
||||
if (start < documentLanguageData.sentences.length) {
|
||||
|
||||
Map<String, Integer> counts = new HashMap<>(documentLanguageData.totalNumWords());
|
||||
for (int pos = start; pos < documentLanguageData.sentences.length && counts.size() < lengthGoal; pos++) {
|
||||
var sent = documentLanguageData.sentences[pos];
|
||||
for (var word : sent) {
|
||||
if (!word.isStopWord()) {
|
||||
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
||||
if (counts.containsKey(w) || (WordPatterns.singleWordQualitiesPredicate.test(w))) {
|
||||
counts.merge(w, 1, Integer::sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Set<String> lastSet;
|
||||
if (counts.size() < 1024) {
|
||||
lastSet = counts.keySet();
|
||||
}
|
||||
else {
|
||||
lastSet = counts.entrySet().stream()
|
||||
.sorted(Comparator.comparing(e -> {
|
||||
double N = docCount; // Number of documents in term freq dictionary
|
||||
|
||||
// Caveat: This is actually the *negated* term score, because the second logarithm has
|
||||
// its parameter inverted (log(a^b) = b log(a); here b = -1)
|
||||
return (1 + Math.log(e.getValue())) * Math.log((1. + dict.getTermFreq(e.getKey())) / N);
|
||||
}))
|
||||
.map(Map.Entry::getKey)
|
||||
.limit(1024)
|
||||
.collect(Collectors.toCollection(LinkedHashSet::new));
|
||||
}
|
||||
|
||||
wordSet.append(blocks[blocks.length - 1], lastSet);
|
||||
}
|
||||
}
|
||||
|
||||
private Collection<String> getArtifacts(DocumentLanguageData documentLanguageData) {
|
||||
Set<String> reps = new HashSet<>();
|
||||
|
||||
|
||||
for (var sent : documentLanguageData.sentences) {
|
||||
for (var word : sent) {
|
||||
String lc = word.wordLowerCase();
|
||||
@ -123,57 +188,7 @@ public class DocumentKeywordExtractor {
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private Collection<WordRep> joinWordLists(List<WordRep>... words) {
|
||||
int size = 0;
|
||||
for (var lst : words) {
|
||||
size += lst.size();
|
||||
}
|
||||
if (size == 0)
|
||||
return Collections.emptyList();
|
||||
|
||||
final LinkedHashSet<WordRep> ret = new LinkedHashSet<>(size);
|
||||
for (var lst : words) {
|
||||
ret.addAll(lst);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
@NotNull
|
||||
private Set<String> getSimpleWords(DocumentLanguageData documentLanguageData) {
|
||||
Map<String, Integer> counts = new HashMap<>(documentLanguageData.totalNumWords());
|
||||
|
||||
for (var sent : documentLanguageData.sentences) {
|
||||
for (int i = 0; i < sent.length(); i++) {
|
||||
if (!sent.isStopWord(i)) {
|
||||
String w = AsciiFlattener.flattenUnicode(sent.wordsLowerCase[i]);
|
||||
if (counts.containsKey(w) || (WordPatterns.wordQualitiesPredicate.test(w) && WordPatterns.filter(w))) {
|
||||
counts.merge(w, 1, Integer::sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return counts.entrySet().stream()
|
||||
.sorted(Comparator.comparing(e -> {
|
||||
double N = 11820118.; // Number of documents in term freq dictionary
|
||||
|
||||
// Caveat: This is actually the *negated* term score, because the second logarithm has
|
||||
// its parameter inverted (log(a^b) = b log(a); here b = -1)
|
||||
return (1+Math.log(e.getValue())) * Math.log((1.+dict.getTermFreq(e.getKey()))/N);
|
||||
}))
|
||||
.map(Map.Entry::getKey)
|
||||
.limit(512).collect(Collectors.toCollection(LinkedHashSet::new));
|
||||
}
|
||||
|
||||
|
||||
public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) {
|
||||
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet()));
|
||||
}
|
||||
|
||||
private Set<WordRep> overlappingStems(Collection<WordRep> wordsA, Collection<WordRep> wordsB) {
|
||||
Set<String> stemmedA = wordsA.stream().map(WordRep::getStemmed).collect(Collectors.toSet());
|
||||
Set<String> stemmedB = wordsB.stream().map(WordRep::getStemmed).collect(Collectors.toSet());
|
||||
Set<String> stemmedIntersect = Sets.intersection(stemmedA, stemmedB);
|
||||
return Stream.concat(wordsA.stream(), wordsB.stream()).filter(w -> stemmedIntersect.contains(w.getStemmed())).collect(Collectors.toSet());
|
||||
}
|
||||
}
|
||||
|
@ -1,65 +1,92 @@
|
||||
package nu.marginalia.util.language.processing;
|
||||
|
||||
import nu.marginalia.util.language.WordPatterns;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.util.language.processing.model.WordRep;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class KeywordCounter {
|
||||
private final KeywordExtractor keywordExtractor;
|
||||
private final NGramDict dict;
|
||||
private final TermFrequencyDict dict;
|
||||
private final double docCount;
|
||||
|
||||
public KeywordCounter(NGramDict dict, KeywordExtractor keywordExtractor) {
|
||||
public KeywordCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) {
|
||||
this.dict = dict;
|
||||
this.keywordExtractor = keywordExtractor;
|
||||
this.docCount = (double) dict.docCount();
|
||||
}
|
||||
|
||||
public List<WordRep> count(DocumentLanguageData dld) {
|
||||
HashMap<String, Double> counts = new HashMap<>(1000);
|
||||
public WordHistogram countHisto(DocumentLanguageData dld) {
|
||||
HashMap<String, Integer> counts = new HashMap<>(1000);
|
||||
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
|
||||
|
||||
|
||||
for (var sent : dld.sentences) {
|
||||
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
||||
for (var span : keywords) {
|
||||
if (span.size() == 1 &&
|
||||
WordPatterns.isStopWord(sent.words[span.start]))
|
||||
continue;
|
||||
|
||||
String stemmed = sent.constructStemmedWordFromSpan(span);
|
||||
|
||||
counts.merge(stemmed, 1., Double::sum);
|
||||
counts.merge(stemmed, 1, Integer::sum);
|
||||
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span));
|
||||
}
|
||||
}
|
||||
|
||||
return counts.entrySet().stream()
|
||||
.filter(e -> e.getValue() > 1)
|
||||
.sorted(Comparator.comparing(this::getTermValue))
|
||||
.map(Map.Entry::getKey)
|
||||
.flatMap(w -> instances.get(w).stream())
|
||||
.filter(w -> w.word.length() > 1)
|
||||
.limit(150)
|
||||
.collect(Collectors.toList());
|
||||
double maxC = counts.values().stream().mapToDouble(Double::valueOf).max().orElse(1);
|
||||
|
||||
Set<WordRep> h5 = new HashSet<>();
|
||||
Set<WordRep> h10 = new HashSet<>();
|
||||
Set<WordRep> h15 = new HashSet<>();
|
||||
|
||||
int doubleWordCount = 0;
|
||||
|
||||
for (var entry : counts.entrySet()) {
|
||||
double value = getTermValue(entry, maxC);
|
||||
|
||||
double avgCnt = entry.getValue();
|
||||
String wordStemmed = entry.getKey();
|
||||
|
||||
Set<WordRep> histogram;
|
||||
if (value < -3 && avgCnt>1) histogram = h15;
|
||||
else if (value < -1.75 && avgCnt>1) histogram = h10;
|
||||
else if (value < -1 &&
|
||||
(!wordStemmed.contains("_") || doubleWordCount++ < 50))
|
||||
histogram = h5;
|
||||
else continue;
|
||||
|
||||
histogram.addAll(instances.get(wordStemmed));
|
||||
}
|
||||
|
||||
return new WordHistogram(h5, h10, h15);
|
||||
}
|
||||
|
||||
private static final Pattern separator = Pattern.compile("_");
|
||||
|
||||
public double getTermValue(Map.Entry<String, Double> e) {
|
||||
public double getTermValue(Map.Entry<String, Integer> e, double maxValue) {
|
||||
String[] parts = separator.split(e.getKey());
|
||||
double totalValue = 0.;
|
||||
for (String part : parts) {
|
||||
totalValue += value(part, e.getValue());
|
||||
totalValue += value(part, e.getValue(), maxValue);
|
||||
}
|
||||
return totalValue / Math.sqrt(parts.length);
|
||||
return totalValue / parts.length;
|
||||
}
|
||||
|
||||
double value(String key, double value) {
|
||||
double value(String key, double value, double maxValue) {
|
||||
double freq = dict.getTermFreqStemmed(key);
|
||||
if (freq < 1) {
|
||||
freq = 10;
|
||||
freq = 1;
|
||||
}
|
||||
return (1+Math.log(value)) * Math.log((1.1+freq)/11820118.);
|
||||
return (0.1 + 0.9*value/maxValue) * Math.log(freq/docCount);
|
||||
}
|
||||
|
||||
|
||||
public record WordHistogram(Set<WordRep> lower, Set<WordRep> mid, Set<WordRep> top) { }
|
||||
}
|
||||
|
@ -1,93 +1,18 @@
|
||||
package nu.marginalia.util.language.processing;
|
||||
|
||||
import nu.marginalia.util.language.WordPatterns;
|
||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
||||
import nu.marginalia.util.language.processing.model.WordSpan;
|
||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||
import nu.marginalia.util.language.WordPatterns;
|
||||
|
||||
import java.lang.ref.SoftReference;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
public class KeywordExtractor {
|
||||
|
||||
public boolean isLegacy() {
|
||||
return legacy;
|
||||
}
|
||||
|
||||
public void setLegacy(boolean legacy) {
|
||||
this.legacy = legacy;
|
||||
}
|
||||
|
||||
private boolean legacy;
|
||||
|
||||
public WordSpan[] getNameLikes(DocumentSentence sentence) {
|
||||
var direct = IntStream.range(0, sentence.length())
|
||||
.filter(i -> sentence.posTags[i].startsWith("N"))
|
||||
.mapToObj(i -> new WordSpan(i, i+1))
|
||||
;
|
||||
var two = IntStream.range(1, sentence.length())
|
||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE)
|
||||
.filter(i -> isName(i, sentence, Collections.emptySet()))
|
||||
.filter(i -> isName(i -1, sentence, Collections.emptySet()))
|
||||
.mapToObj(i -> new WordSpan(i-1, i+1))
|
||||
;
|
||||
|
||||
var a_in_b = IntStream.range(2, sentence.length())
|
||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE)
|
||||
.filter(i -> isProperNoun(i, sentence))
|
||||
.filter(i -> isJoiner(sentence, i-1))
|
||||
.filter(i -> isProperNoun(i-2, sentence))
|
||||
.mapToObj(i -> new WordSpan(i-2, i+1))
|
||||
;
|
||||
|
||||
var a_in_det_b = IntStream.range(3, sentence.length())
|
||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
|
||||
&& sentence.separators[i-2] == WordSeparator.SPACE)
|
||||
.filter(i -> isProperNoun(i, sentence))
|
||||
.filter(i -> isJoiner(sentence, i-1))
|
||||
.filter(i -> sentence.posTags[i-2].equals("DT"))
|
||||
.filter(i -> isProperNoun(i-3, sentence))
|
||||
.mapToObj(i -> new WordSpan(i-3, i+1))
|
||||
;
|
||||
var a_in_in_b = IntStream.range(3, sentence.length())
|
||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
|
||||
&& sentence.separators[i-2] == WordSeparator.SPACE)
|
||||
.filter(i -> isProperNoun(i, sentence))
|
||||
.filter(i -> isJoiner(sentence, i-1) || isProperNoun(i-1, sentence))
|
||||
.filter(i -> isJoiner(sentence, i-2) || isProperNoun(i-2, sentence))
|
||||
.filter(i -> isProperNoun(i-3, sentence))
|
||||
.mapToObj(i -> new WordSpan(i-3, i+1))
|
||||
;
|
||||
var three = IntStream.range(2, sentence.length())
|
||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
|
||||
&& sentence.separators[i-2] == WordSeparator.SPACE)
|
||||
.filter(i -> isName(i, sentence, Collections.emptySet()))
|
||||
.filter(i -> isName(i-1, sentence, Collections.emptySet()))
|
||||
.filter(i -> isName(i-2, sentence, Collections.emptySet()))
|
||||
.mapToObj(i -> new WordSpan(i-2, i+1))
|
||||
;
|
||||
var four = IntStream.range(3, sentence.length())
|
||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
|
||||
&& sentence.separators[i-2] == WordSeparator.SPACE
|
||||
&& sentence.separators[i-3] == WordSeparator.SPACE)
|
||||
.filter(i -> isName(i, sentence, Collections.emptySet()))
|
||||
.filter(i -> isName(i - 1, sentence, Collections.emptySet()))
|
||||
.filter(i -> isName(i - 2, sentence, Collections.emptySet()))
|
||||
.filter(i -> isName(i - 3, sentence, Collections.emptySet()))
|
||||
.mapToObj(i -> new WordSpan(i-3, i+1))
|
||||
;
|
||||
|
||||
return Stream.of(direct, two, a_in_b, a_in_in_b, a_in_det_b, three, four).flatMap(Function.identity())
|
||||
.toArray(WordSpan[]::new);
|
||||
}
|
||||
|
||||
|
||||
public WordSpan[] getNames(DocumentSentence sentence) {
|
||||
List<WordSpan> spans = new ArrayList<>(sentence.length());
|
||||
|
||||
@ -214,7 +139,7 @@ public class KeywordExtractor {
|
||||
}
|
||||
String word = sentence.constructWordFromSpan(w);
|
||||
|
||||
if (word.isBlank() || WordPatterns.isStopWord(word)) return false;
|
||||
if (word.isBlank() || !WordPatterns.filter(word)) return false;
|
||||
if (sentence.posTags[w.start].equals("CC")) return false;
|
||||
if (sentence.posTags[w.end-1].equals("IN")) return false;
|
||||
if (sentence.posTags[w.end-1].equals("DT")) return false;
|
||||
@ -377,4 +302,6 @@ public class KeywordExtractor {
|
||||
|
||||
return (posTag.startsWith("JJ") || posTag.startsWith("R") || posTag.startsWith("VBG"));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -3,7 +3,7 @@ package nu.marginalia.util.language.processing;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
||||
import nu.marginalia.util.language.processing.model.WordRep;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
@ -11,10 +11,11 @@ import java.util.stream.Collectors;
|
||||
|
||||
public class LongNameCounter {
|
||||
private final KeywordExtractor keywordExtractor;
|
||||
|
||||
private final NGramDict dict;
|
||||
public LongNameCounter(NGramDict dict, KeywordExtractor keywordExtractor) {
|
||||
private final TermFrequencyDict dict;
|
||||
private final double docCount;
|
||||
public LongNameCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) {
|
||||
this.dict = dict;
|
||||
docCount = (double) dict.docCount();
|
||||
this.keywordExtractor = keywordExtractor;
|
||||
}
|
||||
|
||||
|
@ -22,6 +22,9 @@ public class NameCounter {
|
||||
DocumentSentence sent = dld.sentences[i];
|
||||
var keywords = keywordExtractor.getNames(sent);
|
||||
for (var span : keywords) {
|
||||
if (span.size() <= 1)
|
||||
continue;
|
||||
|
||||
var stemmed = sent.constructStemmedWordFromSpan(span);
|
||||
|
||||
counts.merge(stemmed, 1., Double::sum);
|
||||
|
@ -2,11 +2,11 @@ package nu.marginalia.wmsa.api;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.wmsa.api.model.ApiLicense;
|
||||
import nu.marginalia.wmsa.client.GsonFactory;
|
||||
import nu.marginalia.wmsa.configuration.server.*;
|
||||
import nu.marginalia.wmsa.edge.search.client.EdgeSearchClient;
|
||||
import org.slf4j.Logger;
|
||||
@ -20,7 +20,7 @@ import java.util.concurrent.ConcurrentHashMap;
|
||||
public class ApiService extends Service {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Gson gson = new GsonBuilder().create();
|
||||
private final Gson gson = GsonFactory.get();
|
||||
private final EdgeSearchClient searchClient;
|
||||
private final HikariDataSource dataSource;
|
||||
private final ConcurrentHashMap<String, ApiLicense> licenseCache = new ConcurrentHashMap<>();
|
||||
|
@ -1,12 +1,11 @@
|
||||
package nu.marginalia.wmsa.client;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.google.protobuf.GeneratedMessageV3;
|
||||
import io.reactivex.rxjava3.core.Observable;
|
||||
import io.reactivex.rxjava3.core.ObservableSource;
|
||||
import io.reactivex.rxjava3.plugins.RxJavaPlugins;
|
||||
import lombok.SneakyThrows;
|
||||
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
|
||||
import nu.marginalia.wmsa.client.exception.LocalException;
|
||||
import nu.marginalia.wmsa.client.exception.NetworkException;
|
||||
import nu.marginalia.wmsa.client.exception.RemoteException;
|
||||
@ -17,8 +16,6 @@ import org.apache.http.HttpHost;
|
||||
import org.apache.logging.log4j.ThreadContext;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.slf4j.Marker;
|
||||
import org.slf4j.MarkerFactory;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
@ -32,9 +29,7 @@ import java.util.zip.GZIPOutputStream;
|
||||
public abstract class AbstractClient implements AutoCloseable {
|
||||
public static final String CONTEXT_OUTBOUND_REQUEST = "outbound-request";
|
||||
|
||||
private final Gson gson = new GsonBuilder()
|
||||
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
|
||||
.create();
|
||||
private final Gson gson = GsonFactory.get();
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@ -186,6 +181,31 @@ public abstract class AbstractClient implements AutoCloseable {
|
||||
.doFinally(() -> ThreadContext.remove("outbound-request"));
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
protected synchronized Observable<HttpStatusCode> post(Context ctx, String endpoint, GeneratedMessageV3 data) {
|
||||
|
||||
ensureAlive();
|
||||
|
||||
RequestBody body = RequestBody.create(
|
||||
MediaType.parse("application/protobuf"),
|
||||
data.toByteArray());
|
||||
|
||||
var req = ctx.paint(new Request.Builder()).url(url + endpoint).post(body).build();
|
||||
var call = client.newCall(req);
|
||||
|
||||
logInbound(call);
|
||||
ThreadContext.put("outbound-request", url + endpoint);
|
||||
try (var rsp = call.execute()) {
|
||||
logOutbound(rsp);
|
||||
int code = rsp.code();
|
||||
|
||||
return validateStatus(code, req).map(HttpStatusCode::new);
|
||||
}
|
||||
finally {
|
||||
ThreadContext.remove("outbound-request");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
protected synchronized <T> Observable<T> postGet(Context ctx, String endpoint, Object data, Class<T> returnType) {
|
||||
|
@ -0,0 +1,29 @@
|
||||
package nu.marginalia.wmsa.client;
|
||||
|
||||
import com.google.gson.*;
|
||||
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
|
||||
public class GsonFactory {
|
||||
public static Gson get() {
|
||||
return new GsonBuilder()
|
||||
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
|
||||
.registerTypeAdapter(EdgeUrl.class, (JsonSerializer<EdgeUrl>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
||||
.registerTypeAdapter(EdgeDomain.class, (JsonSerializer<EdgeDomain>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
||||
.registerTypeAdapter(EdgeUrl.class, (JsonDeserializer<EdgeUrl>) (json, typeOfT, context) -> {
|
||||
try {
|
||||
return new EdgeUrl(json.getAsString());
|
||||
} catch (URISyntaxException e) {
|
||||
throw new JsonParseException("URL Parse Exception", e);
|
||||
}
|
||||
})
|
||||
.registerTypeAdapter(EdgeDomain.class, (JsonDeserializer<EdgeDomain>) (json, typeOfT, context) -> new EdgeDomain(json.getAsString()))
|
||||
.registerTypeAdapter(EdgeId.class, (JsonDeserializer<EdgeId<?>>) (json, typeOfT, context) -> new EdgeId<>(json.getAsInt()))
|
||||
.registerTypeAdapter(EdgeId.class, (JsonSerializer<EdgeId<?>>) (src, typeOfSrc, context) -> new JsonPrimitive(src.id()))
|
||||
.create();
|
||||
}
|
||||
}
|
@ -2,10 +2,7 @@ package nu.marginalia.wmsa.configuration;
|
||||
|
||||
import nu.marginalia.wmsa.api.ApiMain;
|
||||
import nu.marginalia.wmsa.auth.AuthMain;
|
||||
import nu.marginalia.wmsa.configuration.command.Command;
|
||||
import nu.marginalia.wmsa.configuration.command.ListCommand;
|
||||
import nu.marginalia.wmsa.configuration.command.StartCommand;
|
||||
import nu.marginalia.wmsa.configuration.command.VersionCommand;
|
||||
import nu.marginalia.wmsa.configuration.command.*;
|
||||
import nu.marginalia.wmsa.edge.assistant.EdgeAssistantMain;
|
||||
import nu.marginalia.wmsa.edge.dating.DatingMain;
|
||||
import nu.marginalia.wmsa.edge.index.EdgeIndexMain;
|
||||
@ -82,6 +79,9 @@ public enum ServiceDescriptor {
|
||||
MainMapLookup.setMainArguments(args);
|
||||
Map<String, Command> functions = Stream.of(new ListCommand(),
|
||||
new StartCommand(),
|
||||
new ConvertCommand(),
|
||||
new LoadCommand(),
|
||||
new ReindexCommand(),
|
||||
new VersionCommand()
|
||||
).collect(Collectors.toMap(c -> c.name, c -> c));
|
||||
|
||||
|
@ -87,7 +87,7 @@ public class WmsaHome {
|
||||
final Path home = getHomePath();
|
||||
|
||||
return new LanguageModels(
|
||||
home.resolve("model/ngrams-generous-emstr.bin"),
|
||||
home.resolve("model/ngrams.bin"),
|
||||
home.resolve("model/tfreq-new-algo3.bin"),
|
||||
home.resolve("model/opennlp-sentence.bin"),
|
||||
home.resolve("model/English.RDR"),
|
||||
@ -95,4 +95,8 @@ public class WmsaHome {
|
||||
home.resolve("model/opennlp-tok.bin"));
|
||||
}
|
||||
|
||||
private static final boolean debugMode = Boolean.getBoolean("wmsa-debug");
|
||||
public static boolean isDebug() {
|
||||
return debugMode;
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,24 @@
|
||||
package nu.marginalia.wmsa.configuration.command;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.converting.ConverterMain;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class ConvertCommand extends Command {
|
||||
public ConvertCommand() {
|
||||
super("convert");
|
||||
}
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public void execute(String... args) {
|
||||
if (args.length < 2) {
|
||||
System.err.println("Usage: convert plan.yaml");
|
||||
System.exit(255);
|
||||
}
|
||||
|
||||
String[] args2 = Arrays.copyOfRange(args, 1, args.length);
|
||||
ConverterMain.main(args2);
|
||||
}
|
||||
}
|
@ -0,0 +1,24 @@
|
||||
package nu.marginalia.wmsa.configuration.command;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.converting.LoaderMain;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class LoadCommand extends Command {
|
||||
public LoadCommand() {
|
||||
super("load");
|
||||
}
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public void execute(String... args) {
|
||||
if (args.length < 2) {
|
||||
System.err.println("Usage: load plan.yaml");
|
||||
System.exit(255);
|
||||
}
|
||||
|
||||
String[] args2 = Arrays.copyOfRange(args, 1, args.length);
|
||||
LoaderMain.main(args2);
|
||||
}
|
||||
}
|
@ -0,0 +1,24 @@
|
||||
package nu.marginalia.wmsa.configuration.command;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.converting.ReindexTriggerMain;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class ReindexCommand extends Command {
|
||||
public ReindexCommand() {
|
||||
super("reindex");
|
||||
}
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public void execute(String... args) {
|
||||
if (args.length < 2) {
|
||||
System.err.println("Usage: reindex host");
|
||||
System.exit(255);
|
||||
}
|
||||
|
||||
String[] args2 = Arrays.copyOfRange(args, 1, args.length);
|
||||
ReindexTriggerMain.main(args2);
|
||||
}
|
||||
}
|
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.wmsa.configuration.command;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
@ -14,6 +15,12 @@ public class StartCommand extends Command {
|
||||
public void execute(String... args) {
|
||||
if (args.length < 2) {
|
||||
System.err.println("Usage: start service-descriptor");
|
||||
System.err.println();
|
||||
System.err.println("Available services:");
|
||||
System.err.println();
|
||||
for (var d : ServiceDescriptor.values()) {
|
||||
System.err.println("\t"+d.name);
|
||||
}
|
||||
System.exit(255);
|
||||
}
|
||||
var mainMethod = getKind(args[1]).mainClass.getMethod("main", String[].class);
|
||||
|
@ -84,6 +84,7 @@ public class DatabaseModule extends AbstractModule {
|
||||
config.addDataSourceProperty("cachePrepStmts", "true");
|
||||
config.addDataSourceProperty("prepStmtCacheSize", "250");
|
||||
config.addDataSourceProperty("prepStmtCacheSqlLimit", "2048");
|
||||
|
||||
config.setMaximumPoolSize(100);
|
||||
config.setMinimumIdle(10);
|
||||
return new HikariDataSource(config);
|
||||
|
@ -1,10 +1,10 @@
|
||||
package nu.marginalia.wmsa.edge.assistant;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.client.GsonFactory;
|
||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
||||
import nu.marginalia.wmsa.configuration.server.Service;
|
||||
@ -22,7 +22,7 @@ import spark.Spark;
|
||||
public class EdgeAssistantService extends Service {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Gson gson = new GsonBuilder().create();
|
||||
private final Gson gson = GsonFactory.get();
|
||||
private final Units units;
|
||||
private final MathParser mathParser;
|
||||
private final Suggestions suggestions;
|
||||
|
@ -0,0 +1,93 @@
|
||||
package nu.marginalia.wmsa.edge.assistant.dict;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.util.DenseBitMap;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournalFile;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class NGramBloomFilter {
|
||||
private final DenseBitMap bitMap;
|
||||
private static final PorterStemmer ps = new PorterStemmer();
|
||||
private static final HashFunction hasher = Hashing.murmur3_128(0);
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(NGramBloomFilter.class);
|
||||
|
||||
@Inject
|
||||
public NGramBloomFilter() throws IOException {
|
||||
this(WmsaHome.getLanguageModels());
|
||||
}
|
||||
|
||||
public NGramBloomFilter(LanguageModels lm) throws IOException {
|
||||
this(loadSafely(lm.ngramBloomFilter));
|
||||
}
|
||||
|
||||
private static DenseBitMap loadSafely(Path path) throws IOException {
|
||||
if (Files.isRegularFile(path)) {
|
||||
return DenseBitMap.loadFromFile(path);
|
||||
}
|
||||
else {
|
||||
logger.warn("NGrams file missing " + path);
|
||||
return new DenseBitMap(1);
|
||||
}
|
||||
}
|
||||
|
||||
public NGramBloomFilter(DenseBitMap bitMap) {
|
||||
this.bitMap = bitMap;
|
||||
}
|
||||
|
||||
public boolean isKnownNGram(String word) {
|
||||
long bit = bitForWord(word, bitMap.cardinality);
|
||||
|
||||
return bitMap.get(bit);
|
||||
}
|
||||
|
||||
public static void main(String... args) throws IOException {
|
||||
var filter = convertFromDictionaryFile(new File(args[0]));
|
||||
filter.bitMap.writeToFile(Path.of(args[1]));
|
||||
}
|
||||
|
||||
public static NGramBloomFilter load(Path file) throws IOException {
|
||||
return new NGramBloomFilter(DenseBitMap.loadFromFile(file));
|
||||
}
|
||||
|
||||
public static NGramBloomFilter convertFromDictionaryFile(File file) throws IOException {
|
||||
DenseBitMap bitMap = new DenseBitMap(1024*1024*1024L);
|
||||
AtomicInteger popCount = new AtomicInteger();
|
||||
try (var f = new KeywordLexiconJournalFile(file)) {
|
||||
f.loadFile(data -> {
|
||||
long bit = bitForWord(new String(data), bitMap.cardinality);
|
||||
if (!bitMap.set(bit))
|
||||
popCount.incrementAndGet();
|
||||
});
|
||||
}
|
||||
|
||||
System.out.println("popcount = " + popCount.get());
|
||||
return new NGramBloomFilter(bitMap);
|
||||
}
|
||||
|
||||
private static final Pattern underscore = Pattern.compile("_");
|
||||
|
||||
private static long bitForWord(String s, long n) {
|
||||
String[] parts = underscore.split(s);
|
||||
long hc = 0;
|
||||
for (String part : parts) {
|
||||
hc = hc * 31 + hasher.hashString(ps.stemWord(part), StandardCharsets.UTF_8).padToLong();
|
||||
}
|
||||
return (hc & 0x7FFF_FFFF_FFFF_FFFFL) % n;
|
||||
}
|
||||
|
||||
}
|
@ -1,137 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.assistant.dict;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import gnu.trove.map.hash.TLongIntHashMap;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import javax.inject.Inject;
|
||||
import javax.inject.Singleton;
|
||||
import java.io.*;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Singleton
|
||||
public class NGramDict {
|
||||
|
||||
private final TLongIntHashMap wordRates = new TLongIntHashMap(1_000_000, 0.5f, 0, 0);
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private static final Pattern separator = Pattern.compile("[_ ]+");
|
||||
private static final PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
private static long fileSize(Path p) throws IOException {
|
||||
return Files.size(p);
|
||||
}
|
||||
|
||||
@Inject
|
||||
public NGramDict(@Nullable LanguageModels models) {
|
||||
if (models == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (models.ngramFrequency != null) {
|
||||
|
||||
try (var frequencyData = new DataInputStream(new BufferedInputStream(new FileInputStream(models.ngramFrequency.toFile())))) {
|
||||
|
||||
wordRates.ensureCapacity((int)(fileSize(models.ngramFrequency)/16));
|
||||
|
||||
for (;;) {
|
||||
wordRates.put(frequencyData.readLong(), (int) frequencyData.readLong());
|
||||
}
|
||||
} catch (EOFException eof) {
|
||||
// ok
|
||||
} catch (IOException e) {
|
||||
logger.error("IO Exception reading " + models.ngramFrequency, e);
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Read {} N-grams frequencies", wordRates.size());
|
||||
}
|
||||
|
||||
|
||||
public static void main(String... args) {
|
||||
if (args.length != 2) {
|
||||
System.err.println("Expected arguments: in-file out-file");
|
||||
}
|
||||
String inFile = args[0];
|
||||
String outFile = args[1];
|
||||
|
||||
var wordPattern = Pattern.compile("\\w+(_\\w+)*").asMatchPredicate();
|
||||
try (var linesStr = Files.lines(Path.of(inFile));
|
||||
var dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(outFile)))
|
||||
) {
|
||||
linesStr
|
||||
.filter(wordPattern)
|
||||
.mapToLong(NGramDict::getStringHash).forEach(l ->
|
||||
{
|
||||
try {
|
||||
dos.writeLong(l);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
});
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public static long getStringHash(String s) {
|
||||
String[] strings = separator.split(s);
|
||||
if (s.length() > 1) {
|
||||
byte[][] parts = new byte[strings.length][];
|
||||
for (int i = 0; i < parts.length; i++) {
|
||||
parts[i] = ps.stemWord(strings[i]).getBytes();
|
||||
}
|
||||
return longHash(parts);
|
||||
}
|
||||
else {
|
||||
return longHash(s.getBytes());
|
||||
}
|
||||
}
|
||||
public long getTermFreqHash(long hash) {
|
||||
return wordRates.get(hash);
|
||||
}
|
||||
public long getTermFreq(String s) {
|
||||
return wordRates.get(getStringHash(s));
|
||||
}
|
||||
public long getTermFreqStemmed(String s) {
|
||||
return wordRates.get(longHash(s.getBytes()));
|
||||
}
|
||||
|
||||
public static String getStemmedString(String s) {
|
||||
String[] strings = separator.split(s);
|
||||
if (s.length() > 1) {
|
||||
return Arrays.stream(strings).map(ps::stemWord).collect(Collectors.joining("_"));
|
||||
}
|
||||
else {
|
||||
return s;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static long longHash(byte[]... bytesSets) {
|
||||
if (bytesSets == null || bytesSets.length == 0)
|
||||
return 0;
|
||||
|
||||
// https://cp-algorithms.com/string/string-hashing.html
|
||||
int p = 127;
|
||||
long m = (1L<<61)-1;
|
||||
long p_power = 1;
|
||||
long hash_val = 0;
|
||||
|
||||
for (byte[] bytes: bytesSets) {
|
||||
for (byte element : bytes) {
|
||||
hash_val = (hash_val + (element + 1) * p_power) % m;
|
||||
p_power = (p_power * p) % m;
|
||||
}
|
||||
}
|
||||
return hash_val;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,221 @@
|
||||
package nu.marginalia.wmsa.edge.assistant.dict;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import gnu.trove.map.hash.TLongIntHashMap;
|
||||
import nu.marginalia.util.language.LanguageFilter;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter;
|
||||
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import javax.inject.Inject;
|
||||
import javax.inject.Singleton;
|
||||
import java.io.*;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Singleton
|
||||
public class TermFrequencyDict {
|
||||
|
||||
private final TLongIntHashMap wordRates = new TLongIntHashMap(1_000_000, 0.5f, 0, 0);
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private static final Pattern separator = Pattern.compile("[_ ]+");
|
||||
private static final PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
private static final long DOC_COUNT_KEY = ~0L;
|
||||
private static long fileSize(Path p) throws IOException {
|
||||
return Files.size(p);
|
||||
}
|
||||
|
||||
@Inject
|
||||
public TermFrequencyDict(@Nullable LanguageModels models) {
|
||||
if (models == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (models.termFrequencies != null) {
|
||||
|
||||
try (var frequencyData = new DataInputStream(new BufferedInputStream(new FileInputStream(models.termFrequencies.toFile())))) {
|
||||
|
||||
wordRates.ensureCapacity((int)(fileSize(models.termFrequencies)/16));
|
||||
|
||||
for (;;) {
|
||||
wordRates.put(frequencyData.readLong(), (int) frequencyData.readLong());
|
||||
}
|
||||
} catch (EOFException eof) {
|
||||
// ok
|
||||
} catch (IOException e) {
|
||||
logger.error("IO Exception reading " + models.termFrequencies, e);
|
||||
}
|
||||
}
|
||||
|
||||
logger.info("Read {} N-grams frequencies", wordRates.size());
|
||||
}
|
||||
|
||||
|
||||
public int docCount() {
|
||||
int cnt = wordRates.get(DOC_COUNT_KEY);
|
||||
|
||||
if (cnt == 0) {
|
||||
cnt = 11820118; // legacy
|
||||
}
|
||||
return cnt;
|
||||
}
|
||||
|
||||
public static void main(String... args) throws IOException, InterruptedException {
|
||||
if (args.length != 2) {
|
||||
System.err.println("Expected arguments: plan.yaml out-file");
|
||||
}
|
||||
String outFile = args[1];
|
||||
|
||||
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||
|
||||
ThreadLocal<SentenceExtractor> se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels()));
|
||||
LanguageFilter lf = new LanguageFilter();
|
||||
|
||||
TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1);
|
||||
|
||||
ForkJoinPool fjp = new ForkJoinPool(24);
|
||||
AtomicInteger docCount = new AtomicInteger();
|
||||
|
||||
for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine
|
||||
|
||||
if (domain.doc == null)
|
||||
continue;
|
||||
|
||||
fjp.execute(() -> {
|
||||
|
||||
for (var doc : domain.doc) {
|
||||
if (doc.documentBody == null)
|
||||
continue;
|
||||
docCount.incrementAndGet();
|
||||
|
||||
Document parsed = Jsoup.parse(doc.documentBody);
|
||||
parsed.body().filter(new DomPruningFilter(0.5));
|
||||
|
||||
DocumentLanguageData dld = se.get().extractSentences(parsed);
|
||||
|
||||
if (lf.dictionaryAgreement(dld) < 0.1) {
|
||||
return;
|
||||
}
|
||||
|
||||
Set<String> words = new HashSet<>(10_000);
|
||||
|
||||
for (var sent : dld.sentences) {
|
||||
for (var word : sent) {
|
||||
words.add(word.stemmed());
|
||||
}
|
||||
}
|
||||
|
||||
fjp.execute(() -> {
|
||||
synchronized (counts) {
|
||||
for (var word : words) {
|
||||
counts.adjustOrPutValue(longHash(word.getBytes()), 1, 1);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
fjp.shutdown();
|
||||
fjp.awaitTermination(10, TimeUnit.SECONDS);
|
||||
|
||||
try (var dos = new DataOutputStream(Files.newOutputStream(Path.of(outFile)))) {
|
||||
synchronized (counts) {
|
||||
counts.put(DOC_COUNT_KEY, docCount.get());
|
||||
|
||||
counts.forEachEntry((hash, cnt) -> {
|
||||
try {
|
||||
dos.writeLong(hash);
|
||||
dos.writeLong(cnt);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
return true;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
System.out.println(docCount.get());
|
||||
//
|
||||
// counts.forEachEntry((w,c) -> {
|
||||
// if (c > 3L) {
|
||||
// System.out.println(w + ":" + c);
|
||||
// }
|
||||
// return true;
|
||||
// });
|
||||
|
||||
}
|
||||
|
||||
public static long getStringHash(String s) {
|
||||
String[] strings = separator.split(s);
|
||||
if (s.length() > 1) {
|
||||
byte[][] parts = new byte[strings.length][];
|
||||
for (int i = 0; i < parts.length; i++) {
|
||||
parts[i] = ps.stemWord(strings[i]).getBytes();
|
||||
}
|
||||
return longHash(parts);
|
||||
}
|
||||
else {
|
||||
return longHash(s.getBytes());
|
||||
}
|
||||
}
|
||||
public long getTermFreqHash(long hash) {
|
||||
return wordRates.get(hash);
|
||||
}
|
||||
public long getTermFreq(String s) {
|
||||
return wordRates.get(getStringHash(s));
|
||||
}
|
||||
public long getTermFreqStemmed(String s) {
|
||||
return wordRates.get(longHash(s.getBytes()));
|
||||
}
|
||||
|
||||
public static String getStemmedString(String s) {
|
||||
String[] strings = separator.split(s);
|
||||
if (s.length() > 1) {
|
||||
return Arrays.stream(strings).map(ps::stemWord).collect(Collectors.joining("_"));
|
||||
}
|
||||
else {
|
||||
return s;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static long longHash(byte[]... bytesSets) {
|
||||
if (bytesSets == null || bytesSets.length == 0)
|
||||
return 0;
|
||||
|
||||
// https://cp-algorithms.com/string/string-hashing.html
|
||||
int p = 127;
|
||||
long m = (1L<<61)-1;
|
||||
long p_power = 1;
|
||||
long hash_val = 0;
|
||||
|
||||
for (byte[] bytes: bytesSets) {
|
||||
for (byte element : bytes) {
|
||||
hash_val = (hash_val + (element + 1) * p_power) % m;
|
||||
p_power = (p_power * p) % m;
|
||||
}
|
||||
}
|
||||
return hash_val;
|
||||
}
|
||||
|
||||
}
|
@ -6,7 +6,7 @@ import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -2,8 +2,8 @@ package nu.marginalia.wmsa.edge.assistant.suggest;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
||||
import org.apache.commons.collections4.trie.PatriciaTrie;
|
||||
import org.slf4j.Logger;
|
||||
@ -21,7 +21,7 @@ import java.util.stream.Stream;
|
||||
|
||||
public class Suggestions {
|
||||
private final PatriciaTrie<String> suggestionsTrie;
|
||||
private final NGramDict nGramDict;
|
||||
private final TermFrequencyDict termFrequencyDict;
|
||||
private final SpellChecker spellChecker;
|
||||
|
||||
private static final Pattern suggestionPattern = Pattern.compile("^[a-zA-Z0-9]+( [a-zA-Z0-9]+)*$");
|
||||
@ -31,12 +31,12 @@ public class Suggestions {
|
||||
@Inject
|
||||
public Suggestions(@Named("suggestions-file") Path suggestionsFile,
|
||||
SpellChecker spellChecker,
|
||||
NGramDict dict
|
||||
TermFrequencyDict dict
|
||||
) {
|
||||
this.spellChecker = spellChecker;
|
||||
|
||||
suggestionsTrie = loadSuggestions(suggestionsFile);
|
||||
nGramDict = dict;
|
||||
termFrequencyDict = dict;
|
||||
|
||||
logger.info("Loaded {} suggestions", suggestionsTrie.size());
|
||||
}
|
||||
@ -138,7 +138,7 @@ public class Suggestions {
|
||||
}
|
||||
|
||||
Map<String, Long> scach = new HashMap<>(512);
|
||||
Function<String, Long> valr = s -> -nGramDict.getTermFreqHash(scach.computeIfAbsent(s, NGramDict::getStringHash));
|
||||
Function<String, Long> valr = s -> -termFrequencyDict.getTermFreqHash(scach.computeIfAbsent(s, TermFrequencyDict::getStringHash));
|
||||
|
||||
return Stream.iterate(start.getKey(), Objects::nonNull, suggestionsTrie::nextKey)
|
||||
.takeWhile(s -> s.startsWith(prefix))
|
||||
|
@ -22,7 +22,7 @@ import java.util.List;
|
||||
public class ConverterMain {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final CrawledInstructionWriter instructionWriter;
|
||||
private final LoadInstructionWriter instructionWriter;
|
||||
|
||||
public static void main(String... args) throws IOException {
|
||||
|
||||
@ -47,12 +47,12 @@ public class ConverterMain {
|
||||
Gson gson
|
||||
) throws Exception {
|
||||
|
||||
instructionWriter = new CrawledInstructionWriter(plan.process.getDir(), gson);
|
||||
instructionWriter = new LoadInstructionWriter(plan.process.getDir(), gson);
|
||||
|
||||
logger.info("Starting pipe");
|
||||
|
||||
try (WorkLog processLog = plan.createProcessWorkLog()) {
|
||||
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 48, 4, 2) {
|
||||
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 20, 4, 2) {
|
||||
|
||||
@Override
|
||||
protected ProcessingInstructions onProcess(CrawledDomain domainData) {
|
||||
@ -73,12 +73,7 @@ public class ConverterMain {
|
||||
|
||||
};
|
||||
|
||||
plan.forEachCrawledDomain(domain -> {
|
||||
if (!processLog.isJobFinished(domain.id)) {
|
||||
logger.info("{} - {}", domain.domain, domain.id);
|
||||
pipe.accept(domain);
|
||||
}
|
||||
});
|
||||
plan.forEachCrawledDomain(id -> !processLog.isJobFinished(id), pipe::accept);
|
||||
|
||||
pipe.join();
|
||||
}
|
||||
|
@ -1,16 +1,17 @@
|
||||
package nu.marginalia.wmsa.edge.converting;
|
||||
|
||||
import com.google.gson.*;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.name.Names;
|
||||
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.wmsa.client.GsonFactory;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
|
||||
import nu.marginalia.wmsa.edge.index.client.EdgeIndexLocalService;
|
||||
import nu.marginalia.wmsa.edge.index.client.EdgeIndexWriterClient;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class ConverterModule extends AbstractModule {
|
||||
|
||||
@ -31,24 +32,20 @@ public class ConverterModule extends AbstractModule {
|
||||
bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128);
|
||||
bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255);
|
||||
|
||||
if (null != System.getProperty("local-index-path")) {
|
||||
bind(Path.class).annotatedWith(Names.named("local-index-path")).toInstance(Path.of(System.getProperty("local-index-path")));
|
||||
bind(EdgeIndexWriterClient.class).to(EdgeIndexLocalService.class);
|
||||
}
|
||||
else {
|
||||
bind(EdgeIndexWriterClient.class).to(EdgeIndexClient.class);
|
||||
}
|
||||
|
||||
|
||||
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
|
||||
}
|
||||
|
||||
private Gson createGson() {
|
||||
|
||||
return new GsonBuilder()
|
||||
.registerTypeAdapter(EdgeUrl.class, (JsonSerializer<EdgeUrl>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
||||
.registerTypeAdapter(EdgeDomain.class, (JsonSerializer<EdgeDomain>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
||||
.registerTypeAdapter(EdgeUrl.class, (JsonDeserializer<EdgeUrl>) (json, typeOfT, context) -> {
|
||||
try {
|
||||
return new EdgeUrl(json.getAsString());
|
||||
} catch (URISyntaxException e) {
|
||||
throw new JsonParseException("URL Parse Exception", e);
|
||||
}
|
||||
})
|
||||
.registerTypeAdapter(EdgeDomain.class, (JsonDeserializer<EdgeDomain>) (json, typeOfT, context) -> new EdgeDomain(json.getAsString()))
|
||||
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
|
||||
.create();
|
||||
return GsonFactory.get();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,62 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.converting;
|
||||
|
||||
import com.github.luben.zstd.ZstdOutputStream;
|
||||
import com.google.gson.Gson;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
public class CrawledInstructionWriter {
|
||||
private final Path outputDir;
|
||||
private final Gson gson;
|
||||
private static final Logger logger = LoggerFactory.getLogger(CrawledInstructionWriter.class);
|
||||
|
||||
public CrawledInstructionWriter(Path outputDir, Gson gson) {
|
||||
this.outputDir = outputDir;
|
||||
this.gson = gson;
|
||||
|
||||
if (!Files.isDirectory(outputDir)) {
|
||||
throw new IllegalArgumentException("Output dir " + outputDir + " does not exist");
|
||||
}
|
||||
}
|
||||
|
||||
public String accept(String id, List<Instruction> instructionList) throws IOException {
|
||||
Path outputFile = getOutputFile(id);
|
||||
|
||||
if (Files.exists(outputFile)) {
|
||||
Files.delete(outputFile);
|
||||
}
|
||||
|
||||
try (var outputStream = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outputFile.toFile()))))) {
|
||||
logger.info("Writing {} - {}", id, instructionList.size());
|
||||
|
||||
for (var instr : instructionList) {
|
||||
outputStream.append(instr.tag().name());
|
||||
outputStream.append(' ');
|
||||
gson.toJson(instr, outputStream);
|
||||
outputStream.append('\n');
|
||||
}
|
||||
}
|
||||
|
||||
return outputFile.getFileName().toString();
|
||||
}
|
||||
|
||||
private Path getOutputFile(String id) throws IOException {
|
||||
String first = id.substring(0, 2);
|
||||
String second = id.substring(2, 4);
|
||||
|
||||
Path destDir = outputDir.resolve(first).resolve(second);
|
||||
if (!Files.exists(destDir)) {
|
||||
Files.createDirectories(destDir);
|
||||
}
|
||||
return destDir.resolve(id + ".pzstd");
|
||||
}
|
||||
}
|
@ -2,11 +2,10 @@ package nu.marginalia.wmsa.edge.converting;
|
||||
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
@ -76,9 +75,8 @@ public class LinkKeywordLoaderMain {
|
||||
|
||||
// System.out.println(lastLine + " -/- " + domainId + ":" + urlId + " : " + keywords);
|
||||
|
||||
indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -5, new EdgePageWordSet(
|
||||
new EdgePageWords(IndexBlock.Link, new HashSet<>(keywords))), 0
|
||||
).blockingSubscribe();
|
||||
indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId),
|
||||
new DocumentKeywords(IndexBlock.Link, keywords.toArray(String[]::new)), 0);
|
||||
}
|
||||
|
||||
lastLine = urlKeyword.url;
|
||||
|
@ -0,0 +1,121 @@
|
||||
package nu.marginalia.wmsa.edge.converting;
|
||||
|
||||
import com.github.luben.zstd.ZstdOutputStream;
|
||||
import com.google.gson.Gson;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
|
||||
public class LoadInstructionWriter {
|
||||
|
||||
private final Path outputDir;
|
||||
private final Gson gson;
|
||||
private static final Logger logger = LoggerFactory.getLogger(LoadInstructionWriter.class);
|
||||
public LoadInstructionWriter(Path outputDir, Gson gson) {
|
||||
this.outputDir = outputDir;
|
||||
this.gson = gson;
|
||||
|
||||
if (!Files.isDirectory(outputDir)) {
|
||||
throw new IllegalArgumentException("Output dir " + outputDir + " does not exist");
|
||||
}
|
||||
}
|
||||
public String accept(String id, List<Instruction> instructionList) throws IOException {
|
||||
Path outputFile = getOutputFile(id);
|
||||
|
||||
if (Files.exists(outputFile)) {
|
||||
Files.delete(outputFile);
|
||||
}
|
||||
|
||||
try (var outputStream = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outputFile.toFile()))))) {
|
||||
|
||||
SummarizingInterpreter summary = new SummarizingInterpreter(instructionList);
|
||||
logger.info("Writing {} - {} - {}", id, instructionList.size(), summary);
|
||||
|
||||
for (var instr : instructionList) {
|
||||
outputStream.append(instr.tag().name());
|
||||
outputStream.append(' ');
|
||||
gson.toJson(instr, outputStream);
|
||||
outputStream.append('\n');
|
||||
}
|
||||
}
|
||||
|
||||
return outputFile.getFileName().toString();
|
||||
}
|
||||
|
||||
private Path getOutputFile(String id) throws IOException {
|
||||
String first = id.substring(0, 2);
|
||||
String second = id.substring(2, 4);
|
||||
|
||||
Path destDir = outputDir.resolve(first).resolve(second);
|
||||
if (!Files.exists(destDir)) {
|
||||
Files.createDirectories(destDir);
|
||||
}
|
||||
return destDir.resolve(id + ".pzstd");
|
||||
}
|
||||
|
||||
private static class SummarizingInterpreter implements Interpreter {
|
||||
|
||||
private SummarizingInterpreter(List<Instruction> instructions) {
|
||||
for (var i : instructions) {
|
||||
i.apply(this);
|
||||
}
|
||||
}
|
||||
|
||||
private String domainName;
|
||||
private int ok = 0;
|
||||
private int error = 0;
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s - %d %d", domainName, ok, error);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void loadUrl(EdgeUrl[] url) {}
|
||||
|
||||
@Override
|
||||
public void loadDomain(EdgeDomain[] domain) {}
|
||||
|
||||
@Override
|
||||
public void loadRssFeed(EdgeUrl[] rssFeed) {}
|
||||
|
||||
@Override
|
||||
public void loadDomainLink(DomainLink[] links) {}
|
||||
|
||||
@Override
|
||||
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
|
||||
this.domainName = domain.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {
|
||||
ok++;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) {
|
||||
error++;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {}
|
||||
|
||||
@Override
|
||||
public void loadDomainRedirect(DomainLink link) {}
|
||||
}
|
||||
}
|
@ -27,7 +27,6 @@ public class LoaderMain {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class);
|
||||
|
||||
private final Path processDir;
|
||||
private final EdgeCrawlPlan plan;
|
||||
private final ConvertedDomainReader instructionsReader;
|
||||
private final LoaderFactory loaderFactory;
|
||||
@ -59,7 +58,6 @@ public class LoaderMain {
|
||||
LoaderFactory loaderFactory,
|
||||
EdgeIndexClient indexClient) {
|
||||
|
||||
this.processDir = plan.process.getDir();
|
||||
this.plan = plan;
|
||||
this.instructionsReader = instructionsReader;
|
||||
this.loaderFactory = loaderFactory;
|
||||
@ -106,7 +104,12 @@ public class LoaderMain {
|
||||
public void run() {
|
||||
long startTime = System.currentTimeMillis();
|
||||
for (var i : instructionList) {
|
||||
i.apply(loader);
|
||||
try {
|
||||
i.apply(loader);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to load instruction {}", i);
|
||||
}
|
||||
}
|
||||
|
||||
loader.finish();
|
||||
|
@ -6,7 +6,7 @@ import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.DenseBitMap;
|
||||
import nu.marginalia.util.language.WordPatterns;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
@ -36,7 +36,7 @@ public class AnchorTextExtractor {
|
||||
// de-duplicating billions of shuffled (url, word) tuples on limited hardware
|
||||
private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
|
||||
|
||||
private final NGramDict ngramDict = new NGramDict(WmsaHome.getLanguageModels());
|
||||
private final TermFrequencyDict ngramDict = new TermFrequencyDict(WmsaHome.getLanguageModels());
|
||||
|
||||
public AnchorTextExtractor(Predicate<String> includeDomainPredicate,
|
||||
Predicate<EdgeUrl> includeUrlPredicate,
|
||||
|
@ -4,23 +4,22 @@ import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.index.client.EdgeIndexWriterClient;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class IndexLoadKeywords implements Runnable {
|
||||
private final EdgeIndexClient client;
|
||||
private static final Logger logger = LoggerFactory.getLogger(IndexLoadKeywords.class);
|
||||
private final LinkedBlockingQueue<InsertTask> insertQueue = new LinkedBlockingQueue<>(32);
|
||||
|
||||
private record InsertTask(int urlId, int domainId, EdgePageWordSet wordSet) {}
|
||||
private final LinkedBlockingQueue<InsertTask> insertQueue = new LinkedBlockingQueue<>(32);
|
||||
private final EdgeIndexWriterClient client;
|
||||
|
||||
private record InsertTask(int urlId, int domainId, DocumentKeywords wordSet) {}
|
||||
|
||||
private final Thread runThread;
|
||||
private volatile boolean canceled = false;
|
||||
@ -28,7 +27,7 @@ public class IndexLoadKeywords implements Runnable {
|
||||
private static final int index = Integer.getInteger("keyword-index", 1);
|
||||
|
||||
@Inject
|
||||
public IndexLoadKeywords(EdgeIndexClient client) {
|
||||
public IndexLoadKeywords(EdgeIndexWriterClient client) {
|
||||
this.client = client;
|
||||
runThread = new Thread(this, getClass().getSimpleName());
|
||||
runThread.start();
|
||||
@ -39,7 +38,7 @@ public class IndexLoadKeywords implements Runnable {
|
||||
while (!canceled) {
|
||||
var data = insertQueue.poll(1, TimeUnit.SECONDS);
|
||||
if (data != null) {
|
||||
client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), -5., data.wordSet, index).blockingSubscribe();
|
||||
client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.wordSet, index);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -53,15 +52,13 @@ public class IndexLoadKeywords implements Runnable {
|
||||
int domainId = loaderData.getDomainId(url.domain);
|
||||
int urlId = loaderData.getUrlId(url);
|
||||
|
||||
if (urlId < 0 || domainId < 0) {
|
||||
if (urlId <= 0 || domainId <= 0) {
|
||||
logger.warn("Failed to get IDs for {} -- d={},u={}", url, domainId, urlId);
|
||||
return;
|
||||
}
|
||||
|
||||
var ws = new EdgePageWordSet();
|
||||
for (var doc : words) {
|
||||
ws.append(doc.block(), Arrays.asList(doc.keywords()));
|
||||
for (var ws : words) {
|
||||
insertQueue.put(new InsertTask(urlId, domainId, ws));
|
||||
}
|
||||
|
||||
insertQueue.put(new InsertTask(urlId, domainId, ws));
|
||||
}
|
||||
}
|
||||
|
@ -27,6 +27,9 @@ public class Loader implements Interpreter {
|
||||
private final List<LoadProcessedDocument> processedDocumentList;
|
||||
private final List<LoadProcessedDocumentWithError> processedDocumentWithErrorList;
|
||||
|
||||
private final List<EdgeDomain> deferredDomains = new ArrayList<>();
|
||||
private final List<EdgeUrl> deferredUrls = new ArrayList<>();
|
||||
|
||||
public final LoaderData data;
|
||||
|
||||
public Loader(int sizeHint,
|
||||
@ -72,28 +75,54 @@ public class Loader implements Interpreter {
|
||||
@Override
|
||||
public void loadDomainLink(DomainLink[] links) {
|
||||
logger.debug("loadDomainLink({})", links, null);
|
||||
sqlLoadDomainLinks.load(links);
|
||||
sqlLoadDomainLinks.load(data, links);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
|
||||
logger.debug("loadProcessedDomain({}, {}, {})", domain, state, ip);
|
||||
|
||||
sqlLoadProcessedDomain.load(data, domain, state, ip);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void loadProcessedDocument(LoadProcessedDocument document) {
|
||||
deferralCheck(document.url());
|
||||
|
||||
processedDocumentList.add(document);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void loadProcessedDocumentWithError(LoadProcessedDocumentWithError document) {
|
||||
deferralCheck(document.url());
|
||||
|
||||
processedDocumentWithErrorList.add(document);
|
||||
}
|
||||
|
||||
private void deferralCheck(EdgeUrl url) {
|
||||
if (data.getDomainId(url.domain) <= 0)
|
||||
deferredDomains.add(url.domain);
|
||||
|
||||
if (data.getUrlId(url) <= 0)
|
||||
deferredUrls.add(url);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {
|
||||
logger.debug("loadKeywords(#{})", words.length);
|
||||
|
||||
// This is a bit of a bandaid safeguard against a bug in
|
||||
// in the converter, shouldn't be necessary in the future
|
||||
if (!deferredDomains.isEmpty()) {
|
||||
loadDomain(deferredDomains.toArray(EdgeDomain[]::new));
|
||||
deferredDomains.clear();
|
||||
}
|
||||
|
||||
if (!deferredUrls.isEmpty()) {
|
||||
loadUrl(deferredUrls.toArray(EdgeUrl[]::new));
|
||||
deferredUrls.clear();
|
||||
}
|
||||
|
||||
try {
|
||||
indexLoadKeywords.load(data, url, words);
|
||||
} catch (InterruptedException e) {
|
||||
|
@ -40,13 +40,21 @@ public class SqlLoadDomainLinks {
|
||||
}
|
||||
}
|
||||
|
||||
public void load(DomainLink[] links) {
|
||||
public void load(LoaderData data, DomainLink[] links) {
|
||||
|
||||
try (var connection = dataSource.getConnection();
|
||||
var nukeExistingLinksForDomain =
|
||||
connection.prepareStatement("""
|
||||
DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?
|
||||
""");
|
||||
var stmt =
|
||||
connection.prepareCall("CALL INSERT_LINK(?,?)"))
|
||||
{
|
||||
|
||||
connection.setAutoCommit(false);
|
||||
nukeExistingLinksForDomain.setInt(1, data.getDomainId(links[0].from()));
|
||||
nukeExistingLinksForDomain.executeUpdate();
|
||||
|
||||
for (DomainLink link : links) {
|
||||
stmt.setString(1, link.from().toString());
|
||||
stmt.setString(2, link.to().toString());
|
||||
@ -60,6 +68,10 @@ public class SqlLoadDomainLinks {
|
||||
logger.warn("load({}) -- bad row count {}", links[rv], ret[rv]);
|
||||
}
|
||||
}
|
||||
|
||||
connection.commit();
|
||||
connection.setAutoCommit(true);
|
||||
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.warn("SQL error inserting domain links", ex);
|
||||
|
@ -41,16 +41,18 @@ public class SqlLoadDomains {
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) {
|
||||
connection.setAutoCommit(false);
|
||||
insertCall.setString(1, domain.toString());
|
||||
insertCall.setString(2, domain.domain);
|
||||
insertCall.addBatch();
|
||||
|
||||
var ret = insertCall.executeUpdate();
|
||||
connection.commit();
|
||||
if (ret < 0) {
|
||||
logger.warn("load({}) -- bad row count {}", domain, ret);
|
||||
logger.warn("load({}) -- bad return status {}", domain, ret);
|
||||
}
|
||||
|
||||
findIdForTargetDomain(connection, data);
|
||||
findIdForDomain(connection, data, domain);
|
||||
connection.setAutoCommit(true);
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
@ -67,30 +69,48 @@ public class SqlLoadDomains {
|
||||
|
||||
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) {
|
||||
|
||||
|
||||
int cnt = 0; int batchOffset = 0;
|
||||
for (var domain : domains) {
|
||||
insertCall.setString(1, domain.toString());
|
||||
insertCall.setString(2, domain.domain);
|
||||
insertCall.addBatch();
|
||||
}
|
||||
var ret = insertCall.executeBatch();
|
||||
|
||||
for (int rv = 0; rv < domains.length; rv++) {
|
||||
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
|
||||
logger.warn("load({}) -- bad row count {}", domains[rv], ret[rv]);
|
||||
if (++cnt == 1000) {
|
||||
var ret = insertCall.executeBatch();
|
||||
connection.commit();
|
||||
|
||||
for (int rv = 0; rv < cnt; rv++) {
|
||||
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
|
||||
logger.warn("load({}) -- bad row count {}", domains[batchOffset + rv], ret[rv]);
|
||||
}
|
||||
}
|
||||
|
||||
cnt = 0;
|
||||
batchOffset += 1000;
|
||||
}
|
||||
}
|
||||
if (cnt > 0) {
|
||||
var ret = insertCall.executeBatch();
|
||||
connection.commit();
|
||||
for (int rv = 0; rv < cnt; rv++) {
|
||||
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
|
||||
logger.warn("load({}) -- bad row count {}", domains[batchOffset + rv], ret[rv]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
connection.commit();
|
||||
connection.setAutoCommit(true);
|
||||
findIdForTargetDomain(connection, data);
|
||||
findIdForDomain(connection, data, domains);
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.warn("SQL error inserting domains", ex);
|
||||
}
|
||||
}
|
||||
|
||||
void findIdForTargetDomain(Connection connection, LoaderData data) {
|
||||
void findIdForDomain(Connection connection, LoaderData data, EdgeDomain... domains) {
|
||||
if (data.getTargetDomain() == null || data.getDomainId(data.getTargetDomain()) > 0) {
|
||||
return;
|
||||
}
|
||||
@ -98,14 +118,39 @@ public class SqlLoadDomains {
|
||||
try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?"))
|
||||
{
|
||||
|
||||
var targetDomain = data.getTargetDomain();
|
||||
query.setString(1, targetDomain.toString());
|
||||
var rsp = query.executeQuery();
|
||||
if (rsp.next()) {
|
||||
data.addDomain(targetDomain, rsp.getInt(1));
|
||||
for (var domain : domains) {
|
||||
if (data.getDomainId(domain) > 0)
|
||||
continue;
|
||||
|
||||
query.setString(1, domain.toString());
|
||||
var rsp = query.executeQuery();
|
||||
if (rsp.next()) {
|
||||
data.addDomain(domain, rsp.getInt(1));
|
||||
} else {
|
||||
logger.warn("load() -- could not find ID for target domain {}", domain);
|
||||
}
|
||||
}
|
||||
else {
|
||||
logger.warn("load() -- could not find ID for target domain {}", targetDomain);
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.warn("SQL error finding id for domain", ex);
|
||||
}
|
||||
}
|
||||
|
||||
void loadAdditionalDomains(Connection connection, LoaderData data, EdgeDomain[] domains) {
|
||||
|
||||
try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?"))
|
||||
{
|
||||
for (var domain : domains) {
|
||||
|
||||
if (data.getDomainId(domain) == 0) continue;
|
||||
|
||||
query.setString(1, domain.toString());
|
||||
var rsp = query.executeQuery();
|
||||
if (rsp.next()) {
|
||||
data.addDomain(domain, rsp.getInt(1));
|
||||
} else {
|
||||
logger.warn("load() -- could not find ID for target domain {}", domain);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
|
@ -60,13 +60,15 @@ public class SqlLoadProcessedDocument {
|
||||
}
|
||||
|
||||
public void load(LoaderData data, List<LoadProcessedDocument> documents) {
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?)")) {
|
||||
conn.setAutoCommit(false);
|
||||
|
||||
int cnt = 0; int batchOffset = 0;
|
||||
for (var doc : documents) {
|
||||
int urlId = data.getUrlId(doc.url());
|
||||
if (urlId < 0) {
|
||||
if (urlId <= 0) {
|
||||
logger.warn("Failed to resolve ID for URL {}", doc.url());
|
||||
return;
|
||||
}
|
||||
@ -81,25 +83,46 @@ public class SqlLoadProcessedDocument {
|
||||
stmt.setDouble(8, doc.quality());
|
||||
stmt.setInt(9, (int) doc.hash());
|
||||
stmt.addBatch();
|
||||
}
|
||||
var ret = stmt.executeBatch();
|
||||
|
||||
for (int rv = 0; rv < documents.size(); rv++) {
|
||||
if (ret[rv] < 1 && ret[rv] != SUCCESS_NO_INFO) {
|
||||
logger.warn("load({}) -- bad row count {}", documents.get(rv), ret[rv]);
|
||||
if (++cnt == 100) {
|
||||
var ret = stmt.executeBatch();
|
||||
conn.commit();
|
||||
|
||||
for (int rv = 0; rv < cnt; rv++) {
|
||||
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
|
||||
logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]);
|
||||
}
|
||||
}
|
||||
|
||||
cnt = 0;
|
||||
batchOffset += 100;
|
||||
}
|
||||
}
|
||||
if (cnt > 0) {
|
||||
var ret = stmt.executeBatch();
|
||||
conn.commit();
|
||||
for (int rv = 0; rv < cnt; rv++) {
|
||||
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
|
||||
logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
conn.commit();
|
||||
conn.setAutoCommit(true);
|
||||
|
||||
} catch (SQLException ex) {
|
||||
logger.warn("SQL error inserting document", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public void loadWithError(LoaderData data, List<LoadProcessedDocumentWithError> documents) {
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT_BAD(?, ?)")) {
|
||||
|
||||
conn.setAutoCommit(false);
|
||||
|
||||
int cnt = 0; int batchOffset = 0;
|
||||
for (var doc : documents) {
|
||||
int urlId = data.getUrlId(doc.url());
|
||||
if (urlId < 0) {
|
||||
@ -110,13 +133,32 @@ public class SqlLoadProcessedDocument {
|
||||
stmt.setInt(1, urlId);
|
||||
stmt.setString(2, doc.state().name());
|
||||
stmt.addBatch();
|
||||
}
|
||||
var ret = stmt.executeBatch();
|
||||
for (int rv = 0; rv < documents.size(); rv++) {
|
||||
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
|
||||
logger.warn("load({}) -- bad row count {}", documents.get(rv), ret[rv]);
|
||||
|
||||
if (++cnt == 100) {
|
||||
var ret = stmt.executeBatch();
|
||||
conn.commit();
|
||||
|
||||
for (int rv = 0; rv < cnt; rv++) {
|
||||
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
|
||||
logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]);
|
||||
}
|
||||
}
|
||||
|
||||
cnt = 0;
|
||||
batchOffset += 100;
|
||||
}
|
||||
}
|
||||
if (cnt > 0) {
|
||||
var ret = stmt.executeBatch();
|
||||
conn.commit();
|
||||
for (int rv = 0; rv < cnt; rv++) {
|
||||
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
|
||||
logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
conn.setAutoCommit(true);
|
||||
} catch (SQLException ex) {
|
||||
logger.warn("SQL error inserting failed document", ex);
|
||||
}
|
||||
|
@ -14,6 +14,7 @@ public class SqlLoadProcessedDomain {
|
||||
private final HikariDataSource dataSource;
|
||||
private final SqlLoadDomains loadDomains;
|
||||
private static final Logger logger = LoggerFactory.getLogger(SqlLoadProcessedDomain.class);
|
||||
|
||||
@Inject
|
||||
public SqlLoadProcessedDomain(HikariDataSource dataSource, SqlLoadDomains loadDomains) {
|
||||
this.dataSource = dataSource;
|
||||
@ -54,6 +55,7 @@ public class SqlLoadProcessedDomain {
|
||||
initCall.setInt(3, data.getDomainId(domain));
|
||||
initCall.setString(4, ip);
|
||||
int rc = initCall.executeUpdate();
|
||||
conn.commit();
|
||||
if (rc < 1) {
|
||||
logger.warn("load({},{}) -- bad rowcount {}", domain, state, rc);
|
||||
}
|
||||
@ -75,6 +77,7 @@ public class SqlLoadProcessedDomain {
|
||||
stmt.setString(1, link.to().toString());
|
||||
stmt.setString(2, link.from().toString());
|
||||
int rc = stmt.executeUpdate();
|
||||
conn.commit();
|
||||
if (rc != 1) {
|
||||
logger.warn("loadAlias({}) - unexpected row count {}", link, rc);
|
||||
}
|
||||
|
@ -4,6 +4,7 @@ import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -11,6 +12,8 @@ import org.slf4j.LoggerFactory;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Types;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import static java.sql.Statement.SUCCESS_NO_INFO;
|
||||
|
||||
@ -46,17 +49,22 @@ public class SqlLoadUrls {
|
||||
}
|
||||
|
||||
public void load(LoaderData data, EdgeUrl[] urls) {
|
||||
Set<EdgeDomain> affectedDomains = new HashSet<>();
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?,?,?)");
|
||||
var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=?")
|
||||
)
|
||||
{
|
||||
conn.setAutoCommit(false);
|
||||
|
||||
int cnt = 0; int batchOffset = 0;
|
||||
for (var url : urls) {
|
||||
if (url.path.length() >= 255) {
|
||||
logger.warn("Skipping bad URL {}", url);
|
||||
continue;
|
||||
}
|
||||
affectedDomains.add(url.domain);
|
||||
|
||||
insertCall.setString(1, url.proto);
|
||||
insertCall.setString(2, url.domain.toString());
|
||||
@ -70,30 +78,48 @@ public class SqlLoadUrls {
|
||||
insertCall.setString(5, url.param);
|
||||
insertCall.setLong(6, hashPath(url.path, url.param));
|
||||
insertCall.addBatch();
|
||||
|
||||
if (cnt++ == 250) {
|
||||
var ret = insertCall.executeBatch();
|
||||
conn.commit();
|
||||
|
||||
for (int rv = 0; rv < cnt; rv++) {
|
||||
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
|
||||
logger.warn("load({}) -- bad row count {}", urls[batchOffset + rv], ret[rv]);
|
||||
}
|
||||
}
|
||||
|
||||
batchOffset += cnt;
|
||||
cnt = 0;
|
||||
}
|
||||
}
|
||||
var ret = insertCall.executeBatch();
|
||||
for (int rv = 0; rv < ret.length; rv++) {
|
||||
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
|
||||
logger.warn("load({}) -- bad row count {}", urls[rv], ret[rv]);
|
||||
if (cnt > 0) {
|
||||
var ret = insertCall.executeBatch();
|
||||
conn.commit();
|
||||
|
||||
for (int rv = 0; rv < cnt; rv++) {
|
||||
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
|
||||
logger.warn("load({}) -- bad row count {}", urls[batchOffset + rv], ret[rv]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
conn.commit();
|
||||
conn.setAutoCommit(true);
|
||||
|
||||
|
||||
var targetDomain = data.getTargetDomain();
|
||||
queryCall.setInt(1, data.getDomainId(targetDomain));
|
||||
for (var domain : affectedDomains) {
|
||||
queryCall.setInt(1, data.getDomainId(domain));
|
||||
var rsp = queryCall.executeQuery();
|
||||
rsp.setFetchSize(1000);
|
||||
|
||||
var rsp = queryCall.executeQuery();
|
||||
while (rsp.next()) {
|
||||
int urlId = rsp.getInt(1);
|
||||
String proto = rsp.getString(2);
|
||||
String path = rsp.getString(3);
|
||||
String param = rsp.getString(4);
|
||||
|
||||
while (rsp.next()) {
|
||||
int urlId = rsp.getInt(1);
|
||||
String proto = rsp.getString(2);
|
||||
String path = rsp.getString(3);
|
||||
String param = rsp.getString(4);
|
||||
|
||||
data.addUrl(new EdgeUrl(proto, targetDomain, null, path, param), urlId);
|
||||
data.addUrl(new EdgeUrl(proto, domain, null, path, param), urlId);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -17,6 +17,7 @@ public class DisqualifiedException extends Exception {
|
||||
LANGUAGE,
|
||||
STATUS,
|
||||
QUALITY,
|
||||
ACCEPTABLE_ADS
|
||||
ACCEPTABLE_ADS,
|
||||
FORBIDDEN
|
||||
}
|
||||
}
|
||||
|
@ -15,6 +15,7 @@ public class ProcessedDocument {
|
||||
public EdgePageWordSet words;
|
||||
|
||||
public EdgeUrlState state;
|
||||
public String stateReason;
|
||||
|
||||
public OptionalDouble quality() {
|
||||
if (details != null) {
|
||||
|
@ -70,11 +70,22 @@ public class DocumentProcessor {
|
||||
this.summaryExtractor = summaryExtractor;
|
||||
}
|
||||
|
||||
public ProcessedDocument makeDisqualifiedStub(CrawledDocument crawledDocument) {
|
||||
ProcessedDocument ret = new ProcessedDocument();
|
||||
|
||||
try {
|
||||
ret.state = EdgeUrlState.DISQUALIFIED;
|
||||
ret.url = getDocumentUrl(crawledDocument);
|
||||
}
|
||||
catch (Exception ex) {}
|
||||
|
||||
return ret;
|
||||
}
|
||||
public ProcessedDocument process(CrawledDocument crawledDocument, CrawledDomain crawledDomain) {
|
||||
ProcessedDocument ret = new ProcessedDocument();
|
||||
|
||||
try {
|
||||
ret.url = new EdgeUrl(crawledDocument.url);
|
||||
ret.url = getDocumentUrl(crawledDocument);
|
||||
ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus);
|
||||
|
||||
if (ret.state == EdgeUrlState.OK) {
|
||||
@ -86,10 +97,6 @@ public class DocumentProcessor {
|
||||
if (isAcceptedContentType(crawledDocument)) {
|
||||
var detailsWords = createDetails(crawledDomain, crawledDocument);
|
||||
|
||||
if (detailsWords.details().quality < minDocumentQuality) {
|
||||
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
||||
}
|
||||
|
||||
ret.details = detailsWords.details();
|
||||
ret.words = detailsWords.words();
|
||||
}
|
||||
@ -103,17 +110,31 @@ public class DocumentProcessor {
|
||||
}
|
||||
catch (DisqualifiedException ex) {
|
||||
ret.state = EdgeUrlState.DISQUALIFIED;
|
||||
ret.stateReason = ex.reason.toString();
|
||||
logger.debug("Disqualified {}: {}", ret.url, ex.reason);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
ret.state = EdgeUrlState.DISQUALIFIED;
|
||||
logger.info("Failed to convert " + ret.url, ex);
|
||||
logger.info("Failed to convert " + crawledDocument.url, ex);
|
||||
ex.printStackTrace();
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument)
|
||||
throws URISyntaxException
|
||||
{
|
||||
if (crawledDocument.canonicalUrl != null) {
|
||||
try {
|
||||
return new EdgeUrl(crawledDocument.canonicalUrl);
|
||||
}
|
||||
catch (URISyntaxException ex) { /* fallthrough */ }
|
||||
}
|
||||
|
||||
return new EdgeUrl(crawledDocument.url);
|
||||
}
|
||||
|
||||
public static boolean isAcceptedContentType(CrawledDocument crawledDocument) {
|
||||
if (crawledDocument.contentType == null) {
|
||||
return false;
|
||||
@ -141,27 +162,44 @@ public class DocumentProcessor {
|
||||
private DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument)
|
||||
throws DisqualifiedException, URISyntaxException {
|
||||
|
||||
var doc = Jsoup.parse(crawledDocument.documentBody);
|
||||
Document doc = Jsoup.parse(crawledDocument.documentBody);
|
||||
|
||||
if (AcceptableAds.hasAcceptableAdsTag(doc)) {
|
||||
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
|
||||
}
|
||||
if (doc.select("meta[name=robots]").attr("content").contains("noindex")) {
|
||||
throw new DisqualifiedException(DisqualificationReason.FORBIDDEN);
|
||||
}
|
||||
|
||||
var dld = sentenceExtractor.extractSentences(doc.clone());
|
||||
Document prunedDoc = doc.clone();
|
||||
prunedDoc.body().filter(new DomPruningFilter(0.5));
|
||||
|
||||
var dld = sentenceExtractor.extractSentences(prunedDoc);
|
||||
|
||||
checkDocumentLanguage(dld);
|
||||
|
||||
var ret = new ProcessedDocumentDetails();
|
||||
|
||||
ret.description = getDescription(doc);
|
||||
ret.length = getLength(doc);
|
||||
ret.standard = getHtmlStandard(doc);
|
||||
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
|
||||
ret.features = featureExtractor.getFeatures(crawledDomain, doc);
|
||||
|
||||
ret.quality = documentValuator.getQuality(ret.standard, doc, dld);
|
||||
ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();
|
||||
|
||||
var words = getWords(dld);
|
||||
final boolean doSimpleProcessing = ret.quality < minDocumentQuality;
|
||||
|
||||
EdgePageWordSet words;
|
||||
if (doSimpleProcessing) {
|
||||
ret.features = Set.of(HtmlFeature.UNKNOWN);
|
||||
words = keywordExtractor.extractKeywordsMinimal(dld);
|
||||
ret.description = "";
|
||||
}
|
||||
else {
|
||||
ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld);
|
||||
words = keywordExtractor.extractKeywords(dld);
|
||||
ret.description = getDescription(doc);
|
||||
}
|
||||
|
||||
var url = new EdgeUrl(crawledDocument.url);
|
||||
addMetaWords(ret, url, crawledDomain, words);
|
||||
@ -192,7 +230,6 @@ public class DocumentProcessor {
|
||||
ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
|
||||
|
||||
words.append(IndexBlock.Meta, tagWords);
|
||||
words.append(IndexBlock.Words, tagWords);
|
||||
}
|
||||
|
||||
private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) {
|
||||
@ -208,12 +245,11 @@ public class DocumentProcessor {
|
||||
if (linkParser.shouldIndexLink(atag)) {
|
||||
linkOpt.ifPresent(lp::accept);
|
||||
}
|
||||
else if (linkOpt.isPresent()) {
|
||||
if (linkParser.hasBinarySuffix(linkOpt.get().toString())) {
|
||||
linkOpt.ifPresent(lp::acceptNonIndexable);
|
||||
}
|
||||
else {
|
||||
linkOpt
|
||||
.filter(url -> linkParser.hasBinarySuffix(url.path.toLowerCase()))
|
||||
.ifPresent(lp::acceptNonIndexable);
|
||||
}
|
||||
|
||||
}
|
||||
for (var frame : doc.getElementsByTag("frame")) {
|
||||
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
|
||||
@ -233,26 +269,24 @@ public class DocumentProcessor {
|
||||
linkTerms.add("links:"+fd.toString().toLowerCase());
|
||||
linkTerms.add("links:"+fd.getDomain().toLowerCase());
|
||||
}
|
||||
|
||||
words.append(IndexBlock.Meta, linkTerms);
|
||||
|
||||
Set<String> fileKeywords = new HashSet<>(100);
|
||||
for (var link : lp.getNonIndexableUrls()) {
|
||||
|
||||
if (!Objects.equals(domain, link.domain)) {
|
||||
if (!domain.hasSameTopDomain(link.domain)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
synthesizeFilenameKeyword(fileKeywords, link);
|
||||
|
||||
}
|
||||
|
||||
words.append(IndexBlock.Artifacts, fileKeywords);
|
||||
|
||||
}
|
||||
|
||||
private void synthesizeFilenameKeyword(Set<String> fileKeywords, EdgeUrl link) {
|
||||
|
||||
|
||||
Path pFilename = Path.of(link.path.toLowerCase()).getFileName();
|
||||
|
||||
if (pFilename == null) return;
|
||||
@ -289,10 +323,6 @@ public class DocumentProcessor {
|
||||
return htmlStandard;
|
||||
}
|
||||
|
||||
private EdgePageWordSet getWords(DocumentLanguageData dld) {
|
||||
return keywordExtractor.extractKeywords(dld);
|
||||
}
|
||||
|
||||
private String getDescription(Document doc) {
|
||||
return summaryExtractor.extractSummary(doc);
|
||||
}
|
||||
|
@ -1,23 +1,29 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.CommonKeywordExtractor;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
|
||||
import static nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus.BAD_CANONICAL;
|
||||
|
||||
public class DomainProcessor {
|
||||
private static final CommonKeywordExtractor commonKeywordExtractor = new CommonKeywordExtractor();
|
||||
|
||||
private final DocumentProcessor documentProcessor;
|
||||
private final Double minAvgDocumentQuality;
|
||||
|
||||
|
||||
@Inject
|
||||
public DomainProcessor(DocumentProcessor documentProcessor,
|
||||
@Named("min-avg-document-quality") Double minAvgDocumentQuality
|
||||
@ -39,10 +45,39 @@ public class DomainProcessor {
|
||||
if (crawledDomain.doc != null) {
|
||||
ret.documents = new ArrayList<>(crawledDomain.doc.size());
|
||||
|
||||
fixBadCanonicalTags(crawledDomain.doc);
|
||||
|
||||
DocumentDisqualifier disqualifier = new DocumentDisqualifier();
|
||||
for (var doc : crawledDomain.doc) {
|
||||
var processedDoc = documentProcessor.process(doc, crawledDomain);
|
||||
if (processedDoc.url != null) {
|
||||
ret.documents.add(processedDoc);
|
||||
if (disqualifier.isQualified()) {
|
||||
var processedDoc = documentProcessor.process(doc, crawledDomain);
|
||||
|
||||
if (processedDoc.url != null) {
|
||||
ret.documents.add(processedDoc);
|
||||
processedDoc.quality().ifPresent(disqualifier::offer);
|
||||
}
|
||||
else if ("LANGUAGE".equals(processedDoc.stateReason)) {
|
||||
disqualifier.offer(-100);
|
||||
}
|
||||
}
|
||||
else { // Short-circuit processing if quality is too low
|
||||
var stub = documentProcessor.makeDisqualifiedStub(doc);
|
||||
if (stub.url != null) {
|
||||
ret.documents.add(stub);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Set<String> commonSiteWords = new HashSet<>(10);
|
||||
|
||||
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Tfidf_Top, IndexBlock.Subjects));
|
||||
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Title));
|
||||
|
||||
if (!commonSiteWords.isEmpty()) {
|
||||
for (var doc : ret.documents) {
|
||||
if (doc.words != null) {
|
||||
doc.words.get(IndexBlock.Site).addAll(commonSiteWords);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -50,30 +85,60 @@ public class DomainProcessor {
|
||||
ret.documents = Collections.emptyList();
|
||||
}
|
||||
|
||||
double averageQuality = getAverageQuality(ret.documents);
|
||||
if (averageQuality < minAvgDocumentQuality) {
|
||||
ret.documents.forEach(doc -> doc.state = EdgeUrlState.DISQUALIFIED);
|
||||
}
|
||||
|
||||
ret.state = getState(crawledDomain.crawlerStatus);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
private double getAverageQuality(List<ProcessedDocument> documents) {
|
||||
int n = 0;
|
||||
double q = 0.;
|
||||
for (var doc : documents) {
|
||||
if (doc.quality().isPresent()) {
|
||||
n++;
|
||||
q += doc.quality().getAsDouble();
|
||||
private void fixBadCanonicalTags(List<CrawledDocument> docs) {
|
||||
Map<String, Set<String>> seenCanonicals = new HashMap<>();
|
||||
Set<String> seenUrls = new HashSet<>();
|
||||
|
||||
// Sometimes sites set a blanket canonical link to their root page
|
||||
// this removes such links from consideration
|
||||
|
||||
for (var document : docs) {
|
||||
if (!Strings.isNullOrEmpty(document.canonicalUrl) && !Objects.equals(document.canonicalUrl, document.url)) {
|
||||
seenCanonicals.computeIfAbsent(document.canonicalUrl, url -> new HashSet<>()).add(document.documentBodyHash);
|
||||
}
|
||||
seenUrls.add(document.url);
|
||||
}
|
||||
|
||||
for (var document : docs) {
|
||||
if (!Strings.isNullOrEmpty(document.canonicalUrl)
|
||||
&& !Objects.equals(document.canonicalUrl, document.url)
|
||||
&& seenCanonicals.getOrDefault(document.canonicalUrl, Collections.emptySet()).size() > 1) {
|
||||
|
||||
if (seenUrls.add(document.canonicalUrl)) {
|
||||
document.canonicalUrl = document.url;
|
||||
}
|
||||
else {
|
||||
document.crawlerStatus = BAD_CANONICAL.name();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (n > 0) {
|
||||
return q / n;
|
||||
for (var document : docs) {
|
||||
if (!Strings.isNullOrEmpty(document.canonicalUrl)
|
||||
&& !Objects.equals(document.canonicalUrl, document.url)
|
||||
&& seenCanonicals.getOrDefault(document.canonicalUrl, Collections.emptySet()).size() > 1) {
|
||||
document.canonicalUrl = document.url;
|
||||
}
|
||||
}
|
||||
|
||||
// Ignore canonical URL if it points to a different domain
|
||||
// ... this confuses the hell out of the loader
|
||||
for (var document : docs) {
|
||||
if (Strings.isNullOrEmpty(document.canonicalUrl))
|
||||
continue;
|
||||
|
||||
Optional<EdgeUrl> cUrl = EdgeUrl.parse(document.canonicalUrl);
|
||||
Optional<EdgeUrl> dUrl = EdgeUrl.parse(document.url);
|
||||
|
||||
if (cUrl.isPresent() && dUrl.isPresent() && !Objects.equals(cUrl.get().domain, dUrl.get().domain)) {
|
||||
document.canonicalUrl = document.url;
|
||||
}
|
||||
}
|
||||
return -5.;
|
||||
}
|
||||
|
||||
private EdgeDomainIndexingState getState(String crawlerStatus) {
|
||||
@ -84,4 +149,20 @@ public class DomainProcessor {
|
||||
default -> EdgeDomainIndexingState.ERROR;
|
||||
};
|
||||
}
|
||||
|
||||
class DocumentDisqualifier {
|
||||
int count;
|
||||
int goodCount;
|
||||
|
||||
void offer(double quality) {
|
||||
count++;
|
||||
if (quality > minAvgDocumentQuality) {
|
||||
goodCount++;
|
||||
}
|
||||
}
|
||||
|
||||
boolean isQualified() {
|
||||
return count < 25 || goodCount*10 >= count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -26,7 +26,6 @@ public class InstructionsCompiler {
|
||||
}
|
||||
if (domain.redirect != null) {
|
||||
compileRedirect(ret, domain.domain, domain.redirect);
|
||||
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
@ -0,0 +1,71 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class CommonKeywordExtractor {
|
||||
private final PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
private static final int MIN_REQUIRED_DOCUMENTS = 25;
|
||||
|
||||
private static final int REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION = 100;
|
||||
private static final double QUALIFYING_PROPORTION_FOR_KEYWORD = .25;
|
||||
|
||||
private static final int MAX_SITE_KEYWORDS_TO_EXTRACT = 5;
|
||||
|
||||
public List<String> getCommonSiteWords(ProcessedDomain ret, IndexBlock... sourceBlocks) {
|
||||
|
||||
if (ret.documents.size() < MIN_REQUIRED_DOCUMENTS)
|
||||
return Collections.emptyList();
|
||||
|
||||
final Map<String, String> wordToStemmedMemoized = new HashMap<>(ret.documents.size()*10);
|
||||
|
||||
final Map<String, Integer> topStemmedKeywordCount = new HashMap<>(ret.documents.size()*10);
|
||||
final Map<String, Set<String>> stemmedToNonstemmedVariants = new HashMap<>(ret.documents.size()*10);
|
||||
|
||||
int qualifiedDocCount = 0;
|
||||
for (var doc : ret.documents) {
|
||||
if (doc.words == null)
|
||||
continue;
|
||||
|
||||
qualifiedDocCount++;
|
||||
|
||||
for (var block : sourceBlocks) {
|
||||
for (var word : doc.words.get(block).words) {
|
||||
String wordStemmed = wordToStemmedMemoized.computeIfAbsent(word, ps::stemWord);
|
||||
|
||||
// Count by negative values to sort by Map.Entry.comparingByValue() in reverse
|
||||
topStemmedKeywordCount.merge(wordStemmed, -1, Integer::sum);
|
||||
|
||||
stemmedToNonstemmedVariants.computeIfAbsent(wordStemmed, w -> new HashSet<>()).add(word);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int totalValue = 0;
|
||||
for (int value : topStemmedKeywordCount.values()) {
|
||||
totalValue += value;
|
||||
}
|
||||
|
||||
if (totalValue > -REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION)
|
||||
return Collections.emptyList();
|
||||
|
||||
List<String> topWords = new ArrayList<>(MAX_SITE_KEYWORDS_TO_EXTRACT);
|
||||
|
||||
double qualifyingValue = -qualifiedDocCount * QUALIFYING_PROPORTION_FOR_KEYWORD;
|
||||
|
||||
topStemmedKeywordCount.entrySet().stream()
|
||||
.filter(e -> e.getValue() < qualifyingValue)
|
||||
.sorted(Map.Entry.comparingByValue())
|
||||
.limit(MAX_SITE_KEYWORDS_TO_EXTRACT)
|
||||
.forEach(e -> topWords.addAll(stemmedToNonstemmedVariants.get(e.getKey())));
|
||||
|
||||
|
||||
return topWords;
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,105 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.nodes.Node;
|
||||
import org.jsoup.nodes.TextNode;
|
||||
import org.jsoup.select.NodeFilter;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class DomPruningFilter implements NodeFilter {
|
||||
|
||||
private final double pruneThreshold;
|
||||
|
||||
private final Map<Node, NodeData> data = new HashMap<>();
|
||||
private final NodeData dummy = new NodeData(Integer.MAX_VALUE, 1, 0);
|
||||
|
||||
public DomPruningFilter(double pruneThreshold) {
|
||||
this.pruneThreshold = pruneThreshold;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FilterResult head(Node node, int depth) {
|
||||
return FilterResult.CONTINUE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FilterResult tail(Node node, int depth) {
|
||||
final NodeData dataForNode;
|
||||
|
||||
if (node instanceof TextNode tn) {
|
||||
dataForNode = new NodeData(depth, tn.text().length(), 0);
|
||||
}
|
||||
else if (isSignal(node)) {
|
||||
dataForNode = new NodeData(depth, 0,0);
|
||||
for (var childNode : node.childNodes()) {
|
||||
dataForNode.add(data.getOrDefault(childNode, dummy));
|
||||
}
|
||||
}
|
||||
else {
|
||||
dataForNode = new NodeData(depth, 0,0);
|
||||
for (var childNode : node.childNodes()) {
|
||||
dataForNode.addAsNoise(data.getOrDefault(childNode, dummy));
|
||||
}
|
||||
}
|
||||
|
||||
data.put(node, dataForNode);
|
||||
|
||||
if (dataForNode.depth <= 1)
|
||||
return FilterResult.CONTINUE;
|
||||
|
||||
if (dataForNode.signalNodeSize == 0)
|
||||
return FilterResult.REMOVE;
|
||||
if (dataForNode.noiseNodeSize > 0
|
||||
&& dataForNode.signalRate() < pruneThreshold
|
||||
&& dataForNode.treeSize > 3)
|
||||
return FilterResult.REMOVE;
|
||||
|
||||
return FilterResult.CONTINUE;
|
||||
}
|
||||
|
||||
public boolean isSignal(Node node) {
|
||||
|
||||
if (node instanceof Element e) {
|
||||
if ("a".equalsIgnoreCase(e.tagName()))
|
||||
return false;
|
||||
if ("nav".equalsIgnoreCase(e.tagName()))
|
||||
return false;
|
||||
if ("footer".equalsIgnoreCase(e.tagName()))
|
||||
return false;
|
||||
if ("header".equalsIgnoreCase(e.tagName()))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
class NodeData {
|
||||
int signalNodeSize;
|
||||
int noiseNodeSize;
|
||||
int treeSize = 1;
|
||||
int depth;
|
||||
|
||||
NodeData(int depth, int signalNodeSize, int noiseNodeSize) {
|
||||
this.depth = depth;
|
||||
this.signalNodeSize = signalNodeSize;
|
||||
this.noiseNodeSize = noiseNodeSize;
|
||||
}
|
||||
|
||||
public void add(NodeData other) {
|
||||
signalNodeSize += other.signalNodeSize;
|
||||
noiseNodeSize += other.noiseNodeSize;
|
||||
treeSize += other.treeSize;
|
||||
}
|
||||
|
||||
public void addAsNoise(NodeData other) {
|
||||
noiseNodeSize += other.noiseNodeSize + other.signalNodeSize;
|
||||
treeSize += other.treeSize;
|
||||
}
|
||||
|
||||
public double signalRate() {
|
||||
return signalNodeSize / (double)(signalNodeSize + noiseNodeSize);
|
||||
}
|
||||
}
|
@ -2,7 +2,11 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
@ -35,14 +39,20 @@ public class FeatureExtractor {
|
||||
"d31qbv1cthcecs.cloudfront.net",
|
||||
"linkedin.com");
|
||||
|
||||
private AdblockSimulator adblockSimulator;
|
||||
private final AdblockSimulator adblockSimulator;
|
||||
private final RecipeDetector recipeDetector;
|
||||
private final TextileCraftDetector textileCraftDetector;
|
||||
private final WoodworkingDetector woodworkingDetector;
|
||||
|
||||
@Inject
|
||||
public FeatureExtractor(AdblockSimulator adblockSimulator) {
|
||||
public FeatureExtractor(AdblockSimulator adblockSimulator, RecipeDetector recipeDetector, TextileCraftDetector textileCraftDetector, WoodworkingDetector woodworkingDetector) {
|
||||
this.adblockSimulator = adblockSimulator;
|
||||
this.recipeDetector = recipeDetector;
|
||||
this.textileCraftDetector = textileCraftDetector;
|
||||
this.woodworkingDetector = woodworkingDetector;
|
||||
}
|
||||
|
||||
public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc) {
|
||||
public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc, DocumentLanguageData dld) {
|
||||
final Set<HtmlFeature> features = new HashSet<>();
|
||||
|
||||
final Elements scriptTags = doc.getElementsByTag("script");
|
||||
@ -81,9 +91,14 @@ public class FeatureExtractor {
|
||||
}
|
||||
}
|
||||
|
||||
if (!domain.cookies.isEmpty()) {
|
||||
if (!domain.cookies.isEmpty())
|
||||
features.add(HtmlFeature.COOKIES);
|
||||
}
|
||||
|
||||
if (recipeDetector.testP(dld) > 0.5)
|
||||
features.add(HtmlFeature.CATEGORY_FOOD);
|
||||
// these should be mutually exclusive
|
||||
else if (woodworkingDetector.testP(dld) > 0.3 || textileCraftDetector.testP(dld) > 0.3)
|
||||
features.add(HtmlFeature.CATEGORY_CRAFTS);
|
||||
|
||||
return features;
|
||||
}
|
||||
|
@ -12,6 +12,10 @@ public enum HtmlFeature {
|
||||
CATEGORY_FOOD("category:food"),
|
||||
|
||||
ADVERTISEMENT("special:ads"),
|
||||
|
||||
CATEGORY_CRAFTS("category:crafts"),
|
||||
|
||||
UNKNOWN("special:uncategorized")
|
||||
;
|
||||
|
||||
private final String keyword;
|
||||
|
@ -19,10 +19,14 @@ import java.util.regex.Pattern;
|
||||
|
||||
public class LinkParser {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final List<String> blockPrefixList = List.of(
|
||||
"mailto:", "javascript:", "tel:", "itpc:", "#", "file:");
|
||||
private final List<String> blockSuffixList = List.of(
|
||||
|
||||
private final List<String> binarySuffixList = List.of(
|
||||
".pdf", ".mp3", ".wmv", ".avi", ".zip", ".7z",
|
||||
".mpv", ".mp4", ".avi", ".mkv", ".tiff", ".dat", ".tar",
|
||||
".com", ".bat", ".sh",
|
||||
".bin", ".exe", ".tar.gz", ".tar.bz2", ".xml", ".swf",
|
||||
".wav", ".ogg", ".jpg", ".jpeg", ".png", ".gif", ".webp",
|
||||
".webm", ".bmp", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx",
|
||||
@ -33,7 +37,7 @@ public class LinkParser {
|
||||
return Optional.of(l)
|
||||
.filter(this::shouldIndexLink)
|
||||
.map(this::getUrl)
|
||||
.map(link -> resolveUrl(relativeBaseUrl, link))
|
||||
.map(link -> resolveRelativeUrl(relativeBaseUrl, link))
|
||||
.flatMap(this::createURI)
|
||||
.map(URI::normalize)
|
||||
.map(this::renormalize)
|
||||
@ -44,7 +48,7 @@ public class LinkParser {
|
||||
public Optional<EdgeUrl> parseLinkPermissive(EdgeUrl relativeBaseUrl, Element l) {
|
||||
return Optional.of(l)
|
||||
.map(this::getUrl)
|
||||
.map(link -> resolveUrl(relativeBaseUrl, link))
|
||||
.map(link -> resolveRelativeUrl(relativeBaseUrl, link))
|
||||
.flatMap(this::createURI)
|
||||
.map(URI::normalize)
|
||||
.map(this::renormalize)
|
||||
@ -74,7 +78,7 @@ public class LinkParser {
|
||||
@Contract(pure=true)
|
||||
public Optional<EdgeUrl> parseLink(EdgeUrl baseUrl, String str) {
|
||||
return Optional.of(str)
|
||||
.map(link -> resolveUrl(baseUrl, link))
|
||||
.map(link -> resolveRelativeUrl(baseUrl, link))
|
||||
.flatMap(this::createURI)
|
||||
.map(URI::normalize)
|
||||
.map(this::renormalize)
|
||||
@ -85,7 +89,7 @@ public class LinkParser {
|
||||
public Optional<EdgeUrl> parseFrame(EdgeUrl baseUrl, Element frame) {
|
||||
return Optional.of(frame)
|
||||
.map(l -> l.attr("src"))
|
||||
.map(link -> resolveUrl(baseUrl, link))
|
||||
.map(link -> resolveRelativeUrl(baseUrl, link))
|
||||
.flatMap(this::createURI)
|
||||
.map(URI::normalize)
|
||||
.map(this::renormalize)
|
||||
@ -95,10 +99,10 @@ public class LinkParser {
|
||||
@SneakyThrows
|
||||
private URI renormalize(URI uri) {
|
||||
if (uri.getPath() == null) {
|
||||
return renormalize(new URI(uri.getScheme(), uri.getHost(), "/", uri.getFragment()));
|
||||
return renormalize(new URI(uri.getScheme(), uri.getHost(), "/", uri.getQuery(), uri.getFragment()));
|
||||
}
|
||||
if (uri.getPath().startsWith("/../")) {
|
||||
return renormalize(new URI(uri.getScheme(), uri.getHost(), uri.getPath().substring(3), uri.getFragment()));
|
||||
return renormalize(new URI(uri.getScheme(), uri.getHost(), uri.getPath().substring(3), uri.getQuery(), uri.getFragment()));
|
||||
}
|
||||
return uri;
|
||||
}
|
||||
@ -117,10 +121,10 @@ public class LinkParser {
|
||||
private static final Pattern paramSeparatorPattern = Pattern.compile("\\?");
|
||||
|
||||
@SneakyThrows
|
||||
private String resolveUrl(EdgeUrl baseUrl, String s) {
|
||||
private String resolveRelativeUrl(EdgeUrl baseUrl, String s) {
|
||||
|
||||
// url looks like http://www.marginalia.nu/
|
||||
if (isAbsoluteDomain(s)) {
|
||||
if (doesUrlStringHaveProtocol(s)) {
|
||||
return s;
|
||||
}
|
||||
|
||||
@ -154,8 +158,15 @@ public class LinkParser {
|
||||
return url.path.substring(0, lastSlash+1);
|
||||
}
|
||||
|
||||
private boolean isAbsoluteDomain(String s) {
|
||||
return s.matches("^[a-zA-Z]+:.*$");
|
||||
private boolean doesUrlStringHaveProtocol(String s) {
|
||||
int i = 0;
|
||||
for (; i < s.length(); i++) {
|
||||
if (!Character.isAlphabetic(s.charAt(i)))
|
||||
break;
|
||||
}
|
||||
if (i == 0 || i == s.length())
|
||||
return false;
|
||||
return ':' == s.charAt(i);
|
||||
}
|
||||
|
||||
public boolean shouldIndexLink(Element link) {
|
||||
@ -168,26 +179,29 @@ public class LinkParser {
|
||||
return !"noindex".equalsIgnoreCase(rel);
|
||||
}
|
||||
|
||||
public boolean hasBinarySuffix(String href) {
|
||||
return blockSuffixList.stream().anyMatch(href::endsWith);
|
||||
}
|
||||
|
||||
private boolean isUrlRelevant(String href) {
|
||||
if (null == href || "".equals(href)) {
|
||||
return false;
|
||||
}
|
||||
if (href.length() > 128) {
|
||||
return false;
|
||||
}
|
||||
href = href.toLowerCase();
|
||||
|
||||
if (blockPrefixList.stream().anyMatch(href::startsWith)) {
|
||||
return false;
|
||||
}
|
||||
if (hasBinarySuffix(href)) {
|
||||
return false;
|
||||
}
|
||||
if (href.length() > 128) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public boolean hasBinarySuffix(String str) {
|
||||
return binarySuffixList.stream().anyMatch(str::endsWith);
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public EdgeUrl getBaseLink(Document parsed, EdgeUrl documentUrl) {
|
||||
var baseTags = parsed.getElementsByTag("base");
|
||||
@ -196,7 +210,7 @@ public class LinkParser {
|
||||
for (var tag : baseTags) {
|
||||
String href = tag.attr("href");
|
||||
if (!Strings.isNullOrEmpty(href)) {
|
||||
return new EdgeUrl(resolveUrl(documentUrl, href));
|
||||
return new EdgeUrl(resolveRelativeUrl(documentUrl, href));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
|
||||
import java.util.HashMap;
|
||||
@ -14,6 +15,7 @@ public class RecipeDetector {
|
||||
|
||||
private final Map<String, Double> termValues = new HashMap<>();
|
||||
|
||||
@Inject
|
||||
public RecipeDetector() {
|
||||
PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
|
||||
import java.util.HashMap;
|
||||
@ -14,6 +15,7 @@ public class TextileCraftDetector {
|
||||
|
||||
private final Map<String, Double> termValues = new HashMap<>();
|
||||
|
||||
@Inject
|
||||
public TextileCraftDetector() {
|
||||
PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||
|
||||
import java.util.HashMap;
|
||||
@ -14,6 +15,7 @@ public class WoodworkingDetector {
|
||||
|
||||
private final Map<String, Double> termValues = new HashMap<>();
|
||||
|
||||
@Inject
|
||||
public WoodworkingDetector() {
|
||||
PorterStemmer ps = new PorterStemmer();
|
||||
|
||||
|
@ -4,8 +4,8 @@ import com.github.luben.zstd.ZstdOutputStream;
|
||||
import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.wmsa.client.GsonFactory;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||
@ -84,7 +84,7 @@ public class CrawlJobExtractorMain {
|
||||
Driver driver = new Driver();
|
||||
var outFile = Path.of(args[0]);
|
||||
|
||||
Gson gson = new GsonBuilder().create();
|
||||
Gson gson = GsonFactory.get();
|
||||
String[] targetDomains = Arrays.stream(args).skip(1).toArray(String[]::new);
|
||||
|
||||
|
||||
@ -103,7 +103,7 @@ public class CrawlJobExtractorMain {
|
||||
}
|
||||
|
||||
public static void writeSpec(Path outFile, String domain, List<String> urls) throws IOException {
|
||||
Gson gson = new GsonBuilder().create();
|
||||
Gson gson = GsonFactory.get();
|
||||
|
||||
try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) {
|
||||
var job = new CrawlingSpecification();
|
||||
|
@ -4,15 +4,15 @@ import com.github.luben.zstd.ZstdOutputStream;
|
||||
import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.util.ranking.BetterReversePageRank;
|
||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||
import nu.marginalia.wmsa.client.GsonFactory;
|
||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||
import nu.marginalia.util.ranking.BetterReversePageRank;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
@ -23,7 +23,7 @@ import java.nio.file.Path;
|
||||
import java.sql.Connection;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
|
||||
public class CrawlJobExtractorPageRankMain {
|
||||
|
||||
@ -72,7 +72,7 @@ public class CrawlJobExtractorPageRankMain {
|
||||
Driver driver = new Driver();
|
||||
var outFile = Path.of(args[0]);
|
||||
|
||||
Gson gson = new GsonBuilder().create();
|
||||
Gson gson = GsonFactory.get();
|
||||
|
||||
var ds = new DatabaseModule().provideConnection();
|
||||
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||
|
@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.crawling;
|
||||
|
||||
import com.github.luben.zstd.ZstdInputStream;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import nu.marginalia.wmsa.client.GsonFactory;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||
|
||||
@ -13,9 +13,13 @@ import java.io.InputStreamReader;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class CrawledDomainReader {
|
||||
private final Gson gson = new GsonBuilder().create();
|
||||
private final Gson gson = GsonFactory.get();
|
||||
|
||||
private final ForkJoinPool pool = new ForkJoinPool(4);
|
||||
|
||||
public CrawledDomainReader() {
|
||||
}
|
||||
@ -43,7 +47,12 @@ public class CrawledDomainReader {
|
||||
if (line.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
|
||||
domain = gson.fromJson(nextLine, CrawledDomain.class);
|
||||
} else if (line.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
|
||||
docs.add(gson.fromJson(nextLine, CrawledDocument.class));
|
||||
pool.execute(() -> {
|
||||
var doc = gson.fromJson(nextLine, CrawledDocument.class);
|
||||
synchronized (docs) {
|
||||
docs.add(doc);
|
||||
}
|
||||
});
|
||||
}
|
||||
} else if (line.charAt(0) == '{') {
|
||||
domain = gson.fromJson(line, CrawledDomain.class);
|
||||
@ -52,6 +61,8 @@ public class CrawledDomainReader {
|
||||
}
|
||||
}
|
||||
|
||||
pool.awaitQuiescence(10, TimeUnit.SECONDS);
|
||||
|
||||
if (domain == null) {
|
||||
return null;
|
||||
}
|
||||
|
@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.crawling;
|
||||
|
||||
import com.github.luben.zstd.ZstdOutputStream;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import nu.marginalia.wmsa.client.GsonFactory;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.SerializableCrawlData;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -16,7 +16,7 @@ import java.nio.file.Path;
|
||||
|
||||
public class CrawledDomainWriter implements AutoCloseable {
|
||||
private final Path outputDir;
|
||||
private final Gson gson = new GsonBuilder().create();
|
||||
private final Gson gson = GsonFactory.get();
|
||||
private static final Logger logger = LoggerFactory.getLogger(CrawledDomainWriter.class);
|
||||
private final Writer writer;
|
||||
private final Path outputFile;
|
||||
|
@ -2,16 +2,19 @@ package nu.marginalia.wmsa.edge.crawling;
|
||||
|
||||
import com.github.luben.zstd.ZstdInputStream;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import nu.marginalia.wmsa.client.GsonFactory;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
|
||||
import java.io.*;
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.file.Path;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
public class CrawlerSpecificationLoader {
|
||||
private final static Gson gson = new GsonBuilder().create();
|
||||
private final static Gson gson = GsonFactory.get();
|
||||
|
||||
public static void readInputSpec(Path inputSpec, Consumer<CrawlingSpecification> consumer) {
|
||||
try (var inputStream = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(inputSpec.toFile()))))) {
|
||||
|
@ -1,7 +1,5 @@
|
||||
package nu.marginalia.wmsa.edge.crawling.blocklist;
|
||||
|
||||
import com.google.common.cache.Cache;
|
||||
import com.google.common.cache.CacheBuilder;
|
||||
import com.google.inject.Singleton;
|
||||
import com.opencsv.CSVReader;
|
||||
import com.opencsv.exceptions.CsvValidationException;
|
||||
@ -13,10 +11,7 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.InetAddress;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.TreeMap;
|
||||
|
||||
|
@ -6,5 +6,7 @@ public enum CrawlerDocumentStatus {
|
||||
BAD_CHARSET,
|
||||
REDIRECT,
|
||||
ROBOTS_TXT,
|
||||
ERROR
|
||||
ERROR,
|
||||
BAD_CANONICAL,
|
||||
Timeout
|
||||
}
|
||||
|
@ -3,8 +3,9 @@ package nu.marginalia.wmsa.edge.data.dao;
|
||||
import com.google.inject.ImplementedBy;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeIdCollection;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
|
||||
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
||||
|
||||
@ -18,9 +19,9 @@ public interface EdgeDataStoreDao {
|
||||
|
||||
List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist backlist, int set);
|
||||
|
||||
List<BrowseResult> getBrowseResultFromUrlIds(List<EdgeId<EdgeUrl>> urlId);
|
||||
List<BrowseResult> getBrowseResultFromUrlIds(EdgeIdCollection<EdgeUrl> urlId);
|
||||
|
||||
List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids);
|
||||
List<EdgeUrlDetails> getUrlDetailsMulti(EdgeIdCollection<EdgeUrl> ids);
|
||||
|
||||
EdgeDomain getDomain(EdgeId<EdgeDomain> id);
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.wmsa.edge.data.dao;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
import com.google.common.cache.Cache;
|
||||
import com.google.common.cache.CacheBuilder;
|
||||
import com.google.common.util.concurrent.UncheckedExecutionException;
|
||||
@ -8,9 +9,10 @@ import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeIdCollection;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgePageScoreAdjustment;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
|
||||
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
||||
@ -63,17 +65,17 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
}
|
||||
}
|
||||
|
||||
private <T> String idList(List<EdgeId<T>> ids) {
|
||||
private <T> String idList(EdgeIdCollection<EdgeUrl> ids) {
|
||||
StringJoiner j = new StringJoiner(",", "(", ")");
|
||||
for (var id : ids) {
|
||||
j.add(Integer.toString(id.id()));
|
||||
for (var id : ids.values()) {
|
||||
j.add(Integer.toString(id));
|
||||
}
|
||||
return j.toString();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids) {
|
||||
public List<EdgeUrlDetails> getUrlDetailsMulti(EdgeIdCollection<EdgeUrl> ids) {
|
||||
if (ids.isEmpty()) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
@ -110,12 +112,14 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
rsp.getInt(11), // dataHash
|
||||
EdgePageScoreAdjustment.zero(), // urlQualityAdjustment
|
||||
Integer.MAX_VALUE, // rankingId
|
||||
Double.MAX_VALUE, // termScore
|
||||
0 // queryLength
|
||||
Double.MAX_VALUE // termScore
|
||||
);
|
||||
if (val.urlQuality >= QUALITY_LOWER_BOUND_CUTOFF) {
|
||||
result.add(val);
|
||||
if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF
|
||||
&& Strings.isNullOrEmpty(val.description)
|
||||
&& val.url.path.length() > 1) {
|
||||
continue;
|
||||
}
|
||||
result.add(val);
|
||||
|
||||
}
|
||||
}
|
||||
@ -267,7 +271,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<BrowseResult> getBrowseResultFromUrlIds(List<EdgeId<EdgeUrl>> urlIds) {
|
||||
public List<BrowseResult> getBrowseResultFromUrlIds(EdgeIdCollection<EdgeUrl> urlIds) {
|
||||
if (urlIds.isEmpty())
|
||||
return Collections.emptyList();
|
||||
|
||||
|
@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.data.dao.task;
|
||||
import com.google.inject.ImplementedBy;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
|
||||
@ImplementedBy(EdgeDomainBlacklistImpl.class)
|
||||
public interface EdgeDomainBlacklist {
|
||||
|
@ -9,7 +9,7 @@ import nu.marginalia.wmsa.configuration.server.Service;
|
||||
import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService;
|
||||
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
||||
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
|
||||
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
|
||||
|
@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.dating;
|
||||
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
|
||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
||||
|
||||
import java.util.LinkedList;
|
||||
|
@ -4,12 +4,14 @@ import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.Query;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.concurrent.locks.Lock;
|
||||
import java.util.concurrent.locks.ReadWriteLock;
|
||||
@ -104,47 +106,49 @@ public class EdgeIndexBucket {
|
||||
return indexReader.findHotDomainsForKeyword(block, wordId, queryDepth, minHitCount, maxResults);
|
||||
}
|
||||
|
||||
public LongStream getQuery(IndexBlock block, LongPredicate filter, IndexSearchBudget budget, EdgeIndexSearchTerms searchTerms) {
|
||||
public IndexQuery getQuery(IndexQueryCachePool cachePool, IndexBlock block, LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
|
||||
if (null == indexReader) {
|
||||
logger.warn("Index reader not neady {}", block);
|
||||
return LongStream.empty();
|
||||
return new IndexQuery(Collections.emptyList());
|
||||
}
|
||||
|
||||
var orderedIncludes = searchTerms.includes
|
||||
final int[] orderedIncludes = searchTerms.includes
|
||||
.stream()
|
||||
.sorted(Comparator.comparingLong(i -> indexReader.numHits(block, i)))
|
||||
.sorted(Comparator.comparingLong(i -> indexReader.numHits(cachePool, block, i)))
|
||||
.distinct()
|
||||
.mapToInt(Integer::intValue)
|
||||
.toArray();
|
||||
|
||||
Query query;
|
||||
IndexQueryFactory.IndexQueryBuilder query;
|
||||
|
||||
if (orderedIncludes.length == 1) {
|
||||
query = indexReader.findUnderspecified(block, budget, filter, orderedIncludes[0]);
|
||||
query = indexReader.findWord(cachePool, block, orderedIncludes[0]);
|
||||
if (query == null) {
|
||||
return new IndexQuery(Collections.emptyList());
|
||||
}
|
||||
else {
|
||||
query = indexReader.findWord(block, budget, filter, orderedIncludes[0]);
|
||||
}
|
||||
int i;
|
||||
for (i = 1; (i < 3 && i < orderedIncludes.length) || i < orderedIncludes.length-1; i++) {
|
||||
query = query.alsoCached(orderedIncludes[i]);
|
||||
}
|
||||
for (; i < orderedIncludes.length; i++) {
|
||||
|
||||
query.filter(filter);
|
||||
|
||||
for (int i = 1; i < orderedIncludes.length; i++) {
|
||||
query = query.also(orderedIncludes[i]);
|
||||
}
|
||||
|
||||
for (int term : searchTerms.excludes) {
|
||||
query = query.not(term);
|
||||
}
|
||||
|
||||
return query.stream();
|
||||
for (int term : orderedIncludes) {
|
||||
query.prioritize(term);
|
||||
}
|
||||
|
||||
return query.build();
|
||||
}
|
||||
|
||||
|
||||
public IndexBlock getTermScore(int termId, long urlId) {
|
||||
return indexReader.getBlockForResult(termId, urlId);
|
||||
public IndexBlock getTermScore(IndexQueryCachePool cachePool, int termId, long urlId) {
|
||||
return indexReader.getBlockForResult(cachePool, termId, urlId);
|
||||
}
|
||||
|
||||
public boolean isTermInBucket(IndexBlock block, int termId, long urlId) {
|
||||
return indexReader.isTermInBucket(block, termId, urlId);
|
||||
public boolean isTermInBucket(IndexQueryCachePool cachePool, IndexBlock block, int termId, long urlId) {
|
||||
return indexReader.isTermInBucket(cachePool, block, termId, urlId);
|
||||
}
|
||||
}
|
||||
|
@ -1,76 +1,31 @@
|
||||
package nu.marginalia.wmsa.edge.index;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import gnu.trove.map.TLongIntMap;
|
||||
import gnu.trove.map.hash.TIntIntHashMap;
|
||||
import gnu.trove.map.hash.TLongIntHashMap;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import io.prometheus.client.Counter;
|
||||
import io.prometheus.client.Histogram;
|
||||
import io.reactivex.rxjava3.schedulers.Schedulers;
|
||||
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
|
||||
import nu.marginalia.util.ListChunker;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.wmsa.client.GsonFactory;
|
||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
||||
import nu.marginalia.wmsa.configuration.server.Service;
|
||||
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
|
||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
|
||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||
import nu.marginalia.wmsa.edge.model.search.*;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
||||
import org.apache.http.HttpStatus;
|
||||
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexLexiconService;
|
||||
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexOpsService;
|
||||
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexQueryService;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.HaltException;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.LongPredicate;
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
import static spark.Spark.get;
|
||||
import static spark.Spark.halt;
|
||||
|
||||
public class EdgeIndexService extends Service {
|
||||
private static final int SEARCH_BUDGET_TIMEOUT_MS = 100;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@NotNull
|
||||
private final Initialization init;
|
||||
private final SearchIndexes indexes;
|
||||
private final KeywordLexicon keywordLexicon;
|
||||
|
||||
private final Gson gson = new GsonBuilder()
|
||||
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
|
||||
.create();
|
||||
|
||||
private static final Histogram wmsa_edge_index_query_time
|
||||
= Histogram.build().name("wmsa_edge_index_query_time").help("-").register();
|
||||
private static final Counter wmsa_edge_index_query_count
|
||||
= Counter.build().name("wmsa_edge_index_query_count").help("-").register();
|
||||
private static final Histogram wmsa_edge_index_put_words_time
|
||||
= Histogram.build().name("wmsa_edge_index_put_words_time").help("-").register();
|
||||
|
||||
public static final int DYNAMIC_BUCKET_LENGTH = 7;
|
||||
|
||||
@ -81,71 +36,34 @@ public class EdgeIndexService extends Service {
|
||||
Initialization init,
|
||||
MetricsServer metricsServer,
|
||||
SearchIndexes indexes,
|
||||
IndexServicesFactory servicesFactory) {
|
||||
|
||||
EdgeIndexOpsService opsService,
|
||||
EdgeIndexLexiconService lexiconService,
|
||||
EdgeIndexQueryService indexQueryService)
|
||||
{
|
||||
super(ip, port, init, metricsServer);
|
||||
|
||||
final Gson gson = GsonFactory.get();
|
||||
|
||||
this.init = init;
|
||||
this.indexes = indexes;
|
||||
this.keywordLexicon = servicesFactory.getKeywordLexicon();
|
||||
|
||||
Spark.post("/words/", this::putWords);
|
||||
Spark.post("/search/", this::search, gson::toJson);
|
||||
Spark.post("/search-domain/", this::searchDomain, gson::toJson);
|
||||
Spark.post("/words/", lexiconService::putWords);
|
||||
|
||||
Spark.post("/dictionary/*", this::getWordId, gson::toJson);
|
||||
Spark.post("/search/", indexQueryService::search, gson::toJson);
|
||||
Spark.post("/search-domain/", indexQueryService::searchDomain, gson::toJson);
|
||||
|
||||
Spark.post("/ops/repartition", this::repartitionEndpoint);
|
||||
Spark.post("/ops/preconvert", this::preconvertEndpoint);
|
||||
Spark.post("/ops/reindex/:id", this::reindexEndpoint);
|
||||
Spark.get("/dictionary/*", lexiconService::getWordId, gson::toJson);
|
||||
|
||||
Spark.post("/ops/repartition", opsService::repartitionEndpoint);
|
||||
Spark.post("/ops/preconvert", opsService::preconvertEndpoint);
|
||||
Spark.post("/ops/reindex/:id", opsService::reindexEndpoint);
|
||||
|
||||
get("/is-blocked", this::isBlocked, gson::toJson);
|
||||
|
||||
Schedulers.newThread().scheduleDirect(this::initialize, 1, TimeUnit.MICROSECONDS);
|
||||
}
|
||||
|
||||
private Object getWordId(Request request, Response response) {
|
||||
final String word = request.splat()[0];
|
||||
|
||||
var dr = indexes.getDictionaryReader();
|
||||
if (null == dr) {
|
||||
response.status(HttpStatus.SC_FAILED_DEPENDENCY);
|
||||
return "";
|
||||
}
|
||||
|
||||
final int wordId = dr.get(word);
|
||||
|
||||
if (DictionaryHashMap.NO_VALUE == wordId) {
|
||||
response.status(404);
|
||||
return "";
|
||||
}
|
||||
|
||||
return wordId;
|
||||
}
|
||||
|
||||
private Object repartitionEndpoint(Request request, Response response) {
|
||||
|
||||
if (!indexes.repartition()) {
|
||||
Spark.halt(503, "Operations busy");
|
||||
}
|
||||
return "OK";
|
||||
}
|
||||
|
||||
private Object preconvertEndpoint(Request request, Response response) {
|
||||
if (!indexes.preconvert()) {
|
||||
Spark.halt(503, "Operations busy");
|
||||
}
|
||||
return "OK";
|
||||
}
|
||||
|
||||
private Object reindexEndpoint(Request request, Response response) {
|
||||
int id = Integer.parseInt(request.params("id"));
|
||||
|
||||
if (!indexes.reindex(id)) {
|
||||
Spark.halt(503, "Operations busy");
|
||||
}
|
||||
return "OK";
|
||||
}
|
||||
|
||||
private Object isBlocked(Request request, Response response) {
|
||||
return indexes.isBusy() || !initialized;
|
||||
}
|
||||
@ -162,352 +80,7 @@ public class EdgeIndexService extends Service {
|
||||
indexes.initialize(init);
|
||||
}
|
||||
|
||||
private Object putWords(Request request, Response response) {
|
||||
var putWordsRequest = gson.fromJson(request.body(), EdgePutWordsRequest.class);
|
||||
|
||||
synchronized (this) {
|
||||
putWords(putWordsRequest.getDomainId(), putWordsRequest.getUrlId(),
|
||||
putWordsRequest.wordSet, putWordsRequest.getIndex());
|
||||
}
|
||||
|
||||
response.status(HttpStatus.SC_ACCEPTED);
|
||||
return "";
|
||||
}
|
||||
|
||||
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
|
||||
EdgePageWordSet wordSet, int idx
|
||||
) {
|
||||
|
||||
wmsa_edge_index_put_words_time.time(() -> {
|
||||
for (EdgePageWords words : wordSet.values()) {
|
||||
putWords(domainId, urlId, words, idx);
|
||||
}
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
|
||||
EdgePageWords words, int idx
|
||||
) {
|
||||
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
|
||||
|
||||
for (var chunk : ListChunker.chopList(words.words, SearchIndexJournalEntry.MAX_LENGTH)) {
|
||||
|
||||
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
|
||||
var header = new SearchIndexJournalEntryHeader(domainId, urlId, words.block);
|
||||
|
||||
indexWriter.put(header, entry);
|
||||
};
|
||||
}
|
||||
|
||||
private long[] getOrInsertWordIds(List<String> words) {
|
||||
return words.stream()
|
||||
.filter(w -> w.getBytes().length < Byte.MAX_VALUE)
|
||||
.mapToLong(keywordLexicon::getOrInsert)
|
||||
.toArray();
|
||||
}
|
||||
|
||||
private Object searchDomain(Request request, Response response) {
|
||||
if (indexes.getDictionaryReader() == null) {
|
||||
logger.warn("Dictionary reader not yet initialized");
|
||||
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
|
||||
}
|
||||
|
||||
String json = request.body();
|
||||
EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class);
|
||||
|
||||
final int wordId = keywordLexicon.getReadOnly(specsSet.keyword);
|
||||
|
||||
List<EdgeId<EdgeUrl>> urlIds = indexes
|
||||
.getBucket(specsSet.bucket)
|
||||
.findHotDomainsForKeyword(specsSet.block, wordId, specsSet.queryDepth, specsSet.minHitCount, specsSet.maxResults)
|
||||
.mapToObj(lv -> new EdgeId<EdgeUrl>((int)(lv & 0xFFFF_FFFFL)))
|
||||
.toList();
|
||||
|
||||
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
|
||||
}
|
||||
|
||||
private Object search(Request request, Response response) {
|
||||
if (indexes.getDictionaryReader() == null) {
|
||||
logger.warn("Dictionary reader not yet initialized");
|
||||
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
|
||||
}
|
||||
|
||||
String json = request.body();
|
||||
EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class);
|
||||
|
||||
long start = System.currentTimeMillis();
|
||||
try {
|
||||
if (specsSet.isStagger()) {
|
||||
return new EdgeSearchResultSet(searchStaggered(specsSet));
|
||||
}
|
||||
else {
|
||||
return new EdgeSearchResultSet(searchStraight(specsSet));
|
||||
}
|
||||
}
|
||||
catch (HaltException ex) {
|
||||
logger.warn("Halt", ex);
|
||||
throw ex;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.info("Error during search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
|
||||
logger.info("Error", ex);
|
||||
Spark.halt(500, "Error");
|
||||
return null;
|
||||
}
|
||||
finally {
|
||||
wmsa_edge_index_query_time.observe(System.currentTimeMillis() - start);
|
||||
wmsa_edge_index_query_count.inc();
|
||||
}
|
||||
}
|
||||
|
||||
private Map<IndexBlock, List<EdgeSearchResults>> searchStaggered(EdgeSearchSpecification specsSet) {
|
||||
int count = 0;
|
||||
|
||||
final Map<IndexBlock, List<EdgeSearchResults>> results = new HashMap<>();
|
||||
final TIntHashSet seenResults = new TIntHashSet();
|
||||
|
||||
final DomainResultCountFilter[] domainCountFilter = new DomainResultCountFilter[] {
|
||||
new DomainResultCountFilter(specsSet.limitByDomain),
|
||||
new DomainResultCountFilter(specsSet.limitByDomain)
|
||||
};
|
||||
|
||||
final IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_TIMEOUT_MS);
|
||||
final TIntIntHashMap limitsPerBucketRemaining = new TIntIntHashMap(6, 0.7f, 0, specsSet.limitByBucket);
|
||||
|
||||
for (int i = 0; i < specsSet.buckets.size(); i+=2) {
|
||||
for (var sq : specsSet.subqueries) {
|
||||
for (int j = 0; j < 2 && i + j < specsSet.buckets.size(); j++) {
|
||||
Optional<EdgeIndexSearchTerms> searchTerms = getSearchTerms(sq);
|
||||
|
||||
if (searchTerms.isEmpty())
|
||||
continue;
|
||||
|
||||
var result = performSearch(searchTerms.get(),
|
||||
budget,
|
||||
seenResults,
|
||||
domainCountFilter[j],
|
||||
sq,
|
||||
List.of(specsSet.buckets.get(i+j)),
|
||||
specsSet,
|
||||
Math.min(limitsPerBucketRemaining.get(i+j), specsSet.limitTotal - count)
|
||||
);
|
||||
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("{} -> {} {} {}", sq.block, specsSet.buckets.get(i+j), sq.searchTermsInclude, result.results.values().stream().mapToInt(List::size).sum());
|
||||
}
|
||||
|
||||
int sz = result.size();
|
||||
count += sz;
|
||||
limitsPerBucketRemaining.adjustOrPutValue(i+j, -sz, specsSet.limitByBucket-sz);
|
||||
|
||||
if (sz > 0) {
|
||||
results.computeIfAbsent(sq.block, s -> new ArrayList<>()).add(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
@NotNull
|
||||
private Map<IndexBlock, List<EdgeSearchResults>> searchStraight(EdgeSearchSpecification specsSet) {
|
||||
Map<IndexBlock, List<EdgeSearchResults>> results = new HashMap<>();
|
||||
int count = 0;
|
||||
TIntHashSet seenResults = new TIntHashSet();
|
||||
|
||||
final DomainResultCountFilter domainCountFilter = new DomainResultCountFilter(specsSet.limitByDomain);
|
||||
|
||||
IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_TIMEOUT_MS);
|
||||
for (var sq : specsSet.subqueries) {
|
||||
Optional<EdgeIndexSearchTerms> searchTerms = getSearchTerms(sq);
|
||||
|
||||
if (searchTerms.isEmpty())
|
||||
continue;
|
||||
|
||||
var result = performSearch(searchTerms.get(),
|
||||
budget, seenResults, domainCountFilter,
|
||||
sq, specsSet.buckets, specsSet,
|
||||
specsSet.limitTotal - count);
|
||||
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("{} -> {} {}", sq.block, sq.searchTermsInclude, result.size());
|
||||
}
|
||||
|
||||
count += result.size();
|
||||
if (result.size() > 0) {
|
||||
results.computeIfAbsent(sq.block, s -> new ArrayList<>()).add(result);
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private EdgeSearchResults performSearch(EdgeIndexSearchTerms searchTerms,
|
||||
IndexSearchBudget budget,
|
||||
TIntHashSet seenResults,
|
||||
DomainResultCountFilter domainCountFilter,
|
||||
EdgeSearchSubquery sq,
|
||||
List<Integer> specBuckets,
|
||||
EdgeSearchSpecification specs,
|
||||
int limit)
|
||||
{
|
||||
if (limit <= 0) {
|
||||
return new EdgeSearchResults();
|
||||
}
|
||||
|
||||
final Map<Integer, List<EdgeSearchResultItem>> results = new HashMap<>();
|
||||
final DomainResultCountFilter localFilter = new DomainResultCountFilter(specs.limitByDomain);
|
||||
|
||||
for (int i : specBuckets) {
|
||||
int foundResultsCount = results.values().stream().mapToInt(List::size).sum();
|
||||
|
||||
if (foundResultsCount >= specs.limitTotal || foundResultsCount >= limit)
|
||||
break;
|
||||
|
||||
List<EdgeSearchResultItem> resultsForBucket = new ArrayList<>(specs.limitByBucket);
|
||||
|
||||
getQuery(i, budget, sq.block, lv -> localFilter.filterRawValue(i, lv), searchTerms)
|
||||
.mapToObj(id -> new EdgeSearchResultItem(i, sq.termSize(), id))
|
||||
.filter(ri -> !seenResults.contains(ri.url.id()) && localFilter.test(i, domainCountFilter, ri))
|
||||
.limit(specs.limitTotal * 3L)
|
||||
.distinct()
|
||||
.limit(Math.min(specs.limitByBucket
|
||||
- results.values().stream().mapToInt(Collection::size).sum(), limit - foundResultsCount))
|
||||
.forEach(resultsForBucket::add);
|
||||
|
||||
|
||||
for (var result : resultsForBucket) {
|
||||
seenResults.add(result.url.id());
|
||||
}
|
||||
for (var result : resultsForBucket) {
|
||||
for (var searchTerm : sq.searchTermsInclude) {
|
||||
result.scores.add(getSearchTermScore(i, searchTerm, result.getCombinedId()));
|
||||
}
|
||||
}
|
||||
|
||||
domainCountFilter.addAll(i, resultsForBucket);
|
||||
|
||||
if (!resultsForBucket.isEmpty()) {
|
||||
results.put(i, resultsForBucket);
|
||||
}
|
||||
}
|
||||
|
||||
return new EdgeSearchResults(results);
|
||||
}
|
||||
|
||||
private EdgeSearchResultKeywordScore getSearchTermScore(int bucketId, String term, long urlId) {
|
||||
final int termId = indexes.getDictionaryReader().get(term);
|
||||
|
||||
var bucket = indexes.getBucket(bucketId);
|
||||
|
||||
return new EdgeSearchResultKeywordScore(term,
|
||||
bucket.getTermScore(termId, urlId),
|
||||
bucket.isTermInBucket(IndexBlock.Title, termId, urlId),
|
||||
bucket.isTermInBucket(IndexBlock.Link, termId, urlId)
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
public LongStream getHotDomainsQuery(int bucket, IndexBlock block, int wordId,
|
||||
int queryDepth, int minHitCount, int maxResults) {
|
||||
if (!indexes.isValidBucket(bucket)) {
|
||||
logger.warn("Invalid bucket {}", bucket);
|
||||
return LongStream.empty();
|
||||
}
|
||||
|
||||
return indexes.getBucket(bucket).findHotDomainsForKeyword(block, wordId, queryDepth, minHitCount, maxResults);
|
||||
}
|
||||
|
||||
private LongStream getQuery(int bucket, IndexSearchBudget budget, IndexBlock block,
|
||||
LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
|
||||
if (!indexes.isValidBucket(bucket)) {
|
||||
logger.warn("Invalid bucket {}", bucket);
|
||||
return LongStream.empty();
|
||||
}
|
||||
return indexes.getBucket(bucket).getQuery(block, filter, budget, searchTerms);
|
||||
}
|
||||
|
||||
static class DomainResultCountFilter {
|
||||
final TLongIntMap resultsByDomain = new TLongIntHashMap(200, 0.75f, -1, 0);
|
||||
final int limitByDomain;
|
||||
|
||||
DomainResultCountFilter(int limitByDomain) {
|
||||
this.limitByDomain = limitByDomain;
|
||||
}
|
||||
|
||||
public boolean filterRawValue(int bucket, long value) {
|
||||
var domain = new EdgeId<EdgeDomain>((int)(value >>> 32));
|
||||
|
||||
if (domain.id() == Integer.MAX_VALUE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return resultsByDomain.get(getKey(bucket, domain)) <= limitByDomain;
|
||||
}
|
||||
|
||||
long getKey(int bucket, EdgeId<EdgeDomain> id) {
|
||||
return ((long)bucket) << 32 | id.id();
|
||||
}
|
||||
|
||||
public boolean test(int bucket, EdgeSearchResultItem item) {
|
||||
if (item.domain.id() == Integer.MAX_VALUE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return resultsByDomain.adjustOrPutValue(getKey(bucket, item.domain), 1, 1) <= limitByDomain;
|
||||
}
|
||||
|
||||
int getCount(int bucket, EdgeSearchResultItem item) {
|
||||
return resultsByDomain.get(getKey(bucket, item.domain));
|
||||
}
|
||||
|
||||
public void addAll(int bucket, List<EdgeSearchResultItem> items) {
|
||||
items.forEach(item -> {
|
||||
resultsByDomain.adjustOrPutValue(getKey(bucket, item.domain), 1, 1);
|
||||
});
|
||||
}
|
||||
|
||||
public boolean test(int bucket, DomainResultCountFilter root, EdgeSearchResultItem item) {
|
||||
if (item.domain.id() == Integer.MAX_VALUE) {
|
||||
return true;
|
||||
}
|
||||
return root.getCount(bucket, item) + resultsByDomain.adjustOrPutValue(getKey(bucket, item.domain), 1, 1) <= limitByDomain;
|
||||
}
|
||||
}
|
||||
|
||||
private Optional<EdgeIndexSearchTerms> getSearchTerms(EdgeSearchSubquery request) {
|
||||
final List<Integer> excludes = new ArrayList<>();
|
||||
final List<Integer> includes = new ArrayList<>();
|
||||
|
||||
for (var include : request.searchTermsInclude) {
|
||||
var word = lookUpWord(include);
|
||||
if (word.isEmpty()) {
|
||||
logger.debug("Unknown search term: " + include);
|
||||
return Optional.empty();
|
||||
}
|
||||
includes.add(word.getAsInt());
|
||||
}
|
||||
|
||||
for (var exclude : request.searchTermsExclude) {
|
||||
lookUpWord(exclude).ifPresent(excludes::add);
|
||||
}
|
||||
|
||||
if (includes.isEmpty()) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
return Optional.of(new EdgeIndexSearchTerms(includes, excludes));
|
||||
}
|
||||
|
||||
private OptionalInt lookUpWord(String s) {
|
||||
int ret = indexes.getDictionaryReader().get(s);
|
||||
if (ret == DictionaryHashMap.NO_VALUE) {
|
||||
return OptionalInt.empty();
|
||||
}
|
||||
return OptionalInt.of(ret);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,67 +1,65 @@
|
||||
package nu.marginalia.wmsa.edge.index.client;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.google.inject.Singleton;
|
||||
import io.prometheus.client.Summary;
|
||||
import io.reactivex.rxjava3.core.Observable;
|
||||
import io.reactivex.rxjava3.schedulers.Schedulers;
|
||||
import nu.marginalia.wmsa.client.AbstractDynamicClient;
|
||||
import nu.marginalia.wmsa.client.HttpStatusCode;
|
||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultSet;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq;
|
||||
|
||||
import javax.annotation.CheckReturnValue;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
@Singleton
|
||||
public class EdgeIndexClient extends AbstractDynamicClient {
|
||||
private final Gson gson = new GsonBuilder()
|
||||
.create();
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
public class EdgeIndexClient extends AbstractDynamicClient implements EdgeIndexWriterClient {
|
||||
|
||||
private static final Summary wmsa_search_index_api_time = Summary.build().name("wmsa_search_index_api_time").help("-").register();
|
||||
|
||||
public EdgeIndexClient() {
|
||||
super(ServiceDescriptor.EDGE_INDEX);
|
||||
setTimeout(30);
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
public Observable<HttpStatusCode> putWords(Context ctx, EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url, double quality,
|
||||
EdgePageWordSet wordSet, int writer
|
||||
)
|
||||
@Override
|
||||
public void putWords(Context ctx, EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url,
|
||||
DocumentKeywords wordSet, int writer
|
||||
)
|
||||
{
|
||||
EdgePutWordsRequest request = new EdgePutWordsRequest(domain, url, quality, wordSet, writer);
|
||||
|
||||
return this.post(ctx, "/words/", request);
|
||||
var keywordBuilder =
|
||||
IndexPutKeywordsReq.newBuilder()
|
||||
.setDomain(domain.id())
|
||||
.setUrl(url.id())
|
||||
.setIndex(writer);
|
||||
|
||||
var wordSetBuilder = IndexPutKeywordsReq.WordSet.newBuilder();
|
||||
wordSetBuilder.setIndex(wordSet.block().ordinal());
|
||||
wordSetBuilder.addAllWords(List.of(wordSet.keywords()));
|
||||
keywordBuilder.addWordSet(wordSetBuilder.build());
|
||||
|
||||
var req = keywordBuilder.build();
|
||||
|
||||
this.post(ctx, "/words/", req).blockingSubscribe();
|
||||
}
|
||||
|
||||
|
||||
@CheckReturnValue
|
||||
public EdgeSearchResultSet query(Context ctx, EdgeSearchSpecification specs) {
|
||||
return this.postGet(ctx, "/search/", specs, EdgeSearchResultSet.class).blockingFirst();
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
public List<EdgeSearchResultSet> multiQuery(Context ctx, EdgeSearchSpecification... specs) {
|
||||
|
||||
return Observable.fromArray(specs)
|
||||
.concatMap(s -> postGet(ctx, "/search/", s, EdgeSearchResultSet.class)
|
||||
.subscribeOn(Schedulers.io())
|
||||
.timeout(1, TimeUnit.SECONDS)
|
||||
.onErrorComplete())
|
||||
.toList()
|
||||
.blockingGet();
|
||||
public List<EdgeSearchResultItem> query(Context ctx, EdgeSearchSpecification specs) {
|
||||
return wmsa_search_index_api_time.time(
|
||||
() -> this.postGet(ctx, "/search/", specs, EdgeSearchResultSet.class).blockingFirst().getResults()
|
||||
);
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
|
@ -0,0 +1,88 @@
|
||||
package nu.marginalia.wmsa.edge.index.client;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.util.ListChunker;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
|
||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
|
||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
@Singleton
|
||||
public class EdgeIndexLocalService implements EdgeIndexWriterClient {
|
||||
|
||||
private final KeywordLexicon lexicon;
|
||||
private final SearchIndexJournalWriterImpl indexWriter;
|
||||
private static final Logger logger = LoggerFactory.getLogger(EdgeIndexLocalService.class);
|
||||
|
||||
@Inject
|
||||
public EdgeIndexLocalService(@Named("local-index-path") Path path) throws IOException {
|
||||
long hashMapSize = 1L << 31;
|
||||
|
||||
if (Boolean.getBoolean("small-ram")) {
|
||||
hashMapSize = 1L << 27;
|
||||
}
|
||||
|
||||
var lexiconJournal = new KeywordLexiconJournal(path.resolve("dictionary.dat").toFile());
|
||||
lexicon = new KeywordLexicon(lexiconJournal, new DictionaryHashMap(hashMapSize));
|
||||
indexWriter = new SearchIndexJournalWriterImpl(lexicon, path.resolve("index.dat").toFile());
|
||||
}
|
||||
|
||||
public void putWords(Context ctx, EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url,
|
||||
DocumentKeywords wordSet, int writer) {
|
||||
if (wordSet.keywords().length == 0)
|
||||
return;
|
||||
|
||||
if (domain.id() <= 0 || url.id() <= 0) {
|
||||
logger.warn("Bad ID: {}:{}", domain, url);
|
||||
return;
|
||||
}
|
||||
|
||||
for (var chunk : ListChunker.chopList(List.of(wordSet.keywords()), SearchIndexJournalEntry.MAX_LENGTH)) {
|
||||
|
||||
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
|
||||
var header = new SearchIndexJournalEntryHeader(domain, url, wordSet.block());
|
||||
|
||||
indexWriter.put(header, entry);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private long[] getOrInsertWordIds(List<String> words) {
|
||||
long[] ids = new long[words.size()];
|
||||
int putId = 0;
|
||||
|
||||
for (String word : words) {
|
||||
long id = lexicon.getOrInsert(word);
|
||||
if (id != DictionaryHashMap.NO_VALUE) {
|
||||
ids[putId++] = id;
|
||||
}
|
||||
}
|
||||
|
||||
if (putId != words.size()) {
|
||||
ids = Arrays.copyOf(ids, putId);
|
||||
}
|
||||
return ids;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
indexWriter.close();
|
||||
lexicon.close();
|
||||
}
|
||||
}
|
@ -0,0 +1,13 @@
|
||||
package nu.marginalia.wmsa.edge.index.client;
|
||||
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
|
||||
public interface EdgeIndexWriterClient extends AutoCloseable {
|
||||
|
||||
void putWords(Context ctx, EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url,
|
||||
DocumentKeywords wordSets, int writer);
|
||||
}
|
@ -15,10 +15,11 @@ import java.io.IOException;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
||||
private final KeywordLexicon dictionaryWriter;
|
||||
private final KeywordLexicon lexicon;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Disposable writerTask;
|
||||
@ -30,12 +31,14 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
||||
private long pos;
|
||||
|
||||
@SneakyThrows
|
||||
public SearchIndexJournalWriterImpl(KeywordLexicon dictionaryWriter, File indexFile) {
|
||||
this.dictionaryWriter = dictionaryWriter;
|
||||
public SearchIndexJournalWriterImpl(KeywordLexicon lexicon, File indexFile) {
|
||||
this.lexicon = lexicon;
|
||||
initializeIndexFile(indexFile);
|
||||
|
||||
byteBuffer = ByteBuffer.allocate(MAX_BLOCK_SIZE);
|
||||
|
||||
new Thread(this::journalWriterThread, "Journal Writer").start();
|
||||
|
||||
writerTask = Schedulers.io().schedulePeriodicallyDirect(this::forceWrite, 1, 1, TimeUnit.SECONDS);
|
||||
Runtime.getRuntime().addShutdownHook(new Thread(this::forceWrite));
|
||||
}
|
||||
@ -56,25 +59,45 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
||||
}
|
||||
}
|
||||
|
||||
private record WriteJob(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {}
|
||||
private final LinkedBlockingQueue<WriteJob> writeQueue = new LinkedBlockingQueue<>(512);
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public synchronized void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {
|
||||
public void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {
|
||||
writeQueue.put(new WriteJob(header, entryData));
|
||||
}
|
||||
|
||||
byteBuffer.clear();
|
||||
@SneakyThrows
|
||||
public void journalWriterThread() {
|
||||
|
||||
byteBuffer.putInt(entryData.size());
|
||||
byteBuffer.putInt(header.block().id);
|
||||
byteBuffer.putLong(header.documentId());
|
||||
while (true) {
|
||||
var job = writeQueue.take();
|
||||
|
||||
entryData.write(byteBuffer);
|
||||
writeEntry(job.header, job.entryData);
|
||||
}
|
||||
}
|
||||
private synchronized void writeEntry(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {
|
||||
|
||||
byteBuffer.limit(byteBuffer.position());
|
||||
byteBuffer.rewind();
|
||||
try {
|
||||
byteBuffer.clear();
|
||||
|
||||
while (byteBuffer.position() < byteBuffer.limit())
|
||||
channel.write(byteBuffer);
|
||||
byteBuffer.putInt(entryData.size());
|
||||
byteBuffer.putInt(header.block().id);
|
||||
byteBuffer.putLong(header.documentId());
|
||||
|
||||
writePositionMarker();
|
||||
entryData.write(byteBuffer);
|
||||
|
||||
byteBuffer.limit(byteBuffer.position());
|
||||
byteBuffer.rewind();
|
||||
|
||||
while (byteBuffer.position() < byteBuffer.limit())
|
||||
channel.write(byteBuffer);
|
||||
|
||||
writePositionMarker();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -90,17 +113,15 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
||||
|
||||
@Override
|
||||
public void flushWords() {
|
||||
dictionaryWriter.commitToDisk();
|
||||
lexicon.commitToDisk();
|
||||
}
|
||||
|
||||
private void writePositionMarker() throws IOException {
|
||||
var lock = channel.lock(0, 16, false);
|
||||
pos = channel.size();
|
||||
raf.seek(0);
|
||||
raf.writeLong(pos);
|
||||
raf.writeLong(dictionaryWriter.size());
|
||||
raf.writeLong(lexicon.size());
|
||||
raf.seek(pos);
|
||||
lock.release();
|
||||
}
|
||||
|
||||
public synchronized void close() throws IOException {
|
||||
|
@ -2,15 +2,22 @@ package nu.marginalia.wmsa.edge.index.journal.model;
|
||||
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
|
||||
public record SearchIndexJournalEntryHeader(int entrySize, long documentId, IndexBlock block) {
|
||||
|
||||
public static final int HEADER_SIZE_LONGS = 2;
|
||||
|
||||
public SearchIndexJournalEntryHeader( EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId, IndexBlock block) {
|
||||
this(-1, (long) domainId.id() << 32 | urlId.id(), block);
|
||||
this(-1, combineIds(domainId, urlId), block);
|
||||
}
|
||||
|
||||
private static long combineIds(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId) {
|
||||
long did = domainId.id();
|
||||
long uid = urlId.id();
|
||||
|
||||
return (did << 32L) | uid;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -53,7 +53,7 @@ public class KeywordLexicon implements AutoCloseable {
|
||||
@SneakyThrows
|
||||
private int getOrInsert(byte[] bytes) {
|
||||
if (bytes.length >= Byte.MAX_VALUE) {
|
||||
logger.warn("getOrInsert({}), illegal length {}", bytes, bytes.length);
|
||||
logger.warn("getOrInsert({}), illegal length {}", new String(bytes), bytes.length);
|
||||
return DictionaryHashMap.NO_VALUE;
|
||||
}
|
||||
|
||||
|
@ -15,7 +15,7 @@ import java.util.concurrent.locks.ReadWriteLock;
|
||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
public class KeywordLexiconJournalFile {
|
||||
public class KeywordLexiconJournalFile implements AutoCloseable {
|
||||
private final RandomAccessFile journalFileRAF;
|
||||
private final File journalFile;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
@ -9,4 +9,8 @@ import java.util.List;
|
||||
public class EdgeIndexSearchTerms {
|
||||
public List<Integer> includes = new ArrayList<>();
|
||||
public List<Integer> excludes = new ArrayList<>();
|
||||
|
||||
public boolean isEmpty() {
|
||||
return includes.isEmpty();
|
||||
}
|
||||
}
|
||||
|
@ -4,17 +4,17 @@ import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
|
||||
@AllArgsConstructor @Getter
|
||||
@ToString
|
||||
public class EdgePutWordsRequest {
|
||||
public final EdgeId<EdgeDomain> domainId;
|
||||
public final EdgeId<EdgeUrl> urlId;
|
||||
public final double quality;
|
||||
public EdgeId<EdgeDomain> domainId;
|
||||
public EdgeId<EdgeUrl> urlId;
|
||||
public double quality;
|
||||
|
||||
public final EdgePageWordSet wordSet;
|
||||
public EdgePageWordSet wordSet;
|
||||
private int index = 0;
|
||||
}
|
||||
|
@ -1,23 +1,36 @@
|
||||
package nu.marginalia.wmsa.edge.index.model;
|
||||
|
||||
public enum IndexBlock {
|
||||
TitleKeywords(0, 0),
|
||||
Title(1, 1),
|
||||
Link(2, 1.25),
|
||||
Top(3, 2),
|
||||
Middle(4, 3),
|
||||
Low(5, 4),
|
||||
Words(6, 6),
|
||||
Meta(7, 7),
|
||||
PositionWords(8, 4.5),
|
||||
NamesWords(9, 5),
|
||||
Artifacts(10, 10),
|
||||
Topic(11, 0.5);
|
||||
TitleKeywords(IndexBlockType.QUALITY_SIGNAL, 0, 0),
|
||||
Title(IndexBlockType.QUALITY_SIGNAL, 1, 1),
|
||||
|
||||
Link(IndexBlockType.QUALITY_SIGNAL, 2, 1.15),
|
||||
|
||||
Subjects(IndexBlockType.QUALITY_SIGNAL, 3, 1.0),
|
||||
NamesWords(IndexBlockType.QUALITY_SIGNAL, 4, 3.0),
|
||||
|
||||
Artifacts(IndexBlockType.PAGE_DATA, 5, 10),
|
||||
Meta(IndexBlockType.PAGE_DATA, 6, 7),
|
||||
|
||||
Tfidf_Top(IndexBlockType.TF_IDF, 7, 1.5),
|
||||
Tfidf_Middle(IndexBlockType.TF_IDF, 8, 2),
|
||||
Tfidf_Lower(IndexBlockType.TF_IDF, 9, 3.5),
|
||||
|
||||
Words_1(IndexBlockType.PAGE_DATA, 10, 2.0),
|
||||
Words_2(IndexBlockType.PAGE_DATA,11, 3.5),
|
||||
Words_4(IndexBlockType.PAGE_DATA,12, 4.0),
|
||||
Words_8(IndexBlockType.PAGE_DATA,13, 4.5),
|
||||
Words_16Plus(IndexBlockType.PAGE_DATA,14, 7.0),
|
||||
|
||||
Site(IndexBlockType.QUALITY_SIGNAL, 15, 1.2)
|
||||
;
|
||||
|
||||
public final IndexBlockType type;
|
||||
public final int id;
|
||||
public final double sortOrder;
|
||||
|
||||
IndexBlock(int id, double sortOrder) {
|
||||
IndexBlock(IndexBlockType type, int id, double sortOrder) {
|
||||
this.type = type;
|
||||
this.sortOrder = sortOrder;
|
||||
this.id = id;
|
||||
}
|
||||
@ -31,3 +44,5 @@ public enum IndexBlock {
|
||||
throw new IllegalArgumentException("Bad block id");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -0,0 +1,7 @@
|
||||
package nu.marginalia.wmsa.edge.index.model;
|
||||
|
||||
public enum IndexBlockType {
|
||||
QUALITY_SIGNAL,
|
||||
TF_IDF,
|
||||
PAGE_DATA
|
||||
}
|
@ -49,8 +49,8 @@ public class IndexWordsTable implements AutoCloseable {
|
||||
}
|
||||
|
||||
public long positionForWord(int wordId) {
|
||||
|
||||
long offset = reader.findEntry(header, wordId);
|
||||
|
||||
if (offset < 0) {
|
||||
return -1L;
|
||||
}
|
||||
|
@ -0,0 +1,43 @@
|
||||
package nu.marginalia.wmsa.edge.index.reader;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class MicroCache {
|
||||
private final int[] keys;
|
||||
private final long[] data;
|
||||
private int pos = 0;
|
||||
|
||||
public int hit;
|
||||
public int miss;
|
||||
public int full;
|
||||
|
||||
public static final long BAD_VALUE = Long.MIN_VALUE;
|
||||
|
||||
public MicroCache(int size) {
|
||||
keys = new int[size];
|
||||
data = new long[size];
|
||||
|
||||
Arrays.fill(data, BAD_VALUE);
|
||||
}
|
||||
|
||||
public long get(int key) {
|
||||
for (int i = 0; i < keys.length && data[i] != BAD_VALUE; i++) {
|
||||
if (keys[i] == key) {
|
||||
hit++;
|
||||
return data[i];
|
||||
}
|
||||
}
|
||||
miss++;
|
||||
return BAD_VALUE;
|
||||
}
|
||||
|
||||
public void set(int key, long val) {
|
||||
keys[pos] = key;
|
||||
data[pos] = val;
|
||||
|
||||
if (++pos >= keys.length) {
|
||||
full++;
|
||||
pos = 0;
|
||||
}
|
||||
}
|
||||
}
|
@ -9,6 +9,9 @@ import nu.marginalia.util.btree.CachingBTreeReader;
|
||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -22,6 +25,7 @@ public class SearchIndex implements AutoCloseable {
|
||||
|
||||
private final MultimapFileLong urls;
|
||||
private final IndexWordsTable words;
|
||||
public final String name;
|
||||
private final RandomAccessFile wordsFile;
|
||||
private final BTreeReader bTreeReader;
|
||||
private final CachingBTreeReader cachingBTreeReader;
|
||||
@ -36,6 +40,7 @@ public class SearchIndex implements AutoCloseable {
|
||||
throws IOException {
|
||||
|
||||
logger = LoggerFactory.getLogger(name);
|
||||
this.name = name;
|
||||
wordsFile = new RandomAccessFile(inWords, "r");
|
||||
|
||||
logger.info("{} : Loading {}", name, inUrls);
|
||||
@ -65,26 +70,37 @@ public class SearchIndex implements AutoCloseable {
|
||||
}
|
||||
|
||||
|
||||
public long numUrls(int wordId) {
|
||||
public long numUrls(IndexQueryCachePool pool, int wordId) {
|
||||
int length = words.wordLength(wordId);
|
||||
if (length < 0) return 0;
|
||||
if (length > 0) return length;
|
||||
|
||||
return rangeForWord(wordId).numEntries();
|
||||
return rangeForWord(pool, wordId).numEntries();
|
||||
}
|
||||
|
||||
public UrlIndexTree rangeForWord(int wordId) {
|
||||
return new UrlIndexTree(words.positionForWord(wordId));
|
||||
public IndexBTreeRange rangeForWord(IndexQueryCachePool pool, int wordId) {
|
||||
IndexBTreeRange range = pool.getRange(words, wordId);
|
||||
|
||||
if (range == null) {
|
||||
range = new IndexBTreeRange(words.positionForWord(wordId));
|
||||
pool.cacheRange(words, wordId, range);
|
||||
}
|
||||
|
||||
return range;
|
||||
}
|
||||
|
||||
public class UrlIndexTree {
|
||||
final long dataOffset;
|
||||
public IndexBTreeRange rangeForWord(int wordId) {
|
||||
return new IndexBTreeRange(words.positionForWord(wordId));
|
||||
}
|
||||
|
||||
public class IndexBTreeRange {
|
||||
public final long dataOffset;
|
||||
private BTreeHeader header;
|
||||
public UrlIndexTree(long dataOffset) {
|
||||
public IndexBTreeRange(long dataOffset) {
|
||||
this.dataOffset = dataOffset;
|
||||
}
|
||||
|
||||
public LongStream stream() {
|
||||
public LongStream stream(int bufferSize) {
|
||||
if (dataOffset < 0) {
|
||||
return LongStream.empty();
|
||||
}
|
||||
@ -94,7 +110,7 @@ public class SearchIndex implements AutoCloseable {
|
||||
|
||||
long urlOffset = header.dataOffsetLongs();
|
||||
long endOffset = header.dataOffsetLongs() + header.numEntries();
|
||||
int stepSize = Math.min(1024, header.numEntries());
|
||||
int stepSize = Math.min(bufferSize, header.numEntries());
|
||||
|
||||
long[] buffer = new long[stepSize];
|
||||
|
||||
@ -107,6 +123,19 @@ public class SearchIndex implements AutoCloseable {
|
||||
});
|
||||
}
|
||||
|
||||
public EntrySource asEntrySource() {
|
||||
return new AsEntrySource();
|
||||
}
|
||||
|
||||
public QueryFilterStepIf asExcludeFilterStep(IndexQueryCachePool pool) {
|
||||
return new AsExcludeQueryFilterStep(pool);
|
||||
}
|
||||
|
||||
|
||||
public LongStream stream() {
|
||||
return stream(1024);
|
||||
}
|
||||
|
||||
public boolean isPresent() {
|
||||
return dataOffset >= 0;
|
||||
}
|
||||
@ -122,35 +151,95 @@ public class SearchIndex implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasUrl(long url) {
|
||||
if (header != null) {
|
||||
return bTreeReader.findEntry(header, url) >= 0;
|
||||
public boolean hasUrl(CachingBTreeReader.BTreeCachedIndex cache, long url) {
|
||||
if (dataOffset < 0) return false;
|
||||
|
||||
return cachingBTreeReader.findEntry(cache, url) >= 0;
|
||||
}
|
||||
|
||||
public boolean hasUrl(IndexQueryCachePool pool, long url) {
|
||||
if (dataOffset < 0)
|
||||
return false;
|
||||
|
||||
CachingBTreeReader.BTreeCachedIndex cache = pool.getIndexCache(SearchIndex.this, this);
|
||||
|
||||
return cachingBTreeReader.findEntry(cache, url) >= 0;
|
||||
}
|
||||
|
||||
public CachingBTreeReader.BTreeCachedIndex createIndexCache() {
|
||||
if (dataOffset < 0)
|
||||
return null;
|
||||
|
||||
if (header == null) {
|
||||
header = cachingBTreeReader.getHeader(dataOffset);
|
||||
}
|
||||
else if (dataOffset < 0) return false;
|
||||
else {
|
||||
header = bTreeReader.getHeader(dataOffset);
|
||||
return bTreeReader.findEntry(header, url) >= 0;
|
||||
|
||||
return cachingBTreeReader.prepareCache(header);
|
||||
}
|
||||
|
||||
class AsEntrySource implements EntrySource {
|
||||
long pos;
|
||||
final long endOffset;
|
||||
|
||||
public SearchIndex getIndex() {
|
||||
return SearchIndex.this;
|
||||
};
|
||||
|
||||
public AsEntrySource() {
|
||||
if (dataOffset <= 0) {
|
||||
pos = -1;
|
||||
endOffset = -1;
|
||||
return;
|
||||
}
|
||||
|
||||
if (header == null) {
|
||||
header = bTreeReader.getHeader(dataOffset);
|
||||
}
|
||||
|
||||
pos = header.dataOffsetLongs();
|
||||
endOffset = header.dataOffsetLongs() + header.numEntries();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int read(long[] buffer, int n) {
|
||||
if (pos >= endOffset) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int rb = Math.min(n, (int)(endOffset - pos));
|
||||
urls.read(buffer, rb, pos);
|
||||
pos += rb;
|
||||
return rb;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasUrl(CachingBTreeReader.Cache cache, long url) {
|
||||
if (header != null) {
|
||||
return cachingBTreeReader.findEntry(header, cache, url) >= 0;
|
||||
class AsExcludeQueryFilterStep implements QueryFilterStepIf {
|
||||
private final CachingBTreeReader.BTreeCachedIndex cache;
|
||||
|
||||
public AsExcludeQueryFilterStep(IndexQueryCachePool pool) {
|
||||
cache = pool.getIndexCache(SearchIndex.this, IndexBTreeRange.this);
|
||||
}
|
||||
else if (dataOffset < 0) return false;
|
||||
else {
|
||||
header = bTreeReader.getHeader(dataOffset);
|
||||
return cachingBTreeReader.findEntry(header, cache, url) >= 0;
|
||||
|
||||
public SearchIndex getIndex() {
|
||||
return SearchIndex.this;
|
||||
};
|
||||
public double cost() {
|
||||
return cache.getIndexedDataSize();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean test(long value) {
|
||||
return !hasUrl(cache, value);
|
||||
}
|
||||
|
||||
public String describe() {
|
||||
return "Exclude["+name+"]";
|
||||
}
|
||||
}
|
||||
|
||||
public CachingBTreeReader.Cache createIndexCache() {
|
||||
return cachingBTreeReader.prepareCache();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
urls.close();
|
||||
|
@ -3,9 +3,8 @@ package nu.marginalia.wmsa.edge.index.reader;
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexQueryBuilder;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.Query;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -19,18 +18,20 @@ import java.util.stream.Stream;
|
||||
public class SearchIndexReader implements AutoCloseable {
|
||||
|
||||
private final EnumMap<IndexBlock, SearchIndex> indices;
|
||||
|
||||
private final EnumMap<IndexBlock, IndexQueryBuilder> queryBuilders;
|
||||
private final EnumMap<IndexBlock, IndexQueryBuilder> underspecifiedQueryBuilders;
|
||||
private final EnumMap<IndexBlock, IndexQueryFactory> queryBuilders;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private static final IndexBlock[] indicesBySearchOrder = new IndexBlock[] {
|
||||
IndexBlock.Top,
|
||||
IndexBlock.Middle,
|
||||
IndexBlock.Low,
|
||||
IndexBlock.Words,
|
||||
IndexBlock.NamesWords,
|
||||
IndexBlock.Title,
|
||||
IndexBlock.Tfidf_Top,
|
||||
IndexBlock.Tfidf_Middle,
|
||||
IndexBlock.Tfidf_Lower,
|
||||
IndexBlock.Words_1,
|
||||
IndexBlock.Words_2,
|
||||
IndexBlock.Words_4,
|
||||
IndexBlock.Words_8,
|
||||
IndexBlock.Words_16Plus,
|
||||
};
|
||||
|
||||
@Inject
|
||||
@ -38,30 +39,33 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
EnumMap<IndexBlock, SearchIndex> indices) {
|
||||
this.indices = indices;
|
||||
|
||||
var lowIndex = indices.get(IndexBlock.Low);
|
||||
var midIndex = indices.get(IndexBlock.Middle);
|
||||
var topIndex = indices.get(IndexBlock.Top);
|
||||
var lowIndex = indices.get(IndexBlock.Tfidf_Lower);
|
||||
var midIndex = indices.get(IndexBlock.Tfidf_Middle);
|
||||
var topIndex = indices.get(IndexBlock.Tfidf_Top);
|
||||
var linkIndex = indices.get(IndexBlock.Link);
|
||||
var titleIndex = indices.get(IndexBlock.Title);
|
||||
var namesIndex = indices.get(IndexBlock.NamesWords);
|
||||
var positionIndex = indices.get(IndexBlock.PositionWords);
|
||||
var titleKeywordsIndex = indices.get(IndexBlock.TitleKeywords);
|
||||
var wordsIndex = indices.get(IndexBlock.Words);
|
||||
var siteIndex = indices.get(IndexBlock.Site);
|
||||
var metaIndex = indices.get(IndexBlock.Meta);
|
||||
var topicIndex = indices.get(IndexBlock.Topic);
|
||||
var topicIndex = indices.get(IndexBlock.Subjects);
|
||||
|
||||
var words1 = indices.get(IndexBlock.Words_1);
|
||||
var words2 = indices.get(IndexBlock.Words_2);
|
||||
var words4 = indices.get(IndexBlock.Words_4);
|
||||
var words8 = indices.get(IndexBlock.Words_8);
|
||||
var words16 = indices.get(IndexBlock.Words_16Plus);
|
||||
var artifacts = indices.get(IndexBlock.Artifacts);
|
||||
|
||||
queryBuilders = new EnumMap<>(IndexBlock.class);
|
||||
underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class);
|
||||
|
||||
queryBuilders.put(IndexBlock.Words, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex, namesIndex, wordsIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Low, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex), wordsIndex));
|
||||
List<SearchIndex> excludeIndices = listOfNonNulls(metaIndex, titleIndex, topIndex, midIndex, lowIndex, words1);
|
||||
List<SearchIndex> priorityIndices = listOfNonNulls(titleIndex, linkIndex, siteIndex, topIndex, topicIndex);
|
||||
|
||||
underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
|
||||
underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
|
||||
underspecifiedQueryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Title, new IndexQueryFactory(listOfNonNulls(metaIndex, titleIndex, linkIndex), excludeIndices, priorityIndices));
|
||||
queryBuilders.put(IndexBlock.Words_1, new IndexQueryFactory(listOfNonNulls(metaIndex, words1), excludeIndices, priorityIndices));
|
||||
queryBuilders.put(IndexBlock.Words_2, new IndexQueryFactory(listOfNonNulls(metaIndex, words2), excludeIndices, priorityIndices));
|
||||
queryBuilders.put(IndexBlock.Words_4, new IndexQueryFactory(listOfNonNulls(metaIndex, words4), excludeIndices, priorityIndices));
|
||||
queryBuilders.put(IndexBlock.Words_8, new IndexQueryFactory(listOfNonNulls(metaIndex, words8), excludeIndices, priorityIndices));
|
||||
queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryFactory(listOfNonNulls(metaIndex, words16, artifacts), excludeIndices, priorityIndices));
|
||||
}
|
||||
|
||||
@SafeVarargs
|
||||
@ -99,27 +103,13 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
.limit(maxResults);
|
||||
}
|
||||
|
||||
public Query findUnderspecified(
|
||||
IndexBlock block,
|
||||
IndexSearchBudget budget,
|
||||
LongPredicate filter,
|
||||
int wordId) {
|
||||
|
||||
var builder = underspecifiedQueryBuilders.get(block);
|
||||
|
||||
if (null != builder) {
|
||||
return builder.buildUnderspecified(budget, filter, wordId);
|
||||
}
|
||||
return findWord(block, budget, filter, wordId);
|
||||
}
|
||||
|
||||
public Query findWord(IndexBlock block, IndexSearchBudget budget, LongPredicate filter, int wordId) {
|
||||
public IndexQueryFactory.IndexQueryBuilder findWord(IndexQueryCachePool cachePool, IndexBlock block, int wordId) {
|
||||
var builder = queryBuilders.get(block);
|
||||
|
||||
if (builder == null)
|
||||
return Query.EMPTY;
|
||||
return null;
|
||||
|
||||
return builder.build(budget, filter, wordId);
|
||||
return builder.buildQuery(cachePool, wordId);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -130,20 +120,20 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public long numHits(IndexBlock block, int word) {
|
||||
IndexQueryBuilder builder = queryBuilders.get(block);
|
||||
public long numHits(IndexQueryCachePool pool, IndexBlock block, int word) {
|
||||
IndexQueryFactory builder = queryBuilders.get(block);
|
||||
|
||||
if (builder == null)
|
||||
return 0L;
|
||||
|
||||
long hits = 0;
|
||||
for (var index : builder.getIndicies()) {
|
||||
hits += index.numUrls(word);
|
||||
hits += index.numUrls(pool, word);
|
||||
}
|
||||
return hits;
|
||||
}
|
||||
|
||||
public IndexBlock getBlockForResult(int searchTerm, long urlId) {
|
||||
public IndexBlock getBlockForResult(IndexQueryCachePool cachePool, int searchTerm, long urlId) {
|
||||
for (var block : indicesBySearchOrder) {
|
||||
var index = indices.get(block);
|
||||
|
||||
@ -151,21 +141,18 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
continue;
|
||||
}
|
||||
|
||||
var range = index.rangeForWord(searchTerm);
|
||||
|
||||
if (range.hasUrl(urlId)) {
|
||||
if (cachePool.isUrlPresent(index, searchTerm, urlId))
|
||||
return block;
|
||||
}
|
||||
|
||||
}
|
||||
return IndexBlock.Words;
|
||||
|
||||
return IndexBlock.Words_16Plus;
|
||||
}
|
||||
|
||||
public boolean isTermInBucket(IndexBlock block, int searchTerm, long urlId) {
|
||||
public boolean isTermInBucket(IndexQueryCachePool cachePool, IndexBlock block, int searchTerm, long urlId) {
|
||||
final var index = indices.get(block);
|
||||
if (null == index) return false;
|
||||
|
||||
return index
|
||||
.rangeForWord(searchTerm)
|
||||
.hasUrl(urlId);
|
||||
return cachePool.isUrlPresent(index, searchTerm, urlId);
|
||||
}
|
||||
}
|
||||
|
@ -105,7 +105,7 @@ public class SearchIndexes {
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public KeywordLexiconReadOnlyView getDictionaryReader() {
|
||||
public KeywordLexiconReadOnlyView getLexiconReader() {
|
||||
return keywordLexiconReadOnlyView;
|
||||
}
|
||||
|
||||
@ -146,6 +146,7 @@ public class SearchIndexes {
|
||||
public EdgeIndexBucket getBucket(int bucketId) {
|
||||
return buckets[bucketId];
|
||||
}
|
||||
|
||||
public boolean isValidBucket(int bucketId) {
|
||||
return bucketId >= 0 && bucketId < buckets.length;
|
||||
}
|
||||
|
@ -1,151 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.index.reader.query;
|
||||
|
||||
import com.google.common.collect.Streams;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.function.LongPredicate;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
public class IndexQueryBuilder {
|
||||
private final List<SearchIndex> requiredIndices;
|
||||
private final SearchIndex excludeIndex;
|
||||
|
||||
public Collection<SearchIndex> getIndicies() {
|
||||
return requiredIndices;
|
||||
}
|
||||
|
||||
public IndexQueryBuilder(List<SearchIndex> requiredIndices, SearchIndex excludeIndex) {
|
||||
this.requiredIndices = requiredIndices.stream().filter(Objects::nonNull).collect(Collectors.toList());
|
||||
this.excludeIndex = excludeIndex;
|
||||
}
|
||||
|
||||
public Query build(IndexSearchBudget budget,
|
||||
LongPredicate filter,
|
||||
int wordId) {
|
||||
return new QueryForIndices(budget, filter, wordId);
|
||||
}
|
||||
|
||||
// Special treatment for queries with few terms, prefer hits that appear in multiple buckets
|
||||
public Query buildUnderspecified(IndexSearchBudget budget, LongPredicate filter, int wordId) {
|
||||
|
||||
if (requiredIndices.size() == 1) {
|
||||
return build(budget, filter, wordId);
|
||||
}
|
||||
|
||||
var ranges = requiredIndices.stream().map(idx -> idx.rangeForWord(wordId)).toArray(SearchIndex.UrlIndexTree[]::new);
|
||||
var relevantIndices = IntStream.range(0, requiredIndices.size()).filter(i -> ranges[i].isPresent()).toArray();
|
||||
|
||||
if (relevantIndices.length == 0) {
|
||||
return new QueryForIndices(budget, LongStream::empty);
|
||||
}
|
||||
else if (relevantIndices.length == 1 || relevantIndices[0] != 0) {
|
||||
return build(budget, filter, wordId);
|
||||
}
|
||||
|
||||
var fstRange = requiredIndices.get(relevantIndices[0]).rangeForWord(wordId);
|
||||
|
||||
LongStream priorityStream = underspecifiedPairStream(budget, 1000, relevantIndices[0], relevantIndices[0], wordId);
|
||||
for (int i = 1; i < relevantIndices.length; i++) {
|
||||
priorityStream = Streams.concat(priorityStream, underspecifiedPairStream(budget, 1000, relevantIndices[0], relevantIndices[i], wordId));
|
||||
}
|
||||
LongStream stream = LongStream.concat(priorityStream, fstRange.stream().takeWhile(budget::take)).filter(filter);
|
||||
|
||||
return new QueryForIndices(budget, () -> stream);
|
||||
}
|
||||
|
||||
private LongStream underspecifiedPairStream(IndexSearchBudget budget, int limit, int firstIdx, int otherIdx, int wordId) {
|
||||
SearchIndex firstTmp = requiredIndices.get(firstIdx),
|
||||
secondTmp = requiredIndices.get(otherIdx);
|
||||
|
||||
final SearchIndex fst;
|
||||
final SearchIndex snd;
|
||||
|
||||
if (firstTmp.numUrls(wordId) > secondTmp.numUrls(wordId)) {
|
||||
fst = secondTmp;
|
||||
snd = firstTmp;
|
||||
}
|
||||
else {
|
||||
fst = firstTmp;
|
||||
snd = secondTmp;
|
||||
}
|
||||
|
||||
var sndRange = snd.rangeForWord(wordId);
|
||||
var cache = sndRange.createIndexCache();
|
||||
|
||||
return fst.rangeForWord(wordId).stream().takeWhile(budget::take).limit(limit).filter(data -> sndRange.hasUrl(cache, data));
|
||||
}
|
||||
|
||||
|
||||
|
||||
private class QueryForIndices implements Query {
|
||||
private final Supplier<LongStream> supp;
|
||||
private final IndexSearchBudget budget;
|
||||
|
||||
private QueryForIndices(IndexSearchBudget budget, LongPredicate filter, int wordId) {
|
||||
this.budget = budget;
|
||||
supp = () ->
|
||||
requiredIndices.stream().flatMapToLong(idx -> {
|
||||
var range = idx.rangeForWord(wordId);
|
||||
return range.stream().takeWhile(budget::take);
|
||||
})
|
||||
.filter(filter);
|
||||
}
|
||||
|
||||
private QueryForIndices(IndexSearchBudget budget, Supplier<LongStream> supp) {
|
||||
this.budget = budget;
|
||||
this.supp = supp;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query also(int wordId) {
|
||||
return new QueryForIndices(budget,
|
||||
() -> requiredIndices.stream().flatMapToLong(idx -> alsoStream(idx, wordId)));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query alsoCached(int wordId) {
|
||||
return new QueryForIndices(budget,
|
||||
() -> requiredIndices.stream().flatMapToLong(idx -> alsoStreamCached(idx, wordId)));
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query not(int wordId) {
|
||||
// Happens when an index simply isn't present, won't find data anyway
|
||||
// so it's safe to no-op the query
|
||||
if (excludeIndex == null)
|
||||
return new QueryForIndices(budget, LongStream::empty);
|
||||
|
||||
return new QueryForIndices(budget, () -> notStream(wordId));
|
||||
}
|
||||
|
||||
private LongStream alsoStream(SearchIndex idx, int wordId) {
|
||||
var range = idx.rangeForWord(wordId);
|
||||
|
||||
return stream().filter(range::hasUrl).takeWhile(budget::take);
|
||||
}
|
||||
|
||||
private LongStream alsoStreamCached(SearchIndex idx, int wordId) {
|
||||
var range = idx.rangeForWord(wordId);
|
||||
var cache = range.createIndexCache();
|
||||
|
||||
return stream().filter(data -> range.hasUrl(cache, data)).takeWhile(budget::take);
|
||||
}
|
||||
|
||||
private LongStream notStream(int wordId) {
|
||||
var bodyRange = excludeIndex.rangeForWord(wordId);
|
||||
var cache = bodyRange.createIndexCache();
|
||||
|
||||
return stream().filter(url -> !bodyRange.hasUrl(cache, url)).takeWhile(budget::take);
|
||||
}
|
||||
|
||||
public LongStream stream() {
|
||||
return supp.get();
|
||||
}
|
||||
}
|
||||
}
|
@ -1,16 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.index.reader.query;
|
||||
|
||||
|
||||
public class IndexSearchBudget {
|
||||
private long timeout;
|
||||
|
||||
public IndexSearchBudget(long limitTime) {
|
||||
this.timeout = System.currentTimeMillis() + limitTime;
|
||||
}
|
||||
|
||||
// Used for short-circuiting Stream-objects using takeWhile, we don't care
|
||||
public boolean take(long unused) {
|
||||
return System.currentTimeMillis() < timeout;
|
||||
}
|
||||
|
||||
}
|
@ -1,26 +0,0 @@
|
||||
package nu.marginalia.wmsa.edge.index.reader.query;
|
||||
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
public interface Query {
|
||||
Query EMPTY = new Query() {
|
||||
@Override
|
||||
public Query also(int wordId) { return this; }
|
||||
|
||||
@Override
|
||||
public Query alsoCached(int wordId) { return this; }
|
||||
|
||||
@Override
|
||||
public Query not(int wordId) { return this; }
|
||||
|
||||
@Override
|
||||
public LongStream stream() { return LongStream.empty(); }
|
||||
};
|
||||
|
||||
Query also(int wordId);
|
||||
Query alsoCached(int wordId);
|
||||
|
||||
Query not(int wordId);
|
||||
|
||||
LongStream stream();
|
||||
}
|
@ -0,0 +1,107 @@
|
||||
package nu.marginalia.wmsa.edge.index.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.protobuf.InvalidProtocolBufferException;
|
||||
import nu.marginalia.util.ListChunker;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
|
||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
|
||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq;
|
||||
import org.apache.http.HttpStatus;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
@Singleton
|
||||
public class EdgeIndexLexiconService {
|
||||
|
||||
private final SearchIndexes indexes;
|
||||
private final KeywordLexicon keywordLexicon;
|
||||
|
||||
@Inject
|
||||
public EdgeIndexLexiconService(SearchIndexes indexes, IndexServicesFactory servicesFactory) {
|
||||
this.indexes = indexes;
|
||||
this.keywordLexicon = servicesFactory.getKeywordLexicon();
|
||||
}
|
||||
|
||||
public Object getWordId(Request request, Response response) {
|
||||
final String word = request.splat()[0];
|
||||
|
||||
var lr = indexes.getLexiconReader();
|
||||
if (null == lr) {
|
||||
response.status(HttpStatus.SC_FAILED_DEPENDENCY);
|
||||
return "";
|
||||
}
|
||||
|
||||
final int wordId = lr.get(word);
|
||||
|
||||
if (DictionaryHashMap.NO_VALUE == wordId) {
|
||||
response.status(404);
|
||||
return "";
|
||||
}
|
||||
|
||||
return wordId;
|
||||
}
|
||||
|
||||
|
||||
public Object putWords(Request request, Response response) throws InvalidProtocolBufferException {
|
||||
var req = IndexPutKeywordsReq.parseFrom(request.bodyAsBytes());
|
||||
|
||||
EdgeId<EdgeDomain> domainId = new EdgeId<>(req.getDomain());
|
||||
EdgeId<EdgeUrl> urlId = new EdgeId<>(req.getUrl());
|
||||
int idx = req.getIndex();
|
||||
|
||||
for (int ws = 0; ws < req.getWordSetCount(); ws++) {
|
||||
putWords(domainId, urlId, req.getWordSet(ws), idx);
|
||||
}
|
||||
|
||||
response.status(HttpStatus.SC_ACCEPTED);
|
||||
return "";
|
||||
}
|
||||
|
||||
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
|
||||
IndexPutKeywordsReq.WordSet words, int idx
|
||||
) {
|
||||
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
|
||||
|
||||
IndexBlock block = IndexBlock.values()[words.getIndex()];
|
||||
|
||||
for (var chunk : ListChunker.chopList(words.getWordsList(), SearchIndexJournalEntry.MAX_LENGTH)) {
|
||||
|
||||
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
|
||||
var header = new SearchIndexJournalEntryHeader(domainId, urlId, block);
|
||||
|
||||
indexWriter.put(header, entry);
|
||||
}
|
||||
}
|
||||
|
||||
private long[] getOrInsertWordIds(List<String> words) {
|
||||
long[] ids = new long[words.size()];
|
||||
int putIdx = 0;
|
||||
|
||||
for (String word : words) {
|
||||
long id = keywordLexicon.getOrInsert(word);
|
||||
if (id != DictionaryHashMap.NO_VALUE) {
|
||||
ids[putIdx++] = id;
|
||||
}
|
||||
}
|
||||
|
||||
if (putIdx != words.size()) {
|
||||
ids = Arrays.copyOf(ids, putIdx);
|
||||
}
|
||||
return ids;
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,44 @@
|
||||
package nu.marginalia.wmsa.edge.index.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
@Singleton
|
||||
public class EdgeIndexOpsService {
|
||||
|
||||
private final SearchIndexes indexes;
|
||||
|
||||
@Inject
|
||||
public EdgeIndexOpsService(SearchIndexes indexes) {
|
||||
this.indexes = indexes;
|
||||
}
|
||||
|
||||
public Object repartitionEndpoint(Request request, Response response) {
|
||||
|
||||
if (!indexes.repartition()) {
|
||||
Spark.halt(503, "Operations busy");
|
||||
}
|
||||
return "OK";
|
||||
}
|
||||
|
||||
public Object preconvertEndpoint(Request request, Response response) {
|
||||
if (!indexes.preconvert()) {
|
||||
Spark.halt(503, "Operations busy");
|
||||
}
|
||||
return "OK";
|
||||
}
|
||||
|
||||
public Object reindexEndpoint(Request request, Response response) {
|
||||
int id = Integer.parseInt(request.params("id"));
|
||||
|
||||
if (!indexes.reindex(id)) {
|
||||
Spark.halt(503, "Operations busy");
|
||||
}
|
||||
return "OK";
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,325 @@
|
||||
package nu.marginalia.wmsa.edge.index.svc;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import io.prometheus.client.Counter;
|
||||
import io.prometheus.client.Histogram;
|
||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||
import nu.marginalia.wmsa.client.GsonFactory;
|
||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||
import nu.marginalia.wmsa.edge.index.EdgeIndexBucket;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.IndexSearchBudget;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeIdList;
|
||||
import nu.marginalia.wmsa.edge.model.search.*;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
||||
import org.apache.http.HttpStatus;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.HaltException;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.function.LongPredicate;
|
||||
|
||||
import static java.util.Comparator.comparing;
|
||||
import static spark.Spark.halt;
|
||||
|
||||
@Singleton
|
||||
public class EdgeIndexQueryService {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private static final int SEARCH_BUDGET_TIMEOUT_MS = 3000;
|
||||
private static final int QUERY_FETCH_SIZE = 8192;
|
||||
private static final int QUERY_FIRST_PASS_DOMAIN_LIMIT = 64;
|
||||
|
||||
private static final Counter wmsa_edge_index_query_timeouts = Counter.build().name("wmsa_edge_index_query_timeouts").help("-").register();
|
||||
|
||||
private static final Histogram wmsa_edge_index_query_time = Histogram.build().name("wmsa_edge_index_query_time").linearBuckets(50, 50, 15).help("-").register();
|
||||
private static final Histogram wmsa_edge_index_domain_query_time = Histogram.build().name("wmsa_edge_index_domain_query_time").linearBuckets(50, 50, 15).help("-").register();
|
||||
|
||||
private final Gson gson = GsonFactory.get();
|
||||
|
||||
private final SearchIndexes indexes;
|
||||
|
||||
@Inject
|
||||
public EdgeIndexQueryService(SearchIndexes indexes) {
|
||||
this.indexes = indexes;
|
||||
}
|
||||
|
||||
public Object searchDomain(Request request, Response response) {
|
||||
if (indexes.getLexiconReader() == null) {
|
||||
logger.warn("Dictionary reader not yet initialized");
|
||||
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
|
||||
}
|
||||
|
||||
String json = request.body();
|
||||
EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class);
|
||||
|
||||
try {
|
||||
return wmsa_edge_index_domain_query_time.time(() -> queryDomain(specsSet));
|
||||
}
|
||||
catch (HaltException ex) {
|
||||
logger.warn("Halt", ex);
|
||||
throw ex;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.info("Error during domain search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
|
||||
logger.info("Error", ex);
|
||||
Spark.halt(500, "Error");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public Object search(Request request, Response response) {
|
||||
if (indexes.getLexiconReader() == null) {
|
||||
logger.warn("Dictionary reader not yet initialized");
|
||||
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
|
||||
}
|
||||
|
||||
String json = request.body();
|
||||
EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class);
|
||||
|
||||
try {
|
||||
return wmsa_edge_index_query_time.time(() -> query(specsSet));
|
||||
}
|
||||
catch (HaltException ex) {
|
||||
logger.warn("Halt", ex);
|
||||
throw ex;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.info("Error during search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
|
||||
logger.info("Error", ex);
|
||||
Spark.halt(500, "Error");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public EdgeSearchResultSet query(EdgeSearchSpecification specsSet) {
|
||||
List<EdgeSearchResultItem> results = new SearchQuery(specsSet).execute();
|
||||
return new EdgeSearchResultSet(results);
|
||||
}
|
||||
|
||||
public EdgeDomainSearchResults queryDomain(EdgeDomainSearchSpecification specsSet) {
|
||||
|
||||
final OptionalInt wordId = lookUpWord(specsSet.keyword);
|
||||
EdgeIdList<EdgeUrl> urlIds;
|
||||
|
||||
if (wordId.isEmpty()) {
|
||||
urlIds = new EdgeIdList<>();
|
||||
} else {
|
||||
urlIds = indexes
|
||||
.getBucket(specsSet.bucket)
|
||||
.findHotDomainsForKeyword(specsSet.block, wordId.getAsInt(), specsSet.queryDepth, specsSet.minHitCount, specsSet.maxResults)
|
||||
.mapToInt(lv -> (int) (lv & 0xFFFF_FFFFL))
|
||||
.collect(EdgeIdList::new, EdgeIdList::add, EdgeIdList::addAll);
|
||||
}
|
||||
|
||||
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
|
||||
}
|
||||
|
||||
private class SearchQuery {
|
||||
private final TIntHashSet seenResults = new TIntHashSet(QUERY_FETCH_SIZE, 0.5f);
|
||||
private final EdgeSearchSpecification specsSet;
|
||||
private final IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_TIMEOUT_MS);
|
||||
private final IndexQueryCachePool cachePool = new IndexQueryCachePool();
|
||||
|
||||
public SearchQuery(EdgeSearchSpecification specsSet) {
|
||||
this.specsSet = specsSet;
|
||||
}
|
||||
|
||||
private List<EdgeSearchResultItem> execute() {
|
||||
final Set<EdgeSearchResultItem> results = new HashSet<>(QUERY_FETCH_SIZE);
|
||||
|
||||
for (var sq : specsSet.subqueries) {
|
||||
results.addAll(performSearch(sq));
|
||||
}
|
||||
|
||||
for (var result : results) {
|
||||
addResultScores(result);
|
||||
}
|
||||
|
||||
if (!budget.hasTimeLeft()) {
|
||||
wmsa_edge_index_query_timeouts.inc();
|
||||
}
|
||||
|
||||
var domainCountFilter = new ResultDomainDeduplicator(specsSet.limitByDomain);
|
||||
|
||||
if (WmsaHome.isDebug()) {
|
||||
cachePool.printSummary(logger);
|
||||
}
|
||||
cachePool.clear();
|
||||
|
||||
return results.stream()
|
||||
.sorted(
|
||||
comparing(EdgeSearchResultItem::getScore)
|
||||
.thenComparing(EdgeSearchResultItem::getRanking)
|
||||
.thenComparing(EdgeSearchResultItem::getUrlIdInt)
|
||||
)
|
||||
.filter(domainCountFilter::test)
|
||||
.limit(specsSet.getLimitTotal()).toList();
|
||||
}
|
||||
|
||||
|
||||
private List<EdgeSearchResultItem> performSearch(EdgeSearchSubquery sq)
|
||||
{
|
||||
|
||||
final List<EdgeSearchResultItem> results = new ArrayList<>(QUERY_FETCH_SIZE);
|
||||
final EdgeIndexSearchTerms searchTerms = getSearchTerms(sq);
|
||||
|
||||
if (searchTerms.isEmpty())
|
||||
return Collections.emptyList();
|
||||
|
||||
for (int indexBucket : specsSet.buckets) {
|
||||
final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT);
|
||||
|
||||
if (!budget.hasTimeLeft()) {
|
||||
logger.info("Query timed out, omitting {}:{} for query {}", indexBucket, sq.block, sq.searchTermsInclude);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (QUERY_FETCH_SIZE <= results.size())
|
||||
break;
|
||||
|
||||
IndexQuery query = getQuery(cachePool, indexBucket, sq.block, localFilter::filterRawValue, searchTerms);
|
||||
long[] buf = new long[8192];
|
||||
|
||||
while (query.hasMore() && results.size() < QUERY_FETCH_SIZE && budget.hasTimeLeft()) {
|
||||
int cnt = query.getMoreResults(buf, budget);
|
||||
|
||||
for (int i = 0; i < cnt && results.size() < QUERY_FETCH_SIZE; i++) {
|
||||
final long id = buf[i];
|
||||
|
||||
if (!seenResults.add((int)(id & 0xFFFF_FFFFL)) || !localFilter.test(id)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
results.add(new EdgeSearchResultItem(indexBucket, id));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private IndexQuery getQuery(IndexQueryCachePool cachePool, int bucket, IndexBlock block,
|
||||
LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
|
||||
|
||||
if (!indexes.isValidBucket(bucket)) {
|
||||
logger.warn("Invalid bucket {}", bucket);
|
||||
return new IndexQuery(Collections.emptyList());
|
||||
}
|
||||
|
||||
return indexes.getBucket(bucket).getQuery(cachePool, block, filter, searchTerms);
|
||||
}
|
||||
|
||||
private void addResultScores(EdgeSearchResultItem searchResult) {
|
||||
final var reader = Objects.requireNonNull(indexes.getLexiconReader());
|
||||
|
||||
List<List<String>> searchTermVariants = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
|
||||
|
||||
// Memoize calls to getTermData, as they're somewhat expensive and highly redundant
|
||||
Map<ResultTerm, ResultTermData> termMetadata = new HashMap<>(32);
|
||||
|
||||
double bestScore = 0;
|
||||
|
||||
for (int searchTermListIdx = 0; searchTermListIdx < searchTermVariants.size(); searchTermListIdx++) {
|
||||
double setScore = 0;
|
||||
int setSize = 0;
|
||||
for (var searchTerm : searchTermVariants.get(searchTermListIdx)) {
|
||||
|
||||
final int termId = reader.get(searchTerm);
|
||||
|
||||
ResultTermData data = termMetadata.computeIfAbsent(
|
||||
new ResultTerm(searchResult.bucketId, termId, searchResult.getCombinedId()), this::getTermData);
|
||||
|
||||
var score = data.asScore(searchTermListIdx, searchTerm);
|
||||
searchResult.scores.add(score);
|
||||
setScore += score.value();
|
||||
setSize++;
|
||||
}
|
||||
bestScore = Math.min(bestScore, setScore/setSize);
|
||||
}
|
||||
|
||||
searchResult.setScore(bestScore);
|
||||
}
|
||||
|
||||
private ResultTermData getTermData(ResultTerm resultTerm) {
|
||||
final EdgeIndexBucket bucket = indexes.getBucket(resultTerm.bucket);
|
||||
final int termId = resultTerm.termId;
|
||||
final long combinedUrlId = resultTerm.combinedUrlId;
|
||||
|
||||
return new ResultTermData(bucket.getTermScore(cachePool, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Title, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Link, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Site, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Subjects, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.NamesWords, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Top, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Middle, termId, combinedUrlId),
|
||||
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Lower, termId, combinedUrlId)
|
||||
);
|
||||
}
|
||||
|
||||
record ResultTerm (int bucket, int termId, long combinedUrlId) {}
|
||||
record ResultTermData (IndexBlock index,
|
||||
boolean title,
|
||||
boolean link,
|
||||
boolean site,
|
||||
boolean subject,
|
||||
boolean name,
|
||||
boolean high,
|
||||
boolean mid,
|
||||
boolean low
|
||||
) {
|
||||
public EdgeSearchResultKeywordScore asScore(int set, String searchTerm) {
|
||||
return new EdgeSearchResultKeywordScore(set, searchTerm, index, title, link, site, subject, name, high, mid, low);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private EdgeIndexSearchTerms getSearchTerms(EdgeSearchSubquery request) {
|
||||
final List<Integer> excludes = new ArrayList<>();
|
||||
final List<Integer> includes = new ArrayList<>();
|
||||
|
||||
for (var include : request.searchTermsInclude) {
|
||||
var word = lookUpWord(include);
|
||||
if (word.isEmpty()) {
|
||||
logger.debug("Unknown search term: " + include);
|
||||
return new EdgeIndexSearchTerms(includes, excludes);
|
||||
}
|
||||
includes.add(word.getAsInt());
|
||||
}
|
||||
|
||||
for (var exclude : request.searchTermsExclude) {
|
||||
lookUpWord(exclude).ifPresent(excludes::add);
|
||||
}
|
||||
|
||||
return new EdgeIndexSearchTerms(includes, excludes);
|
||||
}
|
||||
|
||||
|
||||
private OptionalInt lookUpWord(String s) {
|
||||
int ret = indexes.getLexiconReader().get(s);
|
||||
if (ret == DictionaryHashMap.NO_VALUE) {
|
||||
return OptionalInt.empty();
|
||||
}
|
||||
return OptionalInt.of(ret);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,97 @@
|
||||
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static java.lang.Math.min;
|
||||
|
||||
public class IndexQuery {
|
||||
private final List<EntrySource> sources;
|
||||
private final List<QueryFilterStepIf> inclusionFilter = new ArrayList<>(10);
|
||||
private final List<QueryFilterStepIf> priorityFilter = new ArrayList<>(10);
|
||||
|
||||
public IndexQuery(List<EntrySource> sources) {
|
||||
this.sources = sources;
|
||||
}
|
||||
|
||||
public void addInclusionFilter(QueryFilterStepIf filter) {
|
||||
inclusionFilter.add(filter);
|
||||
}
|
||||
|
||||
public void addPriorityFilter(QueryFilterStepIf filter) {
|
||||
priorityFilter.add(filter);
|
||||
}
|
||||
|
||||
private int si = 0;
|
||||
|
||||
public boolean hasMore() {
|
||||
return si < sources.size();
|
||||
}
|
||||
|
||||
public int getMoreResults(long[] dest, IndexSearchBudget budget) {
|
||||
final EntrySource source = sources.get(si);
|
||||
|
||||
int bufferUtilizedLength = source.read(dest, dest.length);
|
||||
|
||||
if (bufferUtilizedLength <= 0) {
|
||||
si++;
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (var filter : inclusionFilter) {
|
||||
bufferUtilizedLength = filter.retainDestructive(dest, bufferUtilizedLength);
|
||||
|
||||
if (bufferUtilizedLength <= 0) {
|
||||
si++;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (budget.hasTimeLeft()) {
|
||||
prioritizeBuffer(dest, source, bufferUtilizedLength, budget);
|
||||
}
|
||||
|
||||
int count = min(bufferUtilizedLength, dest.length);
|
||||
System.arraycopy(dest, 0, dest, 0, count);
|
||||
return count;
|
||||
}
|
||||
|
||||
private void prioritizeBuffer(long[] dest, EntrySource source, int remainingBufferSize, IndexSearchBudget budget) {
|
||||
int prioStart = 0;
|
||||
|
||||
for (var filter : priorityFilter) {
|
||||
if (!budget.hasTimeLeft())
|
||||
break;
|
||||
|
||||
if (filter.getIndex() == source.getIndex())
|
||||
continue;
|
||||
|
||||
prioStart += filter.retainReorder(dest, prioStart, remainingBufferSize);
|
||||
|
||||
if (prioStart >= remainingBufferSize) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("Sources:\n");
|
||||
|
||||
for (var source: sources) {
|
||||
sb.append("\t").append(source.getIndex().name).append("\n");
|
||||
}
|
||||
sb.append("Includes:\n");
|
||||
for (var include : inclusionFilter) {
|
||||
sb.append("\t").append(include.describe()).append("\n");
|
||||
}
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,60 @@
|
||||
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||
|
||||
import nu.marginalia.util.btree.CachingBTreeReader;
|
||||
import nu.marginalia.wmsa.edge.index.reader.IndexWordsTable;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||
import org.slf4j.Logger;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class IndexQueryCachePool {
|
||||
private final Map<PoolKey, CachingBTreeReader.BTreeCachedIndex> indexCaches = new HashMap<>();
|
||||
private final Map<RangeKey, SearchIndex.IndexBTreeRange> rangeCache = new HashMap<>();
|
||||
private final Map<PoolKey, Integer> savedCounts = new HashMap<>();
|
||||
|
||||
public CachingBTreeReader.BTreeCachedIndex getIndexCache(SearchIndex index, SearchIndex.IndexBTreeRange range) {
|
||||
var key = new PoolKey(index, range.dataOffset);
|
||||
var entry = indexCaches.get(key);
|
||||
|
||||
if (entry == null) {
|
||||
entry = range.createIndexCache();
|
||||
indexCaches.put(key, entry);
|
||||
}
|
||||
else {
|
||||
savedCounts.merge(key, 1, Integer::sum);
|
||||
}
|
||||
|
||||
return entry;
|
||||
}
|
||||
|
||||
|
||||
public boolean isUrlPresent(SearchIndex index, int term, long url) {
|
||||
var range = index.rangeForWord(this, term);
|
||||
return range.isPresent() && range.hasUrl(this, url);
|
||||
}
|
||||
|
||||
public void printSummary(Logger logger) {
|
||||
long loadedBytes = indexCaches.values().stream().mapToLong(CachingBTreeReader.BTreeCachedIndex::sizeBytes).sum();
|
||||
long savedBytes = savedCounts.entrySet().stream().mapToLong(e -> e.getValue() * indexCaches.get(e.getKey()).sizeBytes()).sum();
|
||||
|
||||
long loaded = indexCaches.values().stream().filter(CachingBTreeReader.BTreeCachedIndex::isLoaded).count();
|
||||
|
||||
logger.info("Index Cache Summary: {}/{} loaded/total, {} index blocks loaded, {} index blocks saved", loaded, indexCaches.size(), loadedBytes/4096., savedBytes/4096.);
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
indexCaches.clear();
|
||||
}
|
||||
|
||||
public SearchIndex.IndexBTreeRange getRange(IndexWordsTable words, int wordId) {
|
||||
return rangeCache.get(new RangeKey(words, wordId));
|
||||
}
|
||||
|
||||
public void cacheRange(IndexWordsTable words, int wordId, SearchIndex.IndexBTreeRange range) {
|
||||
rangeCache.put(new RangeKey(words, wordId), range);
|
||||
}
|
||||
|
||||
private record RangeKey(IndexWordsTable table, int wordId) {}
|
||||
private record PoolKey(SearchIndex index, long dataOffset) {}
|
||||
}
|
@ -0,0 +1,103 @@
|
||||
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterBTreeRange;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepFromPredicate;
|
||||
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.function.LongPredicate;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class IndexQueryFactory {
|
||||
private final List<SearchIndex> requiredIndices;
|
||||
private final List<SearchIndex> excludeIndex;
|
||||
private final List<SearchIndex> priortyIndices;
|
||||
|
||||
public Collection<SearchIndex> getIndicies() {
|
||||
return requiredIndices;
|
||||
}
|
||||
|
||||
public IndexQueryFactory(List<SearchIndex> requiredIndices, List<SearchIndex> excludeIndex, List<SearchIndex> priortyIndices) {
|
||||
this.requiredIndices = requiredIndices.stream().filter(Objects::nonNull).collect(Collectors.toList());
|
||||
this.excludeIndex = excludeIndex;
|
||||
this.priortyIndices = priortyIndices;
|
||||
}
|
||||
|
||||
public IndexQueryBuilder buildQuery(IndexQueryCachePool cachePool, int firstWordId) {
|
||||
List<EntrySource> sources = new ArrayList<>(requiredIndices.size());
|
||||
|
||||
for (var ri : requiredIndices) {
|
||||
var range = ri.rangeForWord(cachePool, firstWordId);
|
||||
if (range.isPresent()) {
|
||||
sources.add(range.asEntrySource());
|
||||
}
|
||||
}
|
||||
|
||||
return new IndexQueryBuilder(new IndexQuery(sources), cachePool);
|
||||
}
|
||||
|
||||
public class IndexQueryBuilder {
|
||||
private final IndexQuery query;
|
||||
private final IndexQueryCachePool cachePool;
|
||||
|
||||
IndexQueryBuilder(IndexQuery query,
|
||||
IndexQueryCachePool cachePool) {
|
||||
this.query = query;
|
||||
this.cachePool = cachePool;
|
||||
}
|
||||
|
||||
public void filter(LongPredicate predicate) {
|
||||
query.addInclusionFilter(new QueryFilterStepFromPredicate(predicate));
|
||||
}
|
||||
|
||||
public IndexQueryBuilder also(int termId) {
|
||||
List<QueryFilterStepIf> filters = new ArrayList<>(requiredIndices.size());
|
||||
|
||||
for (var ri : requiredIndices) {
|
||||
var range = ri.rangeForWord(cachePool, termId);
|
||||
|
||||
if (range.isPresent()) {
|
||||
filters.add(new QueryFilterBTreeRange(ri, range, cachePool));
|
||||
}
|
||||
else {
|
||||
filters.add(QueryFilterStepIf.noPass());
|
||||
}
|
||||
}
|
||||
|
||||
filters.sort(Comparator.naturalOrder());
|
||||
query.addInclusionFilter(QueryFilterStepIf.anyOf(filters));
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
public IndexQueryBuilder not(int termId) {
|
||||
for (var ri : excludeIndex) {
|
||||
var range = ri.rangeForWord(cachePool, termId);
|
||||
if (range.isPresent()) {
|
||||
query.addInclusionFilter(range.asExcludeFilterStep(cachePool));
|
||||
}
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
public void prioritize(int termId) {
|
||||
for (var idx : priortyIndices) {
|
||||
var range = idx.rangeForWord(cachePool, termId);
|
||||
if (range.isPresent()) {
|
||||
query.addPriorityFilter(new QueryFilterBTreeRange(idx, range, cachePool));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public IndexQuery build() {
|
||||
return query;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,12 @@
|
||||
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
public interface IndexQueryIf {
|
||||
IndexQueryIf also(int wordId);
|
||||
IndexQueryIf alsoCached(int wordId);
|
||||
|
||||
IndexQueryIf not(int wordId);
|
||||
|
||||
LongStream stream();
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||
|
||||
|
||||
public class IndexSearchBudget {
|
||||
private final long timeout;
|
||||
|
||||
public IndexSearchBudget(long limitTime) {
|
||||
this.timeout = System.currentTimeMillis() + limitTime;
|
||||
}
|
||||
|
||||
public boolean hasTimeLeft() { return System.currentTimeMillis() < timeout; }
|
||||
}
|
@ -0,0 +1,50 @@
|
||||
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||
|
||||
import gnu.trove.map.TLongIntMap;
|
||||
import gnu.trove.map.hash.TLongIntHashMap;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem;
|
||||
|
||||
public class ResultDomainDeduplicator {
|
||||
final TLongIntMap resultsByRankingId = new TLongIntHashMap(2048, 0.5f, -1, 0);
|
||||
final int limitByDomain;
|
||||
|
||||
public ResultDomainDeduplicator(int limitByDomain) {
|
||||
this.limitByDomain = limitByDomain;
|
||||
}
|
||||
|
||||
public boolean filterRawValue(long value) {
|
||||
int rankingId = (int) (value >>> 32);
|
||||
|
||||
if (rankingId == Integer.MAX_VALUE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return resultsByRankingId.get(getKey(rankingId)) <= limitByDomain;
|
||||
}
|
||||
|
||||
long getKey(int rankingId) {
|
||||
return rankingId;
|
||||
}
|
||||
|
||||
public boolean test(long value) {
|
||||
int ranking = (int) (value >>> 32);
|
||||
if (ranking == Integer.MAX_VALUE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return resultsByRankingId.adjustOrPutValue(ranking, 1, 1) <= limitByDomain;
|
||||
}
|
||||
|
||||
public boolean test(EdgeSearchResultItem item) {
|
||||
final int ranking = item.getRanking();
|
||||
if (ranking == Integer.MAX_VALUE) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// For ResultItems, consider bucketId as well as different buckets may use different
|
||||
// ranking algorithms
|
||||
final long key = ranking*32L + item.bucketId;
|
||||
|
||||
return resultsByRankingId.adjustOrPutValue(key, 1, 1) <= limitByDomain;
|
||||
}
|
||||
}
|
@ -0,0 +1,9 @@
|
||||
package nu.marginalia.wmsa.edge.index.svc.query.types;
|
||||
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||
|
||||
public interface EntrySource {
|
||||
SearchIndex getIndex();
|
||||
int read(long[] buffer, int n);
|
||||
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user