mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 13:19:02 +00:00
Merge pull request 'Merge changes from experimental-22-08 into master' (#109) from master into release
Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/109
This commit is contained in:
commit
afb0c78e4d
@ -58,7 +58,7 @@ jmhJar {
|
|||||||
}
|
}
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third_party')
|
implementation project(':third_party')
|
||||||
|
implementation project(':protocol')
|
||||||
|
|
||||||
implementation 'org.projectlombok:lombok:1.18.24'
|
implementation 'org.projectlombok:lombok:1.18.24'
|
||||||
annotationProcessor 'org.projectlombok:lombok:1.18.24'
|
annotationProcessor 'org.projectlombok:lombok:1.18.24'
|
||||||
@ -157,6 +157,7 @@ dependencies {
|
|||||||
|
|
||||||
jmh 'org.openjdk.jmh:jmh-core:1.35'
|
jmh 'org.openjdk.jmh:jmh-core:1.35'
|
||||||
jmh 'org.openjdk.jmh:jmh-generator-annprocess:1.35'
|
jmh 'org.openjdk.jmh:jmh-generator-annprocess:1.35'
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
configurations {
|
configurations {
|
||||||
|
@ -188,7 +188,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
|||||||
System.out.println(driver.getTitle());
|
System.out.println(driver.getTitle());
|
||||||
|
|
||||||
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
|
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
|
||||||
assertEquals(List.of("Bird"), getTitlesFromSearchResults(html));
|
assertEquals(List.of("Bird", "Washington, D.C."), getTitlesFromSearchResults(html));
|
||||||
|
|
||||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query"));
|
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query"));
|
||||||
}
|
}
|
||||||
@ -201,7 +201,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
|||||||
System.out.println(driver.getTitle());
|
System.out.println(driver.getTitle());
|
||||||
|
|
||||||
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
|
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
|
||||||
assertNotEquals(List.of("Bird"), getTitlesFromSearchResults(html));
|
assertNotEquals(List.of("Bird", "Washington, D.C."), getTitlesFromSearchResults(html));
|
||||||
|
|
||||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query-yes-js"));
|
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query-yes-js"));
|
||||||
}
|
}
|
||||||
@ -214,7 +214,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
|||||||
System.out.println(driver.getTitle());
|
System.out.println(driver.getTitle());
|
||||||
|
|
||||||
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
|
var html = driver.findElement(new By.ByXPath("//*")).getAttribute("outerHTML");
|
||||||
assertEquals(List.of("Bird"), getTitlesFromSearchResults(html));
|
assertEquals(List.of("Bird", "Washington, D.C."), getTitlesFromSearchResults(html));
|
||||||
|
|
||||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query-no-js"));
|
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("query-no-js"));
|
||||||
}
|
}
|
||||||
@ -240,7 +240,7 @@ public class EdgeSearchE2ETest extends E2ETestBase {
|
|||||||
|
|
||||||
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-search"));
|
Files.move(driver.getScreenshotAs(OutputType.FILE).toPath(), screenshotFilename("site-search"));
|
||||||
|
|
||||||
assertEquals(List.of("Frog", "Binomial nomenclature", "Mantis", "Amphibian"), getTitlesFromSearchResults(html));
|
assertEquals(List.of("Frog", "Amphibian"), getTitlesFromSearchResults(html));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -1,6 +1,10 @@
|
|||||||
package nu.marginalia.util;
|
package nu.marginalia.util;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
|
||||||
public class DenseBitMap {
|
public class DenseBitMap {
|
||||||
public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8;
|
public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8;
|
||||||
@ -15,6 +19,31 @@ public class DenseBitMap {
|
|||||||
this.buffer = ByteBuffer.allocateDirect((int)((cardinality / 8) + (misaligned ? 1 : 0)));
|
this.buffer = ByteBuffer.allocateDirect((int)((cardinality / 8) + (misaligned ? 1 : 0)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static DenseBitMap loadFromFile(Path file) throws IOException {
|
||||||
|
long size = Files.size(file);
|
||||||
|
var dbm = new DenseBitMap(size/8);
|
||||||
|
|
||||||
|
try (var bc = Files.newByteChannel(file)) {
|
||||||
|
while (dbm.buffer.position() < dbm.buffer.capacity()) {
|
||||||
|
bc.read(dbm.buffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
dbm.buffer.clear();
|
||||||
|
|
||||||
|
return dbm;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void writeToFile(Path file) throws IOException {
|
||||||
|
|
||||||
|
try (var bc = Files.newByteChannel(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) {
|
||||||
|
while (buffer.position() < buffer.capacity()) {
|
||||||
|
bc.write(buffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
buffer.clear();
|
||||||
|
}
|
||||||
|
|
||||||
public boolean get(long pos) {
|
public boolean get(long pos) {
|
||||||
return (buffer.get((int)(pos >>> 3)) & ((byte)1 << (int)(pos & 7))) != 0;
|
return (buffer.get((int)(pos >>> 3)) & ((byte)1 << (int)(pos & 7))) != 0;
|
||||||
}
|
}
|
||||||
|
@ -25,14 +25,16 @@ public class CachingBTreeReader {
|
|||||||
return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
|
return new BTreeHeader(file.get(fileOffset), file.get(fileOffset+1), file.get(fileOffset+2));
|
||||||
}
|
}
|
||||||
|
|
||||||
public Cache prepareCache() {
|
public BTreeCachedIndex prepareCache(BTreeHeader header) {
|
||||||
return new Cache();
|
return new BTreeCachedIndex(header);
|
||||||
}
|
}
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @return file offset of entry matching keyRaw, negative if absent
|
* @return file offset of entry matching keyRaw, negative if absent
|
||||||
*/
|
*/
|
||||||
public long findEntry(BTreeHeader header, Cache cache, final long keyRaw) {
|
public long findEntry(BTreeCachedIndex cache, final long keyRaw) {
|
||||||
|
BTreeHeader header = cache.header;
|
||||||
|
|
||||||
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
||||||
|
|
||||||
final long key = keyRaw & ctx.equalityMask();
|
final long key = keyRaw & ctx.equalityMask();
|
||||||
@ -46,7 +48,7 @@ public class CachingBTreeReader {
|
|||||||
numEntries = header.numEntries();
|
numEntries = header.numEntries();
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
cache.load(header);
|
cache.load();
|
||||||
|
|
||||||
long dataLayerOffset = searchIndex(header, cache, key);
|
long dataLayerOffset = searchIndex(header, cache, key);
|
||||||
if (dataLayerOffset < 0) {
|
if (dataLayerOffset < 0) {
|
||||||
@ -60,7 +62,7 @@ public class CachingBTreeReader {
|
|||||||
return dataSearcher.binarySearch(key, searchStart, numEntries);
|
return dataSearcher.binarySearch(key, searchStart, numEntries);
|
||||||
}
|
}
|
||||||
|
|
||||||
private long searchIndex(BTreeHeader header, Cache cache, long key) {
|
private long searchIndex(BTreeHeader header, BTreeCachedIndex cache, long key) {
|
||||||
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
final int blockSize = ctx.BLOCK_SIZE_WORDS();
|
||||||
long layerOffset = 0;
|
long layerOffset = 0;
|
||||||
|
|
||||||
@ -77,11 +79,22 @@ public class CachingBTreeReader {
|
|||||||
return layerOffset;
|
return layerOffset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** A cache for the BTree index data that will drastically reduce the number of disk reads
|
||||||
public class Cache {
|
* for repeated queries against the same tree. The memory consumption is typically very low
|
||||||
|
* and the disk access pattern for reading the entire index relatively cheap.
|
||||||
|
*/
|
||||||
|
public class BTreeCachedIndex {
|
||||||
long[] indexData;
|
long[] indexData;
|
||||||
|
final BTreeHeader header;
|
||||||
|
|
||||||
public void load(BTreeHeader header) {
|
final int indexedDataSize;
|
||||||
|
|
||||||
|
public BTreeCachedIndex(BTreeHeader header) {
|
||||||
|
this.header = header;
|
||||||
|
indexedDataSize = header.numEntries();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void load() {
|
||||||
if (indexData != null)
|
if (indexData != null)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
@ -107,5 +120,17 @@ public class CachingBTreeReader {
|
|||||||
}
|
}
|
||||||
return low;
|
return low;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public long sizeBytes() {
|
||||||
|
return isLoaded() ? 8L*indexData.length : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getIndexedDataSize() {
|
||||||
|
return indexedDataSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isLoaded() {
|
||||||
|
return indexData != null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,18 +1,18 @@
|
|||||||
package nu.marginalia.util.dict;
|
package nu.marginalia.util.dict;
|
||||||
|
|
||||||
import nu.marginalia.util.SeekDictionary;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
import java.nio.LongBuffer;
|
import java.nio.LongBuffer;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
public class DictionaryData {
|
public class DictionaryData {
|
||||||
|
|
||||||
private final int DICTIONARY_BANK_SIZE;
|
private final int DICTIONARY_BANK_SIZE;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(DictionaryData.class);
|
private static final Logger logger = LoggerFactory.getLogger(DictionaryData.class);
|
||||||
|
|
||||||
private final SeekDictionary<DictionaryDataBank> banks = SeekDictionary.of(DictionaryDataBank::getSize);
|
private final ArrayList<DictionaryDataBank> banks = new ArrayList(100);
|
||||||
|
|
||||||
public DictionaryData(int bankSize) {
|
public DictionaryData(int bankSize) {
|
||||||
DICTIONARY_BANK_SIZE = bankSize;
|
DICTIONARY_BANK_SIZE = bankSize;
|
||||||
@ -20,12 +20,8 @@ public class DictionaryData {
|
|||||||
banks.add(new DictionaryDataBank(0, bankSize));
|
banks.add(new DictionaryDataBank(0, bankSize));
|
||||||
}
|
}
|
||||||
|
|
||||||
public int size() {
|
|
||||||
return banks.end();
|
|
||||||
}
|
|
||||||
|
|
||||||
public int add(long key) {
|
public int add(long key) {
|
||||||
var activeBank = banks.last();
|
var activeBank = banks.get(banks.size()-1);
|
||||||
int rb = activeBank.add(key);
|
int rb = activeBank.add(key);
|
||||||
|
|
||||||
if (rb == -1) {
|
if (rb == -1) {
|
||||||
@ -42,10 +38,10 @@ public class DictionaryData {
|
|||||||
|
|
||||||
|
|
||||||
public long getKey(int offset) {
|
public long getKey(int offset) {
|
||||||
return banks.bankForOffset(offset).getKey(offset);
|
return banks.get(offset/DICTIONARY_BANK_SIZE).getKey(offset);
|
||||||
}
|
}
|
||||||
public boolean keyEquals(int offset, long otherKey) {
|
public boolean keyEquals(int offset, long otherKey) {
|
||||||
return banks.bankForOffset(offset).keyEquals(offset, otherKey);
|
return banks.get(offset/DICTIONARY_BANK_SIZE).keyEquals(offset, otherKey);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class DictionaryDataBank {
|
private static class DictionaryDataBank {
|
||||||
|
@ -8,7 +8,7 @@ import nu.marginalia.util.language.processing.SentenceExtractor;
|
|||||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
||||||
import nu.marginalia.util.language.processing.model.WordRep;
|
import nu.marginalia.util.language.processing.model.WordRep;
|
||||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
@ -30,7 +30,7 @@ public class DocumentDebugger {
|
|||||||
Path tempDir;
|
Path tempDir;
|
||||||
public DocumentDebugger(LanguageModels lm) throws IOException {
|
public DocumentDebugger(LanguageModels lm) throws IOException {
|
||||||
se = new SentenceExtractor(lm);
|
se = new SentenceExtractor(lm);
|
||||||
var dict = new NGramDict(lm);
|
var dict = new TermFrequencyDict(lm);
|
||||||
ke = new KeywordExtractor();
|
ke = new KeywordExtractor();
|
||||||
|
|
||||||
kc = new KeywordCounter(dict, ke);
|
kc = new KeywordCounter(dict, ke);
|
||||||
@ -69,7 +69,7 @@ public class DocumentDebugger {
|
|||||||
Set<String> reps = new HashSet<>();
|
Set<String> reps = new HashSet<>();
|
||||||
|
|
||||||
// kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed));
|
// kc.count(languageData, 0.75).forEach(rep -> reps.add(rep.stemmed));
|
||||||
kc.count(languageData).forEach(rep -> reps.add(rep.stemmed));
|
// kc.count(languageData).forEach(rep -> reps.add(rep.stemmed));
|
||||||
|
|
||||||
try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) {
|
try (var pw = new PrintWriter(new FileOutputStream(output.toFile()))) {
|
||||||
|
|
||||||
|
@ -19,7 +19,12 @@ public class WordPatterns {
|
|||||||
public static final Pattern wordBreakPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
|
public static final Pattern wordBreakPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
|
||||||
public static final Pattern characterNoisePattern = Pattern.compile("^[/+\\-]+$");
|
public static final Pattern characterNoisePattern = Pattern.compile("^[/+\\-]+$");
|
||||||
|
|
||||||
|
public static final Pattern singleWordAdditionalPattern =
|
||||||
|
Pattern.compile("[\\da-zA-Z]{1,15}([.\\-_/:][\\da-zA-Z]{1,10}){0,4}");
|
||||||
|
|
||||||
|
public static final Predicate<String> singleWordQualitiesPredicate = singleWordAdditionalPattern.asMatchPredicate();
|
||||||
public static final Predicate<String> wordQualitiesPredicate = wordPattern.asMatchPredicate();
|
public static final Predicate<String> wordQualitiesPredicate = wordPattern.asMatchPredicate();
|
||||||
|
|
||||||
public static final Predicate<String> wordAppendixPredicate = wordAppendixPattern.asMatchPredicate();
|
public static final Predicate<String> wordAppendixPredicate = wordAppendixPattern.asMatchPredicate();
|
||||||
public static final Predicate<String> wordPredicateEither = wordQualitiesPredicate.or(wordAppendixPredicate);
|
public static final Predicate<String> wordPredicateEither = wordQualitiesPredicate.or(wordAppendixPredicate);
|
||||||
public static final Predicate<String> characterNoisePredicate = characterNoisePattern.asMatchPredicate();
|
public static final Predicate<String> characterNoisePredicate = characterNoisePattern.asMatchPredicate();
|
||||||
@ -58,7 +63,7 @@ public class WordPatterns {
|
|||||||
if (word.isBlank()) {
|
if (word.isBlank()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (hasMoreThanTwo(word, '-', 2)) {
|
if (hasMoreThanTwo(word, '-', 4)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (hasMoreThanTwo(word, '+', 2)) {
|
if (hasMoreThanTwo(word, '+', 2)) {
|
||||||
@ -75,7 +80,7 @@ public class WordPatterns {
|
|||||||
if (Character.isDigit(word.charAt(i))) {
|
if (Character.isDigit(word.charAt(i))) {
|
||||||
numDigits++;
|
numDigits++;
|
||||||
}
|
}
|
||||||
if (numDigits > 6)
|
if (numDigits > 16)
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6,8 +6,9 @@ import java.nio.file.Path;
|
|||||||
|
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
public class LanguageModels {
|
public class LanguageModels {
|
||||||
public final Path ngramDictionary;
|
public final Path ngramBloomFilter;
|
||||||
public final Path ngramFrequency;
|
public final Path termFrequencies;
|
||||||
|
|
||||||
public final Path openNLPSentenceDetectionData;
|
public final Path openNLPSentenceDetectionData;
|
||||||
public final Path posRules;
|
public final Path posRules;
|
||||||
public final Path posDict;
|
public final Path posDict;
|
||||||
|
@ -5,8 +5,8 @@ import java.util.regex.Pattern;
|
|||||||
|
|
||||||
public class AsciiFlattener {
|
public class AsciiFlattener {
|
||||||
|
|
||||||
private static final Pattern nonAscii = Pattern.compile("[^a-zA-Z0-9_.'+@#:]+");
|
private static final Pattern nonAscii = Pattern.compile("[^a-zA-Z0-9_.'+@#:\\-]+");
|
||||||
private static final Pattern plainAsciiPattern = Pattern.compile("^[a-zA-Z0-9_.'+@#:]+$");
|
private static final Pattern plainAsciiPattern = Pattern.compile("^[a-zA-Z0-9_.'+@#:\\-]+$");
|
||||||
private static final Predicate<String> plainAscii = plainAsciiPattern.asMatchPredicate();
|
private static final Predicate<String> plainAscii = plainAsciiPattern.asMatchPredicate();
|
||||||
|
|
||||||
public static String flattenUnicode(String s) {
|
public static String flattenUnicode(String s) {
|
||||||
|
@ -1,99 +1,164 @@
|
|||||||
package nu.marginalia.util.language.processing;
|
package nu.marginalia.util.language.processing;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
|
||||||
import nu.marginalia.util.language.WordPatterns;
|
import nu.marginalia.util.language.WordPatterns;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
import nu.marginalia.util.language.processing.model.WordRep;
|
import nu.marginalia.util.language.processing.model.WordRep;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||||
import org.jetbrains.annotations.NotNull;
|
|
||||||
|
|
||||||
import javax.inject.Inject;
|
import javax.inject.Inject;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
public class DocumentKeywordExtractor {
|
public class DocumentKeywordExtractor {
|
||||||
|
|
||||||
private final KeywordExtractor keywordExtractor;
|
private final KeywordExtractor keywordExtractor;
|
||||||
private final KeywordCounter tfIdfCounter;
|
private final KeywordCounter tfIdfCounter;
|
||||||
private final NameCounter nameCounter;
|
private final NameCounter nameCounter;
|
||||||
private final LongNameCounter longNameCounter;
|
|
||||||
private final SubjectCounter subjectCounter;
|
private final SubjectCounter subjectCounter;
|
||||||
|
|
||||||
private final NGramDict dict;
|
private final TermFrequencyDict dict;
|
||||||
|
private final double docCount;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public DocumentKeywordExtractor(NGramDict dict) {
|
public DocumentKeywordExtractor(TermFrequencyDict dict) {
|
||||||
this.dict = dict;
|
this.dict = dict;
|
||||||
|
docCount = dict.docCount();
|
||||||
|
|
||||||
keywordExtractor = new KeywordExtractor();
|
keywordExtractor = new KeywordExtractor();
|
||||||
|
|
||||||
tfIdfCounter = new KeywordCounter(dict, keywordExtractor);
|
tfIdfCounter = new KeywordCounter(dict, keywordExtractor);
|
||||||
nameCounter = new NameCounter(keywordExtractor);
|
nameCounter = new NameCounter(keywordExtractor);
|
||||||
longNameCounter = new LongNameCounter(dict, keywordExtractor);
|
|
||||||
subjectCounter = new SubjectCounter(keywordExtractor);
|
subjectCounter = new SubjectCounter(keywordExtractor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public EdgePageWordSet extractKeywordsMinimal(DocumentLanguageData documentLanguageData) {
|
||||||
|
|
||||||
|
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||||
|
|
||||||
|
KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
|
||||||
|
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 2);
|
||||||
|
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
||||||
|
|
||||||
|
List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
|
||||||
|
List<WordRep> topKeywords = new ArrayList<>(wordsTfIdf.top());
|
||||||
|
|
||||||
|
Collection<String> artifacts = getArtifacts(documentLanguageData);
|
||||||
|
|
||||||
|
return new EdgePageWordSet(
|
||||||
|
createWords(IndexBlock.Subjects, subjects),
|
||||||
|
createWords(IndexBlock.Title, titleWords),
|
||||||
|
createWords(IndexBlock.NamesWords, wordsNamesAll),
|
||||||
|
createWords(IndexBlock.Tfidf_Top, topKeywords),
|
||||||
|
createWords(IndexBlock.Tfidf_Middle, midKeywords),
|
||||||
|
new EdgePageWords(IndexBlock.Artifacts, artifacts)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData) {
|
public EdgePageWordSet extractKeywords(DocumentLanguageData documentLanguageData) {
|
||||||
|
|
||||||
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
List<WordRep> titleWords = extractTitleWords(documentLanguageData);
|
||||||
|
|
||||||
List<WordRep> wordsTfIdf = tfIdfCounter.count(documentLanguageData);
|
KeywordCounter.WordHistogram wordsTfIdf = tfIdfCounter.countHisto(documentLanguageData);
|
||||||
List<WordRep> wordsNamesRepeated = nameCounter.count(documentLanguageData, 2);
|
|
||||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
|
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
|
||||||
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
||||||
List<WordRep> wordsLongName = longNameCounter.count(documentLanguageData);
|
|
||||||
|
|
||||||
int totalSize = wordsTfIdf.size();
|
List<WordRep> lowKeywords = new ArrayList<>(wordsTfIdf.lower());
|
||||||
|
List<WordRep> midKeywords = new ArrayList<>(wordsTfIdf.mid());
|
||||||
List<WordRep> lowKeywords = new ArrayList<>(totalSize / 2);
|
List<WordRep> topKeywords = new ArrayList<>(wordsTfIdf.top());
|
||||||
List<WordRep> midKeywords = new ArrayList<>(totalSize / 2);
|
|
||||||
List<WordRep> topKeywords = new ArrayList<>(totalSize / 2);
|
|
||||||
|
|
||||||
for(var v : wordsTfIdf) {
|
|
||||||
if (topKeywords.size() <= totalSize / 10) topKeywords.add(v);
|
|
||||||
else if (midKeywords.size() <= totalSize / 5) midKeywords.add(v);
|
|
||||||
else lowKeywords.add(v);
|
|
||||||
}
|
|
||||||
|
|
||||||
var wordsToMatchWithTitle = joinWordLists(topKeywords, midKeywords, wordsNamesRepeated, subjects);
|
|
||||||
|
|
||||||
var words = getSimpleWords(documentLanguageData);
|
|
||||||
|
|
||||||
for (var w : wordsLongName)
|
|
||||||
words.add(w.word);
|
|
||||||
for (var w : lowKeywords)
|
|
||||||
words.remove(w.word);
|
|
||||||
for (var w : midKeywords)
|
|
||||||
words.remove(w.word);
|
|
||||||
for (var w : topKeywords)
|
|
||||||
words.remove(w.word);
|
|
||||||
|
|
||||||
Collection<String> artifacts = getArtifacts(documentLanguageData);
|
Collection<String> artifacts = getArtifacts(documentLanguageData);
|
||||||
|
|
||||||
var wordSet = new EdgePageWordSet(
|
var wordSet = new EdgePageWordSet(
|
||||||
createWords(IndexBlock.TitleKeywords, overlappingStems(titleWords, wordsToMatchWithTitle)),
|
createWords(IndexBlock.Subjects, subjects),
|
||||||
createWords(IndexBlock.Topic, subjects),
|
|
||||||
createWords(IndexBlock.Title, titleWords),
|
createWords(IndexBlock.Title, titleWords),
|
||||||
createWords(IndexBlock.NamesWords, wordsNamesAll),
|
createWords(IndexBlock.NamesWords, wordsNamesAll),
|
||||||
createWords(IndexBlock.Top, topKeywords),
|
createWords(IndexBlock.Tfidf_Top, topKeywords),
|
||||||
createWords(IndexBlock.Middle, midKeywords),
|
createWords(IndexBlock.Tfidf_Middle, midKeywords),
|
||||||
createWords(IndexBlock.Low, lowKeywords),
|
createWords(IndexBlock.Tfidf_Lower, lowKeywords),
|
||||||
new EdgePageWords(IndexBlock.Artifacts, artifacts)
|
new EdgePageWords(IndexBlock.Artifacts, artifacts)
|
||||||
);
|
);
|
||||||
|
|
||||||
wordSet.append(IndexBlock.Words, words);
|
getSimpleWords(wordSet, documentLanguageData,
|
||||||
|
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus);
|
||||||
|
|
||||||
return wordSet;
|
return wordSet;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void getSimpleWords(EdgePageWordSet wordSet, DocumentLanguageData documentLanguageData, IndexBlock... blocks) {
|
||||||
|
|
||||||
|
int start = 0;
|
||||||
|
int lengthGoal = 32;
|
||||||
|
|
||||||
|
for (int blockIdx = 0; blockIdx < blocks.length-1 && start < documentLanguageData.sentences.length; blockIdx++) {
|
||||||
|
IndexBlock block = blocks[blockIdx];
|
||||||
|
Set<String> words = new HashSet<>(lengthGoal+100);
|
||||||
|
|
||||||
|
int pos;
|
||||||
|
int length = 0;
|
||||||
|
for (pos = start; pos < documentLanguageData.sentences.length && length < lengthGoal; pos++) {
|
||||||
|
var sent = documentLanguageData.sentences[pos];
|
||||||
|
length += sent.length();
|
||||||
|
|
||||||
|
for (var word : sent) {
|
||||||
|
if (!word.isStopWord()) {
|
||||||
|
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
||||||
|
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
|
||||||
|
words.add(w);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
wordSet.append(block, words);
|
||||||
|
start = pos;
|
||||||
|
lengthGoal+=32;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (start < documentLanguageData.sentences.length) {
|
||||||
|
|
||||||
|
Map<String, Integer> counts = new HashMap<>(documentLanguageData.totalNumWords());
|
||||||
|
for (int pos = start; pos < documentLanguageData.sentences.length && counts.size() < lengthGoal; pos++) {
|
||||||
|
var sent = documentLanguageData.sentences[pos];
|
||||||
|
for (var word : sent) {
|
||||||
|
if (!word.isStopWord()) {
|
||||||
|
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
||||||
|
if (counts.containsKey(w) || (WordPatterns.singleWordQualitiesPredicate.test(w))) {
|
||||||
|
counts.merge(w, 1, Integer::sum);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<String> lastSet;
|
||||||
|
if (counts.size() < 1024) {
|
||||||
|
lastSet = counts.keySet();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
lastSet = counts.entrySet().stream()
|
||||||
|
.sorted(Comparator.comparing(e -> {
|
||||||
|
double N = docCount; // Number of documents in term freq dictionary
|
||||||
|
|
||||||
|
// Caveat: This is actually the *negated* term score, because the second logarithm has
|
||||||
|
// its parameter inverted (log(a^b) = b log(a); here b = -1)
|
||||||
|
return (1 + Math.log(e.getValue())) * Math.log((1. + dict.getTermFreq(e.getKey())) / N);
|
||||||
|
}))
|
||||||
|
.map(Map.Entry::getKey)
|
||||||
|
.limit(1024)
|
||||||
|
.collect(Collectors.toCollection(LinkedHashSet::new));
|
||||||
|
}
|
||||||
|
|
||||||
|
wordSet.append(blocks[blocks.length - 1], lastSet);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private Collection<String> getArtifacts(DocumentLanguageData documentLanguageData) {
|
private Collection<String> getArtifacts(DocumentLanguageData documentLanguageData) {
|
||||||
Set<String> reps = new HashSet<>();
|
Set<String> reps = new HashSet<>();
|
||||||
|
|
||||||
|
|
||||||
for (var sent : documentLanguageData.sentences) {
|
for (var sent : documentLanguageData.sentences) {
|
||||||
for (var word : sent) {
|
for (var word : sent) {
|
||||||
String lc = word.wordLowerCase();
|
String lc = word.wordLowerCase();
|
||||||
@ -123,57 +188,7 @@ public class DocumentKeywordExtractor {
|
|||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
private Collection<WordRep> joinWordLists(List<WordRep>... words) {
|
|
||||||
int size = 0;
|
|
||||||
for (var lst : words) {
|
|
||||||
size += lst.size();
|
|
||||||
}
|
|
||||||
if (size == 0)
|
|
||||||
return Collections.emptyList();
|
|
||||||
|
|
||||||
final LinkedHashSet<WordRep> ret = new LinkedHashSet<>(size);
|
|
||||||
for (var lst : words) {
|
|
||||||
ret.addAll(lst);
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
@NotNull
|
|
||||||
private Set<String> getSimpleWords(DocumentLanguageData documentLanguageData) {
|
|
||||||
Map<String, Integer> counts = new HashMap<>(documentLanguageData.totalNumWords());
|
|
||||||
|
|
||||||
for (var sent : documentLanguageData.sentences) {
|
|
||||||
for (int i = 0; i < sent.length(); i++) {
|
|
||||||
if (!sent.isStopWord(i)) {
|
|
||||||
String w = AsciiFlattener.flattenUnicode(sent.wordsLowerCase[i]);
|
|
||||||
if (counts.containsKey(w) || (WordPatterns.wordQualitiesPredicate.test(w) && WordPatterns.filter(w))) {
|
|
||||||
counts.merge(w, 1, Integer::sum);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return counts.entrySet().stream()
|
|
||||||
.sorted(Comparator.comparing(e -> {
|
|
||||||
double N = 11820118.; // Number of documents in term freq dictionary
|
|
||||||
|
|
||||||
// Caveat: This is actually the *negated* term score, because the second logarithm has
|
|
||||||
// its parameter inverted (log(a^b) = b log(a); here b = -1)
|
|
||||||
return (1+Math.log(e.getValue())) * Math.log((1.+dict.getTermFreq(e.getKey()))/N);
|
|
||||||
}))
|
|
||||||
.map(Map.Entry::getKey)
|
|
||||||
.limit(512).collect(Collectors.toCollection(LinkedHashSet::new));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) {
|
public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) {
|
||||||
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet()));
|
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet()));
|
||||||
}
|
}
|
||||||
|
|
||||||
private Set<WordRep> overlappingStems(Collection<WordRep> wordsA, Collection<WordRep> wordsB) {
|
|
||||||
Set<String> stemmedA = wordsA.stream().map(WordRep::getStemmed).collect(Collectors.toSet());
|
|
||||||
Set<String> stemmedB = wordsB.stream().map(WordRep::getStemmed).collect(Collectors.toSet());
|
|
||||||
Set<String> stemmedIntersect = Sets.intersection(stemmedA, stemmedB);
|
|
||||||
return Stream.concat(wordsA.stream(), wordsB.stream()).filter(w -> stemmedIntersect.contains(w.getStemmed())).collect(Collectors.toSet());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -1,65 +1,92 @@
|
|||||||
package nu.marginalia.util.language.processing;
|
package nu.marginalia.util.language.processing;
|
||||||
|
|
||||||
|
import nu.marginalia.util.language.WordPatterns;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
import nu.marginalia.util.language.processing.model.WordRep;
|
import nu.marginalia.util.language.processing.model.WordRep;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.HashMap;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
public class KeywordCounter {
|
public class KeywordCounter {
|
||||||
private final KeywordExtractor keywordExtractor;
|
private final KeywordExtractor keywordExtractor;
|
||||||
private final NGramDict dict;
|
private final TermFrequencyDict dict;
|
||||||
|
private final double docCount;
|
||||||
|
|
||||||
public KeywordCounter(NGramDict dict, KeywordExtractor keywordExtractor) {
|
public KeywordCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) {
|
||||||
this.dict = dict;
|
this.dict = dict;
|
||||||
this.keywordExtractor = keywordExtractor;
|
this.keywordExtractor = keywordExtractor;
|
||||||
|
this.docCount = (double) dict.docCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<WordRep> count(DocumentLanguageData dld) {
|
public WordHistogram countHisto(DocumentLanguageData dld) {
|
||||||
HashMap<String, Double> counts = new HashMap<>(1000);
|
HashMap<String, Integer> counts = new HashMap<>(1000);
|
||||||
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
|
HashMap<String, HashSet<WordRep>> instances = new HashMap<>(1000);
|
||||||
|
|
||||||
|
|
||||||
for (var sent : dld.sentences) {
|
for (var sent : dld.sentences) {
|
||||||
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
var keywords = keywordExtractor.getKeywordsFromSentence(sent);
|
||||||
for (var span : keywords) {
|
for (var span : keywords) {
|
||||||
|
if (span.size() == 1 &&
|
||||||
|
WordPatterns.isStopWord(sent.words[span.start]))
|
||||||
|
continue;
|
||||||
|
|
||||||
String stemmed = sent.constructStemmedWordFromSpan(span);
|
String stemmed = sent.constructStemmedWordFromSpan(span);
|
||||||
|
|
||||||
counts.merge(stemmed, 1., Double::sum);
|
counts.merge(stemmed, 1, Integer::sum);
|
||||||
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span));
|
instances.computeIfAbsent(stemmed, k -> new HashSet<>()).add(new WordRep(sent, span));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return counts.entrySet().stream()
|
double maxC = counts.values().stream().mapToDouble(Double::valueOf).max().orElse(1);
|
||||||
.filter(e -> e.getValue() > 1)
|
|
||||||
.sorted(Comparator.comparing(this::getTermValue))
|
Set<WordRep> h5 = new HashSet<>();
|
||||||
.map(Map.Entry::getKey)
|
Set<WordRep> h10 = new HashSet<>();
|
||||||
.flatMap(w -> instances.get(w).stream())
|
Set<WordRep> h15 = new HashSet<>();
|
||||||
.filter(w -> w.word.length() > 1)
|
|
||||||
.limit(150)
|
int doubleWordCount = 0;
|
||||||
.collect(Collectors.toList());
|
|
||||||
|
for (var entry : counts.entrySet()) {
|
||||||
|
double value = getTermValue(entry, maxC);
|
||||||
|
|
||||||
|
double avgCnt = entry.getValue();
|
||||||
|
String wordStemmed = entry.getKey();
|
||||||
|
|
||||||
|
Set<WordRep> histogram;
|
||||||
|
if (value < -3 && avgCnt>1) histogram = h15;
|
||||||
|
else if (value < -1.75 && avgCnt>1) histogram = h10;
|
||||||
|
else if (value < -1 &&
|
||||||
|
(!wordStemmed.contains("_") || doubleWordCount++ < 50))
|
||||||
|
histogram = h5;
|
||||||
|
else continue;
|
||||||
|
|
||||||
|
histogram.addAll(instances.get(wordStemmed));
|
||||||
|
}
|
||||||
|
|
||||||
|
return new WordHistogram(h5, h10, h15);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final Pattern separator = Pattern.compile("_");
|
private static final Pattern separator = Pattern.compile("_");
|
||||||
|
|
||||||
public double getTermValue(Map.Entry<String, Double> e) {
|
public double getTermValue(Map.Entry<String, Integer> e, double maxValue) {
|
||||||
String[] parts = separator.split(e.getKey());
|
String[] parts = separator.split(e.getKey());
|
||||||
double totalValue = 0.;
|
double totalValue = 0.;
|
||||||
for (String part : parts) {
|
for (String part : parts) {
|
||||||
totalValue += value(part, e.getValue());
|
totalValue += value(part, e.getValue(), maxValue);
|
||||||
}
|
}
|
||||||
return totalValue / Math.sqrt(parts.length);
|
return totalValue / parts.length;
|
||||||
}
|
}
|
||||||
|
|
||||||
double value(String key, double value) {
|
double value(String key, double value, double maxValue) {
|
||||||
double freq = dict.getTermFreqStemmed(key);
|
double freq = dict.getTermFreqStemmed(key);
|
||||||
if (freq < 1) {
|
if (freq < 1) {
|
||||||
freq = 10;
|
freq = 1;
|
||||||
}
|
}
|
||||||
return (1+Math.log(value)) * Math.log((1.1+freq)/11820118.);
|
return (0.1 + 0.9*value/maxValue) * Math.log(freq/docCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public record WordHistogram(Set<WordRep> lower, Set<WordRep> mid, Set<WordRep> top) { }
|
||||||
}
|
}
|
||||||
|
@ -1,93 +1,18 @@
|
|||||||
package nu.marginalia.util.language.processing;
|
package nu.marginalia.util.language.processing;
|
||||||
|
|
||||||
|
import nu.marginalia.util.language.WordPatterns;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
||||||
import nu.marginalia.util.language.processing.model.WordSpan;
|
import nu.marginalia.util.language.processing.model.WordSpan;
|
||||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||||
import nu.marginalia.util.language.WordPatterns;
|
|
||||||
|
|
||||||
import java.lang.ref.SoftReference;
|
import java.lang.ref.SoftReference;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.function.Function;
|
|
||||||
import java.util.stream.IntStream;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
public class KeywordExtractor {
|
public class KeywordExtractor {
|
||||||
|
|
||||||
public boolean isLegacy() {
|
|
||||||
return legacy;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setLegacy(boolean legacy) {
|
|
||||||
this.legacy = legacy;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean legacy;
|
|
||||||
|
|
||||||
public WordSpan[] getNameLikes(DocumentSentence sentence) {
|
|
||||||
var direct = IntStream.range(0, sentence.length())
|
|
||||||
.filter(i -> sentence.posTags[i].startsWith("N"))
|
|
||||||
.mapToObj(i -> new WordSpan(i, i+1))
|
|
||||||
;
|
|
||||||
var two = IntStream.range(1, sentence.length())
|
|
||||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE)
|
|
||||||
.filter(i -> isName(i, sentence, Collections.emptySet()))
|
|
||||||
.filter(i -> isName(i -1, sentence, Collections.emptySet()))
|
|
||||||
.mapToObj(i -> new WordSpan(i-1, i+1))
|
|
||||||
;
|
|
||||||
|
|
||||||
var a_in_b = IntStream.range(2, sentence.length())
|
|
||||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE)
|
|
||||||
.filter(i -> isProperNoun(i, sentence))
|
|
||||||
.filter(i -> isJoiner(sentence, i-1))
|
|
||||||
.filter(i -> isProperNoun(i-2, sentence))
|
|
||||||
.mapToObj(i -> new WordSpan(i-2, i+1))
|
|
||||||
;
|
|
||||||
|
|
||||||
var a_in_det_b = IntStream.range(3, sentence.length())
|
|
||||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
|
|
||||||
&& sentence.separators[i-2] == WordSeparator.SPACE)
|
|
||||||
.filter(i -> isProperNoun(i, sentence))
|
|
||||||
.filter(i -> isJoiner(sentence, i-1))
|
|
||||||
.filter(i -> sentence.posTags[i-2].equals("DT"))
|
|
||||||
.filter(i -> isProperNoun(i-3, sentence))
|
|
||||||
.mapToObj(i -> new WordSpan(i-3, i+1))
|
|
||||||
;
|
|
||||||
var a_in_in_b = IntStream.range(3, sentence.length())
|
|
||||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
|
|
||||||
&& sentence.separators[i-2] == WordSeparator.SPACE)
|
|
||||||
.filter(i -> isProperNoun(i, sentence))
|
|
||||||
.filter(i -> isJoiner(sentence, i-1) || isProperNoun(i-1, sentence))
|
|
||||||
.filter(i -> isJoiner(sentence, i-2) || isProperNoun(i-2, sentence))
|
|
||||||
.filter(i -> isProperNoun(i-3, sentence))
|
|
||||||
.mapToObj(i -> new WordSpan(i-3, i+1))
|
|
||||||
;
|
|
||||||
var three = IntStream.range(2, sentence.length())
|
|
||||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
|
|
||||||
&& sentence.separators[i-2] == WordSeparator.SPACE)
|
|
||||||
.filter(i -> isName(i, sentence, Collections.emptySet()))
|
|
||||||
.filter(i -> isName(i-1, sentence, Collections.emptySet()))
|
|
||||||
.filter(i -> isName(i-2, sentence, Collections.emptySet()))
|
|
||||||
.mapToObj(i -> new WordSpan(i-2, i+1))
|
|
||||||
;
|
|
||||||
var four = IntStream.range(3, sentence.length())
|
|
||||||
.filter(i -> sentence.separators[i-1] == WordSeparator.SPACE
|
|
||||||
&& sentence.separators[i-2] == WordSeparator.SPACE
|
|
||||||
&& sentence.separators[i-3] == WordSeparator.SPACE)
|
|
||||||
.filter(i -> isName(i, sentence, Collections.emptySet()))
|
|
||||||
.filter(i -> isName(i - 1, sentence, Collections.emptySet()))
|
|
||||||
.filter(i -> isName(i - 2, sentence, Collections.emptySet()))
|
|
||||||
.filter(i -> isName(i - 3, sentence, Collections.emptySet()))
|
|
||||||
.mapToObj(i -> new WordSpan(i-3, i+1))
|
|
||||||
;
|
|
||||||
|
|
||||||
return Stream.of(direct, two, a_in_b, a_in_in_b, a_in_det_b, three, four).flatMap(Function.identity())
|
|
||||||
.toArray(WordSpan[]::new);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public WordSpan[] getNames(DocumentSentence sentence) {
|
public WordSpan[] getNames(DocumentSentence sentence) {
|
||||||
List<WordSpan> spans = new ArrayList<>(sentence.length());
|
List<WordSpan> spans = new ArrayList<>(sentence.length());
|
||||||
|
|
||||||
@ -214,7 +139,7 @@ public class KeywordExtractor {
|
|||||||
}
|
}
|
||||||
String word = sentence.constructWordFromSpan(w);
|
String word = sentence.constructWordFromSpan(w);
|
||||||
|
|
||||||
if (word.isBlank() || WordPatterns.isStopWord(word)) return false;
|
if (word.isBlank() || !WordPatterns.filter(word)) return false;
|
||||||
if (sentence.posTags[w.start].equals("CC")) return false;
|
if (sentence.posTags[w.start].equals("CC")) return false;
|
||||||
if (sentence.posTags[w.end-1].equals("IN")) return false;
|
if (sentence.posTags[w.end-1].equals("IN")) return false;
|
||||||
if (sentence.posTags[w.end-1].equals("DT")) return false;
|
if (sentence.posTags[w.end-1].equals("DT")) return false;
|
||||||
@ -377,4 +302,6 @@ public class KeywordExtractor {
|
|||||||
|
|
||||||
return (posTag.startsWith("JJ") || posTag.startsWith("R") || posTag.startsWith("VBG"));
|
return (posTag.startsWith("JJ") || posTag.startsWith("R") || posTag.startsWith("VBG"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -3,7 +3,7 @@ package nu.marginalia.util.language.processing;
|
|||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
import nu.marginalia.util.language.processing.model.DocumentSentence;
|
||||||
import nu.marginalia.util.language.processing.model.WordRep;
|
import nu.marginalia.util.language.processing.model.WordRep;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
@ -11,10 +11,11 @@ import java.util.stream.Collectors;
|
|||||||
|
|
||||||
public class LongNameCounter {
|
public class LongNameCounter {
|
||||||
private final KeywordExtractor keywordExtractor;
|
private final KeywordExtractor keywordExtractor;
|
||||||
|
private final TermFrequencyDict dict;
|
||||||
private final NGramDict dict;
|
private final double docCount;
|
||||||
public LongNameCounter(NGramDict dict, KeywordExtractor keywordExtractor) {
|
public LongNameCounter(TermFrequencyDict dict, KeywordExtractor keywordExtractor) {
|
||||||
this.dict = dict;
|
this.dict = dict;
|
||||||
|
docCount = (double) dict.docCount();
|
||||||
this.keywordExtractor = keywordExtractor;
|
this.keywordExtractor = keywordExtractor;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -22,6 +22,9 @@ public class NameCounter {
|
|||||||
DocumentSentence sent = dld.sentences[i];
|
DocumentSentence sent = dld.sentences[i];
|
||||||
var keywords = keywordExtractor.getNames(sent);
|
var keywords = keywordExtractor.getNames(sent);
|
||||||
for (var span : keywords) {
|
for (var span : keywords) {
|
||||||
|
if (span.size() <= 1)
|
||||||
|
continue;
|
||||||
|
|
||||||
var stemmed = sent.constructStemmedWordFromSpan(span);
|
var stemmed = sent.constructStemmedWordFromSpan(span);
|
||||||
|
|
||||||
counts.merge(stemmed, 1., Double::sum);
|
counts.merge(stemmed, 1., Double::sum);
|
||||||
|
@ -2,11 +2,11 @@ package nu.marginalia.wmsa.api;
|
|||||||
|
|
||||||
import com.google.common.base.Strings;
|
import com.google.common.base.Strings;
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.google.gson.GsonBuilder;
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import nu.marginalia.wmsa.api.model.ApiLicense;
|
import nu.marginalia.wmsa.api.model.ApiLicense;
|
||||||
|
import nu.marginalia.wmsa.client.GsonFactory;
|
||||||
import nu.marginalia.wmsa.configuration.server.*;
|
import nu.marginalia.wmsa.configuration.server.*;
|
||||||
import nu.marginalia.wmsa.edge.search.client.EdgeSearchClient;
|
import nu.marginalia.wmsa.edge.search.client.EdgeSearchClient;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -20,7 +20,7 @@ import java.util.concurrent.ConcurrentHashMap;
|
|||||||
public class ApiService extends Service {
|
public class ApiService extends Service {
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final Gson gson = new GsonBuilder().create();
|
private final Gson gson = GsonFactory.get();
|
||||||
private final EdgeSearchClient searchClient;
|
private final EdgeSearchClient searchClient;
|
||||||
private final HikariDataSource dataSource;
|
private final HikariDataSource dataSource;
|
||||||
private final ConcurrentHashMap<String, ApiLicense> licenseCache = new ConcurrentHashMap<>();
|
private final ConcurrentHashMap<String, ApiLicense> licenseCache = new ConcurrentHashMap<>();
|
||||||
|
@ -1,12 +1,11 @@
|
|||||||
package nu.marginalia.wmsa.client;
|
package nu.marginalia.wmsa.client;
|
||||||
|
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.google.gson.GsonBuilder;
|
import com.google.protobuf.GeneratedMessageV3;
|
||||||
import io.reactivex.rxjava3.core.Observable;
|
import io.reactivex.rxjava3.core.Observable;
|
||||||
import io.reactivex.rxjava3.core.ObservableSource;
|
import io.reactivex.rxjava3.core.ObservableSource;
|
||||||
import io.reactivex.rxjava3.plugins.RxJavaPlugins;
|
import io.reactivex.rxjava3.plugins.RxJavaPlugins;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
|
|
||||||
import nu.marginalia.wmsa.client.exception.LocalException;
|
import nu.marginalia.wmsa.client.exception.LocalException;
|
||||||
import nu.marginalia.wmsa.client.exception.NetworkException;
|
import nu.marginalia.wmsa.client.exception.NetworkException;
|
||||||
import nu.marginalia.wmsa.client.exception.RemoteException;
|
import nu.marginalia.wmsa.client.exception.RemoteException;
|
||||||
@ -17,8 +16,6 @@ import org.apache.http.HttpHost;
|
|||||||
import org.apache.logging.log4j.ThreadContext;
|
import org.apache.logging.log4j.ThreadContext;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.slf4j.Marker;
|
|
||||||
import org.slf4j.MarkerFactory;
|
|
||||||
|
|
||||||
import java.io.ByteArrayOutputStream;
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -32,9 +29,7 @@ import java.util.zip.GZIPOutputStream;
|
|||||||
public abstract class AbstractClient implements AutoCloseable {
|
public abstract class AbstractClient implements AutoCloseable {
|
||||||
public static final String CONTEXT_OUTBOUND_REQUEST = "outbound-request";
|
public static final String CONTEXT_OUTBOUND_REQUEST = "outbound-request";
|
||||||
|
|
||||||
private final Gson gson = new GsonBuilder()
|
private final Gson gson = GsonFactory.get();
|
||||||
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
|
|
||||||
.create();
|
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
@ -186,6 +181,31 @@ public abstract class AbstractClient implements AutoCloseable {
|
|||||||
.doFinally(() -> ThreadContext.remove("outbound-request"));
|
.doFinally(() -> ThreadContext.remove("outbound-request"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
protected synchronized Observable<HttpStatusCode> post(Context ctx, String endpoint, GeneratedMessageV3 data) {
|
||||||
|
|
||||||
|
ensureAlive();
|
||||||
|
|
||||||
|
RequestBody body = RequestBody.create(
|
||||||
|
MediaType.parse("application/protobuf"),
|
||||||
|
data.toByteArray());
|
||||||
|
|
||||||
|
var req = ctx.paint(new Request.Builder()).url(url + endpoint).post(body).build();
|
||||||
|
var call = client.newCall(req);
|
||||||
|
|
||||||
|
logInbound(call);
|
||||||
|
ThreadContext.put("outbound-request", url + endpoint);
|
||||||
|
try (var rsp = call.execute()) {
|
||||||
|
logOutbound(rsp);
|
||||||
|
int code = rsp.code();
|
||||||
|
|
||||||
|
return validateStatus(code, req).map(HttpStatusCode::new);
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
ThreadContext.remove("outbound-request");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
protected synchronized <T> Observable<T> postGet(Context ctx, String endpoint, Object data, Class<T> returnType) {
|
protected synchronized <T> Observable<T> postGet(Context ctx, String endpoint, Object data, Class<T> returnType) {
|
||||||
|
@ -0,0 +1,29 @@
|
|||||||
|
package nu.marginalia.wmsa.client;
|
||||||
|
|
||||||
|
import com.google.gson.*;
|
||||||
|
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||||
|
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
|
||||||
|
public class GsonFactory {
|
||||||
|
public static Gson get() {
|
||||||
|
return new GsonBuilder()
|
||||||
|
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
|
||||||
|
.registerTypeAdapter(EdgeUrl.class, (JsonSerializer<EdgeUrl>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
||||||
|
.registerTypeAdapter(EdgeDomain.class, (JsonSerializer<EdgeDomain>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
||||||
|
.registerTypeAdapter(EdgeUrl.class, (JsonDeserializer<EdgeUrl>) (json, typeOfT, context) -> {
|
||||||
|
try {
|
||||||
|
return new EdgeUrl(json.getAsString());
|
||||||
|
} catch (URISyntaxException e) {
|
||||||
|
throw new JsonParseException("URL Parse Exception", e);
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.registerTypeAdapter(EdgeDomain.class, (JsonDeserializer<EdgeDomain>) (json, typeOfT, context) -> new EdgeDomain(json.getAsString()))
|
||||||
|
.registerTypeAdapter(EdgeId.class, (JsonDeserializer<EdgeId<?>>) (json, typeOfT, context) -> new EdgeId<>(json.getAsInt()))
|
||||||
|
.registerTypeAdapter(EdgeId.class, (JsonSerializer<EdgeId<?>>) (src, typeOfSrc, context) -> new JsonPrimitive(src.id()))
|
||||||
|
.create();
|
||||||
|
}
|
||||||
|
}
|
@ -2,10 +2,7 @@ package nu.marginalia.wmsa.configuration;
|
|||||||
|
|
||||||
import nu.marginalia.wmsa.api.ApiMain;
|
import nu.marginalia.wmsa.api.ApiMain;
|
||||||
import nu.marginalia.wmsa.auth.AuthMain;
|
import nu.marginalia.wmsa.auth.AuthMain;
|
||||||
import nu.marginalia.wmsa.configuration.command.Command;
|
import nu.marginalia.wmsa.configuration.command.*;
|
||||||
import nu.marginalia.wmsa.configuration.command.ListCommand;
|
|
||||||
import nu.marginalia.wmsa.configuration.command.StartCommand;
|
|
||||||
import nu.marginalia.wmsa.configuration.command.VersionCommand;
|
|
||||||
import nu.marginalia.wmsa.edge.assistant.EdgeAssistantMain;
|
import nu.marginalia.wmsa.edge.assistant.EdgeAssistantMain;
|
||||||
import nu.marginalia.wmsa.edge.dating.DatingMain;
|
import nu.marginalia.wmsa.edge.dating.DatingMain;
|
||||||
import nu.marginalia.wmsa.edge.index.EdgeIndexMain;
|
import nu.marginalia.wmsa.edge.index.EdgeIndexMain;
|
||||||
@ -82,6 +79,9 @@ public enum ServiceDescriptor {
|
|||||||
MainMapLookup.setMainArguments(args);
|
MainMapLookup.setMainArguments(args);
|
||||||
Map<String, Command> functions = Stream.of(new ListCommand(),
|
Map<String, Command> functions = Stream.of(new ListCommand(),
|
||||||
new StartCommand(),
|
new StartCommand(),
|
||||||
|
new ConvertCommand(),
|
||||||
|
new LoadCommand(),
|
||||||
|
new ReindexCommand(),
|
||||||
new VersionCommand()
|
new VersionCommand()
|
||||||
).collect(Collectors.toMap(c -> c.name, c -> c));
|
).collect(Collectors.toMap(c -> c.name, c -> c));
|
||||||
|
|
||||||
|
@ -87,7 +87,7 @@ public class WmsaHome {
|
|||||||
final Path home = getHomePath();
|
final Path home = getHomePath();
|
||||||
|
|
||||||
return new LanguageModels(
|
return new LanguageModels(
|
||||||
home.resolve("model/ngrams-generous-emstr.bin"),
|
home.resolve("model/ngrams.bin"),
|
||||||
home.resolve("model/tfreq-new-algo3.bin"),
|
home.resolve("model/tfreq-new-algo3.bin"),
|
||||||
home.resolve("model/opennlp-sentence.bin"),
|
home.resolve("model/opennlp-sentence.bin"),
|
||||||
home.resolve("model/English.RDR"),
|
home.resolve("model/English.RDR"),
|
||||||
@ -95,4 +95,8 @@ public class WmsaHome {
|
|||||||
home.resolve("model/opennlp-tok.bin"));
|
home.resolve("model/opennlp-tok.bin"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static final boolean debugMode = Boolean.getBoolean("wmsa-debug");
|
||||||
|
public static boolean isDebug() {
|
||||||
|
return debugMode;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,24 @@
|
|||||||
|
package nu.marginalia.wmsa.configuration.command;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.ConverterMain;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
public class ConvertCommand extends Command {
|
||||||
|
public ConvertCommand() {
|
||||||
|
super("convert");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
@SneakyThrows
|
||||||
|
public void execute(String... args) {
|
||||||
|
if (args.length < 2) {
|
||||||
|
System.err.println("Usage: convert plan.yaml");
|
||||||
|
System.exit(255);
|
||||||
|
}
|
||||||
|
|
||||||
|
String[] args2 = Arrays.copyOfRange(args, 1, args.length);
|
||||||
|
ConverterMain.main(args2);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,24 @@
|
|||||||
|
package nu.marginalia.wmsa.configuration.command;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.LoaderMain;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
public class LoadCommand extends Command {
|
||||||
|
public LoadCommand() {
|
||||||
|
super("load");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
@SneakyThrows
|
||||||
|
public void execute(String... args) {
|
||||||
|
if (args.length < 2) {
|
||||||
|
System.err.println("Usage: load plan.yaml");
|
||||||
|
System.exit(255);
|
||||||
|
}
|
||||||
|
|
||||||
|
String[] args2 = Arrays.copyOfRange(args, 1, args.length);
|
||||||
|
LoaderMain.main(args2);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,24 @@
|
|||||||
|
package nu.marginalia.wmsa.configuration.command;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.ReindexTriggerMain;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
public class ReindexCommand extends Command {
|
||||||
|
public ReindexCommand() {
|
||||||
|
super("reindex");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
@SneakyThrows
|
||||||
|
public void execute(String... args) {
|
||||||
|
if (args.length < 2) {
|
||||||
|
System.err.println("Usage: reindex host");
|
||||||
|
System.exit(255);
|
||||||
|
}
|
||||||
|
|
||||||
|
String[] args2 = Arrays.copyOfRange(args, 1, args.length);
|
||||||
|
ReindexTriggerMain.main(args2);
|
||||||
|
}
|
||||||
|
}
|
@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.wmsa.configuration.command;
|
package nu.marginalia.wmsa.configuration.command;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
@ -14,6 +15,12 @@ public class StartCommand extends Command {
|
|||||||
public void execute(String... args) {
|
public void execute(String... args) {
|
||||||
if (args.length < 2) {
|
if (args.length < 2) {
|
||||||
System.err.println("Usage: start service-descriptor");
|
System.err.println("Usage: start service-descriptor");
|
||||||
|
System.err.println();
|
||||||
|
System.err.println("Available services:");
|
||||||
|
System.err.println();
|
||||||
|
for (var d : ServiceDescriptor.values()) {
|
||||||
|
System.err.println("\t"+d.name);
|
||||||
|
}
|
||||||
System.exit(255);
|
System.exit(255);
|
||||||
}
|
}
|
||||||
var mainMethod = getKind(args[1]).mainClass.getMethod("main", String[].class);
|
var mainMethod = getKind(args[1]).mainClass.getMethod("main", String[].class);
|
||||||
|
@ -84,6 +84,7 @@ public class DatabaseModule extends AbstractModule {
|
|||||||
config.addDataSourceProperty("cachePrepStmts", "true");
|
config.addDataSourceProperty("cachePrepStmts", "true");
|
||||||
config.addDataSourceProperty("prepStmtCacheSize", "250");
|
config.addDataSourceProperty("prepStmtCacheSize", "250");
|
||||||
config.addDataSourceProperty("prepStmtCacheSqlLimit", "2048");
|
config.addDataSourceProperty("prepStmtCacheSqlLimit", "2048");
|
||||||
|
|
||||||
config.setMaximumPoolSize(100);
|
config.setMaximumPoolSize(100);
|
||||||
config.setMinimumIdle(10);
|
config.setMinimumIdle(10);
|
||||||
return new HikariDataSource(config);
|
return new HikariDataSource(config);
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
package nu.marginalia.wmsa.edge.assistant;
|
package nu.marginalia.wmsa.edge.assistant;
|
||||||
|
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.google.gson.GsonBuilder;
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.wmsa.client.GsonFactory;
|
||||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||||
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
||||||
import nu.marginalia.wmsa.configuration.server.Service;
|
import nu.marginalia.wmsa.configuration.server.Service;
|
||||||
@ -22,7 +22,7 @@ import spark.Spark;
|
|||||||
public class EdgeAssistantService extends Service {
|
public class EdgeAssistantService extends Service {
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final Gson gson = new GsonBuilder().create();
|
private final Gson gson = GsonFactory.get();
|
||||||
private final Units units;
|
private final Units units;
|
||||||
private final MathParser mathParser;
|
private final MathParser mathParser;
|
||||||
private final Suggestions suggestions;
|
private final Suggestions suggestions;
|
||||||
|
@ -0,0 +1,93 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.assistant.dict;
|
||||||
|
|
||||||
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
|
import com.google.common.hash.HashFunction;
|
||||||
|
import com.google.common.hash.Hashing;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import nu.marginalia.util.DenseBitMap;
|
||||||
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
|
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||||
|
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournalFile;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
public class NGramBloomFilter {
|
||||||
|
private final DenseBitMap bitMap;
|
||||||
|
private static final PorterStemmer ps = new PorterStemmer();
|
||||||
|
private static final HashFunction hasher = Hashing.murmur3_128(0);
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(NGramBloomFilter.class);
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public NGramBloomFilter() throws IOException {
|
||||||
|
this(WmsaHome.getLanguageModels());
|
||||||
|
}
|
||||||
|
|
||||||
|
public NGramBloomFilter(LanguageModels lm) throws IOException {
|
||||||
|
this(loadSafely(lm.ngramBloomFilter));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static DenseBitMap loadSafely(Path path) throws IOException {
|
||||||
|
if (Files.isRegularFile(path)) {
|
||||||
|
return DenseBitMap.loadFromFile(path);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
logger.warn("NGrams file missing " + path);
|
||||||
|
return new DenseBitMap(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public NGramBloomFilter(DenseBitMap bitMap) {
|
||||||
|
this.bitMap = bitMap;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isKnownNGram(String word) {
|
||||||
|
long bit = bitForWord(word, bitMap.cardinality);
|
||||||
|
|
||||||
|
return bitMap.get(bit);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String... args) throws IOException {
|
||||||
|
var filter = convertFromDictionaryFile(new File(args[0]));
|
||||||
|
filter.bitMap.writeToFile(Path.of(args[1]));
|
||||||
|
}
|
||||||
|
|
||||||
|
public static NGramBloomFilter load(Path file) throws IOException {
|
||||||
|
return new NGramBloomFilter(DenseBitMap.loadFromFile(file));
|
||||||
|
}
|
||||||
|
|
||||||
|
public static NGramBloomFilter convertFromDictionaryFile(File file) throws IOException {
|
||||||
|
DenseBitMap bitMap = new DenseBitMap(1024*1024*1024L);
|
||||||
|
AtomicInteger popCount = new AtomicInteger();
|
||||||
|
try (var f = new KeywordLexiconJournalFile(file)) {
|
||||||
|
f.loadFile(data -> {
|
||||||
|
long bit = bitForWord(new String(data), bitMap.cardinality);
|
||||||
|
if (!bitMap.set(bit))
|
||||||
|
popCount.incrementAndGet();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println("popcount = " + popCount.get());
|
||||||
|
return new NGramBloomFilter(bitMap);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final Pattern underscore = Pattern.compile("_");
|
||||||
|
|
||||||
|
private static long bitForWord(String s, long n) {
|
||||||
|
String[] parts = underscore.split(s);
|
||||||
|
long hc = 0;
|
||||||
|
for (String part : parts) {
|
||||||
|
hc = hc * 31 + hasher.hashString(ps.stemWord(part), StandardCharsets.UTF_8).padToLong();
|
||||||
|
}
|
||||||
|
return (hc & 0x7FFF_FFFF_FFFF_FFFFL) % n;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,137 +0,0 @@
|
|||||||
package nu.marginalia.wmsa.edge.assistant.dict;
|
|
||||||
|
|
||||||
import ca.rmen.porterstemmer.PorterStemmer;
|
|
||||||
import gnu.trove.map.hash.TLongIntHashMap;
|
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
|
||||||
import javax.inject.Inject;
|
|
||||||
import javax.inject.Singleton;
|
|
||||||
import java.io.*;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
@Singleton
|
|
||||||
public class NGramDict {
|
|
||||||
|
|
||||||
private final TLongIntHashMap wordRates = new TLongIntHashMap(1_000_000, 0.5f, 0, 0);
|
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
private static final Pattern separator = Pattern.compile("[_ ]+");
|
|
||||||
private static final PorterStemmer ps = new PorterStemmer();
|
|
||||||
|
|
||||||
private static long fileSize(Path p) throws IOException {
|
|
||||||
return Files.size(p);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Inject
|
|
||||||
public NGramDict(@Nullable LanguageModels models) {
|
|
||||||
if (models == null) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (models.ngramFrequency != null) {
|
|
||||||
|
|
||||||
try (var frequencyData = new DataInputStream(new BufferedInputStream(new FileInputStream(models.ngramFrequency.toFile())))) {
|
|
||||||
|
|
||||||
wordRates.ensureCapacity((int)(fileSize(models.ngramFrequency)/16));
|
|
||||||
|
|
||||||
for (;;) {
|
|
||||||
wordRates.put(frequencyData.readLong(), (int) frequencyData.readLong());
|
|
||||||
}
|
|
||||||
} catch (EOFException eof) {
|
|
||||||
// ok
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.error("IO Exception reading " + models.ngramFrequency, e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info("Read {} N-grams frequencies", wordRates.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public static void main(String... args) {
|
|
||||||
if (args.length != 2) {
|
|
||||||
System.err.println("Expected arguments: in-file out-file");
|
|
||||||
}
|
|
||||||
String inFile = args[0];
|
|
||||||
String outFile = args[1];
|
|
||||||
|
|
||||||
var wordPattern = Pattern.compile("\\w+(_\\w+)*").asMatchPredicate();
|
|
||||||
try (var linesStr = Files.lines(Path.of(inFile));
|
|
||||||
var dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(outFile)))
|
|
||||||
) {
|
|
||||||
linesStr
|
|
||||||
.filter(wordPattern)
|
|
||||||
.mapToLong(NGramDict::getStringHash).forEach(l ->
|
|
||||||
{
|
|
||||||
try {
|
|
||||||
dos.writeLong(l);
|
|
||||||
} catch (IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
});
|
|
||||||
} catch (IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static long getStringHash(String s) {
|
|
||||||
String[] strings = separator.split(s);
|
|
||||||
if (s.length() > 1) {
|
|
||||||
byte[][] parts = new byte[strings.length][];
|
|
||||||
for (int i = 0; i < parts.length; i++) {
|
|
||||||
parts[i] = ps.stemWord(strings[i]).getBytes();
|
|
||||||
}
|
|
||||||
return longHash(parts);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
return longHash(s.getBytes());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
public long getTermFreqHash(long hash) {
|
|
||||||
return wordRates.get(hash);
|
|
||||||
}
|
|
||||||
public long getTermFreq(String s) {
|
|
||||||
return wordRates.get(getStringHash(s));
|
|
||||||
}
|
|
||||||
public long getTermFreqStemmed(String s) {
|
|
||||||
return wordRates.get(longHash(s.getBytes()));
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String getStemmedString(String s) {
|
|
||||||
String[] strings = separator.split(s);
|
|
||||||
if (s.length() > 1) {
|
|
||||||
return Arrays.stream(strings).map(ps::stemWord).collect(Collectors.joining("_"));
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public static long longHash(byte[]... bytesSets) {
|
|
||||||
if (bytesSets == null || bytesSets.length == 0)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
// https://cp-algorithms.com/string/string-hashing.html
|
|
||||||
int p = 127;
|
|
||||||
long m = (1L<<61)-1;
|
|
||||||
long p_power = 1;
|
|
||||||
long hash_val = 0;
|
|
||||||
|
|
||||||
for (byte[] bytes: bytesSets) {
|
|
||||||
for (byte element : bytes) {
|
|
||||||
hash_val = (hash_val + (element + 1) * p_power) % m;
|
|
||||||
p_power = (p_power * p) % m;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return hash_val;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -0,0 +1,221 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.assistant.dict;
|
||||||
|
|
||||||
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
|
import gnu.trove.map.hash.TLongIntHashMap;
|
||||||
|
import nu.marginalia.util.language.LanguageFilter;
|
||||||
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
|
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||||
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
|
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.DomPruningFilter;
|
||||||
|
import nu.marginalia.wmsa.edge.crawling.CrawlPlanLoader;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import javax.annotation.Nullable;
|
||||||
|
import javax.inject.Inject;
|
||||||
|
import javax.inject.Singleton;
|
||||||
|
import java.io.*;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.ForkJoinPool;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class TermFrequencyDict {
|
||||||
|
|
||||||
|
private final TLongIntHashMap wordRates = new TLongIntHashMap(1_000_000, 0.5f, 0, 0);
|
||||||
|
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
private static final Pattern separator = Pattern.compile("[_ ]+");
|
||||||
|
private static final PorterStemmer ps = new PorterStemmer();
|
||||||
|
|
||||||
|
private static final long DOC_COUNT_KEY = ~0L;
|
||||||
|
private static long fileSize(Path p) throws IOException {
|
||||||
|
return Files.size(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public TermFrequencyDict(@Nullable LanguageModels models) {
|
||||||
|
if (models == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (models.termFrequencies != null) {
|
||||||
|
|
||||||
|
try (var frequencyData = new DataInputStream(new BufferedInputStream(new FileInputStream(models.termFrequencies.toFile())))) {
|
||||||
|
|
||||||
|
wordRates.ensureCapacity((int)(fileSize(models.termFrequencies)/16));
|
||||||
|
|
||||||
|
for (;;) {
|
||||||
|
wordRates.put(frequencyData.readLong(), (int) frequencyData.readLong());
|
||||||
|
}
|
||||||
|
} catch (EOFException eof) {
|
||||||
|
// ok
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.error("IO Exception reading " + models.termFrequencies, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("Read {} N-grams frequencies", wordRates.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public int docCount() {
|
||||||
|
int cnt = wordRates.get(DOC_COUNT_KEY);
|
||||||
|
|
||||||
|
if (cnt == 0) {
|
||||||
|
cnt = 11820118; // legacy
|
||||||
|
}
|
||||||
|
return cnt;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String... args) throws IOException, InterruptedException {
|
||||||
|
if (args.length != 2) {
|
||||||
|
System.err.println("Expected arguments: plan.yaml out-file");
|
||||||
|
}
|
||||||
|
String outFile = args[1];
|
||||||
|
|
||||||
|
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
||||||
|
|
||||||
|
ThreadLocal<SentenceExtractor> se = ThreadLocal.withInitial(() -> new SentenceExtractor(WmsaHome.getLanguageModels()));
|
||||||
|
LanguageFilter lf = new LanguageFilter();
|
||||||
|
|
||||||
|
TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1);
|
||||||
|
|
||||||
|
ForkJoinPool fjp = new ForkJoinPool(24);
|
||||||
|
AtomicInteger docCount = new AtomicInteger();
|
||||||
|
|
||||||
|
for (var domain : plan.domainsIterable()) { // leaks file descriptor, is fine
|
||||||
|
|
||||||
|
if (domain.doc == null)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
fjp.execute(() -> {
|
||||||
|
|
||||||
|
for (var doc : domain.doc) {
|
||||||
|
if (doc.documentBody == null)
|
||||||
|
continue;
|
||||||
|
docCount.incrementAndGet();
|
||||||
|
|
||||||
|
Document parsed = Jsoup.parse(doc.documentBody);
|
||||||
|
parsed.body().filter(new DomPruningFilter(0.5));
|
||||||
|
|
||||||
|
DocumentLanguageData dld = se.get().extractSentences(parsed);
|
||||||
|
|
||||||
|
if (lf.dictionaryAgreement(dld) < 0.1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<String> words = new HashSet<>(10_000);
|
||||||
|
|
||||||
|
for (var sent : dld.sentences) {
|
||||||
|
for (var word : sent) {
|
||||||
|
words.add(word.stemmed());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fjp.execute(() -> {
|
||||||
|
synchronized (counts) {
|
||||||
|
for (var word : words) {
|
||||||
|
counts.adjustOrPutValue(longHash(word.getBytes()), 1, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
fjp.shutdown();
|
||||||
|
fjp.awaitTermination(10, TimeUnit.SECONDS);
|
||||||
|
|
||||||
|
try (var dos = new DataOutputStream(Files.newOutputStream(Path.of(outFile)))) {
|
||||||
|
synchronized (counts) {
|
||||||
|
counts.put(DOC_COUNT_KEY, docCount.get());
|
||||||
|
|
||||||
|
counts.forEachEntry((hash, cnt) -> {
|
||||||
|
try {
|
||||||
|
dos.writeLong(hash);
|
||||||
|
dos.writeLong(cnt);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println(docCount.get());
|
||||||
|
//
|
||||||
|
// counts.forEachEntry((w,c) -> {
|
||||||
|
// if (c > 3L) {
|
||||||
|
// System.out.println(w + ":" + c);
|
||||||
|
// }
|
||||||
|
// return true;
|
||||||
|
// });
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static long getStringHash(String s) {
|
||||||
|
String[] strings = separator.split(s);
|
||||||
|
if (s.length() > 1) {
|
||||||
|
byte[][] parts = new byte[strings.length][];
|
||||||
|
for (int i = 0; i < parts.length; i++) {
|
||||||
|
parts[i] = ps.stemWord(strings[i]).getBytes();
|
||||||
|
}
|
||||||
|
return longHash(parts);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return longHash(s.getBytes());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
public long getTermFreqHash(long hash) {
|
||||||
|
return wordRates.get(hash);
|
||||||
|
}
|
||||||
|
public long getTermFreq(String s) {
|
||||||
|
return wordRates.get(getStringHash(s));
|
||||||
|
}
|
||||||
|
public long getTermFreqStemmed(String s) {
|
||||||
|
return wordRates.get(longHash(s.getBytes()));
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String getStemmedString(String s) {
|
||||||
|
String[] strings = separator.split(s);
|
||||||
|
if (s.length() > 1) {
|
||||||
|
return Arrays.stream(strings).map(ps::stemWord).collect(Collectors.joining("_"));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static long longHash(byte[]... bytesSets) {
|
||||||
|
if (bytesSets == null || bytesSets.length == 0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
// https://cp-algorithms.com/string/string-hashing.html
|
||||||
|
int p = 127;
|
||||||
|
long m = (1L<<61)-1;
|
||||||
|
long p_power = 1;
|
||||||
|
long hash_val = 0;
|
||||||
|
|
||||||
|
for (byte[] bytes: bytesSets) {
|
||||||
|
for (byte element : bytes) {
|
||||||
|
hash_val = (hash_val + (element + 1) * p_power) % m;
|
||||||
|
p_power = (p_power * p) % m;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return hash_val;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -6,7 +6,7 @@ import com.zaxxer.hikari.HikariDataSource;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
|
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
@ -2,8 +2,8 @@ package nu.marginalia.wmsa.edge.assistant.suggest;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker;
|
import nu.marginalia.wmsa.edge.assistant.dict.SpellChecker;
|
||||||
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature;
|
||||||
import org.apache.commons.collections4.trie.PatriciaTrie;
|
import org.apache.commons.collections4.trie.PatriciaTrie;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -21,7 +21,7 @@ import java.util.stream.Stream;
|
|||||||
|
|
||||||
public class Suggestions {
|
public class Suggestions {
|
||||||
private final PatriciaTrie<String> suggestionsTrie;
|
private final PatriciaTrie<String> suggestionsTrie;
|
||||||
private final NGramDict nGramDict;
|
private final TermFrequencyDict termFrequencyDict;
|
||||||
private final SpellChecker spellChecker;
|
private final SpellChecker spellChecker;
|
||||||
|
|
||||||
private static final Pattern suggestionPattern = Pattern.compile("^[a-zA-Z0-9]+( [a-zA-Z0-9]+)*$");
|
private static final Pattern suggestionPattern = Pattern.compile("^[a-zA-Z0-9]+( [a-zA-Z0-9]+)*$");
|
||||||
@ -31,12 +31,12 @@ public class Suggestions {
|
|||||||
@Inject
|
@Inject
|
||||||
public Suggestions(@Named("suggestions-file") Path suggestionsFile,
|
public Suggestions(@Named("suggestions-file") Path suggestionsFile,
|
||||||
SpellChecker spellChecker,
|
SpellChecker spellChecker,
|
||||||
NGramDict dict
|
TermFrequencyDict dict
|
||||||
) {
|
) {
|
||||||
this.spellChecker = spellChecker;
|
this.spellChecker = spellChecker;
|
||||||
|
|
||||||
suggestionsTrie = loadSuggestions(suggestionsFile);
|
suggestionsTrie = loadSuggestions(suggestionsFile);
|
||||||
nGramDict = dict;
|
termFrequencyDict = dict;
|
||||||
|
|
||||||
logger.info("Loaded {} suggestions", suggestionsTrie.size());
|
logger.info("Loaded {} suggestions", suggestionsTrie.size());
|
||||||
}
|
}
|
||||||
@ -138,7 +138,7 @@ public class Suggestions {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Map<String, Long> scach = new HashMap<>(512);
|
Map<String, Long> scach = new HashMap<>(512);
|
||||||
Function<String, Long> valr = s -> -nGramDict.getTermFreqHash(scach.computeIfAbsent(s, NGramDict::getStringHash));
|
Function<String, Long> valr = s -> -termFrequencyDict.getTermFreqHash(scach.computeIfAbsent(s, TermFrequencyDict::getStringHash));
|
||||||
|
|
||||||
return Stream.iterate(start.getKey(), Objects::nonNull, suggestionsTrie::nextKey)
|
return Stream.iterate(start.getKey(), Objects::nonNull, suggestionsTrie::nextKey)
|
||||||
.takeWhile(s -> s.startsWith(prefix))
|
.takeWhile(s -> s.startsWith(prefix))
|
||||||
|
@ -22,7 +22,7 @@ import java.util.List;
|
|||||||
public class ConverterMain {
|
public class ConverterMain {
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final CrawledInstructionWriter instructionWriter;
|
private final LoadInstructionWriter instructionWriter;
|
||||||
|
|
||||||
public static void main(String... args) throws IOException {
|
public static void main(String... args) throws IOException {
|
||||||
|
|
||||||
@ -47,12 +47,12 @@ public class ConverterMain {
|
|||||||
Gson gson
|
Gson gson
|
||||||
) throws Exception {
|
) throws Exception {
|
||||||
|
|
||||||
instructionWriter = new CrawledInstructionWriter(plan.process.getDir(), gson);
|
instructionWriter = new LoadInstructionWriter(plan.process.getDir(), gson);
|
||||||
|
|
||||||
logger.info("Starting pipe");
|
logger.info("Starting pipe");
|
||||||
|
|
||||||
try (WorkLog processLog = plan.createProcessWorkLog()) {
|
try (WorkLog processLog = plan.createProcessWorkLog()) {
|
||||||
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 48, 4, 2) {
|
var pipe = new ParallelPipe<CrawledDomain, ProcessingInstructions>("Crawler", 20, 4, 2) {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected ProcessingInstructions onProcess(CrawledDomain domainData) {
|
protected ProcessingInstructions onProcess(CrawledDomain domainData) {
|
||||||
@ -73,12 +73,7 @@ public class ConverterMain {
|
|||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
plan.forEachCrawledDomain(domain -> {
|
plan.forEachCrawledDomain(id -> !processLog.isJobFinished(id), pipe::accept);
|
||||||
if (!processLog.isJobFinished(domain.id)) {
|
|
||||||
logger.info("{} - {}", domain.domain, domain.id);
|
|
||||||
pipe.accept(domain);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
pipe.join();
|
pipe.join();
|
||||||
}
|
}
|
||||||
|
@ -1,16 +1,17 @@
|
|||||||
package nu.marginalia.wmsa.edge.converting;
|
package nu.marginalia.wmsa.edge.converting;
|
||||||
|
|
||||||
import com.google.gson.*;
|
import com.google.gson.Gson;
|
||||||
import com.google.inject.AbstractModule;
|
import com.google.inject.AbstractModule;
|
||||||
import com.google.inject.name.Names;
|
import com.google.inject.name.Names;
|
||||||
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
|
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
|
import nu.marginalia.wmsa.client.GsonFactory;
|
||||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||||
|
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
|
||||||
|
import nu.marginalia.wmsa.edge.index.client.EdgeIndexLocalService;
|
||||||
|
import nu.marginalia.wmsa.edge.index.client.EdgeIndexWriterClient;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
|
||||||
|
|
||||||
import java.net.URISyntaxException;
|
import java.nio.file.Path;
|
||||||
|
|
||||||
public class ConverterModule extends AbstractModule {
|
public class ConverterModule extends AbstractModule {
|
||||||
|
|
||||||
@ -31,24 +32,20 @@ public class ConverterModule extends AbstractModule {
|
|||||||
bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128);
|
bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128);
|
||||||
bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255);
|
bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255);
|
||||||
|
|
||||||
|
if (null != System.getProperty("local-index-path")) {
|
||||||
|
bind(Path.class).annotatedWith(Names.named("local-index-path")).toInstance(Path.of(System.getProperty("local-index-path")));
|
||||||
|
bind(EdgeIndexWriterClient.class).to(EdgeIndexLocalService.class);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
bind(EdgeIndexWriterClient.class).to(EdgeIndexClient.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
|
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
|
||||||
}
|
}
|
||||||
|
|
||||||
private Gson createGson() {
|
private Gson createGson() {
|
||||||
|
return GsonFactory.get();
|
||||||
return new GsonBuilder()
|
|
||||||
.registerTypeAdapter(EdgeUrl.class, (JsonSerializer<EdgeUrl>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
|
||||||
.registerTypeAdapter(EdgeDomain.class, (JsonSerializer<EdgeDomain>) (src, typeOfSrc, context) -> new JsonPrimitive(src.toString()))
|
|
||||||
.registerTypeAdapter(EdgeUrl.class, (JsonDeserializer<EdgeUrl>) (json, typeOfT, context) -> {
|
|
||||||
try {
|
|
||||||
return new EdgeUrl(json.getAsString());
|
|
||||||
} catch (URISyntaxException e) {
|
|
||||||
throw new JsonParseException("URL Parse Exception", e);
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.registerTypeAdapter(EdgeDomain.class, (JsonDeserializer<EdgeDomain>) (json, typeOfT, context) -> new EdgeDomain(json.getAsString()))
|
|
||||||
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
|
|
||||||
.create();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,62 +0,0 @@
|
|||||||
package nu.marginalia.wmsa.edge.converting;
|
|
||||||
|
|
||||||
import com.github.luben.zstd.ZstdOutputStream;
|
|
||||||
import com.google.gson.Gson;
|
|
||||||
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.BufferedOutputStream;
|
|
||||||
import java.io.FileOutputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.OutputStreamWriter;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
public class CrawledInstructionWriter {
|
|
||||||
private final Path outputDir;
|
|
||||||
private final Gson gson;
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(CrawledInstructionWriter.class);
|
|
||||||
|
|
||||||
public CrawledInstructionWriter(Path outputDir, Gson gson) {
|
|
||||||
this.outputDir = outputDir;
|
|
||||||
this.gson = gson;
|
|
||||||
|
|
||||||
if (!Files.isDirectory(outputDir)) {
|
|
||||||
throw new IllegalArgumentException("Output dir " + outputDir + " does not exist");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public String accept(String id, List<Instruction> instructionList) throws IOException {
|
|
||||||
Path outputFile = getOutputFile(id);
|
|
||||||
|
|
||||||
if (Files.exists(outputFile)) {
|
|
||||||
Files.delete(outputFile);
|
|
||||||
}
|
|
||||||
|
|
||||||
try (var outputStream = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outputFile.toFile()))))) {
|
|
||||||
logger.info("Writing {} - {}", id, instructionList.size());
|
|
||||||
|
|
||||||
for (var instr : instructionList) {
|
|
||||||
outputStream.append(instr.tag().name());
|
|
||||||
outputStream.append(' ');
|
|
||||||
gson.toJson(instr, outputStream);
|
|
||||||
outputStream.append('\n');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return outputFile.getFileName().toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
private Path getOutputFile(String id) throws IOException {
|
|
||||||
String first = id.substring(0, 2);
|
|
||||||
String second = id.substring(2, 4);
|
|
||||||
|
|
||||||
Path destDir = outputDir.resolve(first).resolve(second);
|
|
||||||
if (!Files.exists(destDir)) {
|
|
||||||
Files.createDirectories(destDir);
|
|
||||||
}
|
|
||||||
return destDir.resolve(id + ".pzstd");
|
|
||||||
}
|
|
||||||
}
|
|
@ -2,11 +2,10 @@ package nu.marginalia.wmsa.edge.converting;
|
|||||||
|
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
import nu.marginalia.wmsa.configuration.server.Context;
|
import nu.marginalia.wmsa.configuration.server.Context;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||||
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
|
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
@ -76,9 +75,8 @@ public class LinkKeywordLoaderMain {
|
|||||||
|
|
||||||
// System.out.println(lastLine + " -/- " + domainId + ":" + urlId + " : " + keywords);
|
// System.out.println(lastLine + " -/- " + domainId + ":" + urlId + " : " + keywords);
|
||||||
|
|
||||||
indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -5, new EdgePageWordSet(
|
indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId),
|
||||||
new EdgePageWords(IndexBlock.Link, new HashSet<>(keywords))), 0
|
new DocumentKeywords(IndexBlock.Link, keywords.toArray(String[]::new)), 0);
|
||||||
).blockingSubscribe();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
lastLine = urlKeyword.url;
|
lastLine = urlKeyword.url;
|
||||||
|
@ -0,0 +1,121 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting;
|
||||||
|
|
||||||
|
import com.github.luben.zstd.ZstdOutputStream;
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.Instruction;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.Interpreter;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DomainLink;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocument;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.LoadProcessedDocumentWithError;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.BufferedOutputStream;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStreamWriter;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class LoadInstructionWriter {
|
||||||
|
|
||||||
|
private final Path outputDir;
|
||||||
|
private final Gson gson;
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(LoadInstructionWriter.class);
|
||||||
|
public LoadInstructionWriter(Path outputDir, Gson gson) {
|
||||||
|
this.outputDir = outputDir;
|
||||||
|
this.gson = gson;
|
||||||
|
|
||||||
|
if (!Files.isDirectory(outputDir)) {
|
||||||
|
throw new IllegalArgumentException("Output dir " + outputDir + " does not exist");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
public String accept(String id, List<Instruction> instructionList) throws IOException {
|
||||||
|
Path outputFile = getOutputFile(id);
|
||||||
|
|
||||||
|
if (Files.exists(outputFile)) {
|
||||||
|
Files.delete(outputFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
try (var outputStream = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outputFile.toFile()))))) {
|
||||||
|
|
||||||
|
SummarizingInterpreter summary = new SummarizingInterpreter(instructionList);
|
||||||
|
logger.info("Writing {} - {} - {}", id, instructionList.size(), summary);
|
||||||
|
|
||||||
|
for (var instr : instructionList) {
|
||||||
|
outputStream.append(instr.tag().name());
|
||||||
|
outputStream.append(' ');
|
||||||
|
gson.toJson(instr, outputStream);
|
||||||
|
outputStream.append('\n');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return outputFile.getFileName().toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Path getOutputFile(String id) throws IOException {
|
||||||
|
String first = id.substring(0, 2);
|
||||||
|
String second = id.substring(2, 4);
|
||||||
|
|
||||||
|
Path destDir = outputDir.resolve(first).resolve(second);
|
||||||
|
if (!Files.exists(destDir)) {
|
||||||
|
Files.createDirectories(destDir);
|
||||||
|
}
|
||||||
|
return destDir.resolve(id + ".pzstd");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class SummarizingInterpreter implements Interpreter {
|
||||||
|
|
||||||
|
private SummarizingInterpreter(List<Instruction> instructions) {
|
||||||
|
for (var i : instructions) {
|
||||||
|
i.apply(this);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private String domainName;
|
||||||
|
private int ok = 0;
|
||||||
|
private int error = 0;
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return String.format("%s - %d %d", domainName, ok, error);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadUrl(EdgeUrl[] url) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadDomain(EdgeDomain[] domain) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadRssFeed(EdgeUrl[] rssFeed) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadDomainLink(DomainLink[] links) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
|
||||||
|
this.domainName = domain.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {
|
||||||
|
ok++;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) {
|
||||||
|
error++;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void loadDomainRedirect(DomainLink link) {}
|
||||||
|
}
|
||||||
|
}
|
@ -27,7 +27,6 @@ public class LoaderMain {
|
|||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class);
|
private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class);
|
||||||
|
|
||||||
private final Path processDir;
|
|
||||||
private final EdgeCrawlPlan plan;
|
private final EdgeCrawlPlan plan;
|
||||||
private final ConvertedDomainReader instructionsReader;
|
private final ConvertedDomainReader instructionsReader;
|
||||||
private final LoaderFactory loaderFactory;
|
private final LoaderFactory loaderFactory;
|
||||||
@ -59,7 +58,6 @@ public class LoaderMain {
|
|||||||
LoaderFactory loaderFactory,
|
LoaderFactory loaderFactory,
|
||||||
EdgeIndexClient indexClient) {
|
EdgeIndexClient indexClient) {
|
||||||
|
|
||||||
this.processDir = plan.process.getDir();
|
|
||||||
this.plan = plan;
|
this.plan = plan;
|
||||||
this.instructionsReader = instructionsReader;
|
this.instructionsReader = instructionsReader;
|
||||||
this.loaderFactory = loaderFactory;
|
this.loaderFactory = loaderFactory;
|
||||||
@ -106,8 +104,13 @@ public class LoaderMain {
|
|||||||
public void run() {
|
public void run() {
|
||||||
long startTime = System.currentTimeMillis();
|
long startTime = System.currentTimeMillis();
|
||||||
for (var i : instructionList) {
|
for (var i : instructionList) {
|
||||||
|
try {
|
||||||
i.apply(loader);
|
i.apply(loader);
|
||||||
}
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.error("Failed to load instruction {}", i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
loader.finish();
|
loader.finish();
|
||||||
long loadTime = System.currentTimeMillis() - startTime;
|
long loadTime = System.currentTimeMillis() - startTime;
|
||||||
|
@ -6,7 +6,7 @@ import lombok.SneakyThrows;
|
|||||||
import nu.marginalia.util.DenseBitMap;
|
import nu.marginalia.util.DenseBitMap;
|
||||||
import nu.marginalia.util.language.WordPatterns;
|
import nu.marginalia.util.language.WordPatterns;
|
||||||
import nu.marginalia.wmsa.configuration.WmsaHome;
|
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
import nu.marginalia.wmsa.edge.assistant.dict.TermFrequencyDict;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import org.apache.logging.log4j.util.Strings;
|
import org.apache.logging.log4j.util.Strings;
|
||||||
@ -36,7 +36,7 @@ public class AnchorTextExtractor {
|
|||||||
// de-duplicating billions of shuffled (url, word) tuples on limited hardware
|
// de-duplicating billions of shuffled (url, word) tuples on limited hardware
|
||||||
private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
|
private final DenseBitMap deduplicateHashBitset = new DenseBitMap(DenseBitMap.MAX_CAPACITY_2GB_16BN_ITEMS);
|
||||||
|
|
||||||
private final NGramDict ngramDict = new NGramDict(WmsaHome.getLanguageModels());
|
private final TermFrequencyDict ngramDict = new TermFrequencyDict(WmsaHome.getLanguageModels());
|
||||||
|
|
||||||
public AnchorTextExtractor(Predicate<String> includeDomainPredicate,
|
public AnchorTextExtractor(Predicate<String> includeDomainPredicate,
|
||||||
Predicate<EdgeUrl> includeUrlPredicate,
|
Predicate<EdgeUrl> includeUrlPredicate,
|
||||||
|
@ -4,23 +4,22 @@ import com.google.inject.Inject;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.wmsa.configuration.server.Context;
|
import nu.marginalia.wmsa.configuration.server.Context;
|
||||||
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||||
import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
|
import nu.marginalia.wmsa.edge.index.client.EdgeIndexWriterClient;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.concurrent.LinkedBlockingQueue;
|
import java.util.concurrent.LinkedBlockingQueue;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
public class IndexLoadKeywords implements Runnable {
|
public class IndexLoadKeywords implements Runnable {
|
||||||
private final EdgeIndexClient client;
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(IndexLoadKeywords.class);
|
private static final Logger logger = LoggerFactory.getLogger(IndexLoadKeywords.class);
|
||||||
private final LinkedBlockingQueue<InsertTask> insertQueue = new LinkedBlockingQueue<>(32);
|
|
||||||
|
|
||||||
private record InsertTask(int urlId, int domainId, EdgePageWordSet wordSet) {}
|
private final LinkedBlockingQueue<InsertTask> insertQueue = new LinkedBlockingQueue<>(32);
|
||||||
|
private final EdgeIndexWriterClient client;
|
||||||
|
|
||||||
|
private record InsertTask(int urlId, int domainId, DocumentKeywords wordSet) {}
|
||||||
|
|
||||||
private final Thread runThread;
|
private final Thread runThread;
|
||||||
private volatile boolean canceled = false;
|
private volatile boolean canceled = false;
|
||||||
@ -28,7 +27,7 @@ public class IndexLoadKeywords implements Runnable {
|
|||||||
private static final int index = Integer.getInteger("keyword-index", 1);
|
private static final int index = Integer.getInteger("keyword-index", 1);
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public IndexLoadKeywords(EdgeIndexClient client) {
|
public IndexLoadKeywords(EdgeIndexWriterClient client) {
|
||||||
this.client = client;
|
this.client = client;
|
||||||
runThread = new Thread(this, getClass().getSimpleName());
|
runThread = new Thread(this, getClass().getSimpleName());
|
||||||
runThread.start();
|
runThread.start();
|
||||||
@ -39,7 +38,7 @@ public class IndexLoadKeywords implements Runnable {
|
|||||||
while (!canceled) {
|
while (!canceled) {
|
||||||
var data = insertQueue.poll(1, TimeUnit.SECONDS);
|
var data = insertQueue.poll(1, TimeUnit.SECONDS);
|
||||||
if (data != null) {
|
if (data != null) {
|
||||||
client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), -5., data.wordSet, index).blockingSubscribe();
|
client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.wordSet, index);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -53,15 +52,13 @@ public class IndexLoadKeywords implements Runnable {
|
|||||||
int domainId = loaderData.getDomainId(url.domain);
|
int domainId = loaderData.getDomainId(url.domain);
|
||||||
int urlId = loaderData.getUrlId(url);
|
int urlId = loaderData.getUrlId(url);
|
||||||
|
|
||||||
if (urlId < 0 || domainId < 0) {
|
if (urlId <= 0 || domainId <= 0) {
|
||||||
logger.warn("Failed to get IDs for {} -- d={},u={}", url, domainId, urlId);
|
logger.warn("Failed to get IDs for {} -- d={},u={}", url, domainId, urlId);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
var ws = new EdgePageWordSet();
|
for (var ws : words) {
|
||||||
for (var doc : words) {
|
|
||||||
ws.append(doc.block(), Arrays.asList(doc.keywords()));
|
|
||||||
}
|
|
||||||
|
|
||||||
insertQueue.put(new InsertTask(urlId, domainId, ws));
|
insertQueue.put(new InsertTask(urlId, domainId, ws));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -27,6 +27,9 @@ public class Loader implements Interpreter {
|
|||||||
private final List<LoadProcessedDocument> processedDocumentList;
|
private final List<LoadProcessedDocument> processedDocumentList;
|
||||||
private final List<LoadProcessedDocumentWithError> processedDocumentWithErrorList;
|
private final List<LoadProcessedDocumentWithError> processedDocumentWithErrorList;
|
||||||
|
|
||||||
|
private final List<EdgeDomain> deferredDomains = new ArrayList<>();
|
||||||
|
private final List<EdgeUrl> deferredUrls = new ArrayList<>();
|
||||||
|
|
||||||
public final LoaderData data;
|
public final LoaderData data;
|
||||||
|
|
||||||
public Loader(int sizeHint,
|
public Loader(int sizeHint,
|
||||||
@ -72,28 +75,54 @@ public class Loader implements Interpreter {
|
|||||||
@Override
|
@Override
|
||||||
public void loadDomainLink(DomainLink[] links) {
|
public void loadDomainLink(DomainLink[] links) {
|
||||||
logger.debug("loadDomainLink({})", links, null);
|
logger.debug("loadDomainLink({})", links, null);
|
||||||
sqlLoadDomainLinks.load(links);
|
sqlLoadDomainLinks.load(data, links);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
|
public void loadProcessedDomain(EdgeDomain domain, EdgeDomainIndexingState state, String ip) {
|
||||||
logger.debug("loadProcessedDomain({}, {}, {})", domain, state, ip);
|
logger.debug("loadProcessedDomain({}, {}, {})", domain, state, ip);
|
||||||
|
|
||||||
sqlLoadProcessedDomain.load(data, domain, state, ip);
|
sqlLoadProcessedDomain.load(data, domain, state, ip);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void loadProcessedDocument(LoadProcessedDocument document) {
|
public void loadProcessedDocument(LoadProcessedDocument document) {
|
||||||
|
deferralCheck(document.url());
|
||||||
|
|
||||||
processedDocumentList.add(document);
|
processedDocumentList.add(document);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void loadProcessedDocumentWithError(LoadProcessedDocumentWithError document) {
|
public void loadProcessedDocumentWithError(LoadProcessedDocumentWithError document) {
|
||||||
|
deferralCheck(document.url());
|
||||||
|
|
||||||
processedDocumentWithErrorList.add(document);
|
processedDocumentWithErrorList.add(document);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void deferralCheck(EdgeUrl url) {
|
||||||
|
if (data.getDomainId(url.domain) <= 0)
|
||||||
|
deferredDomains.add(url.domain);
|
||||||
|
|
||||||
|
if (data.getUrlId(url) <= 0)
|
||||||
|
deferredUrls.add(url);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {
|
public void loadKeywords(EdgeUrl url, DocumentKeywords[] words) {
|
||||||
logger.debug("loadKeywords(#{})", words.length);
|
logger.debug("loadKeywords(#{})", words.length);
|
||||||
|
|
||||||
|
// This is a bit of a bandaid safeguard against a bug in
|
||||||
|
// in the converter, shouldn't be necessary in the future
|
||||||
|
if (!deferredDomains.isEmpty()) {
|
||||||
|
loadDomain(deferredDomains.toArray(EdgeDomain[]::new));
|
||||||
|
deferredDomains.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!deferredUrls.isEmpty()) {
|
||||||
|
loadUrl(deferredUrls.toArray(EdgeUrl[]::new));
|
||||||
|
deferredUrls.clear();
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
indexLoadKeywords.load(data, url, words);
|
indexLoadKeywords.load(data, url, words);
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
|
@ -40,13 +40,21 @@ public class SqlLoadDomainLinks {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void load(DomainLink[] links) {
|
public void load(LoaderData data, DomainLink[] links) {
|
||||||
|
|
||||||
try (var connection = dataSource.getConnection();
|
try (var connection = dataSource.getConnection();
|
||||||
|
var nukeExistingLinksForDomain =
|
||||||
|
connection.prepareStatement("""
|
||||||
|
DELETE FROM EC_DOMAIN_LINK WHERE SOURCE_DOMAIN_ID=?
|
||||||
|
""");
|
||||||
var stmt =
|
var stmt =
|
||||||
connection.prepareCall("CALL INSERT_LINK(?,?)"))
|
connection.prepareCall("CALL INSERT_LINK(?,?)"))
|
||||||
{
|
{
|
||||||
|
|
||||||
|
connection.setAutoCommit(false);
|
||||||
|
nukeExistingLinksForDomain.setInt(1, data.getDomainId(links[0].from()));
|
||||||
|
nukeExistingLinksForDomain.executeUpdate();
|
||||||
|
|
||||||
for (DomainLink link : links) {
|
for (DomainLink link : links) {
|
||||||
stmt.setString(1, link.from().toString());
|
stmt.setString(1, link.from().toString());
|
||||||
stmt.setString(2, link.to().toString());
|
stmt.setString(2, link.to().toString());
|
||||||
@ -60,6 +68,10 @@ public class SqlLoadDomainLinks {
|
|||||||
logger.warn("load({}) -- bad row count {}", links[rv], ret[rv]);
|
logger.warn("load({}) -- bad row count {}", links[rv], ret[rv]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
connection.commit();
|
||||||
|
connection.setAutoCommit(true);
|
||||||
|
|
||||||
}
|
}
|
||||||
catch (SQLException ex) {
|
catch (SQLException ex) {
|
||||||
logger.warn("SQL error inserting domain links", ex);
|
logger.warn("SQL error inserting domain links", ex);
|
||||||
|
@ -41,16 +41,18 @@ public class SqlLoadDomains {
|
|||||||
|
|
||||||
try (var connection = dataSource.getConnection()) {
|
try (var connection = dataSource.getConnection()) {
|
||||||
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) {
|
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) {
|
||||||
|
connection.setAutoCommit(false);
|
||||||
insertCall.setString(1, domain.toString());
|
insertCall.setString(1, domain.toString());
|
||||||
insertCall.setString(2, domain.domain);
|
insertCall.setString(2, domain.domain);
|
||||||
insertCall.addBatch();
|
|
||||||
|
|
||||||
var ret = insertCall.executeUpdate();
|
var ret = insertCall.executeUpdate();
|
||||||
|
connection.commit();
|
||||||
if (ret < 0) {
|
if (ret < 0) {
|
||||||
logger.warn("load({}) -- bad row count {}", domain, ret);
|
logger.warn("load({}) -- bad return status {}", domain, ret);
|
||||||
}
|
}
|
||||||
|
|
||||||
findIdForTargetDomain(connection, data);
|
findIdForDomain(connection, data, domain);
|
||||||
|
connection.setAutoCommit(true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (SQLException ex) {
|
catch (SQLException ex) {
|
||||||
@ -67,30 +69,48 @@ public class SqlLoadDomains {
|
|||||||
|
|
||||||
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) {
|
try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) {
|
||||||
|
|
||||||
|
|
||||||
|
int cnt = 0; int batchOffset = 0;
|
||||||
for (var domain : domains) {
|
for (var domain : domains) {
|
||||||
insertCall.setString(1, domain.toString());
|
insertCall.setString(1, domain.toString());
|
||||||
insertCall.setString(2, domain.domain);
|
insertCall.setString(2, domain.domain);
|
||||||
insertCall.addBatch();
|
insertCall.addBatch();
|
||||||
}
|
|
||||||
var ret = insertCall.executeBatch();
|
|
||||||
|
|
||||||
for (int rv = 0; rv < domains.length; rv++) {
|
if (++cnt == 1000) {
|
||||||
|
var ret = insertCall.executeBatch();
|
||||||
|
connection.commit();
|
||||||
|
|
||||||
|
for (int rv = 0; rv < cnt; rv++) {
|
||||||
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
|
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
|
||||||
logger.warn("load({}) -- bad row count {}", domains[rv], ret[rv]);
|
logger.warn("load({}) -- bad row count {}", domains[batchOffset + rv], ret[rv]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cnt = 0;
|
||||||
|
batchOffset += 1000;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (cnt > 0) {
|
||||||
|
var ret = insertCall.executeBatch();
|
||||||
|
connection.commit();
|
||||||
|
for (int rv = 0; rv < cnt; rv++) {
|
||||||
|
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
|
||||||
|
logger.warn("load({}) -- bad row count {}", domains[batchOffset + rv], ret[rv]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
connection.commit();
|
connection.commit();
|
||||||
connection.setAutoCommit(true);
|
connection.setAutoCommit(true);
|
||||||
findIdForTargetDomain(connection, data);
|
findIdForDomain(connection, data, domains);
|
||||||
}
|
}
|
||||||
catch (SQLException ex) {
|
catch (SQLException ex) {
|
||||||
logger.warn("SQL error inserting domains", ex);
|
logger.warn("SQL error inserting domains", ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void findIdForTargetDomain(Connection connection, LoaderData data) {
|
void findIdForDomain(Connection connection, LoaderData data, EdgeDomain... domains) {
|
||||||
if (data.getTargetDomain() == null || data.getDomainId(data.getTargetDomain()) > 0) {
|
if (data.getTargetDomain() == null || data.getDomainId(data.getTargetDomain()) > 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -98,14 +118,39 @@ public class SqlLoadDomains {
|
|||||||
try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?"))
|
try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?"))
|
||||||
{
|
{
|
||||||
|
|
||||||
var targetDomain = data.getTargetDomain();
|
for (var domain : domains) {
|
||||||
query.setString(1, targetDomain.toString());
|
if (data.getDomainId(domain) > 0)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
query.setString(1, domain.toString());
|
||||||
var rsp = query.executeQuery();
|
var rsp = query.executeQuery();
|
||||||
if (rsp.next()) {
|
if (rsp.next()) {
|
||||||
data.addDomain(targetDomain, rsp.getInt(1));
|
data.addDomain(domain, rsp.getInt(1));
|
||||||
|
} else {
|
||||||
|
logger.warn("load() -- could not find ID for target domain {}", domain);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
logger.warn("SQL error finding id for domain", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void loadAdditionalDomains(Connection connection, LoaderData data, EdgeDomain[] domains) {
|
||||||
|
|
||||||
|
try (var query = connection.prepareStatement("SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?"))
|
||||||
|
{
|
||||||
|
for (var domain : domains) {
|
||||||
|
|
||||||
|
if (data.getDomainId(domain) == 0) continue;
|
||||||
|
|
||||||
|
query.setString(1, domain.toString());
|
||||||
|
var rsp = query.executeQuery();
|
||||||
|
if (rsp.next()) {
|
||||||
|
data.addDomain(domain, rsp.getInt(1));
|
||||||
|
} else {
|
||||||
|
logger.warn("load() -- could not find ID for target domain {}", domain);
|
||||||
}
|
}
|
||||||
else {
|
|
||||||
logger.warn("load() -- could not find ID for target domain {}", targetDomain);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (SQLException ex) {
|
catch (SQLException ex) {
|
||||||
|
@ -60,13 +60,15 @@ public class SqlLoadProcessedDocument {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void load(LoaderData data, List<LoadProcessedDocument> documents) {
|
public void load(LoaderData data, List<LoadProcessedDocument> documents) {
|
||||||
|
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?)")) {
|
var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?)")) {
|
||||||
conn.setAutoCommit(false);
|
conn.setAutoCommit(false);
|
||||||
|
|
||||||
|
int cnt = 0; int batchOffset = 0;
|
||||||
for (var doc : documents) {
|
for (var doc : documents) {
|
||||||
int urlId = data.getUrlId(doc.url());
|
int urlId = data.getUrlId(doc.url());
|
||||||
if (urlId < 0) {
|
if (urlId <= 0) {
|
||||||
logger.warn("Failed to resolve ID for URL {}", doc.url());
|
logger.warn("Failed to resolve ID for URL {}", doc.url());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -81,25 +83,46 @@ public class SqlLoadProcessedDocument {
|
|||||||
stmt.setDouble(8, doc.quality());
|
stmt.setDouble(8, doc.quality());
|
||||||
stmt.setInt(9, (int) doc.hash());
|
stmt.setInt(9, (int) doc.hash());
|
||||||
stmt.addBatch();
|
stmt.addBatch();
|
||||||
}
|
|
||||||
|
if (++cnt == 100) {
|
||||||
var ret = stmt.executeBatch();
|
var ret = stmt.executeBatch();
|
||||||
|
|
||||||
for (int rv = 0; rv < documents.size(); rv++) {
|
|
||||||
if (ret[rv] < 1 && ret[rv] != SUCCESS_NO_INFO) {
|
|
||||||
logger.warn("load({}) -- bad row count {}", documents.get(rv), ret[rv]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
conn.commit();
|
conn.commit();
|
||||||
|
|
||||||
|
for (int rv = 0; rv < cnt; rv++) {
|
||||||
|
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
|
||||||
|
logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cnt = 0;
|
||||||
|
batchOffset += 100;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (cnt > 0) {
|
||||||
|
var ret = stmt.executeBatch();
|
||||||
|
conn.commit();
|
||||||
|
for (int rv = 0; rv < cnt; rv++) {
|
||||||
|
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
|
||||||
|
logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
conn.setAutoCommit(true);
|
||||||
|
|
||||||
} catch (SQLException ex) {
|
} catch (SQLException ex) {
|
||||||
logger.warn("SQL error inserting document", ex);
|
logger.warn("SQL error inserting document", ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void loadWithError(LoaderData data, List<LoadProcessedDocumentWithError> documents) {
|
public void loadWithError(LoaderData data, List<LoadProcessedDocumentWithError> documents) {
|
||||||
|
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT_BAD(?, ?)")) {
|
var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT_BAD(?, ?)")) {
|
||||||
|
|
||||||
|
conn.setAutoCommit(false);
|
||||||
|
|
||||||
|
int cnt = 0; int batchOffset = 0;
|
||||||
for (var doc : documents) {
|
for (var doc : documents) {
|
||||||
int urlId = data.getUrlId(doc.url());
|
int urlId = data.getUrlId(doc.url());
|
||||||
if (urlId < 0) {
|
if (urlId < 0) {
|
||||||
@ -110,13 +133,32 @@ public class SqlLoadProcessedDocument {
|
|||||||
stmt.setInt(1, urlId);
|
stmt.setInt(1, urlId);
|
||||||
stmt.setString(2, doc.state().name());
|
stmt.setString(2, doc.state().name());
|
||||||
stmt.addBatch();
|
stmt.addBatch();
|
||||||
}
|
|
||||||
|
if (++cnt == 100) {
|
||||||
var ret = stmt.executeBatch();
|
var ret = stmt.executeBatch();
|
||||||
for (int rv = 0; rv < documents.size(); rv++) {
|
conn.commit();
|
||||||
|
|
||||||
|
for (int rv = 0; rv < cnt; rv++) {
|
||||||
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
|
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
|
||||||
logger.warn("load({}) -- bad row count {}", documents.get(rv), ret[rv]);
|
logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cnt = 0;
|
||||||
|
batchOffset += 100;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (cnt > 0) {
|
||||||
|
var ret = stmt.executeBatch();
|
||||||
|
conn.commit();
|
||||||
|
for (int rv = 0; rv < cnt; rv++) {
|
||||||
|
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
|
||||||
|
logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
conn.setAutoCommit(true);
|
||||||
} catch (SQLException ex) {
|
} catch (SQLException ex) {
|
||||||
logger.warn("SQL error inserting failed document", ex);
|
logger.warn("SQL error inserting failed document", ex);
|
||||||
}
|
}
|
||||||
|
@ -14,6 +14,7 @@ public class SqlLoadProcessedDomain {
|
|||||||
private final HikariDataSource dataSource;
|
private final HikariDataSource dataSource;
|
||||||
private final SqlLoadDomains loadDomains;
|
private final SqlLoadDomains loadDomains;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(SqlLoadProcessedDomain.class);
|
private static final Logger logger = LoggerFactory.getLogger(SqlLoadProcessedDomain.class);
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public SqlLoadProcessedDomain(HikariDataSource dataSource, SqlLoadDomains loadDomains) {
|
public SqlLoadProcessedDomain(HikariDataSource dataSource, SqlLoadDomains loadDomains) {
|
||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
@ -54,6 +55,7 @@ public class SqlLoadProcessedDomain {
|
|||||||
initCall.setInt(3, data.getDomainId(domain));
|
initCall.setInt(3, data.getDomainId(domain));
|
||||||
initCall.setString(4, ip);
|
initCall.setString(4, ip);
|
||||||
int rc = initCall.executeUpdate();
|
int rc = initCall.executeUpdate();
|
||||||
|
conn.commit();
|
||||||
if (rc < 1) {
|
if (rc < 1) {
|
||||||
logger.warn("load({},{}) -- bad rowcount {}", domain, state, rc);
|
logger.warn("load({},{}) -- bad rowcount {}", domain, state, rc);
|
||||||
}
|
}
|
||||||
@ -75,6 +77,7 @@ public class SqlLoadProcessedDomain {
|
|||||||
stmt.setString(1, link.to().toString());
|
stmt.setString(1, link.to().toString());
|
||||||
stmt.setString(2, link.from().toString());
|
stmt.setString(2, link.from().toString());
|
||||||
int rc = stmt.executeUpdate();
|
int rc = stmt.executeUpdate();
|
||||||
|
conn.commit();
|
||||||
if (rc != 1) {
|
if (rc != 1) {
|
||||||
logger.warn("loadAlias({}) - unexpected row count {}", link, rc);
|
logger.warn("loadAlias({}) - unexpected row count {}", link, rc);
|
||||||
}
|
}
|
||||||
|
@ -4,6 +4,7 @@ import com.google.common.hash.HashFunction;
|
|||||||
import com.google.common.hash.Hashing;
|
import com.google.common.hash.Hashing;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -11,6 +12,8 @@ import org.slf4j.LoggerFactory;
|
|||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.sql.Types;
|
import java.sql.Types;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import static java.sql.Statement.SUCCESS_NO_INFO;
|
import static java.sql.Statement.SUCCESS_NO_INFO;
|
||||||
|
|
||||||
@ -46,17 +49,22 @@ public class SqlLoadUrls {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void load(LoaderData data, EdgeUrl[] urls) {
|
public void load(LoaderData data, EdgeUrl[] urls) {
|
||||||
|
Set<EdgeDomain> affectedDomains = new HashSet<>();
|
||||||
|
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?,?,?)");
|
var insertCall = conn.prepareCall("CALL INSERT_URL(?,?,?,?,?,?)");
|
||||||
var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=?")
|
var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=?")
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
conn.setAutoCommit(false);
|
conn.setAutoCommit(false);
|
||||||
|
|
||||||
|
int cnt = 0; int batchOffset = 0;
|
||||||
for (var url : urls) {
|
for (var url : urls) {
|
||||||
if (url.path.length() >= 255) {
|
if (url.path.length() >= 255) {
|
||||||
logger.warn("Skipping bad URL {}", url);
|
logger.warn("Skipping bad URL {}", url);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
affectedDomains.add(url.domain);
|
||||||
|
|
||||||
insertCall.setString(1, url.proto);
|
insertCall.setString(1, url.proto);
|
||||||
insertCall.setString(2, url.domain.toString());
|
insertCall.setString(2, url.domain.toString());
|
||||||
@ -70,22 +78,39 @@ public class SqlLoadUrls {
|
|||||||
insertCall.setString(5, url.param);
|
insertCall.setString(5, url.param);
|
||||||
insertCall.setLong(6, hashPath(url.path, url.param));
|
insertCall.setLong(6, hashPath(url.path, url.param));
|
||||||
insertCall.addBatch();
|
insertCall.addBatch();
|
||||||
}
|
|
||||||
|
if (cnt++ == 250) {
|
||||||
var ret = insertCall.executeBatch();
|
var ret = insertCall.executeBatch();
|
||||||
for (int rv = 0; rv < ret.length; rv++) {
|
conn.commit();
|
||||||
|
|
||||||
|
for (int rv = 0; rv < cnt; rv++) {
|
||||||
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
|
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
|
||||||
logger.warn("load({}) -- bad row count {}", urls[rv], ret[rv]);
|
logger.warn("load({}) -- bad row count {}", urls[batchOffset + rv], ret[rv]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
batchOffset += cnt;
|
||||||
|
cnt = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (cnt > 0) {
|
||||||
|
var ret = insertCall.executeBatch();
|
||||||
conn.commit();
|
conn.commit();
|
||||||
|
|
||||||
|
for (int rv = 0; rv < cnt; rv++) {
|
||||||
|
if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) {
|
||||||
|
logger.warn("load({}) -- bad row count {}", urls[batchOffset + rv], ret[rv]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
conn.setAutoCommit(true);
|
conn.setAutoCommit(true);
|
||||||
|
|
||||||
|
|
||||||
var targetDomain = data.getTargetDomain();
|
for (var domain : affectedDomains) {
|
||||||
queryCall.setInt(1, data.getDomainId(targetDomain));
|
queryCall.setInt(1, data.getDomainId(domain));
|
||||||
|
|
||||||
var rsp = queryCall.executeQuery();
|
var rsp = queryCall.executeQuery();
|
||||||
|
rsp.setFetchSize(1000);
|
||||||
|
|
||||||
while (rsp.next()) {
|
while (rsp.next()) {
|
||||||
int urlId = rsp.getInt(1);
|
int urlId = rsp.getInt(1);
|
||||||
@ -93,7 +118,8 @@ public class SqlLoadUrls {
|
|||||||
String path = rsp.getString(3);
|
String path = rsp.getString(3);
|
||||||
String param = rsp.getString(4);
|
String param = rsp.getString(4);
|
||||||
|
|
||||||
data.addUrl(new EdgeUrl(proto, targetDomain, null, path, param), urlId);
|
data.addUrl(new EdgeUrl(proto, domain, null, path, param), urlId);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -17,6 +17,7 @@ public class DisqualifiedException extends Exception {
|
|||||||
LANGUAGE,
|
LANGUAGE,
|
||||||
STATUS,
|
STATUS,
|
||||||
QUALITY,
|
QUALITY,
|
||||||
ACCEPTABLE_ADS
|
ACCEPTABLE_ADS,
|
||||||
|
FORBIDDEN
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -15,6 +15,7 @@ public class ProcessedDocument {
|
|||||||
public EdgePageWordSet words;
|
public EdgePageWordSet words;
|
||||||
|
|
||||||
public EdgeUrlState state;
|
public EdgeUrlState state;
|
||||||
|
public String stateReason;
|
||||||
|
|
||||||
public OptionalDouble quality() {
|
public OptionalDouble quality() {
|
||||||
if (details != null) {
|
if (details != null) {
|
||||||
|
@ -70,11 +70,22 @@ public class DocumentProcessor {
|
|||||||
this.summaryExtractor = summaryExtractor;
|
this.summaryExtractor = summaryExtractor;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public ProcessedDocument makeDisqualifiedStub(CrawledDocument crawledDocument) {
|
||||||
|
ProcessedDocument ret = new ProcessedDocument();
|
||||||
|
|
||||||
|
try {
|
||||||
|
ret.state = EdgeUrlState.DISQUALIFIED;
|
||||||
|
ret.url = getDocumentUrl(crawledDocument);
|
||||||
|
}
|
||||||
|
catch (Exception ex) {}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
public ProcessedDocument process(CrawledDocument crawledDocument, CrawledDomain crawledDomain) {
|
public ProcessedDocument process(CrawledDocument crawledDocument, CrawledDomain crawledDomain) {
|
||||||
ProcessedDocument ret = new ProcessedDocument();
|
ProcessedDocument ret = new ProcessedDocument();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
ret.url = new EdgeUrl(crawledDocument.url);
|
ret.url = getDocumentUrl(crawledDocument);
|
||||||
ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus);
|
ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus);
|
||||||
|
|
||||||
if (ret.state == EdgeUrlState.OK) {
|
if (ret.state == EdgeUrlState.OK) {
|
||||||
@ -86,10 +97,6 @@ public class DocumentProcessor {
|
|||||||
if (isAcceptedContentType(crawledDocument)) {
|
if (isAcceptedContentType(crawledDocument)) {
|
||||||
var detailsWords = createDetails(crawledDomain, crawledDocument);
|
var detailsWords = createDetails(crawledDomain, crawledDocument);
|
||||||
|
|
||||||
if (detailsWords.details().quality < minDocumentQuality) {
|
|
||||||
throw new DisqualifiedException(DisqualificationReason.QUALITY);
|
|
||||||
}
|
|
||||||
|
|
||||||
ret.details = detailsWords.details();
|
ret.details = detailsWords.details();
|
||||||
ret.words = detailsWords.words();
|
ret.words = detailsWords.words();
|
||||||
}
|
}
|
||||||
@ -103,17 +110,31 @@ public class DocumentProcessor {
|
|||||||
}
|
}
|
||||||
catch (DisqualifiedException ex) {
|
catch (DisqualifiedException ex) {
|
||||||
ret.state = EdgeUrlState.DISQUALIFIED;
|
ret.state = EdgeUrlState.DISQUALIFIED;
|
||||||
|
ret.stateReason = ex.reason.toString();
|
||||||
logger.debug("Disqualified {}: {}", ret.url, ex.reason);
|
logger.debug("Disqualified {}: {}", ret.url, ex.reason);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
ret.state = EdgeUrlState.DISQUALIFIED;
|
ret.state = EdgeUrlState.DISQUALIFIED;
|
||||||
logger.info("Failed to convert " + ret.url, ex);
|
logger.info("Failed to convert " + crawledDocument.url, ex);
|
||||||
ex.printStackTrace();
|
ex.printStackTrace();
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument)
|
||||||
|
throws URISyntaxException
|
||||||
|
{
|
||||||
|
if (crawledDocument.canonicalUrl != null) {
|
||||||
|
try {
|
||||||
|
return new EdgeUrl(crawledDocument.canonicalUrl);
|
||||||
|
}
|
||||||
|
catch (URISyntaxException ex) { /* fallthrough */ }
|
||||||
|
}
|
||||||
|
|
||||||
|
return new EdgeUrl(crawledDocument.url);
|
||||||
|
}
|
||||||
|
|
||||||
public static boolean isAcceptedContentType(CrawledDocument crawledDocument) {
|
public static boolean isAcceptedContentType(CrawledDocument crawledDocument) {
|
||||||
if (crawledDocument.contentType == null) {
|
if (crawledDocument.contentType == null) {
|
||||||
return false;
|
return false;
|
||||||
@ -141,27 +162,44 @@ public class DocumentProcessor {
|
|||||||
private DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument)
|
private DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument)
|
||||||
throws DisqualifiedException, URISyntaxException {
|
throws DisqualifiedException, URISyntaxException {
|
||||||
|
|
||||||
var doc = Jsoup.parse(crawledDocument.documentBody);
|
Document doc = Jsoup.parse(crawledDocument.documentBody);
|
||||||
|
|
||||||
if (AcceptableAds.hasAcceptableAdsTag(doc)) {
|
if (AcceptableAds.hasAcceptableAdsTag(doc)) {
|
||||||
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
|
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
|
||||||
}
|
}
|
||||||
|
if (doc.select("meta[name=robots]").attr("content").contains("noindex")) {
|
||||||
|
throw new DisqualifiedException(DisqualificationReason.FORBIDDEN);
|
||||||
|
}
|
||||||
|
|
||||||
var dld = sentenceExtractor.extractSentences(doc.clone());
|
Document prunedDoc = doc.clone();
|
||||||
|
prunedDoc.body().filter(new DomPruningFilter(0.5));
|
||||||
|
|
||||||
|
var dld = sentenceExtractor.extractSentences(prunedDoc);
|
||||||
|
|
||||||
checkDocumentLanguage(dld);
|
checkDocumentLanguage(dld);
|
||||||
|
|
||||||
var ret = new ProcessedDocumentDetails();
|
var ret = new ProcessedDocumentDetails();
|
||||||
|
|
||||||
ret.description = getDescription(doc);
|
|
||||||
ret.length = getLength(doc);
|
ret.length = getLength(doc);
|
||||||
ret.standard = getHtmlStandard(doc);
|
ret.standard = getHtmlStandard(doc);
|
||||||
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
|
ret.title = titleExtractor.getTitleAbbreviated(doc, dld, crawledDocument.url);
|
||||||
ret.features = featureExtractor.getFeatures(crawledDomain, doc);
|
|
||||||
ret.quality = documentValuator.getQuality(ret.standard, doc, dld);
|
ret.quality = documentValuator.getQuality(ret.standard, doc, dld);
|
||||||
ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();
|
ret.hashCode = HashCode.fromString(crawledDocument.documentBodyHash).asLong();
|
||||||
|
|
||||||
var words = getWords(dld);
|
final boolean doSimpleProcessing = ret.quality < minDocumentQuality;
|
||||||
|
|
||||||
|
EdgePageWordSet words;
|
||||||
|
if (doSimpleProcessing) {
|
||||||
|
ret.features = Set.of(HtmlFeature.UNKNOWN);
|
||||||
|
words = keywordExtractor.extractKeywordsMinimal(dld);
|
||||||
|
ret.description = "";
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
ret.features = featureExtractor.getFeatures(crawledDomain, doc, dld);
|
||||||
|
words = keywordExtractor.extractKeywords(dld);
|
||||||
|
ret.description = getDescription(doc);
|
||||||
|
}
|
||||||
|
|
||||||
var url = new EdgeUrl(crawledDocument.url);
|
var url = new EdgeUrl(crawledDocument.url);
|
||||||
addMetaWords(ret, url, crawledDomain, words);
|
addMetaWords(ret, url, crawledDomain, words);
|
||||||
@ -192,7 +230,6 @@ public class DocumentProcessor {
|
|||||||
ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
|
ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
|
||||||
|
|
||||||
words.append(IndexBlock.Meta, tagWords);
|
words.append(IndexBlock.Meta, tagWords);
|
||||||
words.append(IndexBlock.Words, tagWords);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) {
|
private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) {
|
||||||
@ -208,13 +245,12 @@ public class DocumentProcessor {
|
|||||||
if (linkParser.shouldIndexLink(atag)) {
|
if (linkParser.shouldIndexLink(atag)) {
|
||||||
linkOpt.ifPresent(lp::accept);
|
linkOpt.ifPresent(lp::accept);
|
||||||
}
|
}
|
||||||
else if (linkOpt.isPresent()) {
|
else {
|
||||||
if (linkParser.hasBinarySuffix(linkOpt.get().toString())) {
|
linkOpt
|
||||||
linkOpt.ifPresent(lp::acceptNonIndexable);
|
.filter(url -> linkParser.hasBinarySuffix(url.path.toLowerCase()))
|
||||||
|
.ifPresent(lp::acceptNonIndexable);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
|
||||||
for (var frame : doc.getElementsByTag("frame")) {
|
for (var frame : doc.getElementsByTag("frame")) {
|
||||||
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
|
linkParser.parseFrame(baseUrl, frame).ifPresent(lp::accept);
|
||||||
}
|
}
|
||||||
@ -233,26 +269,24 @@ public class DocumentProcessor {
|
|||||||
linkTerms.add("links:"+fd.toString().toLowerCase());
|
linkTerms.add("links:"+fd.toString().toLowerCase());
|
||||||
linkTerms.add("links:"+fd.getDomain().toLowerCase());
|
linkTerms.add("links:"+fd.getDomain().toLowerCase());
|
||||||
}
|
}
|
||||||
|
|
||||||
words.append(IndexBlock.Meta, linkTerms);
|
words.append(IndexBlock.Meta, linkTerms);
|
||||||
|
|
||||||
Set<String> fileKeywords = new HashSet<>(100);
|
Set<String> fileKeywords = new HashSet<>(100);
|
||||||
for (var link : lp.getNonIndexableUrls()) {
|
for (var link : lp.getNonIndexableUrls()) {
|
||||||
|
|
||||||
if (!Objects.equals(domain, link.domain)) {
|
if (!domain.hasSameTopDomain(link.domain)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
synthesizeFilenameKeyword(fileKeywords, link);
|
synthesizeFilenameKeyword(fileKeywords, link);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
words.append(IndexBlock.Artifacts, fileKeywords);
|
words.append(IndexBlock.Artifacts, fileKeywords);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void synthesizeFilenameKeyword(Set<String> fileKeywords, EdgeUrl link) {
|
private void synthesizeFilenameKeyword(Set<String> fileKeywords, EdgeUrl link) {
|
||||||
|
|
||||||
|
|
||||||
Path pFilename = Path.of(link.path.toLowerCase()).getFileName();
|
Path pFilename = Path.of(link.path.toLowerCase()).getFileName();
|
||||||
|
|
||||||
if (pFilename == null) return;
|
if (pFilename == null) return;
|
||||||
@ -289,10 +323,6 @@ public class DocumentProcessor {
|
|||||||
return htmlStandard;
|
return htmlStandard;
|
||||||
}
|
}
|
||||||
|
|
||||||
private EdgePageWordSet getWords(DocumentLanguageData dld) {
|
|
||||||
return keywordExtractor.extractKeywords(dld);
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getDescription(Document doc) {
|
private String getDescription(Document doc) {
|
||||||
return summaryExtractor.extractSummary(doc);
|
return summaryExtractor.extractSummary(doc);
|
||||||
}
|
}
|
||||||
|
@ -1,23 +1,29 @@
|
|||||||
package nu.marginalia.wmsa.edge.converting.processor;
|
package nu.marginalia.wmsa.edge.converting.processor;
|
||||||
|
|
||||||
|
import com.google.common.base.Strings;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDocument;
|
|
||||||
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.CommonKeywordExtractor;
|
||||||
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDomainStatus;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeUrlState;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
import static nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus.BAD_CANONICAL;
|
||||||
|
|
||||||
public class DomainProcessor {
|
public class DomainProcessor {
|
||||||
|
private static final CommonKeywordExtractor commonKeywordExtractor = new CommonKeywordExtractor();
|
||||||
|
|
||||||
private final DocumentProcessor documentProcessor;
|
private final DocumentProcessor documentProcessor;
|
||||||
private final Double minAvgDocumentQuality;
|
private final Double minAvgDocumentQuality;
|
||||||
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public DomainProcessor(DocumentProcessor documentProcessor,
|
public DomainProcessor(DocumentProcessor documentProcessor,
|
||||||
@Named("min-avg-document-quality") Double minAvgDocumentQuality
|
@Named("min-avg-document-quality") Double minAvgDocumentQuality
|
||||||
@ -39,10 +45,39 @@ public class DomainProcessor {
|
|||||||
if (crawledDomain.doc != null) {
|
if (crawledDomain.doc != null) {
|
||||||
ret.documents = new ArrayList<>(crawledDomain.doc.size());
|
ret.documents = new ArrayList<>(crawledDomain.doc.size());
|
||||||
|
|
||||||
|
fixBadCanonicalTags(crawledDomain.doc);
|
||||||
|
|
||||||
|
DocumentDisqualifier disqualifier = new DocumentDisqualifier();
|
||||||
for (var doc : crawledDomain.doc) {
|
for (var doc : crawledDomain.doc) {
|
||||||
|
if (disqualifier.isQualified()) {
|
||||||
var processedDoc = documentProcessor.process(doc, crawledDomain);
|
var processedDoc = documentProcessor.process(doc, crawledDomain);
|
||||||
|
|
||||||
if (processedDoc.url != null) {
|
if (processedDoc.url != null) {
|
||||||
ret.documents.add(processedDoc);
|
ret.documents.add(processedDoc);
|
||||||
|
processedDoc.quality().ifPresent(disqualifier::offer);
|
||||||
|
}
|
||||||
|
else if ("LANGUAGE".equals(processedDoc.stateReason)) {
|
||||||
|
disqualifier.offer(-100);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else { // Short-circuit processing if quality is too low
|
||||||
|
var stub = documentProcessor.makeDisqualifiedStub(doc);
|
||||||
|
if (stub.url != null) {
|
||||||
|
ret.documents.add(stub);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<String> commonSiteWords = new HashSet<>(10);
|
||||||
|
|
||||||
|
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Tfidf_Top, IndexBlock.Subjects));
|
||||||
|
commonSiteWords.addAll(commonKeywordExtractor.getCommonSiteWords(ret, IndexBlock.Title));
|
||||||
|
|
||||||
|
if (!commonSiteWords.isEmpty()) {
|
||||||
|
for (var doc : ret.documents) {
|
||||||
|
if (doc.words != null) {
|
||||||
|
doc.words.get(IndexBlock.Site).addAll(commonSiteWords);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -50,30 +85,60 @@ public class DomainProcessor {
|
|||||||
ret.documents = Collections.emptyList();
|
ret.documents = Collections.emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
double averageQuality = getAverageQuality(ret.documents);
|
|
||||||
if (averageQuality < minAvgDocumentQuality) {
|
|
||||||
ret.documents.forEach(doc -> doc.state = EdgeUrlState.DISQUALIFIED);
|
|
||||||
}
|
|
||||||
|
|
||||||
ret.state = getState(crawledDomain.crawlerStatus);
|
ret.state = getState(crawledDomain.crawlerStatus);
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
private double getAverageQuality(List<ProcessedDocument> documents) {
|
private void fixBadCanonicalTags(List<CrawledDocument> docs) {
|
||||||
int n = 0;
|
Map<String, Set<String>> seenCanonicals = new HashMap<>();
|
||||||
double q = 0.;
|
Set<String> seenUrls = new HashSet<>();
|
||||||
for (var doc : documents) {
|
|
||||||
if (doc.quality().isPresent()) {
|
// Sometimes sites set a blanket canonical link to their root page
|
||||||
n++;
|
// this removes such links from consideration
|
||||||
q += doc.quality().getAsDouble();
|
|
||||||
|
for (var document : docs) {
|
||||||
|
if (!Strings.isNullOrEmpty(document.canonicalUrl) && !Objects.equals(document.canonicalUrl, document.url)) {
|
||||||
|
seenCanonicals.computeIfAbsent(document.canonicalUrl, url -> new HashSet<>()).add(document.documentBodyHash);
|
||||||
|
}
|
||||||
|
seenUrls.add(document.url);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var document : docs) {
|
||||||
|
if (!Strings.isNullOrEmpty(document.canonicalUrl)
|
||||||
|
&& !Objects.equals(document.canonicalUrl, document.url)
|
||||||
|
&& seenCanonicals.getOrDefault(document.canonicalUrl, Collections.emptySet()).size() > 1) {
|
||||||
|
|
||||||
|
if (seenUrls.add(document.canonicalUrl)) {
|
||||||
|
document.canonicalUrl = document.url;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
document.crawlerStatus = BAD_CANONICAL.name();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n > 0) {
|
for (var document : docs) {
|
||||||
return q / n;
|
if (!Strings.isNullOrEmpty(document.canonicalUrl)
|
||||||
|
&& !Objects.equals(document.canonicalUrl, document.url)
|
||||||
|
&& seenCanonicals.getOrDefault(document.canonicalUrl, Collections.emptySet()).size() > 1) {
|
||||||
|
document.canonicalUrl = document.url;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ignore canonical URL if it points to a different domain
|
||||||
|
// ... this confuses the hell out of the loader
|
||||||
|
for (var document : docs) {
|
||||||
|
if (Strings.isNullOrEmpty(document.canonicalUrl))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
Optional<EdgeUrl> cUrl = EdgeUrl.parse(document.canonicalUrl);
|
||||||
|
Optional<EdgeUrl> dUrl = EdgeUrl.parse(document.url);
|
||||||
|
|
||||||
|
if (cUrl.isPresent() && dUrl.isPresent() && !Objects.equals(cUrl.get().domain, dUrl.get().domain)) {
|
||||||
|
document.canonicalUrl = document.url;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return -5.;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private EdgeDomainIndexingState getState(String crawlerStatus) {
|
private EdgeDomainIndexingState getState(String crawlerStatus) {
|
||||||
@ -84,4 +149,20 @@ public class DomainProcessor {
|
|||||||
default -> EdgeDomainIndexingState.ERROR;
|
default -> EdgeDomainIndexingState.ERROR;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class DocumentDisqualifier {
|
||||||
|
int count;
|
||||||
|
int goodCount;
|
||||||
|
|
||||||
|
void offer(double quality) {
|
||||||
|
count++;
|
||||||
|
if (quality > minAvgDocumentQuality) {
|
||||||
|
goodCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean isQualified() {
|
||||||
|
return count < 25 || goodCount*10 >= count;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -26,7 +26,6 @@ public class InstructionsCompiler {
|
|||||||
}
|
}
|
||||||
if (domain.redirect != null) {
|
if (domain.redirect != null) {
|
||||||
compileRedirect(ret, domain.domain, domain.redirect);
|
compileRedirect(ret, domain.domain, domain.redirect);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
|
@ -0,0 +1,71 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||||
|
|
||||||
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.model.ProcessedDomain;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
public class CommonKeywordExtractor {
|
||||||
|
private final PorterStemmer ps = new PorterStemmer();
|
||||||
|
|
||||||
|
private static final int MIN_REQUIRED_DOCUMENTS = 25;
|
||||||
|
|
||||||
|
private static final int REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION = 100;
|
||||||
|
private static final double QUALIFYING_PROPORTION_FOR_KEYWORD = .25;
|
||||||
|
|
||||||
|
private static final int MAX_SITE_KEYWORDS_TO_EXTRACT = 5;
|
||||||
|
|
||||||
|
public List<String> getCommonSiteWords(ProcessedDomain ret, IndexBlock... sourceBlocks) {
|
||||||
|
|
||||||
|
if (ret.documents.size() < MIN_REQUIRED_DOCUMENTS)
|
||||||
|
return Collections.emptyList();
|
||||||
|
|
||||||
|
final Map<String, String> wordToStemmedMemoized = new HashMap<>(ret.documents.size()*10);
|
||||||
|
|
||||||
|
final Map<String, Integer> topStemmedKeywordCount = new HashMap<>(ret.documents.size()*10);
|
||||||
|
final Map<String, Set<String>> stemmedToNonstemmedVariants = new HashMap<>(ret.documents.size()*10);
|
||||||
|
|
||||||
|
int qualifiedDocCount = 0;
|
||||||
|
for (var doc : ret.documents) {
|
||||||
|
if (doc.words == null)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
qualifiedDocCount++;
|
||||||
|
|
||||||
|
for (var block : sourceBlocks) {
|
||||||
|
for (var word : doc.words.get(block).words) {
|
||||||
|
String wordStemmed = wordToStemmedMemoized.computeIfAbsent(word, ps::stemWord);
|
||||||
|
|
||||||
|
// Count by negative values to sort by Map.Entry.comparingByValue() in reverse
|
||||||
|
topStemmedKeywordCount.merge(wordStemmed, -1, Integer::sum);
|
||||||
|
|
||||||
|
stemmedToNonstemmedVariants.computeIfAbsent(wordStemmed, w -> new HashSet<>()).add(word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int totalValue = 0;
|
||||||
|
for (int value : topStemmedKeywordCount.values()) {
|
||||||
|
totalValue += value;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (totalValue > -REQUIRED_TOTAL_COUNT_FOR_CONSIDERATION)
|
||||||
|
return Collections.emptyList();
|
||||||
|
|
||||||
|
List<String> topWords = new ArrayList<>(MAX_SITE_KEYWORDS_TO_EXTRACT);
|
||||||
|
|
||||||
|
double qualifyingValue = -qualifiedDocCount * QUALIFYING_PROPORTION_FOR_KEYWORD;
|
||||||
|
|
||||||
|
topStemmedKeywordCount.entrySet().stream()
|
||||||
|
.filter(e -> e.getValue() < qualifyingValue)
|
||||||
|
.sorted(Map.Entry.comparingByValue())
|
||||||
|
.limit(MAX_SITE_KEYWORDS_TO_EXTRACT)
|
||||||
|
.forEach(e -> topWords.addAll(stemmedToNonstemmedVariants.get(e.getKey())));
|
||||||
|
|
||||||
|
|
||||||
|
return topWords;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,105 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||||
|
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.nodes.Node;
|
||||||
|
import org.jsoup.nodes.TextNode;
|
||||||
|
import org.jsoup.select.NodeFilter;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class DomPruningFilter implements NodeFilter {
|
||||||
|
|
||||||
|
private final double pruneThreshold;
|
||||||
|
|
||||||
|
private final Map<Node, NodeData> data = new HashMap<>();
|
||||||
|
private final NodeData dummy = new NodeData(Integer.MAX_VALUE, 1, 0);
|
||||||
|
|
||||||
|
public DomPruningFilter(double pruneThreshold) {
|
||||||
|
this.pruneThreshold = pruneThreshold;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public FilterResult head(Node node, int depth) {
|
||||||
|
return FilterResult.CONTINUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public FilterResult tail(Node node, int depth) {
|
||||||
|
final NodeData dataForNode;
|
||||||
|
|
||||||
|
if (node instanceof TextNode tn) {
|
||||||
|
dataForNode = new NodeData(depth, tn.text().length(), 0);
|
||||||
|
}
|
||||||
|
else if (isSignal(node)) {
|
||||||
|
dataForNode = new NodeData(depth, 0,0);
|
||||||
|
for (var childNode : node.childNodes()) {
|
||||||
|
dataForNode.add(data.getOrDefault(childNode, dummy));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
dataForNode = new NodeData(depth, 0,0);
|
||||||
|
for (var childNode : node.childNodes()) {
|
||||||
|
dataForNode.addAsNoise(data.getOrDefault(childNode, dummy));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
data.put(node, dataForNode);
|
||||||
|
|
||||||
|
if (dataForNode.depth <= 1)
|
||||||
|
return FilterResult.CONTINUE;
|
||||||
|
|
||||||
|
if (dataForNode.signalNodeSize == 0)
|
||||||
|
return FilterResult.REMOVE;
|
||||||
|
if (dataForNode.noiseNodeSize > 0
|
||||||
|
&& dataForNode.signalRate() < pruneThreshold
|
||||||
|
&& dataForNode.treeSize > 3)
|
||||||
|
return FilterResult.REMOVE;
|
||||||
|
|
||||||
|
return FilterResult.CONTINUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isSignal(Node node) {
|
||||||
|
|
||||||
|
if (node instanceof Element e) {
|
||||||
|
if ("a".equalsIgnoreCase(e.tagName()))
|
||||||
|
return false;
|
||||||
|
if ("nav".equalsIgnoreCase(e.tagName()))
|
||||||
|
return false;
|
||||||
|
if ("footer".equalsIgnoreCase(e.tagName()))
|
||||||
|
return false;
|
||||||
|
if ("header".equalsIgnoreCase(e.tagName()))
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class NodeData {
|
||||||
|
int signalNodeSize;
|
||||||
|
int noiseNodeSize;
|
||||||
|
int treeSize = 1;
|
||||||
|
int depth;
|
||||||
|
|
||||||
|
NodeData(int depth, int signalNodeSize, int noiseNodeSize) {
|
||||||
|
this.depth = depth;
|
||||||
|
this.signalNodeSize = signalNodeSize;
|
||||||
|
this.noiseNodeSize = noiseNodeSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void add(NodeData other) {
|
||||||
|
signalNodeSize += other.signalNodeSize;
|
||||||
|
noiseNodeSize += other.noiseNodeSize;
|
||||||
|
treeSize += other.treeSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addAsNoise(NodeData other) {
|
||||||
|
noiseNodeSize += other.noiseNodeSize + other.signalNodeSize;
|
||||||
|
treeSize += other.treeSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
public double signalRate() {
|
||||||
|
return signalNodeSize / (double)(signalNodeSize + noiseNodeSize);
|
||||||
|
}
|
||||||
|
}
|
@ -2,7 +2,11 @@ package nu.marginalia.wmsa.edge.converting.processor.logic;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
|
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.AdblockSimulator;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.RecipeDetector;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.TextileCraftDetector;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.topic.WoodworkingDetector;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.jsoup.nodes.Element;
|
import org.jsoup.nodes.Element;
|
||||||
@ -35,14 +39,20 @@ public class FeatureExtractor {
|
|||||||
"d31qbv1cthcecs.cloudfront.net",
|
"d31qbv1cthcecs.cloudfront.net",
|
||||||
"linkedin.com");
|
"linkedin.com");
|
||||||
|
|
||||||
private AdblockSimulator adblockSimulator;
|
private final AdblockSimulator adblockSimulator;
|
||||||
|
private final RecipeDetector recipeDetector;
|
||||||
|
private final TextileCraftDetector textileCraftDetector;
|
||||||
|
private final WoodworkingDetector woodworkingDetector;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public FeatureExtractor(AdblockSimulator adblockSimulator) {
|
public FeatureExtractor(AdblockSimulator adblockSimulator, RecipeDetector recipeDetector, TextileCraftDetector textileCraftDetector, WoodworkingDetector woodworkingDetector) {
|
||||||
this.adblockSimulator = adblockSimulator;
|
this.adblockSimulator = adblockSimulator;
|
||||||
|
this.recipeDetector = recipeDetector;
|
||||||
|
this.textileCraftDetector = textileCraftDetector;
|
||||||
|
this.woodworkingDetector = woodworkingDetector;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc) {
|
public Set<HtmlFeature> getFeatures(CrawledDomain domain, Document doc, DocumentLanguageData dld) {
|
||||||
final Set<HtmlFeature> features = new HashSet<>();
|
final Set<HtmlFeature> features = new HashSet<>();
|
||||||
|
|
||||||
final Elements scriptTags = doc.getElementsByTag("script");
|
final Elements scriptTags = doc.getElementsByTag("script");
|
||||||
@ -81,9 +91,14 @@ public class FeatureExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!domain.cookies.isEmpty()) {
|
if (!domain.cookies.isEmpty())
|
||||||
features.add(HtmlFeature.COOKIES);
|
features.add(HtmlFeature.COOKIES);
|
||||||
}
|
|
||||||
|
if (recipeDetector.testP(dld) > 0.5)
|
||||||
|
features.add(HtmlFeature.CATEGORY_FOOD);
|
||||||
|
// these should be mutually exclusive
|
||||||
|
else if (woodworkingDetector.testP(dld) > 0.3 || textileCraftDetector.testP(dld) > 0.3)
|
||||||
|
features.add(HtmlFeature.CATEGORY_CRAFTS);
|
||||||
|
|
||||||
return features;
|
return features;
|
||||||
}
|
}
|
||||||
|
@ -12,6 +12,10 @@ public enum HtmlFeature {
|
|||||||
CATEGORY_FOOD("category:food"),
|
CATEGORY_FOOD("category:food"),
|
||||||
|
|
||||||
ADVERTISEMENT("special:ads"),
|
ADVERTISEMENT("special:ads"),
|
||||||
|
|
||||||
|
CATEGORY_CRAFTS("category:crafts"),
|
||||||
|
|
||||||
|
UNKNOWN("special:uncategorized")
|
||||||
;
|
;
|
||||||
|
|
||||||
private final String keyword;
|
private final String keyword;
|
||||||
|
@ -19,10 +19,14 @@ import java.util.regex.Pattern;
|
|||||||
|
|
||||||
public class LinkParser {
|
public class LinkParser {
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
private final List<String> blockPrefixList = List.of(
|
private final List<String> blockPrefixList = List.of(
|
||||||
"mailto:", "javascript:", "tel:", "itpc:", "#", "file:");
|
"mailto:", "javascript:", "tel:", "itpc:", "#", "file:");
|
||||||
private final List<String> blockSuffixList = List.of(
|
|
||||||
|
private final List<String> binarySuffixList = List.of(
|
||||||
".pdf", ".mp3", ".wmv", ".avi", ".zip", ".7z",
|
".pdf", ".mp3", ".wmv", ".avi", ".zip", ".7z",
|
||||||
|
".mpv", ".mp4", ".avi", ".mkv", ".tiff", ".dat", ".tar",
|
||||||
|
".com", ".bat", ".sh",
|
||||||
".bin", ".exe", ".tar.gz", ".tar.bz2", ".xml", ".swf",
|
".bin", ".exe", ".tar.gz", ".tar.bz2", ".xml", ".swf",
|
||||||
".wav", ".ogg", ".jpg", ".jpeg", ".png", ".gif", ".webp",
|
".wav", ".ogg", ".jpg", ".jpeg", ".png", ".gif", ".webp",
|
||||||
".webm", ".bmp", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx",
|
".webm", ".bmp", ".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx",
|
||||||
@ -33,7 +37,7 @@ public class LinkParser {
|
|||||||
return Optional.of(l)
|
return Optional.of(l)
|
||||||
.filter(this::shouldIndexLink)
|
.filter(this::shouldIndexLink)
|
||||||
.map(this::getUrl)
|
.map(this::getUrl)
|
||||||
.map(link -> resolveUrl(relativeBaseUrl, link))
|
.map(link -> resolveRelativeUrl(relativeBaseUrl, link))
|
||||||
.flatMap(this::createURI)
|
.flatMap(this::createURI)
|
||||||
.map(URI::normalize)
|
.map(URI::normalize)
|
||||||
.map(this::renormalize)
|
.map(this::renormalize)
|
||||||
@ -44,7 +48,7 @@ public class LinkParser {
|
|||||||
public Optional<EdgeUrl> parseLinkPermissive(EdgeUrl relativeBaseUrl, Element l) {
|
public Optional<EdgeUrl> parseLinkPermissive(EdgeUrl relativeBaseUrl, Element l) {
|
||||||
return Optional.of(l)
|
return Optional.of(l)
|
||||||
.map(this::getUrl)
|
.map(this::getUrl)
|
||||||
.map(link -> resolveUrl(relativeBaseUrl, link))
|
.map(link -> resolveRelativeUrl(relativeBaseUrl, link))
|
||||||
.flatMap(this::createURI)
|
.flatMap(this::createURI)
|
||||||
.map(URI::normalize)
|
.map(URI::normalize)
|
||||||
.map(this::renormalize)
|
.map(this::renormalize)
|
||||||
@ -74,7 +78,7 @@ public class LinkParser {
|
|||||||
@Contract(pure=true)
|
@Contract(pure=true)
|
||||||
public Optional<EdgeUrl> parseLink(EdgeUrl baseUrl, String str) {
|
public Optional<EdgeUrl> parseLink(EdgeUrl baseUrl, String str) {
|
||||||
return Optional.of(str)
|
return Optional.of(str)
|
||||||
.map(link -> resolveUrl(baseUrl, link))
|
.map(link -> resolveRelativeUrl(baseUrl, link))
|
||||||
.flatMap(this::createURI)
|
.flatMap(this::createURI)
|
||||||
.map(URI::normalize)
|
.map(URI::normalize)
|
||||||
.map(this::renormalize)
|
.map(this::renormalize)
|
||||||
@ -85,7 +89,7 @@ public class LinkParser {
|
|||||||
public Optional<EdgeUrl> parseFrame(EdgeUrl baseUrl, Element frame) {
|
public Optional<EdgeUrl> parseFrame(EdgeUrl baseUrl, Element frame) {
|
||||||
return Optional.of(frame)
|
return Optional.of(frame)
|
||||||
.map(l -> l.attr("src"))
|
.map(l -> l.attr("src"))
|
||||||
.map(link -> resolveUrl(baseUrl, link))
|
.map(link -> resolveRelativeUrl(baseUrl, link))
|
||||||
.flatMap(this::createURI)
|
.flatMap(this::createURI)
|
||||||
.map(URI::normalize)
|
.map(URI::normalize)
|
||||||
.map(this::renormalize)
|
.map(this::renormalize)
|
||||||
@ -95,10 +99,10 @@ public class LinkParser {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private URI renormalize(URI uri) {
|
private URI renormalize(URI uri) {
|
||||||
if (uri.getPath() == null) {
|
if (uri.getPath() == null) {
|
||||||
return renormalize(new URI(uri.getScheme(), uri.getHost(), "/", uri.getFragment()));
|
return renormalize(new URI(uri.getScheme(), uri.getHost(), "/", uri.getQuery(), uri.getFragment()));
|
||||||
}
|
}
|
||||||
if (uri.getPath().startsWith("/../")) {
|
if (uri.getPath().startsWith("/../")) {
|
||||||
return renormalize(new URI(uri.getScheme(), uri.getHost(), uri.getPath().substring(3), uri.getFragment()));
|
return renormalize(new URI(uri.getScheme(), uri.getHost(), uri.getPath().substring(3), uri.getQuery(), uri.getFragment()));
|
||||||
}
|
}
|
||||||
return uri;
|
return uri;
|
||||||
}
|
}
|
||||||
@ -117,10 +121,10 @@ public class LinkParser {
|
|||||||
private static final Pattern paramSeparatorPattern = Pattern.compile("\\?");
|
private static final Pattern paramSeparatorPattern = Pattern.compile("\\?");
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private String resolveUrl(EdgeUrl baseUrl, String s) {
|
private String resolveRelativeUrl(EdgeUrl baseUrl, String s) {
|
||||||
|
|
||||||
// url looks like http://www.marginalia.nu/
|
// url looks like http://www.marginalia.nu/
|
||||||
if (isAbsoluteDomain(s)) {
|
if (doesUrlStringHaveProtocol(s)) {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -154,8 +158,15 @@ public class LinkParser {
|
|||||||
return url.path.substring(0, lastSlash+1);
|
return url.path.substring(0, lastSlash+1);
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isAbsoluteDomain(String s) {
|
private boolean doesUrlStringHaveProtocol(String s) {
|
||||||
return s.matches("^[a-zA-Z]+:.*$");
|
int i = 0;
|
||||||
|
for (; i < s.length(); i++) {
|
||||||
|
if (!Character.isAlphabetic(s.charAt(i)))
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (i == 0 || i == s.length())
|
||||||
|
return false;
|
||||||
|
return ':' == s.charAt(i);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean shouldIndexLink(Element link) {
|
public boolean shouldIndexLink(Element link) {
|
||||||
@ -168,26 +179,29 @@ public class LinkParser {
|
|||||||
return !"noindex".equalsIgnoreCase(rel);
|
return !"noindex".equalsIgnoreCase(rel);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean hasBinarySuffix(String href) {
|
|
||||||
return blockSuffixList.stream().anyMatch(href::endsWith);
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean isUrlRelevant(String href) {
|
private boolean isUrlRelevant(String href) {
|
||||||
if (null == href || "".equals(href)) {
|
if (null == href || "".equals(href)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if (href.length() > 128) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
href = href.toLowerCase();
|
||||||
|
|
||||||
if (blockPrefixList.stream().anyMatch(href::startsWith)) {
|
if (blockPrefixList.stream().anyMatch(href::startsWith)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (hasBinarySuffix(href)) {
|
if (hasBinarySuffix(href)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (href.length() > 128) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean hasBinarySuffix(String str) {
|
||||||
|
return binarySuffixList.stream().anyMatch(str::endsWith);
|
||||||
|
}
|
||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
public EdgeUrl getBaseLink(Document parsed, EdgeUrl documentUrl) {
|
public EdgeUrl getBaseLink(Document parsed, EdgeUrl documentUrl) {
|
||||||
var baseTags = parsed.getElementsByTag("base");
|
var baseTags = parsed.getElementsByTag("base");
|
||||||
@ -196,7 +210,7 @@ public class LinkParser {
|
|||||||
for (var tag : baseTags) {
|
for (var tag : baseTags) {
|
||||||
String href = tag.attr("href");
|
String href = tag.attr("href");
|
||||||
if (!Strings.isNullOrEmpty(href)) {
|
if (!Strings.isNullOrEmpty(href)) {
|
||||||
return new EdgeUrl(resolveUrl(documentUrl, href));
|
return new EdgeUrl(resolveRelativeUrl(documentUrl, href));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
||||||
|
|
||||||
import ca.rmen.porterstemmer.PorterStemmer;
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
@ -14,6 +15,7 @@ public class RecipeDetector {
|
|||||||
|
|
||||||
private final Map<String, Double> termValues = new HashMap<>();
|
private final Map<String, Double> termValues = new HashMap<>();
|
||||||
|
|
||||||
|
@Inject
|
||||||
public RecipeDetector() {
|
public RecipeDetector() {
|
||||||
PorterStemmer ps = new PorterStemmer();
|
PorterStemmer ps = new PorterStemmer();
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
||||||
|
|
||||||
import ca.rmen.porterstemmer.PorterStemmer;
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
@ -14,6 +15,7 @@ public class TextileCraftDetector {
|
|||||||
|
|
||||||
private final Map<String, Double> termValues = new HashMap<>();
|
private final Map<String, Double> termValues = new HashMap<>();
|
||||||
|
|
||||||
|
@Inject
|
||||||
public TextileCraftDetector() {
|
public TextileCraftDetector() {
|
||||||
PorterStemmer ps = new PorterStemmer();
|
PorterStemmer ps = new PorterStemmer();
|
||||||
|
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
package nu.marginalia.wmsa.edge.converting.processor.logic.topic;
|
||||||
|
|
||||||
import ca.rmen.porterstemmer.PorterStemmer;
|
import ca.rmen.porterstemmer.PorterStemmer;
|
||||||
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
import nu.marginalia.util.language.processing.model.DocumentLanguageData;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
@ -14,6 +15,7 @@ public class WoodworkingDetector {
|
|||||||
|
|
||||||
private final Map<String, Double> termValues = new HashMap<>();
|
private final Map<String, Double> termValues = new HashMap<>();
|
||||||
|
|
||||||
|
@Inject
|
||||||
public WoodworkingDetector() {
|
public WoodworkingDetector() {
|
||||||
PorterStemmer ps = new PorterStemmer();
|
PorterStemmer ps = new PorterStemmer();
|
||||||
|
|
||||||
|
@ -4,8 +4,8 @@ import com.github.luben.zstd.ZstdOutputStream;
|
|||||||
import com.google.common.hash.HashFunction;
|
import com.google.common.hash.HashFunction;
|
||||||
import com.google.common.hash.Hashing;
|
import com.google.common.hash.Hashing;
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.google.gson.GsonBuilder;
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.wmsa.client.GsonFactory;
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||||
@ -84,7 +84,7 @@ public class CrawlJobExtractorMain {
|
|||||||
Driver driver = new Driver();
|
Driver driver = new Driver();
|
||||||
var outFile = Path.of(args[0]);
|
var outFile = Path.of(args[0]);
|
||||||
|
|
||||||
Gson gson = new GsonBuilder().create();
|
Gson gson = GsonFactory.get();
|
||||||
String[] targetDomains = Arrays.stream(args).skip(1).toArray(String[]::new);
|
String[] targetDomains = Arrays.stream(args).skip(1).toArray(String[]::new);
|
||||||
|
|
||||||
|
|
||||||
@ -103,7 +103,7 @@ public class CrawlJobExtractorMain {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static void writeSpec(Path outFile, String domain, List<String> urls) throws IOException {
|
public static void writeSpec(Path outFile, String domain, List<String> urls) throws IOException {
|
||||||
Gson gson = new GsonBuilder().create();
|
Gson gson = GsonFactory.get();
|
||||||
|
|
||||||
try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) {
|
try (var out = new PrintWriter(new ZstdOutputStream(new BufferedOutputStream(new FileOutputStream(outFile.toFile()))))) {
|
||||||
var job = new CrawlingSpecification();
|
var job = new CrawlingSpecification();
|
||||||
|
@ -4,15 +4,15 @@ import com.github.luben.zstd.ZstdOutputStream;
|
|||||||
import com.google.common.hash.HashFunction;
|
import com.google.common.hash.HashFunction;
|
||||||
import com.google.common.hash.Hashing;
|
import com.google.common.hash.Hashing;
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.google.gson.GsonBuilder;
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import nu.marginalia.util.ranking.BetterReversePageRank;
|
||||||
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
import nu.marginalia.util.ranking.RankingDomainFetcher;
|
||||||
|
import nu.marginalia.wmsa.client.GsonFactory;
|
||||||
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
import nu.marginalia.wmsa.configuration.module.DatabaseModule;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklistImpl;
|
||||||
import nu.marginalia.util.ranking.BetterReversePageRank;
|
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||||
import org.mariadb.jdbc.Driver;
|
import org.mariadb.jdbc.Driver;
|
||||||
|
|
||||||
import java.io.BufferedOutputStream;
|
import java.io.BufferedOutputStream;
|
||||||
@ -23,7 +23,7 @@ import java.nio.file.Path;
|
|||||||
import java.sql.Connection;
|
import java.sql.Connection;
|
||||||
import java.sql.ResultSet;
|
import java.sql.ResultSet;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.*;
|
import java.util.ArrayList;
|
||||||
|
|
||||||
public class CrawlJobExtractorPageRankMain {
|
public class CrawlJobExtractorPageRankMain {
|
||||||
|
|
||||||
@ -72,7 +72,7 @@ public class CrawlJobExtractorPageRankMain {
|
|||||||
Driver driver = new Driver();
|
Driver driver = new Driver();
|
||||||
var outFile = Path.of(args[0]);
|
var outFile = Path.of(args[0]);
|
||||||
|
|
||||||
Gson gson = new GsonBuilder().create();
|
Gson gson = GsonFactory.get();
|
||||||
|
|
||||||
var ds = new DatabaseModule().provideConnection();
|
var ds = new DatabaseModule().provideConnection();
|
||||||
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
var domains = new RankingDomainFetcher(ds, new EdgeDomainBlacklistImpl(ds));
|
||||||
|
@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.crawling;
|
|||||||
|
|
||||||
import com.github.luben.zstd.ZstdInputStream;
|
import com.github.luben.zstd.ZstdInputStream;
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.google.gson.GsonBuilder;
|
import nu.marginalia.wmsa.client.GsonFactory;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawledDomain;
|
||||||
|
|
||||||
@ -13,9 +13,13 @@ import java.io.InputStreamReader;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.concurrent.ForkJoinPool;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
public class CrawledDomainReader {
|
public class CrawledDomainReader {
|
||||||
private final Gson gson = new GsonBuilder().create();
|
private final Gson gson = GsonFactory.get();
|
||||||
|
|
||||||
|
private final ForkJoinPool pool = new ForkJoinPool(4);
|
||||||
|
|
||||||
public CrawledDomainReader() {
|
public CrawledDomainReader() {
|
||||||
}
|
}
|
||||||
@ -43,7 +47,12 @@ public class CrawledDomainReader {
|
|||||||
if (line.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
|
if (line.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
|
||||||
domain = gson.fromJson(nextLine, CrawledDomain.class);
|
domain = gson.fromJson(nextLine, CrawledDomain.class);
|
||||||
} else if (line.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
|
} else if (line.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
|
||||||
docs.add(gson.fromJson(nextLine, CrawledDocument.class));
|
pool.execute(() -> {
|
||||||
|
var doc = gson.fromJson(nextLine, CrawledDocument.class);
|
||||||
|
synchronized (docs) {
|
||||||
|
docs.add(doc);
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
} else if (line.charAt(0) == '{') {
|
} else if (line.charAt(0) == '{') {
|
||||||
domain = gson.fromJson(line, CrawledDomain.class);
|
domain = gson.fromJson(line, CrawledDomain.class);
|
||||||
@ -52,6 +61,8 @@ public class CrawledDomainReader {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pool.awaitQuiescence(10, TimeUnit.SECONDS);
|
||||||
|
|
||||||
if (domain == null) {
|
if (domain == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
@ -2,7 +2,7 @@ package nu.marginalia.wmsa.edge.crawling;
|
|||||||
|
|
||||||
import com.github.luben.zstd.ZstdOutputStream;
|
import com.github.luben.zstd.ZstdOutputStream;
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.google.gson.GsonBuilder;
|
import nu.marginalia.wmsa.client.GsonFactory;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.SerializableCrawlData;
|
import nu.marginalia.wmsa.edge.crawling.model.SerializableCrawlData;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -16,7 +16,7 @@ import java.nio.file.Path;
|
|||||||
|
|
||||||
public class CrawledDomainWriter implements AutoCloseable {
|
public class CrawledDomainWriter implements AutoCloseable {
|
||||||
private final Path outputDir;
|
private final Path outputDir;
|
||||||
private final Gson gson = new GsonBuilder().create();
|
private final Gson gson = GsonFactory.get();
|
||||||
private static final Logger logger = LoggerFactory.getLogger(CrawledDomainWriter.class);
|
private static final Logger logger = LoggerFactory.getLogger(CrawledDomainWriter.class);
|
||||||
private final Writer writer;
|
private final Writer writer;
|
||||||
private final Path outputFile;
|
private final Path outputFile;
|
||||||
|
@ -2,16 +2,19 @@ package nu.marginalia.wmsa.edge.crawling;
|
|||||||
|
|
||||||
import com.github.luben.zstd.ZstdInputStream;
|
import com.github.luben.zstd.ZstdInputStream;
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.google.gson.GsonBuilder;
|
import nu.marginalia.wmsa.client.GsonFactory;
|
||||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
import nu.marginalia.wmsa.edge.crawling.model.CrawlingSpecification;
|
||||||
import org.apache.logging.log4j.util.Strings;
|
import org.apache.logging.log4j.util.Strings;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.BufferedReader;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.function.Consumer;
|
import java.util.function.Consumer;
|
||||||
|
|
||||||
public class CrawlerSpecificationLoader {
|
public class CrawlerSpecificationLoader {
|
||||||
private final static Gson gson = new GsonBuilder().create();
|
private final static Gson gson = GsonFactory.get();
|
||||||
|
|
||||||
public static void readInputSpec(Path inputSpec, Consumer<CrawlingSpecification> consumer) {
|
public static void readInputSpec(Path inputSpec, Consumer<CrawlingSpecification> consumer) {
|
||||||
try (var inputStream = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(inputSpec.toFile()))))) {
|
try (var inputStream = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(inputSpec.toFile()))))) {
|
||||||
|
@ -1,7 +1,5 @@
|
|||||||
package nu.marginalia.wmsa.edge.crawling.blocklist;
|
package nu.marginalia.wmsa.edge.crawling.blocklist;
|
||||||
|
|
||||||
import com.google.common.cache.Cache;
|
|
||||||
import com.google.common.cache.CacheBuilder;
|
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import com.opencsv.CSVReader;
|
import com.opencsv.CSVReader;
|
||||||
import com.opencsv.exceptions.CsvValidationException;
|
import com.opencsv.exceptions.CsvValidationException;
|
||||||
@ -13,10 +11,7 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import java.io.FileReader;
|
import java.io.FileReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
|
|
||||||
|
@ -6,5 +6,7 @@ public enum CrawlerDocumentStatus {
|
|||||||
BAD_CHARSET,
|
BAD_CHARSET,
|
||||||
REDIRECT,
|
REDIRECT,
|
||||||
ROBOTS_TXT,
|
ROBOTS_TXT,
|
||||||
ERROR
|
ERROR,
|
||||||
|
BAD_CANONICAL,
|
||||||
|
Timeout
|
||||||
}
|
}
|
||||||
|
@ -3,8 +3,9 @@ package nu.marginalia.wmsa.edge.data.dao;
|
|||||||
import com.google.inject.ImplementedBy;
|
import com.google.inject.ImplementedBy;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||||
|
import nu.marginalia.wmsa.edge.model.id.EdgeIdCollection;
|
||||||
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
|
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
|
||||||
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
||||||
|
|
||||||
@ -18,9 +19,9 @@ public interface EdgeDataStoreDao {
|
|||||||
|
|
||||||
List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist backlist, int set);
|
List<BrowseResult> getRandomDomains(int count, EdgeDomainBlacklist backlist, int set);
|
||||||
|
|
||||||
List<BrowseResult> getBrowseResultFromUrlIds(List<EdgeId<EdgeUrl>> urlId);
|
List<BrowseResult> getBrowseResultFromUrlIds(EdgeIdCollection<EdgeUrl> urlId);
|
||||||
|
|
||||||
List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids);
|
List<EdgeUrlDetails> getUrlDetailsMulti(EdgeIdCollection<EdgeUrl> ids);
|
||||||
|
|
||||||
EdgeDomain getDomain(EdgeId<EdgeDomain> id);
|
EdgeDomain getDomain(EdgeId<EdgeDomain> id);
|
||||||
|
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.wmsa.edge.data.dao;
|
package nu.marginalia.wmsa.edge.data.dao;
|
||||||
|
|
||||||
|
import com.google.common.base.Strings;
|
||||||
import com.google.common.cache.Cache;
|
import com.google.common.cache.Cache;
|
||||||
import com.google.common.cache.CacheBuilder;
|
import com.google.common.cache.CacheBuilder;
|
||||||
import com.google.common.util.concurrent.UncheckedExecutionException;
|
import com.google.common.util.concurrent.UncheckedExecutionException;
|
||||||
@ -8,9 +9,10 @@ import com.zaxxer.hikari.HikariDataSource;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainIndexingState;
|
||||||
|
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||||
|
import nu.marginalia.wmsa.edge.model.id.EdgeIdCollection;
|
||||||
import nu.marginalia.wmsa.edge.model.search.EdgePageScoreAdjustment;
|
import nu.marginalia.wmsa.edge.model.search.EdgePageScoreAdjustment;
|
||||||
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
|
import nu.marginalia.wmsa.edge.model.search.EdgeUrlDetails;
|
||||||
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
||||||
@ -63,17 +65,17 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private <T> String idList(List<EdgeId<T>> ids) {
|
private <T> String idList(EdgeIdCollection<EdgeUrl> ids) {
|
||||||
StringJoiner j = new StringJoiner(",", "(", ")");
|
StringJoiner j = new StringJoiner(",", "(", ")");
|
||||||
for (var id : ids) {
|
for (var id : ids.values()) {
|
||||||
j.add(Integer.toString(id.id()));
|
j.add(Integer.toString(id));
|
||||||
}
|
}
|
||||||
return j.toString();
|
return j.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
@Override
|
@Override
|
||||||
public List<EdgeUrlDetails> getUrlDetailsMulti(List<EdgeId<EdgeUrl>> ids) {
|
public List<EdgeUrlDetails> getUrlDetailsMulti(EdgeIdCollection<EdgeUrl> ids) {
|
||||||
if (ids.isEmpty()) {
|
if (ids.isEmpty()) {
|
||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
@ -110,12 +112,14 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
rsp.getInt(11), // dataHash
|
rsp.getInt(11), // dataHash
|
||||||
EdgePageScoreAdjustment.zero(), // urlQualityAdjustment
|
EdgePageScoreAdjustment.zero(), // urlQualityAdjustment
|
||||||
Integer.MAX_VALUE, // rankingId
|
Integer.MAX_VALUE, // rankingId
|
||||||
Double.MAX_VALUE, // termScore
|
Double.MAX_VALUE // termScore
|
||||||
0 // queryLength
|
|
||||||
);
|
);
|
||||||
if (val.urlQuality >= QUALITY_LOWER_BOUND_CUTOFF) {
|
if (val.urlQuality <= QUALITY_LOWER_BOUND_CUTOFF
|
||||||
result.add(val);
|
&& Strings.isNullOrEmpty(val.description)
|
||||||
|
&& val.url.path.length() > 1) {
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
result.add(val);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -267,7 +271,7 @@ public class EdgeDataStoreDaoImpl implements EdgeDataStoreDao {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<BrowseResult> getBrowseResultFromUrlIds(List<EdgeId<EdgeUrl>> urlIds) {
|
public List<BrowseResult> getBrowseResultFromUrlIds(EdgeIdCollection<EdgeUrl> urlIds) {
|
||||||
if (urlIds.isEmpty())
|
if (urlIds.isEmpty())
|
||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
|
|
||||||
|
@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.data.dao.task;
|
|||||||
import com.google.inject.ImplementedBy;
|
import com.google.inject.ImplementedBy;
|
||||||
import gnu.trove.set.hash.TIntHashSet;
|
import gnu.trove.set.hash.TIntHashSet;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||||
|
|
||||||
@ImplementedBy(EdgeDomainBlacklistImpl.class)
|
@ImplementedBy(EdgeDomainBlacklistImpl.class)
|
||||||
public interface EdgeDomainBlacklist {
|
public interface EdgeDomainBlacklist {
|
||||||
|
@ -9,7 +9,7 @@ import nu.marginalia.wmsa.configuration.server.Service;
|
|||||||
import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService;
|
import nu.marginalia.wmsa.edge.assistant.screenshot.ScreenshotService;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
|
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||||
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
||||||
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
|
import nu.marginalia.wmsa.renderer.mustache.MustacheRenderer;
|
||||||
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
|
import nu.marginalia.wmsa.renderer.mustache.RendererFactory;
|
||||||
|
@ -3,7 +3,7 @@ package nu.marginalia.wmsa.edge.dating;
|
|||||||
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
|
import nu.marginalia.wmsa.edge.data.dao.EdgeDataStoreDao;
|
||||||
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
import nu.marginalia.wmsa.edge.data.dao.task.EdgeDomainBlacklist;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||||
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
import nu.marginalia.wmsa.edge.search.model.BrowseResult;
|
||||||
|
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
|
@ -4,12 +4,14 @@ import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriter;
|
|||||||
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndexReader;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.query.Query;
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.concurrent.locks.Lock;
|
import java.util.concurrent.locks.Lock;
|
||||||
import java.util.concurrent.locks.ReadWriteLock;
|
import java.util.concurrent.locks.ReadWriteLock;
|
||||||
@ -104,47 +106,49 @@ public class EdgeIndexBucket {
|
|||||||
return indexReader.findHotDomainsForKeyword(block, wordId, queryDepth, minHitCount, maxResults);
|
return indexReader.findHotDomainsForKeyword(block, wordId, queryDepth, minHitCount, maxResults);
|
||||||
}
|
}
|
||||||
|
|
||||||
public LongStream getQuery(IndexBlock block, LongPredicate filter, IndexSearchBudget budget, EdgeIndexSearchTerms searchTerms) {
|
public IndexQuery getQuery(IndexQueryCachePool cachePool, IndexBlock block, LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
|
||||||
if (null == indexReader) {
|
if (null == indexReader) {
|
||||||
logger.warn("Index reader not neady {}", block);
|
logger.warn("Index reader not neady {}", block);
|
||||||
return LongStream.empty();
|
return new IndexQuery(Collections.emptyList());
|
||||||
}
|
}
|
||||||
|
|
||||||
var orderedIncludes = searchTerms.includes
|
final int[] orderedIncludes = searchTerms.includes
|
||||||
.stream()
|
.stream()
|
||||||
.sorted(Comparator.comparingLong(i -> indexReader.numHits(block, i)))
|
.sorted(Comparator.comparingLong(i -> indexReader.numHits(cachePool, block, i)))
|
||||||
.distinct()
|
.distinct()
|
||||||
.mapToInt(Integer::intValue)
|
.mapToInt(Integer::intValue)
|
||||||
.toArray();
|
.toArray();
|
||||||
|
|
||||||
Query query;
|
IndexQueryFactory.IndexQueryBuilder query;
|
||||||
|
|
||||||
if (orderedIncludes.length == 1) {
|
query = indexReader.findWord(cachePool, block, orderedIncludes[0]);
|
||||||
query = indexReader.findUnderspecified(block, budget, filter, orderedIncludes[0]);
|
if (query == null) {
|
||||||
|
return new IndexQuery(Collections.emptyList());
|
||||||
}
|
}
|
||||||
else {
|
|
||||||
query = indexReader.findWord(block, budget, filter, orderedIncludes[0]);
|
query.filter(filter);
|
||||||
}
|
|
||||||
int i;
|
for (int i = 1; i < orderedIncludes.length; i++) {
|
||||||
for (i = 1; (i < 3 && i < orderedIncludes.length) || i < orderedIncludes.length-1; i++) {
|
|
||||||
query = query.alsoCached(orderedIncludes[i]);
|
|
||||||
}
|
|
||||||
for (; i < orderedIncludes.length; i++) {
|
|
||||||
query = query.also(orderedIncludes[i]);
|
query = query.also(orderedIncludes[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int term : searchTerms.excludes) {
|
for (int term : searchTerms.excludes) {
|
||||||
query = query.not(term);
|
query = query.not(term);
|
||||||
}
|
}
|
||||||
|
|
||||||
return query.stream();
|
for (int term : orderedIncludes) {
|
||||||
|
query.prioritize(term);
|
||||||
|
}
|
||||||
|
|
||||||
|
return query.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public IndexBlock getTermScore(int termId, long urlId) {
|
public IndexBlock getTermScore(IndexQueryCachePool cachePool, int termId, long urlId) {
|
||||||
return indexReader.getBlockForResult(termId, urlId);
|
return indexReader.getBlockForResult(cachePool, termId, urlId);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isTermInBucket(IndexBlock block, int termId, long urlId) {
|
public boolean isTermInBucket(IndexQueryCachePool cachePool, IndexBlock block, int termId, long urlId) {
|
||||||
return indexReader.isTermInBucket(block, termId, urlId);
|
return indexReader.isTermInBucket(cachePool, block, termId, urlId);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,76 +1,31 @@
|
|||||||
package nu.marginalia.wmsa.edge.index;
|
package nu.marginalia.wmsa.edge.index;
|
||||||
|
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.google.gson.GsonBuilder;
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
import gnu.trove.map.TLongIntMap;
|
|
||||||
import gnu.trove.map.hash.TIntIntHashMap;
|
|
||||||
import gnu.trove.map.hash.TLongIntHashMap;
|
|
||||||
import gnu.trove.set.hash.TIntHashSet;
|
|
||||||
import io.prometheus.client.Counter;
|
|
||||||
import io.prometheus.client.Histogram;
|
|
||||||
import io.reactivex.rxjava3.schedulers.Schedulers;
|
import io.reactivex.rxjava3.schedulers.Schedulers;
|
||||||
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
|
import nu.marginalia.wmsa.client.GsonFactory;
|
||||||
import nu.marginalia.util.ListChunker;
|
|
||||||
import nu.marginalia.util.dict.DictionaryHashMap;
|
|
||||||
import nu.marginalia.wmsa.configuration.server.Initialization;
|
import nu.marginalia.wmsa.configuration.server.Initialization;
|
||||||
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
import nu.marginalia.wmsa.configuration.server.MetricsServer;
|
||||||
import nu.marginalia.wmsa.configuration.server.Service;
|
import nu.marginalia.wmsa.configuration.server.Service;
|
||||||
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
|
|
||||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
|
|
||||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
|
|
||||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
|
||||||
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
|
||||||
import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest;
|
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
|
||||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
|
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexLexiconService;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexOpsService;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
import nu.marginalia.wmsa.edge.index.svc.EdgeIndexQueryService;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
|
||||||
import nu.marginalia.wmsa.edge.model.search.*;
|
|
||||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
|
||||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
|
||||||
import org.apache.http.HttpStatus;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
import spark.HaltException;
|
|
||||||
import spark.Request;
|
import spark.Request;
|
||||||
import spark.Response;
|
import spark.Response;
|
||||||
import spark.Spark;
|
import spark.Spark;
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.function.LongPredicate;
|
|
||||||
import java.util.stream.LongStream;
|
|
||||||
|
|
||||||
import static spark.Spark.get;
|
import static spark.Spark.get;
|
||||||
import static spark.Spark.halt;
|
|
||||||
|
|
||||||
public class EdgeIndexService extends Service {
|
public class EdgeIndexService extends Service {
|
||||||
private static final int SEARCH_BUDGET_TIMEOUT_MS = 100;
|
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
@NotNull
|
@NotNull
|
||||||
private final Initialization init;
|
private final Initialization init;
|
||||||
private final SearchIndexes indexes;
|
private final SearchIndexes indexes;
|
||||||
private final KeywordLexicon keywordLexicon;
|
|
||||||
|
|
||||||
private final Gson gson = new GsonBuilder()
|
|
||||||
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
|
|
||||||
.create();
|
|
||||||
|
|
||||||
private static final Histogram wmsa_edge_index_query_time
|
|
||||||
= Histogram.build().name("wmsa_edge_index_query_time").help("-").register();
|
|
||||||
private static final Counter wmsa_edge_index_query_count
|
|
||||||
= Counter.build().name("wmsa_edge_index_query_count").help("-").register();
|
|
||||||
private static final Histogram wmsa_edge_index_put_words_time
|
|
||||||
= Histogram.build().name("wmsa_edge_index_put_words_time").help("-").register();
|
|
||||||
|
|
||||||
public static final int DYNAMIC_BUCKET_LENGTH = 7;
|
public static final int DYNAMIC_BUCKET_LENGTH = 7;
|
||||||
|
|
||||||
@ -81,71 +36,34 @@ public class EdgeIndexService extends Service {
|
|||||||
Initialization init,
|
Initialization init,
|
||||||
MetricsServer metricsServer,
|
MetricsServer metricsServer,
|
||||||
SearchIndexes indexes,
|
SearchIndexes indexes,
|
||||||
IndexServicesFactory servicesFactory) {
|
|
||||||
|
EdgeIndexOpsService opsService,
|
||||||
|
EdgeIndexLexiconService lexiconService,
|
||||||
|
EdgeIndexQueryService indexQueryService)
|
||||||
|
{
|
||||||
super(ip, port, init, metricsServer);
|
super(ip, port, init, metricsServer);
|
||||||
|
|
||||||
|
final Gson gson = GsonFactory.get();
|
||||||
|
|
||||||
this.init = init;
|
this.init = init;
|
||||||
this.indexes = indexes;
|
this.indexes = indexes;
|
||||||
this.keywordLexicon = servicesFactory.getKeywordLexicon();
|
|
||||||
|
|
||||||
Spark.post("/words/", this::putWords);
|
Spark.post("/words/", lexiconService::putWords);
|
||||||
Spark.post("/search/", this::search, gson::toJson);
|
|
||||||
Spark.post("/search-domain/", this::searchDomain, gson::toJson);
|
|
||||||
|
|
||||||
Spark.post("/dictionary/*", this::getWordId, gson::toJson);
|
Spark.post("/search/", indexQueryService::search, gson::toJson);
|
||||||
|
Spark.post("/search-domain/", indexQueryService::searchDomain, gson::toJson);
|
||||||
|
|
||||||
Spark.post("/ops/repartition", this::repartitionEndpoint);
|
Spark.get("/dictionary/*", lexiconService::getWordId, gson::toJson);
|
||||||
Spark.post("/ops/preconvert", this::preconvertEndpoint);
|
|
||||||
Spark.post("/ops/reindex/:id", this::reindexEndpoint);
|
Spark.post("/ops/repartition", opsService::repartitionEndpoint);
|
||||||
|
Spark.post("/ops/preconvert", opsService::preconvertEndpoint);
|
||||||
|
Spark.post("/ops/reindex/:id", opsService::reindexEndpoint);
|
||||||
|
|
||||||
get("/is-blocked", this::isBlocked, gson::toJson);
|
get("/is-blocked", this::isBlocked, gson::toJson);
|
||||||
|
|
||||||
Schedulers.newThread().scheduleDirect(this::initialize, 1, TimeUnit.MICROSECONDS);
|
Schedulers.newThread().scheduleDirect(this::initialize, 1, TimeUnit.MICROSECONDS);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Object getWordId(Request request, Response response) {
|
|
||||||
final String word = request.splat()[0];
|
|
||||||
|
|
||||||
var dr = indexes.getDictionaryReader();
|
|
||||||
if (null == dr) {
|
|
||||||
response.status(HttpStatus.SC_FAILED_DEPENDENCY);
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
|
|
||||||
final int wordId = dr.get(word);
|
|
||||||
|
|
||||||
if (DictionaryHashMap.NO_VALUE == wordId) {
|
|
||||||
response.status(404);
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
|
|
||||||
return wordId;
|
|
||||||
}
|
|
||||||
|
|
||||||
private Object repartitionEndpoint(Request request, Response response) {
|
|
||||||
|
|
||||||
if (!indexes.repartition()) {
|
|
||||||
Spark.halt(503, "Operations busy");
|
|
||||||
}
|
|
||||||
return "OK";
|
|
||||||
}
|
|
||||||
|
|
||||||
private Object preconvertEndpoint(Request request, Response response) {
|
|
||||||
if (!indexes.preconvert()) {
|
|
||||||
Spark.halt(503, "Operations busy");
|
|
||||||
}
|
|
||||||
return "OK";
|
|
||||||
}
|
|
||||||
|
|
||||||
private Object reindexEndpoint(Request request, Response response) {
|
|
||||||
int id = Integer.parseInt(request.params("id"));
|
|
||||||
|
|
||||||
if (!indexes.reindex(id)) {
|
|
||||||
Spark.halt(503, "Operations busy");
|
|
||||||
}
|
|
||||||
return "OK";
|
|
||||||
}
|
|
||||||
|
|
||||||
private Object isBlocked(Request request, Response response) {
|
private Object isBlocked(Request request, Response response) {
|
||||||
return indexes.isBusy() || !initialized;
|
return indexes.isBusy() || !initialized;
|
||||||
}
|
}
|
||||||
@ -162,352 +80,7 @@ public class EdgeIndexService extends Service {
|
|||||||
indexes.initialize(init);
|
indexes.initialize(init);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Object putWords(Request request, Response response) {
|
|
||||||
var putWordsRequest = gson.fromJson(request.body(), EdgePutWordsRequest.class);
|
|
||||||
|
|
||||||
synchronized (this) {
|
|
||||||
putWords(putWordsRequest.getDomainId(), putWordsRequest.getUrlId(),
|
|
||||||
putWordsRequest.wordSet, putWordsRequest.getIndex());
|
|
||||||
}
|
|
||||||
|
|
||||||
response.status(HttpStatus.SC_ACCEPTED);
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
|
|
||||||
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
|
|
||||||
EdgePageWordSet wordSet, int idx
|
|
||||||
) {
|
|
||||||
|
|
||||||
wmsa_edge_index_put_words_time.time(() -> {
|
|
||||||
for (EdgePageWords words : wordSet.values()) {
|
|
||||||
putWords(domainId, urlId, words, idx);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
|
|
||||||
EdgePageWords words, int idx
|
|
||||||
) {
|
|
||||||
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
|
|
||||||
|
|
||||||
for (var chunk : ListChunker.chopList(words.words, SearchIndexJournalEntry.MAX_LENGTH)) {
|
|
||||||
|
|
||||||
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
|
|
||||||
var header = new SearchIndexJournalEntryHeader(domainId, urlId, words.block);
|
|
||||||
|
|
||||||
indexWriter.put(header, entry);
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
private long[] getOrInsertWordIds(List<String> words) {
|
|
||||||
return words.stream()
|
|
||||||
.filter(w -> w.getBytes().length < Byte.MAX_VALUE)
|
|
||||||
.mapToLong(keywordLexicon::getOrInsert)
|
|
||||||
.toArray();
|
|
||||||
}
|
|
||||||
|
|
||||||
private Object searchDomain(Request request, Response response) {
|
|
||||||
if (indexes.getDictionaryReader() == null) {
|
|
||||||
logger.warn("Dictionary reader not yet initialized");
|
|
||||||
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
|
|
||||||
}
|
|
||||||
|
|
||||||
String json = request.body();
|
|
||||||
EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class);
|
|
||||||
|
|
||||||
final int wordId = keywordLexicon.getReadOnly(specsSet.keyword);
|
|
||||||
|
|
||||||
List<EdgeId<EdgeUrl>> urlIds = indexes
|
|
||||||
.getBucket(specsSet.bucket)
|
|
||||||
.findHotDomainsForKeyword(specsSet.block, wordId, specsSet.queryDepth, specsSet.minHitCount, specsSet.maxResults)
|
|
||||||
.mapToObj(lv -> new EdgeId<EdgeUrl>((int)(lv & 0xFFFF_FFFFL)))
|
|
||||||
.toList();
|
|
||||||
|
|
||||||
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
|
|
||||||
}
|
|
||||||
|
|
||||||
private Object search(Request request, Response response) {
|
|
||||||
if (indexes.getDictionaryReader() == null) {
|
|
||||||
logger.warn("Dictionary reader not yet initialized");
|
|
||||||
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
|
|
||||||
}
|
|
||||||
|
|
||||||
String json = request.body();
|
|
||||||
EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class);
|
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
|
||||||
try {
|
|
||||||
if (specsSet.isStagger()) {
|
|
||||||
return new EdgeSearchResultSet(searchStaggered(specsSet));
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
return new EdgeSearchResultSet(searchStraight(specsSet));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (HaltException ex) {
|
|
||||||
logger.warn("Halt", ex);
|
|
||||||
throw ex;
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
logger.info("Error during search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
|
|
||||||
logger.info("Error", ex);
|
|
||||||
Spark.halt(500, "Error");
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
finally {
|
|
||||||
wmsa_edge_index_query_time.observe(System.currentTimeMillis() - start);
|
|
||||||
wmsa_edge_index_query_count.inc();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private Map<IndexBlock, List<EdgeSearchResults>> searchStaggered(EdgeSearchSpecification specsSet) {
|
|
||||||
int count = 0;
|
|
||||||
|
|
||||||
final Map<IndexBlock, List<EdgeSearchResults>> results = new HashMap<>();
|
|
||||||
final TIntHashSet seenResults = new TIntHashSet();
|
|
||||||
|
|
||||||
final DomainResultCountFilter[] domainCountFilter = new DomainResultCountFilter[] {
|
|
||||||
new DomainResultCountFilter(specsSet.limitByDomain),
|
|
||||||
new DomainResultCountFilter(specsSet.limitByDomain)
|
|
||||||
};
|
|
||||||
|
|
||||||
final IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_TIMEOUT_MS);
|
|
||||||
final TIntIntHashMap limitsPerBucketRemaining = new TIntIntHashMap(6, 0.7f, 0, specsSet.limitByBucket);
|
|
||||||
|
|
||||||
for (int i = 0; i < specsSet.buckets.size(); i+=2) {
|
|
||||||
for (var sq : specsSet.subqueries) {
|
|
||||||
for (int j = 0; j < 2 && i + j < specsSet.buckets.size(); j++) {
|
|
||||||
Optional<EdgeIndexSearchTerms> searchTerms = getSearchTerms(sq);
|
|
||||||
|
|
||||||
if (searchTerms.isEmpty())
|
|
||||||
continue;
|
|
||||||
|
|
||||||
var result = performSearch(searchTerms.get(),
|
|
||||||
budget,
|
|
||||||
seenResults,
|
|
||||||
domainCountFilter[j],
|
|
||||||
sq,
|
|
||||||
List.of(specsSet.buckets.get(i+j)),
|
|
||||||
specsSet,
|
|
||||||
Math.min(limitsPerBucketRemaining.get(i+j), specsSet.limitTotal - count)
|
|
||||||
);
|
|
||||||
|
|
||||||
if (logger.isDebugEnabled()) {
|
|
||||||
logger.debug("{} -> {} {} {}", sq.block, specsSet.buckets.get(i+j), sq.searchTermsInclude, result.results.values().stream().mapToInt(List::size).sum());
|
|
||||||
}
|
|
||||||
|
|
||||||
int sz = result.size();
|
|
||||||
count += sz;
|
|
||||||
limitsPerBucketRemaining.adjustOrPutValue(i+j, -sz, specsSet.limitByBucket-sz);
|
|
||||||
|
|
||||||
if (sz > 0) {
|
|
||||||
results.computeIfAbsent(sq.block, s -> new ArrayList<>()).add(result);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return results;
|
|
||||||
}
|
|
||||||
|
|
||||||
@NotNull
|
|
||||||
private Map<IndexBlock, List<EdgeSearchResults>> searchStraight(EdgeSearchSpecification specsSet) {
|
|
||||||
Map<IndexBlock, List<EdgeSearchResults>> results = new HashMap<>();
|
|
||||||
int count = 0;
|
|
||||||
TIntHashSet seenResults = new TIntHashSet();
|
|
||||||
|
|
||||||
final DomainResultCountFilter domainCountFilter = new DomainResultCountFilter(specsSet.limitByDomain);
|
|
||||||
|
|
||||||
IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_TIMEOUT_MS);
|
|
||||||
for (var sq : specsSet.subqueries) {
|
|
||||||
Optional<EdgeIndexSearchTerms> searchTerms = getSearchTerms(sq);
|
|
||||||
|
|
||||||
if (searchTerms.isEmpty())
|
|
||||||
continue;
|
|
||||||
|
|
||||||
var result = performSearch(searchTerms.get(),
|
|
||||||
budget, seenResults, domainCountFilter,
|
|
||||||
sq, specsSet.buckets, specsSet,
|
|
||||||
specsSet.limitTotal - count);
|
|
||||||
|
|
||||||
if (logger.isDebugEnabled()) {
|
|
||||||
logger.debug("{} -> {} {}", sq.block, sq.searchTermsInclude, result.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
count += result.size();
|
|
||||||
if (result.size() > 0) {
|
|
||||||
results.computeIfAbsent(sq.block, s -> new ArrayList<>()).add(result);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return results;
|
|
||||||
}
|
|
||||||
|
|
||||||
private EdgeSearchResults performSearch(EdgeIndexSearchTerms searchTerms,
|
|
||||||
IndexSearchBudget budget,
|
|
||||||
TIntHashSet seenResults,
|
|
||||||
DomainResultCountFilter domainCountFilter,
|
|
||||||
EdgeSearchSubquery sq,
|
|
||||||
List<Integer> specBuckets,
|
|
||||||
EdgeSearchSpecification specs,
|
|
||||||
int limit)
|
|
||||||
{
|
|
||||||
if (limit <= 0) {
|
|
||||||
return new EdgeSearchResults();
|
|
||||||
}
|
|
||||||
|
|
||||||
final Map<Integer, List<EdgeSearchResultItem>> results = new HashMap<>();
|
|
||||||
final DomainResultCountFilter localFilter = new DomainResultCountFilter(specs.limitByDomain);
|
|
||||||
|
|
||||||
for (int i : specBuckets) {
|
|
||||||
int foundResultsCount = results.values().stream().mapToInt(List::size).sum();
|
|
||||||
|
|
||||||
if (foundResultsCount >= specs.limitTotal || foundResultsCount >= limit)
|
|
||||||
break;
|
|
||||||
|
|
||||||
List<EdgeSearchResultItem> resultsForBucket = new ArrayList<>(specs.limitByBucket);
|
|
||||||
|
|
||||||
getQuery(i, budget, sq.block, lv -> localFilter.filterRawValue(i, lv), searchTerms)
|
|
||||||
.mapToObj(id -> new EdgeSearchResultItem(i, sq.termSize(), id))
|
|
||||||
.filter(ri -> !seenResults.contains(ri.url.id()) && localFilter.test(i, domainCountFilter, ri))
|
|
||||||
.limit(specs.limitTotal * 3L)
|
|
||||||
.distinct()
|
|
||||||
.limit(Math.min(specs.limitByBucket
|
|
||||||
- results.values().stream().mapToInt(Collection::size).sum(), limit - foundResultsCount))
|
|
||||||
.forEach(resultsForBucket::add);
|
|
||||||
|
|
||||||
|
|
||||||
for (var result : resultsForBucket) {
|
|
||||||
seenResults.add(result.url.id());
|
|
||||||
}
|
|
||||||
for (var result : resultsForBucket) {
|
|
||||||
for (var searchTerm : sq.searchTermsInclude) {
|
|
||||||
result.scores.add(getSearchTermScore(i, searchTerm, result.getCombinedId()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
domainCountFilter.addAll(i, resultsForBucket);
|
|
||||||
|
|
||||||
if (!resultsForBucket.isEmpty()) {
|
|
||||||
results.put(i, resultsForBucket);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return new EdgeSearchResults(results);
|
|
||||||
}
|
|
||||||
|
|
||||||
private EdgeSearchResultKeywordScore getSearchTermScore(int bucketId, String term, long urlId) {
|
|
||||||
final int termId = indexes.getDictionaryReader().get(term);
|
|
||||||
|
|
||||||
var bucket = indexes.getBucket(bucketId);
|
|
||||||
|
|
||||||
return new EdgeSearchResultKeywordScore(term,
|
|
||||||
bucket.getTermScore(termId, urlId),
|
|
||||||
bucket.isTermInBucket(IndexBlock.Title, termId, urlId),
|
|
||||||
bucket.isTermInBucket(IndexBlock.Link, termId, urlId)
|
|
||||||
);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public LongStream getHotDomainsQuery(int bucket, IndexBlock block, int wordId,
|
|
||||||
int queryDepth, int minHitCount, int maxResults) {
|
|
||||||
if (!indexes.isValidBucket(bucket)) {
|
|
||||||
logger.warn("Invalid bucket {}", bucket);
|
|
||||||
return LongStream.empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
return indexes.getBucket(bucket).findHotDomainsForKeyword(block, wordId, queryDepth, minHitCount, maxResults);
|
|
||||||
}
|
|
||||||
|
|
||||||
private LongStream getQuery(int bucket, IndexSearchBudget budget, IndexBlock block,
|
|
||||||
LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
|
|
||||||
if (!indexes.isValidBucket(bucket)) {
|
|
||||||
logger.warn("Invalid bucket {}", bucket);
|
|
||||||
return LongStream.empty();
|
|
||||||
}
|
|
||||||
return indexes.getBucket(bucket).getQuery(block, filter, budget, searchTerms);
|
|
||||||
}
|
|
||||||
|
|
||||||
static class DomainResultCountFilter {
|
|
||||||
final TLongIntMap resultsByDomain = new TLongIntHashMap(200, 0.75f, -1, 0);
|
|
||||||
final int limitByDomain;
|
|
||||||
|
|
||||||
DomainResultCountFilter(int limitByDomain) {
|
|
||||||
this.limitByDomain = limitByDomain;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean filterRawValue(int bucket, long value) {
|
|
||||||
var domain = new EdgeId<EdgeDomain>((int)(value >>> 32));
|
|
||||||
|
|
||||||
if (domain.id() == Integer.MAX_VALUE) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return resultsByDomain.get(getKey(bucket, domain)) <= limitByDomain;
|
|
||||||
}
|
|
||||||
|
|
||||||
long getKey(int bucket, EdgeId<EdgeDomain> id) {
|
|
||||||
return ((long)bucket) << 32 | id.id();
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean test(int bucket, EdgeSearchResultItem item) {
|
|
||||||
if (item.domain.id() == Integer.MAX_VALUE) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return resultsByDomain.adjustOrPutValue(getKey(bucket, item.domain), 1, 1) <= limitByDomain;
|
|
||||||
}
|
|
||||||
|
|
||||||
int getCount(int bucket, EdgeSearchResultItem item) {
|
|
||||||
return resultsByDomain.get(getKey(bucket, item.domain));
|
|
||||||
}
|
|
||||||
|
|
||||||
public void addAll(int bucket, List<EdgeSearchResultItem> items) {
|
|
||||||
items.forEach(item -> {
|
|
||||||
resultsByDomain.adjustOrPutValue(getKey(bucket, item.domain), 1, 1);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean test(int bucket, DomainResultCountFilter root, EdgeSearchResultItem item) {
|
|
||||||
if (item.domain.id() == Integer.MAX_VALUE) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return root.getCount(bucket, item) + resultsByDomain.adjustOrPutValue(getKey(bucket, item.domain), 1, 1) <= limitByDomain;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private Optional<EdgeIndexSearchTerms> getSearchTerms(EdgeSearchSubquery request) {
|
|
||||||
final List<Integer> excludes = new ArrayList<>();
|
|
||||||
final List<Integer> includes = new ArrayList<>();
|
|
||||||
|
|
||||||
for (var include : request.searchTermsInclude) {
|
|
||||||
var word = lookUpWord(include);
|
|
||||||
if (word.isEmpty()) {
|
|
||||||
logger.debug("Unknown search term: " + include);
|
|
||||||
return Optional.empty();
|
|
||||||
}
|
|
||||||
includes.add(word.getAsInt());
|
|
||||||
}
|
|
||||||
|
|
||||||
for (var exclude : request.searchTermsExclude) {
|
|
||||||
lookUpWord(exclude).ifPresent(excludes::add);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (includes.isEmpty()) {
|
|
||||||
return Optional.empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
return Optional.of(new EdgeIndexSearchTerms(includes, excludes));
|
|
||||||
}
|
|
||||||
|
|
||||||
private OptionalInt lookUpWord(String s) {
|
|
||||||
int ret = indexes.getDictionaryReader().get(s);
|
|
||||||
if (ret == DictionaryHashMap.NO_VALUE) {
|
|
||||||
return OptionalInt.empty();
|
|
||||||
}
|
|
||||||
return OptionalInt.of(ret);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,67 +1,65 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.client;
|
package nu.marginalia.wmsa.edge.index.client;
|
||||||
|
|
||||||
import com.google.gson.Gson;
|
|
||||||
import com.google.gson.GsonBuilder;
|
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
|
import io.prometheus.client.Summary;
|
||||||
import io.reactivex.rxjava3.core.Observable;
|
import io.reactivex.rxjava3.core.Observable;
|
||||||
import io.reactivex.rxjava3.schedulers.Schedulers;
|
import io.reactivex.rxjava3.schedulers.Schedulers;
|
||||||
import nu.marginalia.wmsa.client.AbstractDynamicClient;
|
import nu.marginalia.wmsa.client.AbstractDynamicClient;
|
||||||
import nu.marginalia.wmsa.client.HttpStatusCode;
|
|
||||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||||
import nu.marginalia.wmsa.configuration.server.Context;
|
import nu.marginalia.wmsa.configuration.server.Context;
|
||||||
import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest;
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||||
|
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem;
|
||||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultSet;
|
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultSet;
|
||||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
||||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
||||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
||||||
import org.slf4j.Logger;
|
import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq;
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import javax.annotation.CheckReturnValue;
|
import javax.annotation.CheckReturnValue;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class EdgeIndexClient extends AbstractDynamicClient {
|
public class EdgeIndexClient extends AbstractDynamicClient implements EdgeIndexWriterClient {
|
||||||
private final Gson gson = new GsonBuilder()
|
|
||||||
.create();
|
private static final Summary wmsa_search_index_api_time = Summary.build().name("wmsa_search_index_api_time").help("-").register();
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
public EdgeIndexClient() {
|
public EdgeIndexClient() {
|
||||||
super(ServiceDescriptor.EDGE_INDEX);
|
super(ServiceDescriptor.EDGE_INDEX);
|
||||||
setTimeout(30);
|
setTimeout(30);
|
||||||
}
|
}
|
||||||
|
|
||||||
@CheckReturnValue
|
@Override
|
||||||
public Observable<HttpStatusCode> putWords(Context ctx, EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url, double quality,
|
public void putWords(Context ctx, EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url,
|
||||||
EdgePageWordSet wordSet, int writer
|
DocumentKeywords wordSet, int writer
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
EdgePutWordsRequest request = new EdgePutWordsRequest(domain, url, quality, wordSet, writer);
|
|
||||||
|
|
||||||
return this.post(ctx, "/words/", request);
|
var keywordBuilder =
|
||||||
|
IndexPutKeywordsReq.newBuilder()
|
||||||
|
.setDomain(domain.id())
|
||||||
|
.setUrl(url.id())
|
||||||
|
.setIndex(writer);
|
||||||
|
|
||||||
|
var wordSetBuilder = IndexPutKeywordsReq.WordSet.newBuilder();
|
||||||
|
wordSetBuilder.setIndex(wordSet.block().ordinal());
|
||||||
|
wordSetBuilder.addAllWords(List.of(wordSet.keywords()));
|
||||||
|
keywordBuilder.addWordSet(wordSetBuilder.build());
|
||||||
|
|
||||||
|
var req = keywordBuilder.build();
|
||||||
|
|
||||||
|
this.post(ctx, "/words/", req).blockingSubscribe();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@CheckReturnValue
|
@CheckReturnValue
|
||||||
public EdgeSearchResultSet query(Context ctx, EdgeSearchSpecification specs) {
|
public List<EdgeSearchResultItem> query(Context ctx, EdgeSearchSpecification specs) {
|
||||||
return this.postGet(ctx, "/search/", specs, EdgeSearchResultSet.class).blockingFirst();
|
return wmsa_search_index_api_time.time(
|
||||||
}
|
() -> this.postGet(ctx, "/search/", specs, EdgeSearchResultSet.class).blockingFirst().getResults()
|
||||||
|
);
|
||||||
@CheckReturnValue
|
|
||||||
public List<EdgeSearchResultSet> multiQuery(Context ctx, EdgeSearchSpecification... specs) {
|
|
||||||
|
|
||||||
return Observable.fromArray(specs)
|
|
||||||
.concatMap(s -> postGet(ctx, "/search/", s, EdgeSearchResultSet.class)
|
|
||||||
.subscribeOn(Schedulers.io())
|
|
||||||
.timeout(1, TimeUnit.SECONDS)
|
|
||||||
.onErrorComplete())
|
|
||||||
.toList()
|
|
||||||
.blockingGet();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@CheckReturnValue
|
@CheckReturnValue
|
||||||
|
@ -0,0 +1,88 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.client;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import com.google.inject.name.Named;
|
||||||
|
import nu.marginalia.util.ListChunker;
|
||||||
|
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||||
|
import nu.marginalia.wmsa.configuration.server.Context;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||||
|
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
|
||||||
|
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
|
||||||
|
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
|
||||||
|
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
||||||
|
import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class EdgeIndexLocalService implements EdgeIndexWriterClient {
|
||||||
|
|
||||||
|
private final KeywordLexicon lexicon;
|
||||||
|
private final SearchIndexJournalWriterImpl indexWriter;
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(EdgeIndexLocalService.class);
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public EdgeIndexLocalService(@Named("local-index-path") Path path) throws IOException {
|
||||||
|
long hashMapSize = 1L << 31;
|
||||||
|
|
||||||
|
if (Boolean.getBoolean("small-ram")) {
|
||||||
|
hashMapSize = 1L << 27;
|
||||||
|
}
|
||||||
|
|
||||||
|
var lexiconJournal = new KeywordLexiconJournal(path.resolve("dictionary.dat").toFile());
|
||||||
|
lexicon = new KeywordLexicon(lexiconJournal, new DictionaryHashMap(hashMapSize));
|
||||||
|
indexWriter = new SearchIndexJournalWriterImpl(lexicon, path.resolve("index.dat").toFile());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void putWords(Context ctx, EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url,
|
||||||
|
DocumentKeywords wordSet, int writer) {
|
||||||
|
if (wordSet.keywords().length == 0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
if (domain.id() <= 0 || url.id() <= 0) {
|
||||||
|
logger.warn("Bad ID: {}:{}", domain, url);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var chunk : ListChunker.chopList(List.of(wordSet.keywords()), SearchIndexJournalEntry.MAX_LENGTH)) {
|
||||||
|
|
||||||
|
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
|
||||||
|
var header = new SearchIndexJournalEntryHeader(domain, url, wordSet.block());
|
||||||
|
|
||||||
|
indexWriter.put(header, entry);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private long[] getOrInsertWordIds(List<String> words) {
|
||||||
|
long[] ids = new long[words.size()];
|
||||||
|
int putId = 0;
|
||||||
|
|
||||||
|
for (String word : words) {
|
||||||
|
long id = lexicon.getOrInsert(word);
|
||||||
|
if (id != DictionaryHashMap.NO_VALUE) {
|
||||||
|
ids[putId++] = id;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (putId != words.size()) {
|
||||||
|
ids = Arrays.copyOf(ids, putId);
|
||||||
|
}
|
||||||
|
return ids;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws Exception {
|
||||||
|
indexWriter.close();
|
||||||
|
lexicon.close();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,13 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.client;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.configuration.server.Context;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||||
|
|
||||||
|
public interface EdgeIndexWriterClient extends AutoCloseable {
|
||||||
|
|
||||||
|
void putWords(Context ctx, EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url,
|
||||||
|
DocumentKeywords wordSets, int writer);
|
||||||
|
}
|
@ -15,10 +15,11 @@ import java.io.IOException;
|
|||||||
import java.io.RandomAccessFile;
|
import java.io.RandomAccessFile;
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
import java.nio.channels.FileChannel;
|
import java.nio.channels.FileChannel;
|
||||||
|
import java.util.concurrent.LinkedBlockingQueue;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
||||||
private final KeywordLexicon dictionaryWriter;
|
private final KeywordLexicon lexicon;
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
private final Disposable writerTask;
|
private final Disposable writerTask;
|
||||||
@ -30,12 +31,14 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
|||||||
private long pos;
|
private long pos;
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public SearchIndexJournalWriterImpl(KeywordLexicon dictionaryWriter, File indexFile) {
|
public SearchIndexJournalWriterImpl(KeywordLexicon lexicon, File indexFile) {
|
||||||
this.dictionaryWriter = dictionaryWriter;
|
this.lexicon = lexicon;
|
||||||
initializeIndexFile(indexFile);
|
initializeIndexFile(indexFile);
|
||||||
|
|
||||||
byteBuffer = ByteBuffer.allocate(MAX_BLOCK_SIZE);
|
byteBuffer = ByteBuffer.allocate(MAX_BLOCK_SIZE);
|
||||||
|
|
||||||
|
new Thread(this::journalWriterThread, "Journal Writer").start();
|
||||||
|
|
||||||
writerTask = Schedulers.io().schedulePeriodicallyDirect(this::forceWrite, 1, 1, TimeUnit.SECONDS);
|
writerTask = Schedulers.io().schedulePeriodicallyDirect(this::forceWrite, 1, 1, TimeUnit.SECONDS);
|
||||||
Runtime.getRuntime().addShutdownHook(new Thread(this::forceWrite));
|
Runtime.getRuntime().addShutdownHook(new Thread(this::forceWrite));
|
||||||
}
|
}
|
||||||
@ -56,10 +59,27 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private record WriteJob(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {}
|
||||||
|
private final LinkedBlockingQueue<WriteJob> writeQueue = new LinkedBlockingQueue<>(512);
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public synchronized void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {
|
public void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {
|
||||||
|
writeQueue.put(new WriteJob(header, entryData));
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public void journalWriterThread() {
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
var job = writeQueue.take();
|
||||||
|
|
||||||
|
writeEntry(job.header, job.entryData);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private synchronized void writeEntry(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {
|
||||||
|
|
||||||
|
try {
|
||||||
byteBuffer.clear();
|
byteBuffer.clear();
|
||||||
|
|
||||||
byteBuffer.putInt(entryData.size());
|
byteBuffer.putInt(entryData.size());
|
||||||
@ -75,6 +95,9 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
|||||||
channel.write(byteBuffer);
|
channel.write(byteBuffer);
|
||||||
|
|
||||||
writePositionMarker();
|
writePositionMarker();
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -90,17 +113,15 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void flushWords() {
|
public void flushWords() {
|
||||||
dictionaryWriter.commitToDisk();
|
lexicon.commitToDisk();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void writePositionMarker() throws IOException {
|
private void writePositionMarker() throws IOException {
|
||||||
var lock = channel.lock(0, 16, false);
|
|
||||||
pos = channel.size();
|
pos = channel.size();
|
||||||
raf.seek(0);
|
raf.seek(0);
|
||||||
raf.writeLong(pos);
|
raf.writeLong(pos);
|
||||||
raf.writeLong(dictionaryWriter.size());
|
raf.writeLong(lexicon.size());
|
||||||
raf.seek(pos);
|
raf.seek(pos);
|
||||||
lock.release();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public synchronized void close() throws IOException {
|
public synchronized void close() throws IOException {
|
||||||
|
@ -2,15 +2,22 @@ package nu.marginalia.wmsa.edge.index.journal.model;
|
|||||||
|
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||||
|
|
||||||
public record SearchIndexJournalEntryHeader(int entrySize, long documentId, IndexBlock block) {
|
public record SearchIndexJournalEntryHeader(int entrySize, long documentId, IndexBlock block) {
|
||||||
|
|
||||||
public static final int HEADER_SIZE_LONGS = 2;
|
public static final int HEADER_SIZE_LONGS = 2;
|
||||||
|
|
||||||
public SearchIndexJournalEntryHeader( EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId, IndexBlock block) {
|
public SearchIndexJournalEntryHeader( EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId, IndexBlock block) {
|
||||||
this(-1, (long) domainId.id() << 32 | urlId.id(), block);
|
this(-1, combineIds(domainId, urlId), block);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static long combineIds(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId) {
|
||||||
|
long did = domainId.id();
|
||||||
|
long uid = urlId.id();
|
||||||
|
|
||||||
|
return (did << 32L) | uid;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -53,7 +53,7 @@ public class KeywordLexicon implements AutoCloseable {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private int getOrInsert(byte[] bytes) {
|
private int getOrInsert(byte[] bytes) {
|
||||||
if (bytes.length >= Byte.MAX_VALUE) {
|
if (bytes.length >= Byte.MAX_VALUE) {
|
||||||
logger.warn("getOrInsert({}), illegal length {}", bytes, bytes.length);
|
logger.warn("getOrInsert({}), illegal length {}", new String(bytes), bytes.length);
|
||||||
return DictionaryHashMap.NO_VALUE;
|
return DictionaryHashMap.NO_VALUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -15,7 +15,7 @@ import java.util.concurrent.locks.ReadWriteLock;
|
|||||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||||
import java.util.function.Consumer;
|
import java.util.function.Consumer;
|
||||||
|
|
||||||
public class KeywordLexiconJournalFile {
|
public class KeywordLexiconJournalFile implements AutoCloseable {
|
||||||
private final RandomAccessFile journalFileRAF;
|
private final RandomAccessFile journalFileRAF;
|
||||||
private final File journalFile;
|
private final File journalFile;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
@ -9,4 +9,8 @@ import java.util.List;
|
|||||||
public class EdgeIndexSearchTerms {
|
public class EdgeIndexSearchTerms {
|
||||||
public List<Integer> includes = new ArrayList<>();
|
public List<Integer> includes = new ArrayList<>();
|
||||||
public List<Integer> excludes = new ArrayList<>();
|
public List<Integer> excludes = new ArrayList<>();
|
||||||
|
|
||||||
|
public boolean isEmpty() {
|
||||||
|
return includes.isEmpty();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -4,17 +4,17 @@ import lombok.AllArgsConstructor;
|
|||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import lombok.ToString;
|
import lombok.ToString;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||||
|
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||||
|
|
||||||
@AllArgsConstructor @Getter
|
@AllArgsConstructor @Getter
|
||||||
@ToString
|
@ToString
|
||||||
public class EdgePutWordsRequest {
|
public class EdgePutWordsRequest {
|
||||||
public final EdgeId<EdgeDomain> domainId;
|
public EdgeId<EdgeDomain> domainId;
|
||||||
public final EdgeId<EdgeUrl> urlId;
|
public EdgeId<EdgeUrl> urlId;
|
||||||
public final double quality;
|
public double quality;
|
||||||
|
|
||||||
public final EdgePageWordSet wordSet;
|
public EdgePageWordSet wordSet;
|
||||||
private int index = 0;
|
private int index = 0;
|
||||||
}
|
}
|
||||||
|
@ -1,23 +1,36 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.model;
|
package nu.marginalia.wmsa.edge.index.model;
|
||||||
|
|
||||||
public enum IndexBlock {
|
public enum IndexBlock {
|
||||||
TitleKeywords(0, 0),
|
TitleKeywords(IndexBlockType.QUALITY_SIGNAL, 0, 0),
|
||||||
Title(1, 1),
|
Title(IndexBlockType.QUALITY_SIGNAL, 1, 1),
|
||||||
Link(2, 1.25),
|
|
||||||
Top(3, 2),
|
|
||||||
Middle(4, 3),
|
|
||||||
Low(5, 4),
|
|
||||||
Words(6, 6),
|
|
||||||
Meta(7, 7),
|
|
||||||
PositionWords(8, 4.5),
|
|
||||||
NamesWords(9, 5),
|
|
||||||
Artifacts(10, 10),
|
|
||||||
Topic(11, 0.5);
|
|
||||||
|
|
||||||
|
Link(IndexBlockType.QUALITY_SIGNAL, 2, 1.15),
|
||||||
|
|
||||||
|
Subjects(IndexBlockType.QUALITY_SIGNAL, 3, 1.0),
|
||||||
|
NamesWords(IndexBlockType.QUALITY_SIGNAL, 4, 3.0),
|
||||||
|
|
||||||
|
Artifacts(IndexBlockType.PAGE_DATA, 5, 10),
|
||||||
|
Meta(IndexBlockType.PAGE_DATA, 6, 7),
|
||||||
|
|
||||||
|
Tfidf_Top(IndexBlockType.TF_IDF, 7, 1.5),
|
||||||
|
Tfidf_Middle(IndexBlockType.TF_IDF, 8, 2),
|
||||||
|
Tfidf_Lower(IndexBlockType.TF_IDF, 9, 3.5),
|
||||||
|
|
||||||
|
Words_1(IndexBlockType.PAGE_DATA, 10, 2.0),
|
||||||
|
Words_2(IndexBlockType.PAGE_DATA,11, 3.5),
|
||||||
|
Words_4(IndexBlockType.PAGE_DATA,12, 4.0),
|
||||||
|
Words_8(IndexBlockType.PAGE_DATA,13, 4.5),
|
||||||
|
Words_16Plus(IndexBlockType.PAGE_DATA,14, 7.0),
|
||||||
|
|
||||||
|
Site(IndexBlockType.QUALITY_SIGNAL, 15, 1.2)
|
||||||
|
;
|
||||||
|
|
||||||
|
public final IndexBlockType type;
|
||||||
public final int id;
|
public final int id;
|
||||||
public final double sortOrder;
|
public final double sortOrder;
|
||||||
|
|
||||||
IndexBlock(int id, double sortOrder) {
|
IndexBlock(IndexBlockType type, int id, double sortOrder) {
|
||||||
|
this.type = type;
|
||||||
this.sortOrder = sortOrder;
|
this.sortOrder = sortOrder;
|
||||||
this.id = id;
|
this.id = id;
|
||||||
}
|
}
|
||||||
@ -31,3 +44,5 @@ public enum IndexBlock {
|
|||||||
throw new IllegalArgumentException("Bad block id");
|
throw new IllegalArgumentException("Bad block id");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -0,0 +1,7 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.model;
|
||||||
|
|
||||||
|
public enum IndexBlockType {
|
||||||
|
QUALITY_SIGNAL,
|
||||||
|
TF_IDF,
|
||||||
|
PAGE_DATA
|
||||||
|
}
|
@ -49,8 +49,8 @@ public class IndexWordsTable implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public long positionForWord(int wordId) {
|
public long positionForWord(int wordId) {
|
||||||
|
|
||||||
long offset = reader.findEntry(header, wordId);
|
long offset = reader.findEntry(header, wordId);
|
||||||
|
|
||||||
if (offset < 0) {
|
if (offset < 0) {
|
||||||
return -1L;
|
return -1L;
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,43 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.reader;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
public class MicroCache {
|
||||||
|
private final int[] keys;
|
||||||
|
private final long[] data;
|
||||||
|
private int pos = 0;
|
||||||
|
|
||||||
|
public int hit;
|
||||||
|
public int miss;
|
||||||
|
public int full;
|
||||||
|
|
||||||
|
public static final long BAD_VALUE = Long.MIN_VALUE;
|
||||||
|
|
||||||
|
public MicroCache(int size) {
|
||||||
|
keys = new int[size];
|
||||||
|
data = new long[size];
|
||||||
|
|
||||||
|
Arrays.fill(data, BAD_VALUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
public long get(int key) {
|
||||||
|
for (int i = 0; i < keys.length && data[i] != BAD_VALUE; i++) {
|
||||||
|
if (keys[i] == key) {
|
||||||
|
hit++;
|
||||||
|
return data[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
miss++;
|
||||||
|
return BAD_VALUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void set(int key, long val) {
|
||||||
|
keys[pos] = key;
|
||||||
|
data[pos] = val;
|
||||||
|
|
||||||
|
if (++pos >= keys.length) {
|
||||||
|
full++;
|
||||||
|
pos = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -9,6 +9,9 @@ import nu.marginalia.util.btree.CachingBTreeReader;
|
|||||||
import nu.marginalia.util.btree.model.BTreeHeader;
|
import nu.marginalia.util.btree.model.BTreeHeader;
|
||||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||||
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
|
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexConverter;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -22,6 +25,7 @@ public class SearchIndex implements AutoCloseable {
|
|||||||
|
|
||||||
private final MultimapFileLong urls;
|
private final MultimapFileLong urls;
|
||||||
private final IndexWordsTable words;
|
private final IndexWordsTable words;
|
||||||
|
public final String name;
|
||||||
private final RandomAccessFile wordsFile;
|
private final RandomAccessFile wordsFile;
|
||||||
private final BTreeReader bTreeReader;
|
private final BTreeReader bTreeReader;
|
||||||
private final CachingBTreeReader cachingBTreeReader;
|
private final CachingBTreeReader cachingBTreeReader;
|
||||||
@ -36,6 +40,7 @@ public class SearchIndex implements AutoCloseable {
|
|||||||
throws IOException {
|
throws IOException {
|
||||||
|
|
||||||
logger = LoggerFactory.getLogger(name);
|
logger = LoggerFactory.getLogger(name);
|
||||||
|
this.name = name;
|
||||||
wordsFile = new RandomAccessFile(inWords, "r");
|
wordsFile = new RandomAccessFile(inWords, "r");
|
||||||
|
|
||||||
logger.info("{} : Loading {}", name, inUrls);
|
logger.info("{} : Loading {}", name, inUrls);
|
||||||
@ -65,26 +70,37 @@ public class SearchIndex implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public long numUrls(int wordId) {
|
public long numUrls(IndexQueryCachePool pool, int wordId) {
|
||||||
int length = words.wordLength(wordId);
|
int length = words.wordLength(wordId);
|
||||||
if (length < 0) return 0;
|
if (length < 0) return 0;
|
||||||
if (length > 0) return length;
|
if (length > 0) return length;
|
||||||
|
|
||||||
return rangeForWord(wordId).numEntries();
|
return rangeForWord(pool, wordId).numEntries();
|
||||||
}
|
}
|
||||||
|
|
||||||
public UrlIndexTree rangeForWord(int wordId) {
|
public IndexBTreeRange rangeForWord(IndexQueryCachePool pool, int wordId) {
|
||||||
return new UrlIndexTree(words.positionForWord(wordId));
|
IndexBTreeRange range = pool.getRange(words, wordId);
|
||||||
|
|
||||||
|
if (range == null) {
|
||||||
|
range = new IndexBTreeRange(words.positionForWord(wordId));
|
||||||
|
pool.cacheRange(words, wordId, range);
|
||||||
}
|
}
|
||||||
|
|
||||||
public class UrlIndexTree {
|
return range;
|
||||||
final long dataOffset;
|
}
|
||||||
|
|
||||||
|
public IndexBTreeRange rangeForWord(int wordId) {
|
||||||
|
return new IndexBTreeRange(words.positionForWord(wordId));
|
||||||
|
}
|
||||||
|
|
||||||
|
public class IndexBTreeRange {
|
||||||
|
public final long dataOffset;
|
||||||
private BTreeHeader header;
|
private BTreeHeader header;
|
||||||
public UrlIndexTree(long dataOffset) {
|
public IndexBTreeRange(long dataOffset) {
|
||||||
this.dataOffset = dataOffset;
|
this.dataOffset = dataOffset;
|
||||||
}
|
}
|
||||||
|
|
||||||
public LongStream stream() {
|
public LongStream stream(int bufferSize) {
|
||||||
if (dataOffset < 0) {
|
if (dataOffset < 0) {
|
||||||
return LongStream.empty();
|
return LongStream.empty();
|
||||||
}
|
}
|
||||||
@ -94,7 +110,7 @@ public class SearchIndex implements AutoCloseable {
|
|||||||
|
|
||||||
long urlOffset = header.dataOffsetLongs();
|
long urlOffset = header.dataOffsetLongs();
|
||||||
long endOffset = header.dataOffsetLongs() + header.numEntries();
|
long endOffset = header.dataOffsetLongs() + header.numEntries();
|
||||||
int stepSize = Math.min(1024, header.numEntries());
|
int stepSize = Math.min(bufferSize, header.numEntries());
|
||||||
|
|
||||||
long[] buffer = new long[stepSize];
|
long[] buffer = new long[stepSize];
|
||||||
|
|
||||||
@ -107,6 +123,19 @@ public class SearchIndex implements AutoCloseable {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public EntrySource asEntrySource() {
|
||||||
|
return new AsEntrySource();
|
||||||
|
}
|
||||||
|
|
||||||
|
public QueryFilterStepIf asExcludeFilterStep(IndexQueryCachePool pool) {
|
||||||
|
return new AsExcludeQueryFilterStep(pool);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public LongStream stream() {
|
||||||
|
return stream(1024);
|
||||||
|
}
|
||||||
|
|
||||||
public boolean isPresent() {
|
public boolean isPresent() {
|
||||||
return dataOffset >= 0;
|
return dataOffset >= 0;
|
||||||
}
|
}
|
||||||
@ -122,34 +151,94 @@ public class SearchIndex implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean hasUrl(long url) {
|
public boolean hasUrl(CachingBTreeReader.BTreeCachedIndex cache, long url) {
|
||||||
if (header != null) {
|
if (dataOffset < 0) return false;
|
||||||
return bTreeReader.findEntry(header, url) >= 0;
|
|
||||||
|
return cachingBTreeReader.findEntry(cache, url) >= 0;
|
||||||
}
|
}
|
||||||
else if (dataOffset < 0) return false;
|
|
||||||
else {
|
public boolean hasUrl(IndexQueryCachePool pool, long url) {
|
||||||
|
if (dataOffset < 0)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
CachingBTreeReader.BTreeCachedIndex cache = pool.getIndexCache(SearchIndex.this, this);
|
||||||
|
|
||||||
|
return cachingBTreeReader.findEntry(cache, url) >= 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public CachingBTreeReader.BTreeCachedIndex createIndexCache() {
|
||||||
|
if (dataOffset < 0)
|
||||||
|
return null;
|
||||||
|
|
||||||
|
if (header == null) {
|
||||||
|
header = cachingBTreeReader.getHeader(dataOffset);
|
||||||
|
}
|
||||||
|
|
||||||
|
return cachingBTreeReader.prepareCache(header);
|
||||||
|
}
|
||||||
|
|
||||||
|
class AsEntrySource implements EntrySource {
|
||||||
|
long pos;
|
||||||
|
final long endOffset;
|
||||||
|
|
||||||
|
public SearchIndex getIndex() {
|
||||||
|
return SearchIndex.this;
|
||||||
|
};
|
||||||
|
|
||||||
|
public AsEntrySource() {
|
||||||
|
if (dataOffset <= 0) {
|
||||||
|
pos = -1;
|
||||||
|
endOffset = -1;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (header == null) {
|
||||||
header = bTreeReader.getHeader(dataOffset);
|
header = bTreeReader.getHeader(dataOffset);
|
||||||
return bTreeReader.findEntry(header, url) >= 0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean hasUrl(CachingBTreeReader.Cache cache, long url) {
|
pos = header.dataOffsetLongs();
|
||||||
if (header != null) {
|
endOffset = header.dataOffsetLongs() + header.numEntries();
|
||||||
return cachingBTreeReader.findEntry(header, cache, url) >= 0;
|
|
||||||
}
|
|
||||||
else if (dataOffset < 0) return false;
|
|
||||||
else {
|
|
||||||
header = bTreeReader.getHeader(dataOffset);
|
|
||||||
return cachingBTreeReader.findEntry(header, cache, url) >= 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public CachingBTreeReader.Cache createIndexCache() {
|
|
||||||
return cachingBTreeReader.prepareCache();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int read(long[] buffer, int n) {
|
||||||
|
if (pos >= endOffset) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int rb = Math.min(n, (int)(endOffset - pos));
|
||||||
|
urls.read(buffer, rb, pos);
|
||||||
|
pos += rb;
|
||||||
|
return rb;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class AsExcludeQueryFilterStep implements QueryFilterStepIf {
|
||||||
|
private final CachingBTreeReader.BTreeCachedIndex cache;
|
||||||
|
|
||||||
|
public AsExcludeQueryFilterStep(IndexQueryCachePool pool) {
|
||||||
|
cache = pool.getIndexCache(SearchIndex.this, IndexBTreeRange.this);
|
||||||
|
}
|
||||||
|
|
||||||
|
public SearchIndex getIndex() {
|
||||||
|
return SearchIndex.this;
|
||||||
|
};
|
||||||
|
public double cost() {
|
||||||
|
return cache.getIndexedDataSize();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean test(long value) {
|
||||||
|
return !hasUrl(cache, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String describe() {
|
||||||
|
return "Exclude["+name+"]";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws Exception {
|
public void close() throws Exception {
|
||||||
|
@ -3,9 +3,8 @@ package nu.marginalia.wmsa.edge.index.reader;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexQueryBuilder;
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryFactory;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.query.Query;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -19,18 +18,20 @@ import java.util.stream.Stream;
|
|||||||
public class SearchIndexReader implements AutoCloseable {
|
public class SearchIndexReader implements AutoCloseable {
|
||||||
|
|
||||||
private final EnumMap<IndexBlock, SearchIndex> indices;
|
private final EnumMap<IndexBlock, SearchIndex> indices;
|
||||||
|
private final EnumMap<IndexBlock, IndexQueryFactory> queryBuilders;
|
||||||
private final EnumMap<IndexBlock, IndexQueryBuilder> queryBuilders;
|
|
||||||
private final EnumMap<IndexBlock, IndexQueryBuilder> underspecifiedQueryBuilders;
|
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
private static final IndexBlock[] indicesBySearchOrder = new IndexBlock[] {
|
private static final IndexBlock[] indicesBySearchOrder = new IndexBlock[] {
|
||||||
IndexBlock.Top,
|
IndexBlock.Title,
|
||||||
IndexBlock.Middle,
|
IndexBlock.Tfidf_Top,
|
||||||
IndexBlock.Low,
|
IndexBlock.Tfidf_Middle,
|
||||||
IndexBlock.Words,
|
IndexBlock.Tfidf_Lower,
|
||||||
IndexBlock.NamesWords,
|
IndexBlock.Words_1,
|
||||||
|
IndexBlock.Words_2,
|
||||||
|
IndexBlock.Words_4,
|
||||||
|
IndexBlock.Words_8,
|
||||||
|
IndexBlock.Words_16Plus,
|
||||||
};
|
};
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
@ -38,30 +39,33 @@ public class SearchIndexReader implements AutoCloseable {
|
|||||||
EnumMap<IndexBlock, SearchIndex> indices) {
|
EnumMap<IndexBlock, SearchIndex> indices) {
|
||||||
this.indices = indices;
|
this.indices = indices;
|
||||||
|
|
||||||
var lowIndex = indices.get(IndexBlock.Low);
|
var lowIndex = indices.get(IndexBlock.Tfidf_Lower);
|
||||||
var midIndex = indices.get(IndexBlock.Middle);
|
var midIndex = indices.get(IndexBlock.Tfidf_Middle);
|
||||||
var topIndex = indices.get(IndexBlock.Top);
|
var topIndex = indices.get(IndexBlock.Tfidf_Top);
|
||||||
var linkIndex = indices.get(IndexBlock.Link);
|
var linkIndex = indices.get(IndexBlock.Link);
|
||||||
var titleIndex = indices.get(IndexBlock.Title);
|
var titleIndex = indices.get(IndexBlock.Title);
|
||||||
var namesIndex = indices.get(IndexBlock.NamesWords);
|
var siteIndex = indices.get(IndexBlock.Site);
|
||||||
var positionIndex = indices.get(IndexBlock.PositionWords);
|
|
||||||
var titleKeywordsIndex = indices.get(IndexBlock.TitleKeywords);
|
|
||||||
var wordsIndex = indices.get(IndexBlock.Words);
|
|
||||||
var metaIndex = indices.get(IndexBlock.Meta);
|
var metaIndex = indices.get(IndexBlock.Meta);
|
||||||
var topicIndex = indices.get(IndexBlock.Topic);
|
var topicIndex = indices.get(IndexBlock.Subjects);
|
||||||
|
|
||||||
|
var words1 = indices.get(IndexBlock.Words_1);
|
||||||
|
var words2 = indices.get(IndexBlock.Words_2);
|
||||||
|
var words4 = indices.get(IndexBlock.Words_4);
|
||||||
|
var words8 = indices.get(IndexBlock.Words_8);
|
||||||
|
var words16 = indices.get(IndexBlock.Words_16Plus);
|
||||||
|
var artifacts = indices.get(IndexBlock.Artifacts);
|
||||||
|
|
||||||
queryBuilders = new EnumMap<>(IndexBlock.class);
|
queryBuilders = new EnumMap<>(IndexBlock.class);
|
||||||
underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class);
|
|
||||||
|
|
||||||
queryBuilders.put(IndexBlock.Words, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex, namesIndex, wordsIndex), wordsIndex));
|
List<SearchIndex> excludeIndices = listOfNonNulls(metaIndex, titleIndex, topIndex, midIndex, lowIndex, words1);
|
||||||
queryBuilders.put(IndexBlock.Low, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex), wordsIndex));
|
List<SearchIndex> priorityIndices = listOfNonNulls(titleIndex, linkIndex, siteIndex, topIndex, topicIndex);
|
||||||
queryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex), wordsIndex));
|
|
||||||
queryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex), wordsIndex));
|
|
||||||
queryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex), wordsIndex));
|
|
||||||
|
|
||||||
underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
|
queryBuilders.put(IndexBlock.Title, new IndexQueryFactory(listOfNonNulls(metaIndex, titleIndex, linkIndex), excludeIndices, priorityIndices));
|
||||||
underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
|
queryBuilders.put(IndexBlock.Words_1, new IndexQueryFactory(listOfNonNulls(metaIndex, words1), excludeIndices, priorityIndices));
|
||||||
underspecifiedQueryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
|
queryBuilders.put(IndexBlock.Words_2, new IndexQueryFactory(listOfNonNulls(metaIndex, words2), excludeIndices, priorityIndices));
|
||||||
|
queryBuilders.put(IndexBlock.Words_4, new IndexQueryFactory(listOfNonNulls(metaIndex, words4), excludeIndices, priorityIndices));
|
||||||
|
queryBuilders.put(IndexBlock.Words_8, new IndexQueryFactory(listOfNonNulls(metaIndex, words8), excludeIndices, priorityIndices));
|
||||||
|
queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryFactory(listOfNonNulls(metaIndex, words16, artifacts), excludeIndices, priorityIndices));
|
||||||
}
|
}
|
||||||
|
|
||||||
@SafeVarargs
|
@SafeVarargs
|
||||||
@ -99,27 +103,13 @@ public class SearchIndexReader implements AutoCloseable {
|
|||||||
.limit(maxResults);
|
.limit(maxResults);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Query findUnderspecified(
|
public IndexQueryFactory.IndexQueryBuilder findWord(IndexQueryCachePool cachePool, IndexBlock block, int wordId) {
|
||||||
IndexBlock block,
|
|
||||||
IndexSearchBudget budget,
|
|
||||||
LongPredicate filter,
|
|
||||||
int wordId) {
|
|
||||||
|
|
||||||
var builder = underspecifiedQueryBuilders.get(block);
|
|
||||||
|
|
||||||
if (null != builder) {
|
|
||||||
return builder.buildUnderspecified(budget, filter, wordId);
|
|
||||||
}
|
|
||||||
return findWord(block, budget, filter, wordId);
|
|
||||||
}
|
|
||||||
|
|
||||||
public Query findWord(IndexBlock block, IndexSearchBudget budget, LongPredicate filter, int wordId) {
|
|
||||||
var builder = queryBuilders.get(block);
|
var builder = queryBuilders.get(block);
|
||||||
|
|
||||||
if (builder == null)
|
if (builder == null)
|
||||||
return Query.EMPTY;
|
return null;
|
||||||
|
|
||||||
return builder.build(budget, filter, wordId);
|
return builder.buildQuery(cachePool, wordId);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -130,20 +120,20 @@ public class SearchIndexReader implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public long numHits(IndexBlock block, int word) {
|
public long numHits(IndexQueryCachePool pool, IndexBlock block, int word) {
|
||||||
IndexQueryBuilder builder = queryBuilders.get(block);
|
IndexQueryFactory builder = queryBuilders.get(block);
|
||||||
|
|
||||||
if (builder == null)
|
if (builder == null)
|
||||||
return 0L;
|
return 0L;
|
||||||
|
|
||||||
long hits = 0;
|
long hits = 0;
|
||||||
for (var index : builder.getIndicies()) {
|
for (var index : builder.getIndicies()) {
|
||||||
hits += index.numUrls(word);
|
hits += index.numUrls(pool, word);
|
||||||
}
|
}
|
||||||
return hits;
|
return hits;
|
||||||
}
|
}
|
||||||
|
|
||||||
public IndexBlock getBlockForResult(int searchTerm, long urlId) {
|
public IndexBlock getBlockForResult(IndexQueryCachePool cachePool, int searchTerm, long urlId) {
|
||||||
for (var block : indicesBySearchOrder) {
|
for (var block : indicesBySearchOrder) {
|
||||||
var index = indices.get(block);
|
var index = indices.get(block);
|
||||||
|
|
||||||
@ -151,21 +141,18 @@ public class SearchIndexReader implements AutoCloseable {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
var range = index.rangeForWord(searchTerm);
|
if (cachePool.isUrlPresent(index, searchTerm, urlId))
|
||||||
|
|
||||||
if (range.hasUrl(urlId)) {
|
|
||||||
return block;
|
return block;
|
||||||
}
|
|
||||||
}
|
|
||||||
return IndexBlock.Words;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isTermInBucket(IndexBlock block, int searchTerm, long urlId) {
|
return IndexBlock.Words_16Plus;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isTermInBucket(IndexQueryCachePool cachePool, IndexBlock block, int searchTerm, long urlId) {
|
||||||
final var index = indices.get(block);
|
final var index = indices.get(block);
|
||||||
if (null == index) return false;
|
if (null == index) return false;
|
||||||
|
|
||||||
return index
|
return cachePool.isUrlPresent(index, searchTerm, urlId);
|
||||||
.rangeForWord(searchTerm)
|
|
||||||
.hasUrl(urlId);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -105,7 +105,7 @@ public class SearchIndexes {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
public KeywordLexiconReadOnlyView getDictionaryReader() {
|
public KeywordLexiconReadOnlyView getLexiconReader() {
|
||||||
return keywordLexiconReadOnlyView;
|
return keywordLexiconReadOnlyView;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -146,6 +146,7 @@ public class SearchIndexes {
|
|||||||
public EdgeIndexBucket getBucket(int bucketId) {
|
public EdgeIndexBucket getBucket(int bucketId) {
|
||||||
return buckets[bucketId];
|
return buckets[bucketId];
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isValidBucket(int bucketId) {
|
public boolean isValidBucket(int bucketId) {
|
||||||
return bucketId >= 0 && bucketId < buckets.length;
|
return bucketId >= 0 && bucketId < buckets.length;
|
||||||
}
|
}
|
||||||
|
@ -1,151 +0,0 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.reader.query;
|
|
||||||
|
|
||||||
import com.google.common.collect.Streams;
|
|
||||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
|
||||||
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.function.LongPredicate;
|
|
||||||
import java.util.function.Supplier;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import java.util.stream.IntStream;
|
|
||||||
import java.util.stream.LongStream;
|
|
||||||
|
|
||||||
public class IndexQueryBuilder {
|
|
||||||
private final List<SearchIndex> requiredIndices;
|
|
||||||
private final SearchIndex excludeIndex;
|
|
||||||
|
|
||||||
public Collection<SearchIndex> getIndicies() {
|
|
||||||
return requiredIndices;
|
|
||||||
}
|
|
||||||
|
|
||||||
public IndexQueryBuilder(List<SearchIndex> requiredIndices, SearchIndex excludeIndex) {
|
|
||||||
this.requiredIndices = requiredIndices.stream().filter(Objects::nonNull).collect(Collectors.toList());
|
|
||||||
this.excludeIndex = excludeIndex;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Query build(IndexSearchBudget budget,
|
|
||||||
LongPredicate filter,
|
|
||||||
int wordId) {
|
|
||||||
return new QueryForIndices(budget, filter, wordId);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Special treatment for queries with few terms, prefer hits that appear in multiple buckets
|
|
||||||
public Query buildUnderspecified(IndexSearchBudget budget, LongPredicate filter, int wordId) {
|
|
||||||
|
|
||||||
if (requiredIndices.size() == 1) {
|
|
||||||
return build(budget, filter, wordId);
|
|
||||||
}
|
|
||||||
|
|
||||||
var ranges = requiredIndices.stream().map(idx -> idx.rangeForWord(wordId)).toArray(SearchIndex.UrlIndexTree[]::new);
|
|
||||||
var relevantIndices = IntStream.range(0, requiredIndices.size()).filter(i -> ranges[i].isPresent()).toArray();
|
|
||||||
|
|
||||||
if (relevantIndices.length == 0) {
|
|
||||||
return new QueryForIndices(budget, LongStream::empty);
|
|
||||||
}
|
|
||||||
else if (relevantIndices.length == 1 || relevantIndices[0] != 0) {
|
|
||||||
return build(budget, filter, wordId);
|
|
||||||
}
|
|
||||||
|
|
||||||
var fstRange = requiredIndices.get(relevantIndices[0]).rangeForWord(wordId);
|
|
||||||
|
|
||||||
LongStream priorityStream = underspecifiedPairStream(budget, 1000, relevantIndices[0], relevantIndices[0], wordId);
|
|
||||||
for (int i = 1; i < relevantIndices.length; i++) {
|
|
||||||
priorityStream = Streams.concat(priorityStream, underspecifiedPairStream(budget, 1000, relevantIndices[0], relevantIndices[i], wordId));
|
|
||||||
}
|
|
||||||
LongStream stream = LongStream.concat(priorityStream, fstRange.stream().takeWhile(budget::take)).filter(filter);
|
|
||||||
|
|
||||||
return new QueryForIndices(budget, () -> stream);
|
|
||||||
}
|
|
||||||
|
|
||||||
private LongStream underspecifiedPairStream(IndexSearchBudget budget, int limit, int firstIdx, int otherIdx, int wordId) {
|
|
||||||
SearchIndex firstTmp = requiredIndices.get(firstIdx),
|
|
||||||
secondTmp = requiredIndices.get(otherIdx);
|
|
||||||
|
|
||||||
final SearchIndex fst;
|
|
||||||
final SearchIndex snd;
|
|
||||||
|
|
||||||
if (firstTmp.numUrls(wordId) > secondTmp.numUrls(wordId)) {
|
|
||||||
fst = secondTmp;
|
|
||||||
snd = firstTmp;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
fst = firstTmp;
|
|
||||||
snd = secondTmp;
|
|
||||||
}
|
|
||||||
|
|
||||||
var sndRange = snd.rangeForWord(wordId);
|
|
||||||
var cache = sndRange.createIndexCache();
|
|
||||||
|
|
||||||
return fst.rangeForWord(wordId).stream().takeWhile(budget::take).limit(limit).filter(data -> sndRange.hasUrl(cache, data));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
private class QueryForIndices implements Query {
|
|
||||||
private final Supplier<LongStream> supp;
|
|
||||||
private final IndexSearchBudget budget;
|
|
||||||
|
|
||||||
private QueryForIndices(IndexSearchBudget budget, LongPredicate filter, int wordId) {
|
|
||||||
this.budget = budget;
|
|
||||||
supp = () ->
|
|
||||||
requiredIndices.stream().flatMapToLong(idx -> {
|
|
||||||
var range = idx.rangeForWord(wordId);
|
|
||||||
return range.stream().takeWhile(budget::take);
|
|
||||||
})
|
|
||||||
.filter(filter);
|
|
||||||
}
|
|
||||||
|
|
||||||
private QueryForIndices(IndexSearchBudget budget, Supplier<LongStream> supp) {
|
|
||||||
this.budget = budget;
|
|
||||||
this.supp = supp;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Query also(int wordId) {
|
|
||||||
return new QueryForIndices(budget,
|
|
||||||
() -> requiredIndices.stream().flatMapToLong(idx -> alsoStream(idx, wordId)));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Query alsoCached(int wordId) {
|
|
||||||
return new QueryForIndices(budget,
|
|
||||||
() -> requiredIndices.stream().flatMapToLong(idx -> alsoStreamCached(idx, wordId)));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Query not(int wordId) {
|
|
||||||
// Happens when an index simply isn't present, won't find data anyway
|
|
||||||
// so it's safe to no-op the query
|
|
||||||
if (excludeIndex == null)
|
|
||||||
return new QueryForIndices(budget, LongStream::empty);
|
|
||||||
|
|
||||||
return new QueryForIndices(budget, () -> notStream(wordId));
|
|
||||||
}
|
|
||||||
|
|
||||||
private LongStream alsoStream(SearchIndex idx, int wordId) {
|
|
||||||
var range = idx.rangeForWord(wordId);
|
|
||||||
|
|
||||||
return stream().filter(range::hasUrl).takeWhile(budget::take);
|
|
||||||
}
|
|
||||||
|
|
||||||
private LongStream alsoStreamCached(SearchIndex idx, int wordId) {
|
|
||||||
var range = idx.rangeForWord(wordId);
|
|
||||||
var cache = range.createIndexCache();
|
|
||||||
|
|
||||||
return stream().filter(data -> range.hasUrl(cache, data)).takeWhile(budget::take);
|
|
||||||
}
|
|
||||||
|
|
||||||
private LongStream notStream(int wordId) {
|
|
||||||
var bodyRange = excludeIndex.rangeForWord(wordId);
|
|
||||||
var cache = bodyRange.createIndexCache();
|
|
||||||
|
|
||||||
return stream().filter(url -> !bodyRange.hasUrl(cache, url)).takeWhile(budget::take);
|
|
||||||
}
|
|
||||||
|
|
||||||
public LongStream stream() {
|
|
||||||
return supp.get();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,16 +0,0 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.reader.query;
|
|
||||||
|
|
||||||
|
|
||||||
public class IndexSearchBudget {
|
|
||||||
private long timeout;
|
|
||||||
|
|
||||||
public IndexSearchBudget(long limitTime) {
|
|
||||||
this.timeout = System.currentTimeMillis() + limitTime;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Used for short-circuiting Stream-objects using takeWhile, we don't care
|
|
||||||
public boolean take(long unused) {
|
|
||||||
return System.currentTimeMillis() < timeout;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,26 +0,0 @@
|
|||||||
package nu.marginalia.wmsa.edge.index.reader.query;
|
|
||||||
|
|
||||||
import java.util.stream.LongStream;
|
|
||||||
|
|
||||||
public interface Query {
|
|
||||||
Query EMPTY = new Query() {
|
|
||||||
@Override
|
|
||||||
public Query also(int wordId) { return this; }
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Query alsoCached(int wordId) { return this; }
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Query not(int wordId) { return this; }
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public LongStream stream() { return LongStream.empty(); }
|
|
||||||
};
|
|
||||||
|
|
||||||
Query also(int wordId);
|
|
||||||
Query alsoCached(int wordId);
|
|
||||||
|
|
||||||
Query not(int wordId);
|
|
||||||
|
|
||||||
LongStream stream();
|
|
||||||
}
|
|
@ -0,0 +1,107 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.svc;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import com.google.protobuf.InvalidProtocolBufferException;
|
||||||
|
import nu.marginalia.util.ListChunker;
|
||||||
|
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||||
|
import nu.marginalia.wmsa.edge.index.IndexServicesFactory;
|
||||||
|
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl;
|
||||||
|
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
|
||||||
|
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
|
||||||
|
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||||
|
import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq;
|
||||||
|
import org.apache.http.HttpStatus;
|
||||||
|
import spark.Request;
|
||||||
|
import spark.Response;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class EdgeIndexLexiconService {
|
||||||
|
|
||||||
|
private final SearchIndexes indexes;
|
||||||
|
private final KeywordLexicon keywordLexicon;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public EdgeIndexLexiconService(SearchIndexes indexes, IndexServicesFactory servicesFactory) {
|
||||||
|
this.indexes = indexes;
|
||||||
|
this.keywordLexicon = servicesFactory.getKeywordLexicon();
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object getWordId(Request request, Response response) {
|
||||||
|
final String word = request.splat()[0];
|
||||||
|
|
||||||
|
var lr = indexes.getLexiconReader();
|
||||||
|
if (null == lr) {
|
||||||
|
response.status(HttpStatus.SC_FAILED_DEPENDENCY);
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
final int wordId = lr.get(word);
|
||||||
|
|
||||||
|
if (DictionaryHashMap.NO_VALUE == wordId) {
|
||||||
|
response.status(404);
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
return wordId;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Object putWords(Request request, Response response) throws InvalidProtocolBufferException {
|
||||||
|
var req = IndexPutKeywordsReq.parseFrom(request.bodyAsBytes());
|
||||||
|
|
||||||
|
EdgeId<EdgeDomain> domainId = new EdgeId<>(req.getDomain());
|
||||||
|
EdgeId<EdgeUrl> urlId = new EdgeId<>(req.getUrl());
|
||||||
|
int idx = req.getIndex();
|
||||||
|
|
||||||
|
for (int ws = 0; ws < req.getWordSetCount(); ws++) {
|
||||||
|
putWords(domainId, urlId, req.getWordSet(ws), idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
response.status(HttpStatus.SC_ACCEPTED);
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
|
||||||
|
IndexPutKeywordsReq.WordSet words, int idx
|
||||||
|
) {
|
||||||
|
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
|
||||||
|
|
||||||
|
IndexBlock block = IndexBlock.values()[words.getIndex()];
|
||||||
|
|
||||||
|
for (var chunk : ListChunker.chopList(words.getWordsList(), SearchIndexJournalEntry.MAX_LENGTH)) {
|
||||||
|
|
||||||
|
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
|
||||||
|
var header = new SearchIndexJournalEntryHeader(domainId, urlId, block);
|
||||||
|
|
||||||
|
indexWriter.put(header, entry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private long[] getOrInsertWordIds(List<String> words) {
|
||||||
|
long[] ids = new long[words.size()];
|
||||||
|
int putIdx = 0;
|
||||||
|
|
||||||
|
for (String word : words) {
|
||||||
|
long id = keywordLexicon.getOrInsert(word);
|
||||||
|
if (id != DictionaryHashMap.NO_VALUE) {
|
||||||
|
ids[putIdx++] = id;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (putIdx != words.size()) {
|
||||||
|
ids = Arrays.copyOf(ids, putIdx);
|
||||||
|
}
|
||||||
|
return ids;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,44 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.svc;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||||
|
import spark.Request;
|
||||||
|
import spark.Response;
|
||||||
|
import spark.Spark;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class EdgeIndexOpsService {
|
||||||
|
|
||||||
|
private final SearchIndexes indexes;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public EdgeIndexOpsService(SearchIndexes indexes) {
|
||||||
|
this.indexes = indexes;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object repartitionEndpoint(Request request, Response response) {
|
||||||
|
|
||||||
|
if (!indexes.repartition()) {
|
||||||
|
Spark.halt(503, "Operations busy");
|
||||||
|
}
|
||||||
|
return "OK";
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object preconvertEndpoint(Request request, Response response) {
|
||||||
|
if (!indexes.preconvert()) {
|
||||||
|
Spark.halt(503, "Operations busy");
|
||||||
|
}
|
||||||
|
return "OK";
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object reindexEndpoint(Request request, Response response) {
|
||||||
|
int id = Integer.parseInt(request.params("id"));
|
||||||
|
|
||||||
|
if (!indexes.reindex(id)) {
|
||||||
|
Spark.halt(503, "Operations busy");
|
||||||
|
}
|
||||||
|
return "OK";
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,325 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.svc;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import gnu.trove.set.hash.TIntHashSet;
|
||||||
|
import io.prometheus.client.Counter;
|
||||||
|
import io.prometheus.client.Histogram;
|
||||||
|
import nu.marginalia.util.dict.DictionaryHashMap;
|
||||||
|
import nu.marginalia.wmsa.client.GsonFactory;
|
||||||
|
import nu.marginalia.wmsa.configuration.WmsaHome;
|
||||||
|
import nu.marginalia.wmsa.edge.index.EdgeIndexBucket;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
||||||
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQuery;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexQueryCachePool;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.IndexSearchBudget;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.ResultDomainDeduplicator;
|
||||||
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.id.EdgeIdList;
|
||||||
|
import nu.marginalia.wmsa.edge.model.search.*;
|
||||||
|
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
||||||
|
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
||||||
|
import org.apache.http.HttpStatus;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
import spark.HaltException;
|
||||||
|
import spark.Request;
|
||||||
|
import spark.Response;
|
||||||
|
import spark.Spark;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.function.LongPredicate;
|
||||||
|
|
||||||
|
import static java.util.Comparator.comparing;
|
||||||
|
import static spark.Spark.halt;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class EdgeIndexQueryService {
|
||||||
|
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
private static final int SEARCH_BUDGET_TIMEOUT_MS = 3000;
|
||||||
|
private static final int QUERY_FETCH_SIZE = 8192;
|
||||||
|
private static final int QUERY_FIRST_PASS_DOMAIN_LIMIT = 64;
|
||||||
|
|
||||||
|
private static final Counter wmsa_edge_index_query_timeouts = Counter.build().name("wmsa_edge_index_query_timeouts").help("-").register();
|
||||||
|
|
||||||
|
private static final Histogram wmsa_edge_index_query_time = Histogram.build().name("wmsa_edge_index_query_time").linearBuckets(50, 50, 15).help("-").register();
|
||||||
|
private static final Histogram wmsa_edge_index_domain_query_time = Histogram.build().name("wmsa_edge_index_domain_query_time").linearBuckets(50, 50, 15).help("-").register();
|
||||||
|
|
||||||
|
private final Gson gson = GsonFactory.get();
|
||||||
|
|
||||||
|
private final SearchIndexes indexes;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public EdgeIndexQueryService(SearchIndexes indexes) {
|
||||||
|
this.indexes = indexes;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object searchDomain(Request request, Response response) {
|
||||||
|
if (indexes.getLexiconReader() == null) {
|
||||||
|
logger.warn("Dictionary reader not yet initialized");
|
||||||
|
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
|
||||||
|
}
|
||||||
|
|
||||||
|
String json = request.body();
|
||||||
|
EdgeDomainSearchSpecification specsSet = gson.fromJson(json, EdgeDomainSearchSpecification.class);
|
||||||
|
|
||||||
|
try {
|
||||||
|
return wmsa_edge_index_domain_query_time.time(() -> queryDomain(specsSet));
|
||||||
|
}
|
||||||
|
catch (HaltException ex) {
|
||||||
|
logger.warn("Halt", ex);
|
||||||
|
throw ex;
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.info("Error during domain search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
|
||||||
|
logger.info("Error", ex);
|
||||||
|
Spark.halt(500, "Error");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object search(Request request, Response response) {
|
||||||
|
if (indexes.getLexiconReader() == null) {
|
||||||
|
logger.warn("Dictionary reader not yet initialized");
|
||||||
|
halt(HttpStatus.SC_SERVICE_UNAVAILABLE, "Come back in a few minutes");
|
||||||
|
}
|
||||||
|
|
||||||
|
String json = request.body();
|
||||||
|
EdgeSearchSpecification specsSet = gson.fromJson(json, EdgeSearchSpecification.class);
|
||||||
|
|
||||||
|
try {
|
||||||
|
return wmsa_edge_index_query_time.time(() -> query(specsSet));
|
||||||
|
}
|
||||||
|
catch (HaltException ex) {
|
||||||
|
logger.warn("Halt", ex);
|
||||||
|
throw ex;
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.info("Error during search {}({}) (query: {})", ex.getClass().getSimpleName(), ex.getMessage(), json);
|
||||||
|
logger.info("Error", ex);
|
||||||
|
Spark.halt(500, "Error");
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public EdgeSearchResultSet query(EdgeSearchSpecification specsSet) {
|
||||||
|
List<EdgeSearchResultItem> results = new SearchQuery(specsSet).execute();
|
||||||
|
return new EdgeSearchResultSet(results);
|
||||||
|
}
|
||||||
|
|
||||||
|
public EdgeDomainSearchResults queryDomain(EdgeDomainSearchSpecification specsSet) {
|
||||||
|
|
||||||
|
final OptionalInt wordId = lookUpWord(specsSet.keyword);
|
||||||
|
EdgeIdList<EdgeUrl> urlIds;
|
||||||
|
|
||||||
|
if (wordId.isEmpty()) {
|
||||||
|
urlIds = new EdgeIdList<>();
|
||||||
|
} else {
|
||||||
|
urlIds = indexes
|
||||||
|
.getBucket(specsSet.bucket)
|
||||||
|
.findHotDomainsForKeyword(specsSet.block, wordId.getAsInt(), specsSet.queryDepth, specsSet.minHitCount, specsSet.maxResults)
|
||||||
|
.mapToInt(lv -> (int) (lv & 0xFFFF_FFFFL))
|
||||||
|
.collect(EdgeIdList::new, EdgeIdList::add, EdgeIdList::addAll);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new EdgeDomainSearchResults(specsSet.keyword, urlIds);
|
||||||
|
}
|
||||||
|
|
||||||
|
private class SearchQuery {
|
||||||
|
private final TIntHashSet seenResults = new TIntHashSet(QUERY_FETCH_SIZE, 0.5f);
|
||||||
|
private final EdgeSearchSpecification specsSet;
|
||||||
|
private final IndexSearchBudget budget = new IndexSearchBudget(SEARCH_BUDGET_TIMEOUT_MS);
|
||||||
|
private final IndexQueryCachePool cachePool = new IndexQueryCachePool();
|
||||||
|
|
||||||
|
public SearchQuery(EdgeSearchSpecification specsSet) {
|
||||||
|
this.specsSet = specsSet;
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<EdgeSearchResultItem> execute() {
|
||||||
|
final Set<EdgeSearchResultItem> results = new HashSet<>(QUERY_FETCH_SIZE);
|
||||||
|
|
||||||
|
for (var sq : specsSet.subqueries) {
|
||||||
|
results.addAll(performSearch(sq));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var result : results) {
|
||||||
|
addResultScores(result);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!budget.hasTimeLeft()) {
|
||||||
|
wmsa_edge_index_query_timeouts.inc();
|
||||||
|
}
|
||||||
|
|
||||||
|
var domainCountFilter = new ResultDomainDeduplicator(specsSet.limitByDomain);
|
||||||
|
|
||||||
|
if (WmsaHome.isDebug()) {
|
||||||
|
cachePool.printSummary(logger);
|
||||||
|
}
|
||||||
|
cachePool.clear();
|
||||||
|
|
||||||
|
return results.stream()
|
||||||
|
.sorted(
|
||||||
|
comparing(EdgeSearchResultItem::getScore)
|
||||||
|
.thenComparing(EdgeSearchResultItem::getRanking)
|
||||||
|
.thenComparing(EdgeSearchResultItem::getUrlIdInt)
|
||||||
|
)
|
||||||
|
.filter(domainCountFilter::test)
|
||||||
|
.limit(specsSet.getLimitTotal()).toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private List<EdgeSearchResultItem> performSearch(EdgeSearchSubquery sq)
|
||||||
|
{
|
||||||
|
|
||||||
|
final List<EdgeSearchResultItem> results = new ArrayList<>(QUERY_FETCH_SIZE);
|
||||||
|
final EdgeIndexSearchTerms searchTerms = getSearchTerms(sq);
|
||||||
|
|
||||||
|
if (searchTerms.isEmpty())
|
||||||
|
return Collections.emptyList();
|
||||||
|
|
||||||
|
for (int indexBucket : specsSet.buckets) {
|
||||||
|
final ResultDomainDeduplicator localFilter = new ResultDomainDeduplicator(QUERY_FIRST_PASS_DOMAIN_LIMIT);
|
||||||
|
|
||||||
|
if (!budget.hasTimeLeft()) {
|
||||||
|
logger.info("Query timed out, omitting {}:{} for query {}", indexBucket, sq.block, sq.searchTermsInclude);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (QUERY_FETCH_SIZE <= results.size())
|
||||||
|
break;
|
||||||
|
|
||||||
|
IndexQuery query = getQuery(cachePool, indexBucket, sq.block, localFilter::filterRawValue, searchTerms);
|
||||||
|
long[] buf = new long[8192];
|
||||||
|
|
||||||
|
while (query.hasMore() && results.size() < QUERY_FETCH_SIZE && budget.hasTimeLeft()) {
|
||||||
|
int cnt = query.getMoreResults(buf, budget);
|
||||||
|
|
||||||
|
for (int i = 0; i < cnt && results.size() < QUERY_FETCH_SIZE; i++) {
|
||||||
|
final long id = buf[i];
|
||||||
|
|
||||||
|
if (!seenResults.add((int)(id & 0xFFFF_FFFFL)) || !localFilter.test(id)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
results.add(new EdgeSearchResultItem(indexBucket, id));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return results;
|
||||||
|
}
|
||||||
|
|
||||||
|
private IndexQuery getQuery(IndexQueryCachePool cachePool, int bucket, IndexBlock block,
|
||||||
|
LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
|
||||||
|
|
||||||
|
if (!indexes.isValidBucket(bucket)) {
|
||||||
|
logger.warn("Invalid bucket {}", bucket);
|
||||||
|
return new IndexQuery(Collections.emptyList());
|
||||||
|
}
|
||||||
|
|
||||||
|
return indexes.getBucket(bucket).getQuery(cachePool, block, filter, searchTerms);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addResultScores(EdgeSearchResultItem searchResult) {
|
||||||
|
final var reader = Objects.requireNonNull(indexes.getLexiconReader());
|
||||||
|
|
||||||
|
List<List<String>> searchTermVariants = specsSet.subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList();
|
||||||
|
|
||||||
|
// Memoize calls to getTermData, as they're somewhat expensive and highly redundant
|
||||||
|
Map<ResultTerm, ResultTermData> termMetadata = new HashMap<>(32);
|
||||||
|
|
||||||
|
double bestScore = 0;
|
||||||
|
|
||||||
|
for (int searchTermListIdx = 0; searchTermListIdx < searchTermVariants.size(); searchTermListIdx++) {
|
||||||
|
double setScore = 0;
|
||||||
|
int setSize = 0;
|
||||||
|
for (var searchTerm : searchTermVariants.get(searchTermListIdx)) {
|
||||||
|
|
||||||
|
final int termId = reader.get(searchTerm);
|
||||||
|
|
||||||
|
ResultTermData data = termMetadata.computeIfAbsent(
|
||||||
|
new ResultTerm(searchResult.bucketId, termId, searchResult.getCombinedId()), this::getTermData);
|
||||||
|
|
||||||
|
var score = data.asScore(searchTermListIdx, searchTerm);
|
||||||
|
searchResult.scores.add(score);
|
||||||
|
setScore += score.value();
|
||||||
|
setSize++;
|
||||||
|
}
|
||||||
|
bestScore = Math.min(bestScore, setScore/setSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
searchResult.setScore(bestScore);
|
||||||
|
}
|
||||||
|
|
||||||
|
private ResultTermData getTermData(ResultTerm resultTerm) {
|
||||||
|
final EdgeIndexBucket bucket = indexes.getBucket(resultTerm.bucket);
|
||||||
|
final int termId = resultTerm.termId;
|
||||||
|
final long combinedUrlId = resultTerm.combinedUrlId;
|
||||||
|
|
||||||
|
return new ResultTermData(bucket.getTermScore(cachePool, termId, combinedUrlId),
|
||||||
|
bucket.isTermInBucket(cachePool, IndexBlock.Title, termId, combinedUrlId),
|
||||||
|
bucket.isTermInBucket(cachePool, IndexBlock.Link, termId, combinedUrlId),
|
||||||
|
bucket.isTermInBucket(cachePool, IndexBlock.Site, termId, combinedUrlId),
|
||||||
|
bucket.isTermInBucket(cachePool, IndexBlock.Subjects, termId, combinedUrlId),
|
||||||
|
bucket.isTermInBucket(cachePool, IndexBlock.NamesWords, termId, combinedUrlId),
|
||||||
|
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Top, termId, combinedUrlId),
|
||||||
|
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Middle, termId, combinedUrlId),
|
||||||
|
bucket.isTermInBucket(cachePool, IndexBlock.Tfidf_Lower, termId, combinedUrlId)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
record ResultTerm (int bucket, int termId, long combinedUrlId) {}
|
||||||
|
record ResultTermData (IndexBlock index,
|
||||||
|
boolean title,
|
||||||
|
boolean link,
|
||||||
|
boolean site,
|
||||||
|
boolean subject,
|
||||||
|
boolean name,
|
||||||
|
boolean high,
|
||||||
|
boolean mid,
|
||||||
|
boolean low
|
||||||
|
) {
|
||||||
|
public EdgeSearchResultKeywordScore asScore(int set, String searchTerm) {
|
||||||
|
return new EdgeSearchResultKeywordScore(set, searchTerm, index, title, link, site, subject, name, high, mid, low);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private EdgeIndexSearchTerms getSearchTerms(EdgeSearchSubquery request) {
|
||||||
|
final List<Integer> excludes = new ArrayList<>();
|
||||||
|
final List<Integer> includes = new ArrayList<>();
|
||||||
|
|
||||||
|
for (var include : request.searchTermsInclude) {
|
||||||
|
var word = lookUpWord(include);
|
||||||
|
if (word.isEmpty()) {
|
||||||
|
logger.debug("Unknown search term: " + include);
|
||||||
|
return new EdgeIndexSearchTerms(includes, excludes);
|
||||||
|
}
|
||||||
|
includes.add(word.getAsInt());
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var exclude : request.searchTermsExclude) {
|
||||||
|
lookUpWord(exclude).ifPresent(excludes::add);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new EdgeIndexSearchTerms(includes, excludes);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private OptionalInt lookUpWord(String s) {
|
||||||
|
int ret = indexes.getLexiconReader().get(s);
|
||||||
|
if (ret == DictionaryHashMap.NO_VALUE) {
|
||||||
|
return OptionalInt.empty();
|
||||||
|
}
|
||||||
|
return OptionalInt.of(ret);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,97 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static java.lang.Math.min;
|
||||||
|
|
||||||
|
public class IndexQuery {
|
||||||
|
private final List<EntrySource> sources;
|
||||||
|
private final List<QueryFilterStepIf> inclusionFilter = new ArrayList<>(10);
|
||||||
|
private final List<QueryFilterStepIf> priorityFilter = new ArrayList<>(10);
|
||||||
|
|
||||||
|
public IndexQuery(List<EntrySource> sources) {
|
||||||
|
this.sources = sources;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addInclusionFilter(QueryFilterStepIf filter) {
|
||||||
|
inclusionFilter.add(filter);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addPriorityFilter(QueryFilterStepIf filter) {
|
||||||
|
priorityFilter.add(filter);
|
||||||
|
}
|
||||||
|
|
||||||
|
private int si = 0;
|
||||||
|
|
||||||
|
public boolean hasMore() {
|
||||||
|
return si < sources.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getMoreResults(long[] dest, IndexSearchBudget budget) {
|
||||||
|
final EntrySource source = sources.get(si);
|
||||||
|
|
||||||
|
int bufferUtilizedLength = source.read(dest, dest.length);
|
||||||
|
|
||||||
|
if (bufferUtilizedLength <= 0) {
|
||||||
|
si++;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var filter : inclusionFilter) {
|
||||||
|
bufferUtilizedLength = filter.retainDestructive(dest, bufferUtilizedLength);
|
||||||
|
|
||||||
|
if (bufferUtilizedLength <= 0) {
|
||||||
|
si++;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (budget.hasTimeLeft()) {
|
||||||
|
prioritizeBuffer(dest, source, bufferUtilizedLength, budget);
|
||||||
|
}
|
||||||
|
|
||||||
|
int count = min(bufferUtilizedLength, dest.length);
|
||||||
|
System.arraycopy(dest, 0, dest, 0, count);
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void prioritizeBuffer(long[] dest, EntrySource source, int remainingBufferSize, IndexSearchBudget budget) {
|
||||||
|
int prioStart = 0;
|
||||||
|
|
||||||
|
for (var filter : priorityFilter) {
|
||||||
|
if (!budget.hasTimeLeft())
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (filter.getIndex() == source.getIndex())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
prioStart += filter.retainReorder(dest, prioStart, remainingBufferSize);
|
||||||
|
|
||||||
|
if (prioStart >= remainingBufferSize) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
sb.append("Sources:\n");
|
||||||
|
|
||||||
|
for (var source: sources) {
|
||||||
|
sb.append("\t").append(source.getIndex().name).append("\n");
|
||||||
|
}
|
||||||
|
sb.append("Includes:\n");
|
||||||
|
for (var include : inclusionFilter) {
|
||||||
|
sb.append("\t").append(include.describe()).append("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -0,0 +1,60 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||||
|
|
||||||
|
import nu.marginalia.util.btree.CachingBTreeReader;
|
||||||
|
import nu.marginalia.wmsa.edge.index.reader.IndexWordsTable;
|
||||||
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class IndexQueryCachePool {
|
||||||
|
private final Map<PoolKey, CachingBTreeReader.BTreeCachedIndex> indexCaches = new HashMap<>();
|
||||||
|
private final Map<RangeKey, SearchIndex.IndexBTreeRange> rangeCache = new HashMap<>();
|
||||||
|
private final Map<PoolKey, Integer> savedCounts = new HashMap<>();
|
||||||
|
|
||||||
|
public CachingBTreeReader.BTreeCachedIndex getIndexCache(SearchIndex index, SearchIndex.IndexBTreeRange range) {
|
||||||
|
var key = new PoolKey(index, range.dataOffset);
|
||||||
|
var entry = indexCaches.get(key);
|
||||||
|
|
||||||
|
if (entry == null) {
|
||||||
|
entry = range.createIndexCache();
|
||||||
|
indexCaches.put(key, entry);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
savedCounts.merge(key, 1, Integer::sum);
|
||||||
|
}
|
||||||
|
|
||||||
|
return entry;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public boolean isUrlPresent(SearchIndex index, int term, long url) {
|
||||||
|
var range = index.rangeForWord(this, term);
|
||||||
|
return range.isPresent() && range.hasUrl(this, url);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void printSummary(Logger logger) {
|
||||||
|
long loadedBytes = indexCaches.values().stream().mapToLong(CachingBTreeReader.BTreeCachedIndex::sizeBytes).sum();
|
||||||
|
long savedBytes = savedCounts.entrySet().stream().mapToLong(e -> e.getValue() * indexCaches.get(e.getKey()).sizeBytes()).sum();
|
||||||
|
|
||||||
|
long loaded = indexCaches.values().stream().filter(CachingBTreeReader.BTreeCachedIndex::isLoaded).count();
|
||||||
|
|
||||||
|
logger.info("Index Cache Summary: {}/{} loaded/total, {} index blocks loaded, {} index blocks saved", loaded, indexCaches.size(), loadedBytes/4096., savedBytes/4096.);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void clear() {
|
||||||
|
indexCaches.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
public SearchIndex.IndexBTreeRange getRange(IndexWordsTable words, int wordId) {
|
||||||
|
return rangeCache.get(new RangeKey(words, wordId));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void cacheRange(IndexWordsTable words, int wordId, SearchIndex.IndexBTreeRange range) {
|
||||||
|
rangeCache.put(new RangeKey(words, wordId), range);
|
||||||
|
}
|
||||||
|
|
||||||
|
private record RangeKey(IndexWordsTable table, int wordId) {}
|
||||||
|
private record PoolKey(SearchIndex index, long dataOffset) {}
|
||||||
|
}
|
@ -0,0 +1,103 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.types.EntrySource;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterBTreeRange;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepFromPredicate;
|
||||||
|
import nu.marginalia.wmsa.edge.index.svc.query.types.filter.QueryFilterStepIf;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.function.LongPredicate;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
public class IndexQueryFactory {
|
||||||
|
private final List<SearchIndex> requiredIndices;
|
||||||
|
private final List<SearchIndex> excludeIndex;
|
||||||
|
private final List<SearchIndex> priortyIndices;
|
||||||
|
|
||||||
|
public Collection<SearchIndex> getIndicies() {
|
||||||
|
return requiredIndices;
|
||||||
|
}
|
||||||
|
|
||||||
|
public IndexQueryFactory(List<SearchIndex> requiredIndices, List<SearchIndex> excludeIndex, List<SearchIndex> priortyIndices) {
|
||||||
|
this.requiredIndices = requiredIndices.stream().filter(Objects::nonNull).collect(Collectors.toList());
|
||||||
|
this.excludeIndex = excludeIndex;
|
||||||
|
this.priortyIndices = priortyIndices;
|
||||||
|
}
|
||||||
|
|
||||||
|
public IndexQueryBuilder buildQuery(IndexQueryCachePool cachePool, int firstWordId) {
|
||||||
|
List<EntrySource> sources = new ArrayList<>(requiredIndices.size());
|
||||||
|
|
||||||
|
for (var ri : requiredIndices) {
|
||||||
|
var range = ri.rangeForWord(cachePool, firstWordId);
|
||||||
|
if (range.isPresent()) {
|
||||||
|
sources.add(range.asEntrySource());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new IndexQueryBuilder(new IndexQuery(sources), cachePool);
|
||||||
|
}
|
||||||
|
|
||||||
|
public class IndexQueryBuilder {
|
||||||
|
private final IndexQuery query;
|
||||||
|
private final IndexQueryCachePool cachePool;
|
||||||
|
|
||||||
|
IndexQueryBuilder(IndexQuery query,
|
||||||
|
IndexQueryCachePool cachePool) {
|
||||||
|
this.query = query;
|
||||||
|
this.cachePool = cachePool;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void filter(LongPredicate predicate) {
|
||||||
|
query.addInclusionFilter(new QueryFilterStepFromPredicate(predicate));
|
||||||
|
}
|
||||||
|
|
||||||
|
public IndexQueryBuilder also(int termId) {
|
||||||
|
List<QueryFilterStepIf> filters = new ArrayList<>(requiredIndices.size());
|
||||||
|
|
||||||
|
for (var ri : requiredIndices) {
|
||||||
|
var range = ri.rangeForWord(cachePool, termId);
|
||||||
|
|
||||||
|
if (range.isPresent()) {
|
||||||
|
filters.add(new QueryFilterBTreeRange(ri, range, cachePool));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
filters.add(QueryFilterStepIf.noPass());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
filters.sort(Comparator.naturalOrder());
|
||||||
|
query.addInclusionFilter(QueryFilterStepIf.anyOf(filters));
|
||||||
|
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public IndexQueryBuilder not(int termId) {
|
||||||
|
for (var ri : excludeIndex) {
|
||||||
|
var range = ri.rangeForWord(cachePool, termId);
|
||||||
|
if (range.isPresent()) {
|
||||||
|
query.addInclusionFilter(range.asExcludeFilterStep(cachePool));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void prioritize(int termId) {
|
||||||
|
for (var idx : priortyIndices) {
|
||||||
|
var range = idx.rangeForWord(cachePool, termId);
|
||||||
|
if (range.isPresent()) {
|
||||||
|
query.addPriorityFilter(new QueryFilterBTreeRange(idx, range, cachePool));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public IndexQuery build() {
|
||||||
|
return query;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,12 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||||
|
|
||||||
|
import java.util.stream.LongStream;
|
||||||
|
|
||||||
|
public interface IndexQueryIf {
|
||||||
|
IndexQueryIf also(int wordId);
|
||||||
|
IndexQueryIf alsoCached(int wordId);
|
||||||
|
|
||||||
|
IndexQueryIf not(int wordId);
|
||||||
|
|
||||||
|
LongStream stream();
|
||||||
|
}
|
@ -0,0 +1,12 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||||
|
|
||||||
|
|
||||||
|
public class IndexSearchBudget {
|
||||||
|
private final long timeout;
|
||||||
|
|
||||||
|
public IndexSearchBudget(long limitTime) {
|
||||||
|
this.timeout = System.currentTimeMillis() + limitTime;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasTimeLeft() { return System.currentTimeMillis() < timeout; }
|
||||||
|
}
|
@ -0,0 +1,50 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.svc.query;
|
||||||
|
|
||||||
|
import gnu.trove.map.TLongIntMap;
|
||||||
|
import gnu.trove.map.hash.TLongIntHashMap;
|
||||||
|
import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultItem;
|
||||||
|
|
||||||
|
public class ResultDomainDeduplicator {
|
||||||
|
final TLongIntMap resultsByRankingId = new TLongIntHashMap(2048, 0.5f, -1, 0);
|
||||||
|
final int limitByDomain;
|
||||||
|
|
||||||
|
public ResultDomainDeduplicator(int limitByDomain) {
|
||||||
|
this.limitByDomain = limitByDomain;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean filterRawValue(long value) {
|
||||||
|
int rankingId = (int) (value >>> 32);
|
||||||
|
|
||||||
|
if (rankingId == Integer.MAX_VALUE) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return resultsByRankingId.get(getKey(rankingId)) <= limitByDomain;
|
||||||
|
}
|
||||||
|
|
||||||
|
long getKey(int rankingId) {
|
||||||
|
return rankingId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean test(long value) {
|
||||||
|
int ranking = (int) (value >>> 32);
|
||||||
|
if (ranking == Integer.MAX_VALUE) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return resultsByRankingId.adjustOrPutValue(ranking, 1, 1) <= limitByDomain;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean test(EdgeSearchResultItem item) {
|
||||||
|
final int ranking = item.getRanking();
|
||||||
|
if (ranking == Integer.MAX_VALUE) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// For ResultItems, consider bucketId as well as different buckets may use different
|
||||||
|
// ranking algorithms
|
||||||
|
final long key = ranking*32L + item.bucketId;
|
||||||
|
|
||||||
|
return resultsByRankingId.adjustOrPutValue(key, 1, 1) <= limitByDomain;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,9 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.index.svc.query.types;
|
||||||
|
|
||||||
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndex;
|
||||||
|
|
||||||
|
public interface EntrySource {
|
||||||
|
SearchIndex getIndex();
|
||||||
|
int read(long[] buffer, int n);
|
||||||
|
|
||||||
|
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user