mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Experimental changes for 22-08/09 update.
This commit is contained in:
parent
db056be06a
commit
3200c36072
@ -58,7 +58,7 @@ jmhJar {
|
|||||||
}
|
}
|
||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':third_party')
|
implementation project(':third_party')
|
||||||
|
implementation project(':protocol')
|
||||||
|
|
||||||
implementation 'org.projectlombok:lombok:1.18.24'
|
implementation 'org.projectlombok:lombok:1.18.24'
|
||||||
annotationProcessor 'org.projectlombok:lombok:1.18.24'
|
annotationProcessor 'org.projectlombok:lombok:1.18.24'
|
||||||
@ -157,6 +157,9 @@ dependencies {
|
|||||||
|
|
||||||
jmh 'org.openjdk.jmh:jmh-core:1.35'
|
jmh 'org.openjdk.jmh:jmh-core:1.35'
|
||||||
jmh 'org.openjdk.jmh:jmh-generator-annprocess:1.35'
|
jmh 'org.openjdk.jmh:jmh-generator-annprocess:1.35'
|
||||||
|
|
||||||
|
implementation 'com.dslplatform:dsl-json:1.9.9'
|
||||||
|
annotationProcessor 'com.dslplatform:dsl-json-processor:1.9.9'
|
||||||
}
|
}
|
||||||
|
|
||||||
configurations {
|
configurations {
|
||||||
|
@ -1,18 +1,18 @@
|
|||||||
package nu.marginalia.util.dict;
|
package nu.marginalia.util.dict;
|
||||||
|
|
||||||
import nu.marginalia.util.SeekDictionary;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
import java.nio.LongBuffer;
|
import java.nio.LongBuffer;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
public class DictionaryData {
|
public class DictionaryData {
|
||||||
|
|
||||||
private final int DICTIONARY_BANK_SIZE;
|
private final int DICTIONARY_BANK_SIZE;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(DictionaryData.class);
|
private static final Logger logger = LoggerFactory.getLogger(DictionaryData.class);
|
||||||
|
|
||||||
private final SeekDictionary<DictionaryDataBank> banks = SeekDictionary.of(DictionaryDataBank::getSize);
|
private final ArrayList<DictionaryDataBank> banks = new ArrayList(100);
|
||||||
|
|
||||||
public DictionaryData(int bankSize) {
|
public DictionaryData(int bankSize) {
|
||||||
DICTIONARY_BANK_SIZE = bankSize;
|
DICTIONARY_BANK_SIZE = bankSize;
|
||||||
@ -20,12 +20,8 @@ public class DictionaryData {
|
|||||||
banks.add(new DictionaryDataBank(0, bankSize));
|
banks.add(new DictionaryDataBank(0, bankSize));
|
||||||
}
|
}
|
||||||
|
|
||||||
public int size() {
|
|
||||||
return banks.end();
|
|
||||||
}
|
|
||||||
|
|
||||||
public int add(long key) {
|
public int add(long key) {
|
||||||
var activeBank = banks.last();
|
var activeBank = banks.get(banks.size()-1);
|
||||||
int rb = activeBank.add(key);
|
int rb = activeBank.add(key);
|
||||||
|
|
||||||
if (rb == -1) {
|
if (rb == -1) {
|
||||||
@ -42,10 +38,10 @@ public class DictionaryData {
|
|||||||
|
|
||||||
|
|
||||||
public long getKey(int offset) {
|
public long getKey(int offset) {
|
||||||
return banks.bankForOffset(offset).getKey(offset);
|
return banks.get(offset/DICTIONARY_BANK_SIZE).getKey(offset);
|
||||||
}
|
}
|
||||||
public boolean keyEquals(int offset, long otherKey) {
|
public boolean keyEquals(int offset, long otherKey) {
|
||||||
return banks.bankForOffset(offset).keyEquals(offset, otherKey);
|
return banks.get(offset/DICTIONARY_BANK_SIZE).keyEquals(offset, otherKey);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class DictionaryDataBank {
|
private static class DictionaryDataBank {
|
||||||
|
@ -19,7 +19,12 @@ public class WordPatterns {
|
|||||||
public static final Pattern wordBreakPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
|
public static final Pattern wordBreakPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
|
||||||
public static final Pattern characterNoisePattern = Pattern.compile("^[/+\\-]+$");
|
public static final Pattern characterNoisePattern = Pattern.compile("^[/+\\-]+$");
|
||||||
|
|
||||||
|
public static final Pattern singleWordAdditionalPattern =
|
||||||
|
Pattern.compile("[\\da-zA-Z]{1,15}([.\\-_/:][\\da-zA-Z]{1,10}){0,4}");
|
||||||
|
|
||||||
|
public static final Predicate<String> singleWordQualitiesPredicate = singleWordAdditionalPattern.asMatchPredicate();
|
||||||
public static final Predicate<String> wordQualitiesPredicate = wordPattern.asMatchPredicate();
|
public static final Predicate<String> wordQualitiesPredicate = wordPattern.asMatchPredicate();
|
||||||
|
|
||||||
public static final Predicate<String> wordAppendixPredicate = wordAppendixPattern.asMatchPredicate();
|
public static final Predicate<String> wordAppendixPredicate = wordAppendixPattern.asMatchPredicate();
|
||||||
public static final Predicate<String> wordPredicateEither = wordQualitiesPredicate.or(wordAppendixPredicate);
|
public static final Predicate<String> wordPredicateEither = wordQualitiesPredicate.or(wordAppendixPredicate);
|
||||||
public static final Predicate<String> characterNoisePredicate = characterNoisePattern.asMatchPredicate();
|
public static final Predicate<String> characterNoisePredicate = characterNoisePattern.asMatchPredicate();
|
||||||
|
@ -8,7 +8,6 @@ import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
|||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||||
import org.jetbrains.annotations.NotNull;
|
|
||||||
|
|
||||||
import javax.inject.Inject;
|
import javax.inject.Inject;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
@ -45,7 +44,6 @@ public class DocumentKeywordExtractor {
|
|||||||
List<WordRep> wordsNamesRepeated = nameCounter.count(documentLanguageData, 2);
|
List<WordRep> wordsNamesRepeated = nameCounter.count(documentLanguageData, 2);
|
||||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
|
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
|
||||||
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
||||||
List<WordRep> wordsLongName = longNameCounter.count(documentLanguageData);
|
|
||||||
|
|
||||||
int totalSize = wordsTfIdf.size();
|
int totalSize = wordsTfIdf.size();
|
||||||
|
|
||||||
@ -61,17 +59,6 @@ public class DocumentKeywordExtractor {
|
|||||||
|
|
||||||
var wordsToMatchWithTitle = joinWordLists(topKeywords, midKeywords, wordsNamesRepeated, subjects);
|
var wordsToMatchWithTitle = joinWordLists(topKeywords, midKeywords, wordsNamesRepeated, subjects);
|
||||||
|
|
||||||
var words = getSimpleWords(documentLanguageData);
|
|
||||||
|
|
||||||
for (var w : wordsLongName)
|
|
||||||
words.add(w.word);
|
|
||||||
for (var w : lowKeywords)
|
|
||||||
words.remove(w.word);
|
|
||||||
for (var w : midKeywords)
|
|
||||||
words.remove(w.word);
|
|
||||||
for (var w : topKeywords)
|
|
||||||
words.remove(w.word);
|
|
||||||
|
|
||||||
Collection<String> artifacts = getArtifacts(documentLanguageData);
|
Collection<String> artifacts = getArtifacts(documentLanguageData);
|
||||||
|
|
||||||
var wordSet = new EdgePageWordSet(
|
var wordSet = new EdgePageWordSet(
|
||||||
@ -85,15 +72,81 @@ public class DocumentKeywordExtractor {
|
|||||||
new EdgePageWords(IndexBlock.Artifacts, artifacts)
|
new EdgePageWords(IndexBlock.Artifacts, artifacts)
|
||||||
);
|
);
|
||||||
|
|
||||||
wordSet.append(IndexBlock.Words, words);
|
getSimpleWords(wordSet, documentLanguageData,
|
||||||
|
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus);
|
||||||
|
|
||||||
return wordSet;
|
return wordSet;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void getSimpleWords(EdgePageWordSet wordSet, DocumentLanguageData documentLanguageData, IndexBlock... blocks) {
|
||||||
|
|
||||||
|
int start = 0;
|
||||||
|
int lengthGoal = 32;
|
||||||
|
|
||||||
|
for (int blockIdx = 0; blockIdx < blocks.length-1 && start < documentLanguageData.sentences.length; blockIdx++) {
|
||||||
|
IndexBlock block = blocks[blockIdx];
|
||||||
|
Set<String> words = new HashSet<>(lengthGoal+100);
|
||||||
|
|
||||||
|
int pos;
|
||||||
|
int length = 0;
|
||||||
|
for (pos = start; pos < documentLanguageData.sentences.length && length < lengthGoal; pos++) {
|
||||||
|
var sent = documentLanguageData.sentences[pos];
|
||||||
|
length += sent.length();
|
||||||
|
|
||||||
|
for (var word : sent) {
|
||||||
|
if (!word.isStopWord()) {
|
||||||
|
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
||||||
|
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
|
||||||
|
words.add(w);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
wordSet.append(block, words);
|
||||||
|
start = pos;
|
||||||
|
lengthGoal+=32;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (start < documentLanguageData.sentences.length) {
|
||||||
|
|
||||||
|
Map<String, Integer> counts = new HashMap<>(documentLanguageData.totalNumWords());
|
||||||
|
for (int pos = start; pos < documentLanguageData.sentences.length && counts.size() < lengthGoal; pos++) {
|
||||||
|
var sent = documentLanguageData.sentences[pos];
|
||||||
|
for (var word : sent) {
|
||||||
|
if (!word.isStopWord()) {
|
||||||
|
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
||||||
|
if (counts.containsKey(w) || (WordPatterns.singleWordQualitiesPredicate.test(w))) {
|
||||||
|
counts.merge(w, 1, Integer::sum);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Set<String> lastSet;
|
||||||
|
if (counts.size() < 1024) {
|
||||||
|
lastSet = counts.keySet();
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
lastSet = counts.entrySet().stream()
|
||||||
|
.sorted(Comparator.comparing(e -> {
|
||||||
|
double N = 11820118.; // Number of documents in term freq dictionary
|
||||||
|
|
||||||
|
// Caveat: This is actually the *negated* term score, because the second logarithm has
|
||||||
|
// its parameter inverted (log(a^b) = b log(a); here b = -1)
|
||||||
|
return (1 + Math.log(e.getValue())) * Math.log((1. + dict.getTermFreq(e.getKey())) / N);
|
||||||
|
}))
|
||||||
|
.map(Map.Entry::getKey)
|
||||||
|
.limit(1024)
|
||||||
|
.collect(Collectors.toCollection(LinkedHashSet::new));
|
||||||
|
}
|
||||||
|
|
||||||
|
wordSet.append(blocks[blocks.length - 1], lastSet);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private Collection<String> getArtifacts(DocumentLanguageData documentLanguageData) {
|
private Collection<String> getArtifacts(DocumentLanguageData documentLanguageData) {
|
||||||
Set<String> reps = new HashSet<>();
|
Set<String> reps = new HashSet<>();
|
||||||
|
|
||||||
|
|
||||||
for (var sent : documentLanguageData.sentences) {
|
for (var sent : documentLanguageData.sentences) {
|
||||||
for (var word : sent) {
|
for (var word : sent) {
|
||||||
String lc = word.wordLowerCase();
|
String lc = word.wordLowerCase();
|
||||||
@ -138,33 +191,6 @@ public class DocumentKeywordExtractor {
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
@NotNull
|
|
||||||
private Set<String> getSimpleWords(DocumentLanguageData documentLanguageData) {
|
|
||||||
Map<String, Integer> counts = new HashMap<>(documentLanguageData.totalNumWords());
|
|
||||||
|
|
||||||
for (var sent : documentLanguageData.sentences) {
|
|
||||||
for (int i = 0; i < sent.length(); i++) {
|
|
||||||
if (!sent.isStopWord(i)) {
|
|
||||||
String w = AsciiFlattener.flattenUnicode(sent.wordsLowerCase[i]);
|
|
||||||
if (counts.containsKey(w) || (WordPatterns.wordQualitiesPredicate.test(w) && WordPatterns.filter(w))) {
|
|
||||||
counts.merge(w, 1, Integer::sum);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return counts.entrySet().stream()
|
|
||||||
.sorted(Comparator.comparing(e -> {
|
|
||||||
double N = 11820118.; // Number of documents in term freq dictionary
|
|
||||||
|
|
||||||
// Caveat: This is actually the *negated* term score, because the second logarithm has
|
|
||||||
// its parameter inverted (log(a^b) = b log(a); here b = -1)
|
|
||||||
return (1+Math.log(e.getValue())) * Math.log((1.+dict.getTermFreq(e.getKey()))/N);
|
|
||||||
}))
|
|
||||||
.map(Map.Entry::getKey)
|
|
||||||
.limit(512).collect(Collectors.toCollection(LinkedHashSet::new));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) {
|
public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) {
|
||||||
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet()));
|
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet()));
|
||||||
|
@ -2,6 +2,7 @@ package nu.marginalia.wmsa.client;
|
|||||||
|
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import com.google.gson.GsonBuilder;
|
import com.google.gson.GsonBuilder;
|
||||||
|
import com.google.protobuf.GeneratedMessageV3;
|
||||||
import io.reactivex.rxjava3.core.Observable;
|
import io.reactivex.rxjava3.core.Observable;
|
||||||
import io.reactivex.rxjava3.core.ObservableSource;
|
import io.reactivex.rxjava3.core.ObservableSource;
|
||||||
import io.reactivex.rxjava3.plugins.RxJavaPlugins;
|
import io.reactivex.rxjava3.plugins.RxJavaPlugins;
|
||||||
@ -17,8 +18,6 @@ import org.apache.http.HttpHost;
|
|||||||
import org.apache.logging.log4j.ThreadContext;
|
import org.apache.logging.log4j.ThreadContext;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.slf4j.Marker;
|
|
||||||
import org.slf4j.MarkerFactory;
|
|
||||||
|
|
||||||
import java.io.ByteArrayOutputStream;
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -186,6 +185,31 @@ public abstract class AbstractClient implements AutoCloseable {
|
|||||||
.doFinally(() -> ThreadContext.remove("outbound-request"));
|
.doFinally(() -> ThreadContext.remove("outbound-request"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
protected synchronized Observable<HttpStatusCode> post(Context ctx, String endpoint, GeneratedMessageV3 data) {
|
||||||
|
|
||||||
|
ensureAlive();
|
||||||
|
|
||||||
|
RequestBody body = RequestBody.create(
|
||||||
|
MediaType.parse("application/protobuf"),
|
||||||
|
data.toByteArray());
|
||||||
|
|
||||||
|
var req = ctx.paint(new Request.Builder()).url(url + endpoint).post(body).build();
|
||||||
|
var call = client.newCall(req);
|
||||||
|
|
||||||
|
logInbound(call);
|
||||||
|
ThreadContext.put("outbound-request", url + endpoint);
|
||||||
|
try (var rsp = call.execute()) {
|
||||||
|
logOutbound(rsp);
|
||||||
|
int code = rsp.code();
|
||||||
|
|
||||||
|
return validateStatus(code, req).map(HttpStatusCode::new);
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
ThreadContext.remove("outbound-request");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
protected synchronized <T> Observable<T> postGet(Context ctx, String endpoint, Object data, Class<T> returnType) {
|
protected synchronized <T> Observable<T> postGet(Context ctx, String endpoint, Object data, Class<T> returnType) {
|
||||||
|
@ -76,7 +76,7 @@ public class LinkKeywordLoaderMain {
|
|||||||
|
|
||||||
// System.out.println(lastLine + " -/- " + domainId + ":" + urlId + " : " + keywords);
|
// System.out.println(lastLine + " -/- " + domainId + ":" + urlId + " : " + keywords);
|
||||||
|
|
||||||
indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -5, new EdgePageWordSet(
|
indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), new EdgePageWordSet(
|
||||||
new EdgePageWords(IndexBlock.Link, new HashSet<>(keywords))), 0
|
new EdgePageWords(IndexBlock.Link, new HashSet<>(keywords))), 0
|
||||||
).blockingSubscribe();
|
).blockingSubscribe();
|
||||||
}
|
}
|
||||||
|
@ -39,7 +39,7 @@ public class IndexLoadKeywords implements Runnable {
|
|||||||
while (!canceled) {
|
while (!canceled) {
|
||||||
var data = insertQueue.poll(1, TimeUnit.SECONDS);
|
var data = insertQueue.poll(1, TimeUnit.SECONDS);
|
||||||
if (data != null) {
|
if (data != null) {
|
||||||
client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), -5., data.wordSet, index).blockingSubscribe();
|
client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.wordSet, index).blockingSubscribe();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -147,7 +147,10 @@ public class DocumentProcessor {
|
|||||||
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
|
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
|
||||||
}
|
}
|
||||||
|
|
||||||
var dld = sentenceExtractor.extractSentences(doc.clone());
|
DomPruner domPruner = new DomPruner();
|
||||||
|
Document prunedDoc = doc.clone();
|
||||||
|
domPruner.prune(prunedDoc, 0.5);
|
||||||
|
var dld = sentenceExtractor.extractSentences(prunedDoc);
|
||||||
|
|
||||||
checkDocumentLanguage(dld);
|
checkDocumentLanguage(dld);
|
||||||
|
|
||||||
@ -192,7 +195,7 @@ public class DocumentProcessor {
|
|||||||
ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
|
ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
|
||||||
|
|
||||||
words.append(IndexBlock.Meta, tagWords);
|
words.append(IndexBlock.Meta, tagWords);
|
||||||
words.append(IndexBlock.Words, tagWords);
|
words.append(IndexBlock.Words_1, tagWords);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) {
|
private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) {
|
||||||
|
@ -0,0 +1,111 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||||
|
|
||||||
|
import org.jsoup.nodes.Document;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.nodes.Node;
|
||||||
|
import org.jsoup.nodes.TextNode;
|
||||||
|
import org.jsoup.select.NodeVisitor;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class DomPruner {
|
||||||
|
|
||||||
|
public void prune(Document document, double pruneThreshold) {
|
||||||
|
PruningVisitor pruningVisitor = new PruningVisitor();
|
||||||
|
document.traverse(pruningVisitor);
|
||||||
|
|
||||||
|
pruningVisitor.data.forEach((node, data) -> {
|
||||||
|
if (data.depth <= 1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (data.signalNodeSize == 0) node.remove();
|
||||||
|
else if (data.noiseNodeSize > 0
|
||||||
|
&& data.signalRate() < pruneThreshold
|
||||||
|
&& data.treeSize > 3) {
|
||||||
|
node.remove();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
private static class PruningVisitor implements NodeVisitor {
|
||||||
|
|
||||||
|
private final Map<Node, NodeData> data = new HashMap<>();
|
||||||
|
private final NodeData dummy = new NodeData(Integer.MAX_VALUE, 1, 0);
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void head(Node node, int depth) {}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void tail(Node node, int depth) {
|
||||||
|
final NodeData dataForNode;
|
||||||
|
|
||||||
|
if (node instanceof TextNode tn) {
|
||||||
|
dataForNode = new NodeData(depth, tn.text().length(), 0);
|
||||||
|
}
|
||||||
|
else if (isSignal(node)) {
|
||||||
|
dataForNode = new NodeData(depth, 0,0);
|
||||||
|
for (var childNode : node.childNodes()) {
|
||||||
|
dataForNode.add(data.getOrDefault(childNode, dummy));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
dataForNode = new NodeData(depth, 0,0);
|
||||||
|
for (var childNode : node.childNodes()) {
|
||||||
|
dataForNode.addAsNoise(data.getOrDefault(childNode, dummy));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
data.put(node, dataForNode);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isSignal(Node node) {
|
||||||
|
|
||||||
|
if (node instanceof Element e) {
|
||||||
|
if ("a".equalsIgnoreCase(e.tagName()))
|
||||||
|
return false;
|
||||||
|
if ("nav".equalsIgnoreCase(e.tagName()))
|
||||||
|
return false;
|
||||||
|
if ("footer".equalsIgnoreCase(e.tagName()))
|
||||||
|
return false;
|
||||||
|
if ("header".equalsIgnoreCase(e.tagName()))
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class NodeData {
|
||||||
|
int signalNodeSize;
|
||||||
|
int noiseNodeSize;
|
||||||
|
int treeSize = 1;
|
||||||
|
int depth;
|
||||||
|
|
||||||
|
private NodeData(int depth, int signalNodeSize, int noiseNodeSize) {
|
||||||
|
this.depth = depth;
|
||||||
|
this.signalNodeSize = signalNodeSize;
|
||||||
|
this.noiseNodeSize = noiseNodeSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void add(NodeData other) {
|
||||||
|
signalNodeSize += other.signalNodeSize;
|
||||||
|
noiseNodeSize += other.noiseNodeSize;
|
||||||
|
treeSize += other.treeSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void addAsNoise(NodeData other) {
|
||||||
|
noiseNodeSize += other.noiseNodeSize + other.signalNodeSize;
|
||||||
|
treeSize += other.treeSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public double signalRate() {
|
||||||
|
return signalNodeSize / (double)(signalNodeSize + noiseNodeSize);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -4,11 +4,11 @@ import com.google.gson.Gson;
|
|||||||
import com.google.gson.GsonBuilder;
|
import com.google.gson.GsonBuilder;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.name.Named;
|
import com.google.inject.name.Named;
|
||||||
|
import com.google.protobuf.InvalidProtocolBufferException;
|
||||||
import gnu.trove.map.TLongIntMap;
|
import gnu.trove.map.TLongIntMap;
|
||||||
import gnu.trove.map.hash.TIntIntHashMap;
|
import gnu.trove.map.hash.TIntIntHashMap;
|
||||||
import gnu.trove.map.hash.TLongIntHashMap;
|
import gnu.trove.map.hash.TLongIntHashMap;
|
||||||
import gnu.trove.set.hash.TIntHashSet;
|
import gnu.trove.set.hash.TIntHashSet;
|
||||||
import io.prometheus.client.Counter;
|
|
||||||
import io.prometheus.client.Histogram;
|
import io.prometheus.client.Histogram;
|
||||||
import io.reactivex.rxjava3.schedulers.Schedulers;
|
import io.reactivex.rxjava3.schedulers.Schedulers;
|
||||||
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
|
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
|
||||||
@ -22,18 +22,16 @@ import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
|
|||||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
|
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
|
||||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
||||||
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
||||||
import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest;
|
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
|
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
|
||||||
import nu.marginalia.wmsa.edge.model.search.*;
|
import nu.marginalia.wmsa.edge.model.search.*;
|
||||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
||||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
||||||
|
import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq;
|
||||||
import org.apache.http.HttpStatus;
|
import org.apache.http.HttpStatus;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -52,7 +50,7 @@ import static spark.Spark.get;
|
|||||||
import static spark.Spark.halt;
|
import static spark.Spark.halt;
|
||||||
|
|
||||||
public class EdgeIndexService extends Service {
|
public class EdgeIndexService extends Service {
|
||||||
private static final int SEARCH_BUDGET_TIMEOUT_MS = 100;
|
private static final int SEARCH_BUDGET_TIMEOUT_MS = 3000;
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
@ -66,11 +64,9 @@ public class EdgeIndexService extends Service {
|
|||||||
.create();
|
.create();
|
||||||
|
|
||||||
private static final Histogram wmsa_edge_index_query_time
|
private static final Histogram wmsa_edge_index_query_time
|
||||||
= Histogram.build().name("wmsa_edge_index_query_time").help("-").register();
|
= Histogram.build().name("wmsa_edge_index_query_time")
|
||||||
private static final Counter wmsa_edge_index_query_count
|
.linearBuckets(50, 50, 15)
|
||||||
= Counter.build().name("wmsa_edge_index_query_count").help("-").register();
|
.help("-").register();
|
||||||
private static final Histogram wmsa_edge_index_put_words_time
|
|
||||||
= Histogram.build().name("wmsa_edge_index_put_words_time").help("-").register();
|
|
||||||
|
|
||||||
public static final int DYNAMIC_BUCKET_LENGTH = 7;
|
public static final int DYNAMIC_BUCKET_LENGTH = 7;
|
||||||
|
|
||||||
@ -162,12 +158,15 @@ public class EdgeIndexService extends Service {
|
|||||||
indexes.initialize(init);
|
indexes.initialize(init);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Object putWords(Request request, Response response) {
|
private Object putWords(Request request, Response response) throws InvalidProtocolBufferException {
|
||||||
var putWordsRequest = gson.fromJson(request.body(), EdgePutWordsRequest.class);
|
var req = IndexPutKeywordsReq.parseFrom(request.bodyAsBytes());
|
||||||
|
|
||||||
synchronized (this) {
|
EdgeId<EdgeDomain> domainId = new EdgeId<>(req.getDomain());
|
||||||
putWords(putWordsRequest.getDomainId(), putWordsRequest.getUrlId(),
|
EdgeId<EdgeUrl> urlId = new EdgeId<>(req.getUrl());
|
||||||
putWordsRequest.wordSet, putWordsRequest.getIndex());
|
int idx = req.getIndex();
|
||||||
|
|
||||||
|
for (int ws = 0; ws < req.getWordSetCount(); ws++) {
|
||||||
|
putWords(domainId, urlId, req.getWordSet(ws), idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
response.status(HttpStatus.SC_ACCEPTED);
|
response.status(HttpStatus.SC_ACCEPTED);
|
||||||
@ -175,26 +174,16 @@ public class EdgeIndexService extends Service {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
|
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
|
||||||
EdgePageWordSet wordSet, int idx
|
IndexPutKeywordsReq.WordSet words, int idx
|
||||||
) {
|
|
||||||
|
|
||||||
wmsa_edge_index_put_words_time.time(() -> {
|
|
||||||
for (EdgePageWords words : wordSet.values()) {
|
|
||||||
putWords(domainId, urlId, words, idx);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
|
|
||||||
EdgePageWords words, int idx
|
|
||||||
) {
|
) {
|
||||||
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
|
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
|
||||||
|
|
||||||
for (var chunk : ListChunker.chopList(words.words, SearchIndexJournalEntry.MAX_LENGTH)) {
|
IndexBlock block = IndexBlock.values()[words.getIndex()];
|
||||||
|
|
||||||
|
for (var chunk : ListChunker.chopList(words.getWordsList(), SearchIndexJournalEntry.MAX_LENGTH)) {
|
||||||
|
|
||||||
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
|
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
|
||||||
var header = new SearchIndexJournalEntryHeader(domainId, urlId, words.block);
|
var header = new SearchIndexJournalEntryHeader(domainId, urlId, block);
|
||||||
|
|
||||||
indexWriter.put(header, entry);
|
indexWriter.put(header, entry);
|
||||||
};
|
};
|
||||||
@ -257,7 +246,6 @@ public class EdgeIndexService extends Service {
|
|||||||
}
|
}
|
||||||
finally {
|
finally {
|
||||||
wmsa_edge_index_query_time.observe(System.currentTimeMillis() - start);
|
wmsa_edge_index_query_time.observe(System.currentTimeMillis() - start);
|
||||||
wmsa_edge_index_query_count.inc();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -410,16 +398,6 @@ public class EdgeIndexService extends Service {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public LongStream getHotDomainsQuery(int bucket, IndexBlock block, int wordId,
|
|
||||||
int queryDepth, int minHitCount, int maxResults) {
|
|
||||||
if (!indexes.isValidBucket(bucket)) {
|
|
||||||
logger.warn("Invalid bucket {}", bucket);
|
|
||||||
return LongStream.empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
return indexes.getBucket(bucket).findHotDomainsForKeyword(block, wordId, queryDepth, minHitCount, maxResults);
|
|
||||||
}
|
|
||||||
|
|
||||||
private LongStream getQuery(int bucket, IndexSearchBudget budget, IndexBlock block,
|
private LongStream getQuery(int bucket, IndexSearchBudget budget, IndexBlock block,
|
||||||
LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
|
LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
|
||||||
if (!indexes.isValidBucket(bucket)) {
|
if (!indexes.isValidBucket(bucket)) {
|
||||||
|
@ -9,7 +9,6 @@ import nu.marginalia.wmsa.client.AbstractDynamicClient;
|
|||||||
import nu.marginalia.wmsa.client.HttpStatusCode;
|
import nu.marginalia.wmsa.client.HttpStatusCode;
|
||||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||||
import nu.marginalia.wmsa.configuration.server.Context;
|
import nu.marginalia.wmsa.configuration.server.Context;
|
||||||
import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest;
|
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
@ -18,6 +17,7 @@ import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultSet;
|
|||||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
||||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
||||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
||||||
|
import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -37,13 +37,27 @@ public class EdgeIndexClient extends AbstractDynamicClient {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@CheckReturnValue
|
@CheckReturnValue
|
||||||
public Observable<HttpStatusCode> putWords(Context ctx, EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url, double quality,
|
public Observable<HttpStatusCode> putWords(Context ctx, EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url,
|
||||||
EdgePageWordSet wordSet, int writer
|
EdgePageWordSet wordSet, int writer
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
EdgePutWordsRequest request = new EdgePutWordsRequest(domain, url, quality, wordSet, writer);
|
|
||||||
|
|
||||||
return this.post(ctx, "/words/", request);
|
var keywordBuilder =
|
||||||
|
IndexPutKeywordsReq.newBuilder()
|
||||||
|
.setDomain(domain.id())
|
||||||
|
.setUrl(url.id())
|
||||||
|
.setIndex(writer);
|
||||||
|
|
||||||
|
for (var set : wordSet.wordSets.values()) {
|
||||||
|
var wordSetBuilder = IndexPutKeywordsReq.WordSet.newBuilder();
|
||||||
|
wordSetBuilder.setIndex(set.block.ordinal());
|
||||||
|
wordSetBuilder.addAllWords(set.words);
|
||||||
|
keywordBuilder.addWordSet(wordSetBuilder.build());
|
||||||
|
}
|
||||||
|
|
||||||
|
var req = keywordBuilder.build();
|
||||||
|
|
||||||
|
return this.post(ctx, "/words/", req);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -15,6 +15,7 @@ import java.io.IOException;
|
|||||||
import java.io.RandomAccessFile;
|
import java.io.RandomAccessFile;
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
import java.nio.channels.FileChannel;
|
import java.nio.channels.FileChannel;
|
||||||
|
import java.util.concurrent.LinkedBlockingQueue;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
||||||
@ -36,6 +37,8 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
|||||||
|
|
||||||
byteBuffer = ByteBuffer.allocate(MAX_BLOCK_SIZE);
|
byteBuffer = ByteBuffer.allocate(MAX_BLOCK_SIZE);
|
||||||
|
|
||||||
|
new Thread(this::journalWriterThread, "Journal Writer").start();
|
||||||
|
|
||||||
writerTask = Schedulers.io().schedulePeriodicallyDirect(this::forceWrite, 1, 1, TimeUnit.SECONDS);
|
writerTask = Schedulers.io().schedulePeriodicallyDirect(this::forceWrite, 1, 1, TimeUnit.SECONDS);
|
||||||
Runtime.getRuntime().addShutdownHook(new Thread(this::forceWrite));
|
Runtime.getRuntime().addShutdownHook(new Thread(this::forceWrite));
|
||||||
}
|
}
|
||||||
@ -56,25 +59,45 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private record WriteJob(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {}
|
||||||
|
private final LinkedBlockingQueue<WriteJob> writeQueue = new LinkedBlockingQueue<>(512);
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public synchronized void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {
|
public void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {
|
||||||
|
writeQueue.put(new WriteJob(header, entryData));
|
||||||
|
}
|
||||||
|
|
||||||
byteBuffer.clear();
|
@SneakyThrows
|
||||||
|
public void journalWriterThread() {
|
||||||
|
|
||||||
byteBuffer.putInt(entryData.size());
|
while (true) {
|
||||||
byteBuffer.putInt(header.block().id);
|
var job = writeQueue.take();
|
||||||
byteBuffer.putLong(header.documentId());
|
|
||||||
|
|
||||||
entryData.write(byteBuffer);
|
writeEntry(job.header, job.entryData);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private synchronized void writeEntry(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {
|
||||||
|
|
||||||
byteBuffer.limit(byteBuffer.position());
|
try {
|
||||||
byteBuffer.rewind();
|
byteBuffer.clear();
|
||||||
|
|
||||||
while (byteBuffer.position() < byteBuffer.limit())
|
byteBuffer.putInt(entryData.size());
|
||||||
channel.write(byteBuffer);
|
byteBuffer.putInt(header.block().id);
|
||||||
|
byteBuffer.putLong(header.documentId());
|
||||||
|
|
||||||
writePositionMarker();
|
entryData.write(byteBuffer);
|
||||||
|
|
||||||
|
byteBuffer.limit(byteBuffer.position());
|
||||||
|
byteBuffer.rewind();
|
||||||
|
|
||||||
|
while (byteBuffer.position() < byteBuffer.limit())
|
||||||
|
channel.write(byteBuffer);
|
||||||
|
|
||||||
|
writePositionMarker();
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -94,13 +117,11 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void writePositionMarker() throws IOException {
|
private void writePositionMarker() throws IOException {
|
||||||
var lock = channel.lock(0, 16, false);
|
|
||||||
pos = channel.size();
|
pos = channel.size();
|
||||||
raf.seek(0);
|
raf.seek(0);
|
||||||
raf.writeLong(pos);
|
raf.writeLong(pos);
|
||||||
raf.writeLong(dictionaryWriter.size());
|
raf.writeLong(dictionaryWriter.size());
|
||||||
raf.seek(pos);
|
raf.seek(pos);
|
||||||
lock.release();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public synchronized void close() throws IOException {
|
public synchronized void close() throws IOException {
|
||||||
|
@ -5,16 +5,16 @@ import lombok.Getter;
|
|||||||
import lombok.ToString;
|
import lombok.ToString;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||||
|
|
||||||
@AllArgsConstructor @Getter
|
@AllArgsConstructor @Getter
|
||||||
@ToString
|
@ToString
|
||||||
public class EdgePutWordsRequest {
|
public class EdgePutWordsRequest {
|
||||||
public final EdgeId<EdgeDomain> domainId;
|
public EdgeId<EdgeDomain> domainId;
|
||||||
public final EdgeId<EdgeUrl> urlId;
|
public EdgeId<EdgeUrl> urlId;
|
||||||
public final double quality;
|
public double quality;
|
||||||
|
|
||||||
public final EdgePageWordSet wordSet;
|
public EdgePageWordSet wordSet;
|
||||||
private int index = 0;
|
private int index = 0;
|
||||||
}
|
}
|
||||||
|
@ -5,14 +5,18 @@ public enum IndexBlock {
|
|||||||
Title(1, 1),
|
Title(1, 1),
|
||||||
Link(2, 1.25),
|
Link(2, 1.25),
|
||||||
Top(3, 2),
|
Top(3, 2),
|
||||||
Middle(4, 3),
|
Middle(4, 2.5),
|
||||||
Low(5, 4),
|
Low(5, 3.0),
|
||||||
Words(6, 6),
|
Words_1(6, 3.0),
|
||||||
Meta(7, 7),
|
Meta(7, 7),
|
||||||
PositionWords(8, 4.5),
|
Words_2(8, 3.5),
|
||||||
NamesWords(9, 5),
|
NamesWords(9, 5),
|
||||||
Artifacts(10, 10),
|
Artifacts(10, 10),
|
||||||
Topic(11, 0.5);
|
Topic(11, 0.5),
|
||||||
|
Words_4(12, 4.0),
|
||||||
|
Words_8(13, 4.5),
|
||||||
|
Words_16Plus(14, 7.0),
|
||||||
|
;
|
||||||
|
|
||||||
public final int id;
|
public final int id;
|
||||||
public final double sortOrder;
|
public final double sortOrder;
|
||||||
|
@ -29,8 +29,12 @@ public class SearchIndexReader implements AutoCloseable {
|
|||||||
IndexBlock.Top,
|
IndexBlock.Top,
|
||||||
IndexBlock.Middle,
|
IndexBlock.Middle,
|
||||||
IndexBlock.Low,
|
IndexBlock.Low,
|
||||||
IndexBlock.Words,
|
|
||||||
IndexBlock.NamesWords,
|
IndexBlock.NamesWords,
|
||||||
|
IndexBlock.Words_1,
|
||||||
|
IndexBlock.Words_2,
|
||||||
|
IndexBlock.Words_4,
|
||||||
|
IndexBlock.Words_8,
|
||||||
|
IndexBlock.Words_16Plus,
|
||||||
};
|
};
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
@ -44,24 +48,29 @@ public class SearchIndexReader implements AutoCloseable {
|
|||||||
var linkIndex = indices.get(IndexBlock.Link);
|
var linkIndex = indices.get(IndexBlock.Link);
|
||||||
var titleIndex = indices.get(IndexBlock.Title);
|
var titleIndex = indices.get(IndexBlock.Title);
|
||||||
var namesIndex = indices.get(IndexBlock.NamesWords);
|
var namesIndex = indices.get(IndexBlock.NamesWords);
|
||||||
var positionIndex = indices.get(IndexBlock.PositionWords);
|
|
||||||
var titleKeywordsIndex = indices.get(IndexBlock.TitleKeywords);
|
var titleKeywordsIndex = indices.get(IndexBlock.TitleKeywords);
|
||||||
var wordsIndex = indices.get(IndexBlock.Words);
|
|
||||||
var metaIndex = indices.get(IndexBlock.Meta);
|
var metaIndex = indices.get(IndexBlock.Meta);
|
||||||
var topicIndex = indices.get(IndexBlock.Topic);
|
var topicIndex = indices.get(IndexBlock.Topic);
|
||||||
|
|
||||||
|
var words1 = indices.get(IndexBlock.Words_1);
|
||||||
|
var words2 = indices.get(IndexBlock.Words_2);
|
||||||
|
var words4 = indices.get(IndexBlock.Words_4);
|
||||||
|
var words8 = indices.get(IndexBlock.Words_8);
|
||||||
|
var words16 = indices.get(IndexBlock.Words_16Plus);
|
||||||
|
var artifacts = indices.get(IndexBlock.Artifacts);
|
||||||
|
|
||||||
queryBuilders = new EnumMap<>(IndexBlock.class);
|
queryBuilders = new EnumMap<>(IndexBlock.class);
|
||||||
underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class);
|
underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class);
|
||||||
|
|
||||||
queryBuilders.put(IndexBlock.Words, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex, namesIndex, wordsIndex), wordsIndex));
|
queryBuilders.put(IndexBlock.Words_1, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words1), words1));
|
||||||
queryBuilders.put(IndexBlock.Low, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex), wordsIndex));
|
queryBuilders.put(IndexBlock.Words_2, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words2), words1));
|
||||||
queryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex), wordsIndex));
|
queryBuilders.put(IndexBlock.Words_4, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words4), words1));
|
||||||
queryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex), wordsIndex));
|
queryBuilders.put(IndexBlock.Words_8, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words8), words1));
|
||||||
queryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex), wordsIndex));
|
queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words1, words2, words4, words8, words16, artifacts), words1));
|
||||||
|
|
||||||
underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
|
underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
|
||||||
underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
|
underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
|
||||||
underspecifiedQueryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
|
underspecifiedQueryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
|
||||||
}
|
}
|
||||||
|
|
||||||
@SafeVarargs
|
@SafeVarargs
|
||||||
@ -157,7 +166,7 @@ public class SearchIndexReader implements AutoCloseable {
|
|||||||
return block;
|
return block;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return IndexBlock.Words;
|
return IndexBlock.Words_1;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isTermInBucket(IndexBlock block, int searchTerm, long urlId) {
|
public boolean isTermInBucket(IndexBlock block, int searchTerm, long urlId) {
|
||||||
|
@ -27,7 +27,8 @@ public class IndexQueryBuilder {
|
|||||||
|
|
||||||
public Query build(IndexSearchBudget budget,
|
public Query build(IndexSearchBudget budget,
|
||||||
LongPredicate filter,
|
LongPredicate filter,
|
||||||
int wordId) {
|
int wordId)
|
||||||
|
{
|
||||||
return new QueryForIndices(budget, filter, wordId);
|
return new QueryForIndices(budget, filter, wordId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,13 +1,13 @@
|
|||||||
package nu.marginalia.wmsa.edge.integration.stackoverflow;
|
package nu.marginalia.wmsa.edge.integration.stackoverflow;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
|
||||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
|
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
|
||||||
import nu.marginalia.wmsa.edge.integration.stackoverflow.model.StackOverflowPost;
|
import nu.marginalia.wmsa.edge.integration.stackoverflow.model.StackOverflowPost;
|
||||||
import nu.marginalia.wmsa.edge.model.*;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
@ -46,8 +46,8 @@ public class StackOverflowPostProcessor {
|
|||||||
var keywords = documentKeywordExtractor.extractKeywords(dld);
|
var keywords = documentKeywordExtractor.extractKeywords(dld);
|
||||||
|
|
||||||
keywords.get(IndexBlock.Meta).addJust("site:"+post.getUrl().domain);
|
keywords.get(IndexBlock.Meta).addJust("site:"+post.getUrl().domain);
|
||||||
keywords.get(IndexBlock.Words).addJust("site:"+post.getUrl().domain);
|
keywords.get(IndexBlock.Words_1).addJust("site:"+post.getUrl().domain);
|
||||||
keywords.get(IndexBlock.Words).addJust("special:wikipedia");
|
keywords.get(IndexBlock.Words_1).addJust("special:wikipedia");
|
||||||
keywords.get(IndexBlock.Meta).addJust("special:wikipedia");
|
keywords.get(IndexBlock.Meta).addJust("special:wikipedia");
|
||||||
keywords.get(IndexBlock.Meta).addJust("js:true");
|
keywords.get(IndexBlock.Meta).addJust("js:true");
|
||||||
|
|
||||||
|
@ -1,13 +1,13 @@
|
|||||||
package nu.marginalia.wmsa.edge.integration.wikipedia;
|
package nu.marginalia.wmsa.edge.integration.wikipedia;
|
||||||
|
|
||||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
|
||||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||||
|
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
|
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
|
||||||
import nu.marginalia.wmsa.edge.integration.wikipedia.model.WikipediaArticle;
|
import nu.marginalia.wmsa.edge.integration.wikipedia.model.WikipediaArticle;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink;
|
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
@ -42,8 +42,8 @@ public class WikipediaProcessor {
|
|||||||
var keywords = documentKeywordExtractor.extractKeywords(dld);
|
var keywords = documentKeywordExtractor.extractKeywords(dld);
|
||||||
|
|
||||||
keywords.get(IndexBlock.Meta).addJust("site:"+post.getUrl().domain);
|
keywords.get(IndexBlock.Meta).addJust("site:"+post.getUrl().domain);
|
||||||
keywords.get(IndexBlock.Words).addJust("site:"+post.getUrl().domain);
|
keywords.get(IndexBlock.Words_1).addJust("site:"+post.getUrl().domain);
|
||||||
keywords.get(IndexBlock.Words).addJust("special:stackoverflow");
|
keywords.get(IndexBlock.Words_1).addJust("special:stackoverflow");
|
||||||
keywords.get(IndexBlock.Meta).addJust("special:stackoverflow");
|
keywords.get(IndexBlock.Meta).addJust("special:stackoverflow");
|
||||||
keywords.get(IndexBlock.Meta).addJust("js:true");
|
keywords.get(IndexBlock.Meta).addJust("js:true");
|
||||||
|
|
||||||
|
@ -1,13 +1,15 @@
|
|||||||
package nu.marginalia.wmsa.edge.model.crawl;
|
package nu.marginalia.wmsa.edge.model.crawl;
|
||||||
|
|
||||||
|
import com.dslplatform.json.JsonObject;
|
||||||
|
import com.dslplatform.json.JsonWriter;
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
@Data
|
@Data
|
||||||
public class EdgePageWordSet {
|
public class EdgePageWordSet implements JsonObject {
|
||||||
public final Map<IndexBlock, EdgePageWords> wordSets;
|
public Map<IndexBlock, EdgePageWords> wordSets;
|
||||||
|
|
||||||
public EdgePageWordSet(EdgePageWords... words) {
|
public EdgePageWordSet(EdgePageWords... words) {
|
||||||
wordSets = new EnumMap<>(IndexBlock.class);
|
wordSets = new EnumMap<>(IndexBlock.class);
|
||||||
@ -45,4 +47,18 @@ public class EdgePageWordSet {
|
|||||||
});
|
});
|
||||||
return sj.toString();
|
return sj.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void serialize(JsonWriter writer, boolean minimal) {
|
||||||
|
writer.writeAscii("[");
|
||||||
|
boolean first = false;
|
||||||
|
for (var w : wordSets.values()) {
|
||||||
|
if (!first) first = true;
|
||||||
|
else writer.writeAscii(", ");
|
||||||
|
|
||||||
|
w.serialize(writer, minimal);
|
||||||
|
}
|
||||||
|
writer.writeAscii("]}");
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,4 +1,7 @@
|
|||||||
package nu.marginalia.wmsa.edge.model.crawl;
|
package nu.marginalia.wmsa.edge.model.crawl;
|
||||||
|
import com.dslplatform.json.JsonObject;
|
||||||
|
import com.dslplatform.json.JsonWriter;
|
||||||
|
import com.dslplatform.json.NumberConverter;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
import lombok.ToString;
|
import lombok.ToString;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
@ -8,7 +11,7 @@ import java.util.Collection;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@ToString @Getter
|
@ToString @Getter
|
||||||
public class EdgePageWords {
|
public class EdgePageWords implements JsonObject {
|
||||||
public final IndexBlock block;
|
public final IndexBlock block;
|
||||||
public final List<String> words = new ArrayList<>();
|
public final List<String> words = new ArrayList<>();
|
||||||
|
|
||||||
@ -31,4 +34,19 @@ public class EdgePageWords {
|
|||||||
return words.size();
|
return words.size();
|
||||||
}
|
}
|
||||||
public void addJust(String word) { words.add(word); }
|
public void addJust(String word) { words.add(word); }
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void serialize(JsonWriter writer, boolean minimal) {
|
||||||
|
writer.writeAscii("{\"b\":");
|
||||||
|
NumberConverter.serialize(block.ordinal(), writer);
|
||||||
|
writer.writeAscii(", \"w\": [");
|
||||||
|
boolean first = false;
|
||||||
|
for (var word : words) {
|
||||||
|
if (!first) first = true;
|
||||||
|
else { writer.writeAscii(","); }
|
||||||
|
|
||||||
|
writer.writeString(word);
|
||||||
|
}
|
||||||
|
writer.writeAscii("]}");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -10,25 +10,31 @@ import java.util.stream.Collectors;
|
|||||||
|
|
||||||
public enum EdgeSearchProfile {
|
public enum EdgeSearchProfile {
|
||||||
DEFAULT("default",
|
DEFAULT("default",
|
||||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Link,
|
||||||
|
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
|
||||||
|
),
|
||||||
0, 1),
|
0, 1),
|
||||||
MODERN("modern",
|
MODERN("modern",
|
||||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Link, IndexBlock.NamesWords,
|
||||||
|
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
|
||||||
|
),
|
||||||
2),
|
2),
|
||||||
CORPO("corpo",
|
CORPO("corpo",
|
||||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords,
|
||||||
4, 5, 6, 7),
|
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
|
||||||
|
4, 5, 7),
|
||||||
YOLO("yolo",
|
YOLO("yolo",
|
||||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords,
|
||||||
|
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
|
||||||
0, 2, 1, 3, 4, 6),
|
0, 2, 1, 3, 4, 6),
|
||||||
CORPO_CLEAN("corpo-clean",
|
CORPO_CLEAN("corpo-clean",
|
||||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords),
|
||||||
4, 5),
|
4, 5),
|
||||||
ACADEMIA("academia",
|
ACADEMIA("academia",
|
||||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords),
|
||||||
3),
|
3),
|
||||||
FOOD("food",
|
FOOD("food",
|
||||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords),
|
||||||
2, 0),
|
2, 0),
|
||||||
;
|
;
|
||||||
|
|
||||||
|
@ -62,7 +62,7 @@ public class SiteSearchCommand implements SearchCommandInterface {
|
|||||||
DecoratedSearchResultSet resultSet;
|
DecoratedSearchResultSet resultSet;
|
||||||
Path screenshotPath = null;
|
Path screenshotPath = null;
|
||||||
if (null != domain) {
|
if (null != domain) {
|
||||||
resultSet = searchOperator.performDumbQuery(ctx, EdgeSearchProfile.CORPO, IndexBlock.Words, 100, 100, "site:"+domain);
|
resultSet = searchOperator.performDumbQuery(ctx, EdgeSearchProfile.CORPO, IndexBlock.Words_1, 100, 100, "site:"+domain);
|
||||||
|
|
||||||
screenshotPath = Path.of("/screenshot/" + dataStoreDao.getDomainId(domain).id());
|
screenshotPath = Path.of("/screenshot/" + dataStoreDao.getDomainId(domain).id());
|
||||||
}
|
}
|
||||||
|
@ -30,7 +30,7 @@ public class SearchResultValuator {
|
|||||||
EdgeSearchResultKeywordScore[] scores = rawScores.stream().filter(w -> !w.keyword.contains(":")).toArray(EdgeSearchResultKeywordScore[]::new);
|
EdgeSearchResultKeywordScore[] scores = rawScores.stream().filter(w -> !w.keyword.contains(":")).toArray(EdgeSearchResultKeywordScore[]::new);
|
||||||
|
|
||||||
if (scores.length == 0) {
|
if (scores.length == 0) {
|
||||||
return IndexBlock.Words.sortOrder;
|
return IndexBlock.Words_1.sortOrder;
|
||||||
}
|
}
|
||||||
|
|
||||||
final double[] weights = getTermWeights(scores);
|
final double[] weights = getTermWeights(scores);
|
||||||
|
@ -51,7 +51,7 @@ public class FeaturesLoaderTool {
|
|||||||
throw new RuntimeException(ex);
|
throw new RuntimeException(ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
client.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -5, wordSet, 0)
|
client.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), wordSet, 0)
|
||||||
.blockingSubscribe();
|
.blockingSubscribe();
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -141,7 +141,7 @@ CREATE OR REPLACE VIEW EC_URL_VIEW AS
|
|||||||
EC_PAGE_DATA.FEATURES AS FEATURES,
|
EC_PAGE_DATA.FEATURES AS FEATURES,
|
||||||
|
|
||||||
EC_DOMAIN.IP AS IP,
|
EC_DOMAIN.IP AS IP,
|
||||||
EC_DOMAIN.STATE AS STATE,
|
EC_URL.STATE AS STATE,
|
||||||
EC_DOMAIN.RANK AS RANK,
|
EC_DOMAIN.RANK AS RANK,
|
||||||
EC_DOMAIN.STATE AS DOMAIN_STATE
|
EC_DOMAIN.STATE AS DOMAIN_STATE
|
||||||
FROM EC_URL
|
FROM EC_URL
|
||||||
|
@ -0,0 +1,12 @@
|
|||||||
|
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
class DomPrunerTest {
|
||||||
|
@Test
|
||||||
|
public void test() throws IOException {
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
@ -1,10 +1,8 @@
|
|||||||
package nu.marginalia.wmsa.edge.crawling;
|
package nu.marginalia.wmsa.edge.crawling;
|
||||||
|
|
||||||
import com.zaxxer.hikari.HikariConfig;
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.util.TestLanguageModels;
|
import nu.marginalia.util.TestLanguageModels;
|
||||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
import nu.marginalia.util.language.WordPatterns;
|
||||||
import nu.marginalia.util.language.conf.LanguageModels;
|
import nu.marginalia.util.language.conf.LanguageModels;
|
||||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||||
import nu.marginalia.util.language.processing.KeywordExtractor;
|
import nu.marginalia.util.language.processing.KeywordExtractor;
|
||||||
@ -12,11 +10,9 @@ import nu.marginalia.util.language.processing.SentenceExtractor;
|
|||||||
import nu.marginalia.util.language.processing.model.WordRep;
|
import nu.marginalia.util.language.processing.model.WordRep;
|
||||||
import nu.marginalia.util.language.processing.model.WordSpan;
|
import nu.marginalia.util.language.processing.model.WordSpan;
|
||||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||||
import nu.marginalia.util.ranking.BuggyReversePageRank;
|
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||||
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
|
||||||
import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
|
import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Disabled;
|
import org.junit.jupiter.api.Disabled;
|
||||||
@ -103,6 +99,11 @@ class SentenceExtractorTest {
|
|||||||
});
|
});
|
||||||
reader.join();
|
reader.join();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPattern() {
|
||||||
|
System.out.println(WordPatterns.singleWordAdditionalPattern.matcher("2.6.18164.el5pae").matches());
|
||||||
|
}
|
||||||
@Test
|
@Test
|
||||||
void extractSentences() throws IOException {
|
void extractSentences() throws IOException {
|
||||||
var data = Path.of("/home/vlofgren/Code/tmp-data/");
|
var data = Path.of("/home/vlofgren/Code/tmp-data/");
|
||||||
|
@ -11,12 +11,15 @@ import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
|
|||||||
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
|
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
|
||||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
|
||||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
|
||||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||||
import org.junit.jupiter.api.*;
|
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||||
|
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||||
|
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
||||||
|
import org.junit.jupiter.api.AfterAll;
|
||||||
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
import org.junit.jupiter.api.parallel.Execution;
|
import org.junit.jupiter.api.parallel.Execution;
|
||||||
import org.junit.jupiter.api.parallel.ExecutionMode;
|
import org.junit.jupiter.api.parallel.ExecutionMode;
|
||||||
import org.junit.jupiter.api.parallel.ResourceAccessMode;
|
import org.junit.jupiter.api.parallel.ResourceAccessMode;
|
||||||
@ -141,7 +144,7 @@ public class EdgeIndexClientTest {
|
|||||||
void putWords(int didx, int idx, double quality, String... words) {
|
void putWords(int didx, int idx, double quality, String... words) {
|
||||||
EdgePageWords epw = new EdgePageWords(IndexBlock.Title);
|
EdgePageWords epw = new EdgePageWords(IndexBlock.Title);
|
||||||
epw.addAll(Arrays.asList(words));
|
epw.addAll(Arrays.asList(words));
|
||||||
client.putWords(Context.internal(), new EdgeId<>(didx), new EdgeId<>(idx), quality,
|
client.putWords(Context.internal(), new EdgeId<>(didx), new EdgeId<>(idx),
|
||||||
new EdgePageWordSet(epw), 0).blockingSubscribe();
|
new EdgePageWordSet(epw), 0).blockingSubscribe();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -61,7 +61,7 @@ class SearchIndexJournalWriterTest {
|
|||||||
void put() throws IOException {
|
void put() throws IOException {
|
||||||
writer.put(new SearchIndexJournalEntryHeader(4, (1234L << 32) | 5678, IndexBlock.Link),
|
writer.put(new SearchIndexJournalEntryHeader(4, (1234L << 32) | 5678, IndexBlock.Link),
|
||||||
new SearchIndexJournalEntry(new long[] { 1, 2, 3, 4 }));
|
new SearchIndexJournalEntry(new long[] { 1, 2, 3, 4 }));
|
||||||
writer.put(new SearchIndexJournalEntryHeader(4, (2345L << 32) | 2244, IndexBlock.Words),
|
writer.put(new SearchIndexJournalEntryHeader(4, (2345L << 32) | 2244, IndexBlock.Words_1),
|
||||||
new SearchIndexJournalEntry(new long[] { 5, 6, 7 }));
|
new SearchIndexJournalEntry(new long[] { 5, 6, 7 }));
|
||||||
writer.forceWrite();
|
writer.forceWrite();
|
||||||
|
|
||||||
|
27
protocol/build.gradle
Normal file
27
protocol/build.gradle
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
plugins {
|
||||||
|
id "com.google.protobuf" version "0.8.19"
|
||||||
|
id "java"
|
||||||
|
}
|
||||||
|
repositories {
|
||||||
|
gradlePluginPortal()
|
||||||
|
}
|
||||||
|
protobuf {
|
||||||
|
protoc {
|
||||||
|
artifact = 'com.google.protobuf:protoc:3.0.0'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sourceSets {
|
||||||
|
main {
|
||||||
|
java {
|
||||||
|
srcDirs 'build/generated/source/proto/main/grpc'
|
||||||
|
srcDirs 'build/generated/source/proto/main/java'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
protobuf files ("def/")
|
||||||
|
|
||||||
|
implementation group: 'com.google.protobuf', name: 'protobuf-java', version: '3.0.0'
|
||||||
|
}
|
21
protocol/def/index.proto
Normal file
21
protocol/def/index.proto
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
syntax = "proto3";
|
||||||
|
|
||||||
|
option java_package = "nu.wmsa.wmsa.edge.index.proto";
|
||||||
|
option java_outer_classname = "IndexProto";
|
||||||
|
option java_multiple_files = true;
|
||||||
|
|
||||||
|
message IndexPutKeywordsReq {
|
||||||
|
int32 domain = 1;
|
||||||
|
int32 url = 2;
|
||||||
|
int32 index = 3;
|
||||||
|
repeated WordSet wordSet = 4;
|
||||||
|
|
||||||
|
message WordSet {
|
||||||
|
int32 index = 1;
|
||||||
|
repeated string words = 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
message IndexSearchQueryRsp {
|
||||||
|
|
||||||
|
}
|
@ -2,3 +2,4 @@ rootProject.name = 'wmsa'
|
|||||||
|
|
||||||
include 'marginalia_nu'
|
include 'marginalia_nu'
|
||||||
include 'third_party'
|
include 'third_party'
|
||||||
|
include 'protocol'
|
Loading…
Reference in New Issue
Block a user