mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-22 20:48:59 +00:00
Experimental changes for 22-08/09 update.
This commit is contained in:
parent
db056be06a
commit
3200c36072
@ -58,7 +58,7 @@ jmhJar {
|
||||
}
|
||||
dependencies {
|
||||
implementation project(':third_party')
|
||||
|
||||
implementation project(':protocol')
|
||||
|
||||
implementation 'org.projectlombok:lombok:1.18.24'
|
||||
annotationProcessor 'org.projectlombok:lombok:1.18.24'
|
||||
@ -157,6 +157,9 @@ dependencies {
|
||||
|
||||
jmh 'org.openjdk.jmh:jmh-core:1.35'
|
||||
jmh 'org.openjdk.jmh:jmh-generator-annprocess:1.35'
|
||||
|
||||
implementation 'com.dslplatform:dsl-json:1.9.9'
|
||||
annotationProcessor 'com.dslplatform:dsl-json-processor:1.9.9'
|
||||
}
|
||||
|
||||
configurations {
|
||||
|
@ -1,18 +1,18 @@
|
||||
package nu.marginalia.util.dict;
|
||||
|
||||
import nu.marginalia.util.SeekDictionary;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.LongBuffer;
|
||||
import java.util.ArrayList;
|
||||
|
||||
public class DictionaryData {
|
||||
|
||||
private final int DICTIONARY_BANK_SIZE;
|
||||
private static final Logger logger = LoggerFactory.getLogger(DictionaryData.class);
|
||||
|
||||
private final SeekDictionary<DictionaryDataBank> banks = SeekDictionary.of(DictionaryDataBank::getSize);
|
||||
private final ArrayList<DictionaryDataBank> banks = new ArrayList(100);
|
||||
|
||||
public DictionaryData(int bankSize) {
|
||||
DICTIONARY_BANK_SIZE = bankSize;
|
||||
@ -20,12 +20,8 @@ public class DictionaryData {
|
||||
banks.add(new DictionaryDataBank(0, bankSize));
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return banks.end();
|
||||
}
|
||||
|
||||
public int add(long key) {
|
||||
var activeBank = banks.last();
|
||||
var activeBank = banks.get(banks.size()-1);
|
||||
int rb = activeBank.add(key);
|
||||
|
||||
if (rb == -1) {
|
||||
@ -42,10 +38,10 @@ public class DictionaryData {
|
||||
|
||||
|
||||
public long getKey(int offset) {
|
||||
return banks.bankForOffset(offset).getKey(offset);
|
||||
return banks.get(offset/DICTIONARY_BANK_SIZE).getKey(offset);
|
||||
}
|
||||
public boolean keyEquals(int offset, long otherKey) {
|
||||
return banks.bankForOffset(offset).keyEquals(offset, otherKey);
|
||||
return banks.get(offset/DICTIONARY_BANK_SIZE).keyEquals(offset, otherKey);
|
||||
}
|
||||
|
||||
private static class DictionaryDataBank {
|
||||
|
@ -19,7 +19,12 @@ public class WordPatterns {
|
||||
public static final Pattern wordBreakPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
|
||||
public static final Pattern characterNoisePattern = Pattern.compile("^[/+\\-]+$");
|
||||
|
||||
public static final Pattern singleWordAdditionalPattern =
|
||||
Pattern.compile("[\\da-zA-Z]{1,15}([.\\-_/:][\\da-zA-Z]{1,10}){0,4}");
|
||||
|
||||
public static final Predicate<String> singleWordQualitiesPredicate = singleWordAdditionalPattern.asMatchPredicate();
|
||||
public static final Predicate<String> wordQualitiesPredicate = wordPattern.asMatchPredicate();
|
||||
|
||||
public static final Predicate<String> wordAppendixPredicate = wordAppendixPattern.asMatchPredicate();
|
||||
public static final Predicate<String> wordPredicateEither = wordQualitiesPredicate.or(wordAppendixPredicate);
|
||||
public static final Predicate<String> characterNoisePredicate = characterNoisePattern.asMatchPredicate();
|
||||
|
@ -8,7 +8,6 @@ import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import javax.inject.Inject;
|
||||
import java.util.*;
|
||||
@ -45,7 +44,6 @@ public class DocumentKeywordExtractor {
|
||||
List<WordRep> wordsNamesRepeated = nameCounter.count(documentLanguageData, 2);
|
||||
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
|
||||
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
|
||||
List<WordRep> wordsLongName = longNameCounter.count(documentLanguageData);
|
||||
|
||||
int totalSize = wordsTfIdf.size();
|
||||
|
||||
@ -61,17 +59,6 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
var wordsToMatchWithTitle = joinWordLists(topKeywords, midKeywords, wordsNamesRepeated, subjects);
|
||||
|
||||
var words = getSimpleWords(documentLanguageData);
|
||||
|
||||
for (var w : wordsLongName)
|
||||
words.add(w.word);
|
||||
for (var w : lowKeywords)
|
||||
words.remove(w.word);
|
||||
for (var w : midKeywords)
|
||||
words.remove(w.word);
|
||||
for (var w : topKeywords)
|
||||
words.remove(w.word);
|
||||
|
||||
Collection<String> artifacts = getArtifacts(documentLanguageData);
|
||||
|
||||
var wordSet = new EdgePageWordSet(
|
||||
@ -85,15 +72,81 @@ public class DocumentKeywordExtractor {
|
||||
new EdgePageWords(IndexBlock.Artifacts, artifacts)
|
||||
);
|
||||
|
||||
wordSet.append(IndexBlock.Words, words);
|
||||
getSimpleWords(wordSet, documentLanguageData,
|
||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus);
|
||||
|
||||
return wordSet;
|
||||
}
|
||||
|
||||
private void getSimpleWords(EdgePageWordSet wordSet, DocumentLanguageData documentLanguageData, IndexBlock... blocks) {
|
||||
|
||||
int start = 0;
|
||||
int lengthGoal = 32;
|
||||
|
||||
for (int blockIdx = 0; blockIdx < blocks.length-1 && start < documentLanguageData.sentences.length; blockIdx++) {
|
||||
IndexBlock block = blocks[blockIdx];
|
||||
Set<String> words = new HashSet<>(lengthGoal+100);
|
||||
|
||||
int pos;
|
||||
int length = 0;
|
||||
for (pos = start; pos < documentLanguageData.sentences.length && length < lengthGoal; pos++) {
|
||||
var sent = documentLanguageData.sentences[pos];
|
||||
length += sent.length();
|
||||
|
||||
for (var word : sent) {
|
||||
if (!word.isStopWord()) {
|
||||
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
||||
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
|
||||
words.add(w);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
wordSet.append(block, words);
|
||||
start = pos;
|
||||
lengthGoal+=32;
|
||||
}
|
||||
|
||||
if (start < documentLanguageData.sentences.length) {
|
||||
|
||||
Map<String, Integer> counts = new HashMap<>(documentLanguageData.totalNumWords());
|
||||
for (int pos = start; pos < documentLanguageData.sentences.length && counts.size() < lengthGoal; pos++) {
|
||||
var sent = documentLanguageData.sentences[pos];
|
||||
for (var word : sent) {
|
||||
if (!word.isStopWord()) {
|
||||
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
|
||||
if (counts.containsKey(w) || (WordPatterns.singleWordQualitiesPredicate.test(w))) {
|
||||
counts.merge(w, 1, Integer::sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Set<String> lastSet;
|
||||
if (counts.size() < 1024) {
|
||||
lastSet = counts.keySet();
|
||||
}
|
||||
else {
|
||||
lastSet = counts.entrySet().stream()
|
||||
.sorted(Comparator.comparing(e -> {
|
||||
double N = 11820118.; // Number of documents in term freq dictionary
|
||||
|
||||
// Caveat: This is actually the *negated* term score, because the second logarithm has
|
||||
// its parameter inverted (log(a^b) = b log(a); here b = -1)
|
||||
return (1 + Math.log(e.getValue())) * Math.log((1. + dict.getTermFreq(e.getKey())) / N);
|
||||
}))
|
||||
.map(Map.Entry::getKey)
|
||||
.limit(1024)
|
||||
.collect(Collectors.toCollection(LinkedHashSet::new));
|
||||
}
|
||||
|
||||
wordSet.append(blocks[blocks.length - 1], lastSet);
|
||||
}
|
||||
}
|
||||
|
||||
private Collection<String> getArtifacts(DocumentLanguageData documentLanguageData) {
|
||||
Set<String> reps = new HashSet<>();
|
||||
|
||||
|
||||
for (var sent : documentLanguageData.sentences) {
|
||||
for (var word : sent) {
|
||||
String lc = word.wordLowerCase();
|
||||
@ -138,33 +191,6 @@ public class DocumentKeywordExtractor {
|
||||
return ret;
|
||||
}
|
||||
|
||||
@NotNull
|
||||
private Set<String> getSimpleWords(DocumentLanguageData documentLanguageData) {
|
||||
Map<String, Integer> counts = new HashMap<>(documentLanguageData.totalNumWords());
|
||||
|
||||
for (var sent : documentLanguageData.sentences) {
|
||||
for (int i = 0; i < sent.length(); i++) {
|
||||
if (!sent.isStopWord(i)) {
|
||||
String w = AsciiFlattener.flattenUnicode(sent.wordsLowerCase[i]);
|
||||
if (counts.containsKey(w) || (WordPatterns.wordQualitiesPredicate.test(w) && WordPatterns.filter(w))) {
|
||||
counts.merge(w, 1, Integer::sum);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return counts.entrySet().stream()
|
||||
.sorted(Comparator.comparing(e -> {
|
||||
double N = 11820118.; // Number of documents in term freq dictionary
|
||||
|
||||
// Caveat: This is actually the *negated* term score, because the second logarithm has
|
||||
// its parameter inverted (log(a^b) = b log(a); here b = -1)
|
||||
return (1+Math.log(e.getValue())) * Math.log((1.+dict.getTermFreq(e.getKey()))/N);
|
||||
}))
|
||||
.map(Map.Entry::getKey)
|
||||
.limit(512).collect(Collectors.toCollection(LinkedHashSet::new));
|
||||
}
|
||||
|
||||
|
||||
public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) {
|
||||
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet()));
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.wmsa.client;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.google.protobuf.GeneratedMessageV3;
|
||||
import io.reactivex.rxjava3.core.Observable;
|
||||
import io.reactivex.rxjava3.core.ObservableSource;
|
||||
import io.reactivex.rxjava3.plugins.RxJavaPlugins;
|
||||
@ -17,8 +18,6 @@ import org.apache.http.HttpHost;
|
||||
import org.apache.logging.log4j.ThreadContext;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.slf4j.Marker;
|
||||
import org.slf4j.MarkerFactory;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
@ -186,6 +185,31 @@ public abstract class AbstractClient implements AutoCloseable {
|
||||
.doFinally(() -> ThreadContext.remove("outbound-request"));
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
protected synchronized Observable<HttpStatusCode> post(Context ctx, String endpoint, GeneratedMessageV3 data) {
|
||||
|
||||
ensureAlive();
|
||||
|
||||
RequestBody body = RequestBody.create(
|
||||
MediaType.parse("application/protobuf"),
|
||||
data.toByteArray());
|
||||
|
||||
var req = ctx.paint(new Request.Builder()).url(url + endpoint).post(body).build();
|
||||
var call = client.newCall(req);
|
||||
|
||||
logInbound(call);
|
||||
ThreadContext.put("outbound-request", url + endpoint);
|
||||
try (var rsp = call.execute()) {
|
||||
logOutbound(rsp);
|
||||
int code = rsp.code();
|
||||
|
||||
return validateStatus(code, req).map(HttpStatusCode::new);
|
||||
}
|
||||
finally {
|
||||
ThreadContext.remove("outbound-request");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
protected synchronized <T> Observable<T> postGet(Context ctx, String endpoint, Object data, Class<T> returnType) {
|
||||
|
@ -76,7 +76,7 @@ public class LinkKeywordLoaderMain {
|
||||
|
||||
// System.out.println(lastLine + " -/- " + domainId + ":" + urlId + " : " + keywords);
|
||||
|
||||
indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -5, new EdgePageWordSet(
|
||||
indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), new EdgePageWordSet(
|
||||
new EdgePageWords(IndexBlock.Link, new HashSet<>(keywords))), 0
|
||||
).blockingSubscribe();
|
||||
}
|
||||
|
@ -39,7 +39,7 @@ public class IndexLoadKeywords implements Runnable {
|
||||
while (!canceled) {
|
||||
var data = insertQueue.poll(1, TimeUnit.SECONDS);
|
||||
if (data != null) {
|
||||
client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), -5., data.wordSet, index).blockingSubscribe();
|
||||
client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.wordSet, index).blockingSubscribe();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -147,7 +147,10 @@ public class DocumentProcessor {
|
||||
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
|
||||
}
|
||||
|
||||
var dld = sentenceExtractor.extractSentences(doc.clone());
|
||||
DomPruner domPruner = new DomPruner();
|
||||
Document prunedDoc = doc.clone();
|
||||
domPruner.prune(prunedDoc, 0.5);
|
||||
var dld = sentenceExtractor.extractSentences(prunedDoc);
|
||||
|
||||
checkDocumentLanguage(dld);
|
||||
|
||||
@ -192,7 +195,7 @@ public class DocumentProcessor {
|
||||
ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
|
||||
|
||||
words.append(IndexBlock.Meta, tagWords);
|
||||
words.append(IndexBlock.Words, tagWords);
|
||||
words.append(IndexBlock.Words_1, tagWords);
|
||||
}
|
||||
|
||||
private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) {
|
||||
|
@ -0,0 +1,111 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.nodes.Node;
|
||||
import org.jsoup.nodes.TextNode;
|
||||
import org.jsoup.select.NodeVisitor;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class DomPruner {
|
||||
|
||||
public void prune(Document document, double pruneThreshold) {
|
||||
PruningVisitor pruningVisitor = new PruningVisitor();
|
||||
document.traverse(pruningVisitor);
|
||||
|
||||
pruningVisitor.data.forEach((node, data) -> {
|
||||
if (data.depth <= 1) {
|
||||
return;
|
||||
}
|
||||
if (data.signalNodeSize == 0) node.remove();
|
||||
else if (data.noiseNodeSize > 0
|
||||
&& data.signalRate() < pruneThreshold
|
||||
&& data.treeSize > 3) {
|
||||
node.remove();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
||||
private static class PruningVisitor implements NodeVisitor {
|
||||
|
||||
private final Map<Node, NodeData> data = new HashMap<>();
|
||||
private final NodeData dummy = new NodeData(Integer.MAX_VALUE, 1, 0);
|
||||
|
||||
@Override
|
||||
public void head(Node node, int depth) {}
|
||||
|
||||
@Override
|
||||
public void tail(Node node, int depth) {
|
||||
final NodeData dataForNode;
|
||||
|
||||
if (node instanceof TextNode tn) {
|
||||
dataForNode = new NodeData(depth, tn.text().length(), 0);
|
||||
}
|
||||
else if (isSignal(node)) {
|
||||
dataForNode = new NodeData(depth, 0,0);
|
||||
for (var childNode : node.childNodes()) {
|
||||
dataForNode.add(data.getOrDefault(childNode, dummy));
|
||||
}
|
||||
}
|
||||
else {
|
||||
dataForNode = new NodeData(depth, 0,0);
|
||||
for (var childNode : node.childNodes()) {
|
||||
dataForNode.addAsNoise(data.getOrDefault(childNode, dummy));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
data.put(node, dataForNode);
|
||||
}
|
||||
|
||||
public boolean isSignal(Node node) {
|
||||
|
||||
if (node instanceof Element e) {
|
||||
if ("a".equalsIgnoreCase(e.tagName()))
|
||||
return false;
|
||||
if ("nav".equalsIgnoreCase(e.tagName()))
|
||||
return false;
|
||||
if ("footer".equalsIgnoreCase(e.tagName()))
|
||||
return false;
|
||||
if ("header".equalsIgnoreCase(e.tagName()))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
private static class NodeData {
|
||||
int signalNodeSize;
|
||||
int noiseNodeSize;
|
||||
int treeSize = 1;
|
||||
int depth;
|
||||
|
||||
private NodeData(int depth, int signalNodeSize, int noiseNodeSize) {
|
||||
this.depth = depth;
|
||||
this.signalNodeSize = signalNodeSize;
|
||||
this.noiseNodeSize = noiseNodeSize;
|
||||
}
|
||||
|
||||
public void add(NodeData other) {
|
||||
signalNodeSize += other.signalNodeSize;
|
||||
noiseNodeSize += other.noiseNodeSize;
|
||||
treeSize += other.treeSize;
|
||||
}
|
||||
|
||||
public void addAsNoise(NodeData other) {
|
||||
noiseNodeSize += other.noiseNodeSize + other.signalNodeSize;
|
||||
treeSize += other.treeSize;
|
||||
}
|
||||
|
||||
|
||||
public double signalRate() {
|
||||
return signalNodeSize / (double)(signalNodeSize + noiseNodeSize);
|
||||
}
|
||||
}
|
||||
}
|
@ -4,11 +4,11 @@ import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.name.Named;
|
||||
import com.google.protobuf.InvalidProtocolBufferException;
|
||||
import gnu.trove.map.TLongIntMap;
|
||||
import gnu.trove.map.hash.TIntIntHashMap;
|
||||
import gnu.trove.map.hash.TLongIntHashMap;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import io.prometheus.client.Counter;
|
||||
import io.prometheus.client.Histogram;
|
||||
import io.reactivex.rxjava3.schedulers.Schedulers;
|
||||
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
|
||||
@ -22,18 +22,16 @@ import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
|
||||
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
|
||||
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||
import nu.marginalia.wmsa.edge.model.search.*;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
||||
import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq;
|
||||
import org.apache.http.HttpStatus;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
@ -52,7 +50,7 @@ import static spark.Spark.get;
|
||||
import static spark.Spark.halt;
|
||||
|
||||
public class EdgeIndexService extends Service {
|
||||
private static final int SEARCH_BUDGET_TIMEOUT_MS = 100;
|
||||
private static final int SEARCH_BUDGET_TIMEOUT_MS = 3000;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@ -66,11 +64,9 @@ public class EdgeIndexService extends Service {
|
||||
.create();
|
||||
|
||||
private static final Histogram wmsa_edge_index_query_time
|
||||
= Histogram.build().name("wmsa_edge_index_query_time").help("-").register();
|
||||
private static final Counter wmsa_edge_index_query_count
|
||||
= Counter.build().name("wmsa_edge_index_query_count").help("-").register();
|
||||
private static final Histogram wmsa_edge_index_put_words_time
|
||||
= Histogram.build().name("wmsa_edge_index_put_words_time").help("-").register();
|
||||
= Histogram.build().name("wmsa_edge_index_query_time")
|
||||
.linearBuckets(50, 50, 15)
|
||||
.help("-").register();
|
||||
|
||||
public static final int DYNAMIC_BUCKET_LENGTH = 7;
|
||||
|
||||
@ -162,12 +158,15 @@ public class EdgeIndexService extends Service {
|
||||
indexes.initialize(init);
|
||||
}
|
||||
|
||||
private Object putWords(Request request, Response response) {
|
||||
var putWordsRequest = gson.fromJson(request.body(), EdgePutWordsRequest.class);
|
||||
private Object putWords(Request request, Response response) throws InvalidProtocolBufferException {
|
||||
var req = IndexPutKeywordsReq.parseFrom(request.bodyAsBytes());
|
||||
|
||||
synchronized (this) {
|
||||
putWords(putWordsRequest.getDomainId(), putWordsRequest.getUrlId(),
|
||||
putWordsRequest.wordSet, putWordsRequest.getIndex());
|
||||
EdgeId<EdgeDomain> domainId = new EdgeId<>(req.getDomain());
|
||||
EdgeId<EdgeUrl> urlId = new EdgeId<>(req.getUrl());
|
||||
int idx = req.getIndex();
|
||||
|
||||
for (int ws = 0; ws < req.getWordSetCount(); ws++) {
|
||||
putWords(domainId, urlId, req.getWordSet(ws), idx);
|
||||
}
|
||||
|
||||
response.status(HttpStatus.SC_ACCEPTED);
|
||||
@ -175,26 +174,16 @@ public class EdgeIndexService extends Service {
|
||||
}
|
||||
|
||||
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
|
||||
EdgePageWordSet wordSet, int idx
|
||||
) {
|
||||
|
||||
wmsa_edge_index_put_words_time.time(() -> {
|
||||
for (EdgePageWords words : wordSet.values()) {
|
||||
putWords(domainId, urlId, words, idx);
|
||||
}
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
|
||||
EdgePageWords words, int idx
|
||||
IndexPutKeywordsReq.WordSet words, int idx
|
||||
) {
|
||||
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
|
||||
|
||||
for (var chunk : ListChunker.chopList(words.words, SearchIndexJournalEntry.MAX_LENGTH)) {
|
||||
IndexBlock block = IndexBlock.values()[words.getIndex()];
|
||||
|
||||
for (var chunk : ListChunker.chopList(words.getWordsList(), SearchIndexJournalEntry.MAX_LENGTH)) {
|
||||
|
||||
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
|
||||
var header = new SearchIndexJournalEntryHeader(domainId, urlId, words.block);
|
||||
var header = new SearchIndexJournalEntryHeader(domainId, urlId, block);
|
||||
|
||||
indexWriter.put(header, entry);
|
||||
};
|
||||
@ -257,7 +246,6 @@ public class EdgeIndexService extends Service {
|
||||
}
|
||||
finally {
|
||||
wmsa_edge_index_query_time.observe(System.currentTimeMillis() - start);
|
||||
wmsa_edge_index_query_count.inc();
|
||||
}
|
||||
}
|
||||
|
||||
@ -410,16 +398,6 @@ public class EdgeIndexService extends Service {
|
||||
|
||||
}
|
||||
|
||||
public LongStream getHotDomainsQuery(int bucket, IndexBlock block, int wordId,
|
||||
int queryDepth, int minHitCount, int maxResults) {
|
||||
if (!indexes.isValidBucket(bucket)) {
|
||||
logger.warn("Invalid bucket {}", bucket);
|
||||
return LongStream.empty();
|
||||
}
|
||||
|
||||
return indexes.getBucket(bucket).findHotDomainsForKeyword(block, wordId, queryDepth, minHitCount, maxResults);
|
||||
}
|
||||
|
||||
private LongStream getQuery(int bucket, IndexSearchBudget budget, IndexBlock block,
|
||||
LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
|
||||
if (!indexes.isValidBucket(bucket)) {
|
||||
|
@ -9,7 +9,6 @@ import nu.marginalia.wmsa.client.AbstractDynamicClient;
|
||||
import nu.marginalia.wmsa.client.HttpStatusCode;
|
||||
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
|
||||
import nu.marginalia.wmsa.configuration.server.Context;
|
||||
import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
@ -18,6 +17,7 @@ import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultSet;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
|
||||
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
|
||||
import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -37,13 +37,27 @@ public class EdgeIndexClient extends AbstractDynamicClient {
|
||||
}
|
||||
|
||||
@CheckReturnValue
|
||||
public Observable<HttpStatusCode> putWords(Context ctx, EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url, double quality,
|
||||
public Observable<HttpStatusCode> putWords(Context ctx, EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url,
|
||||
EdgePageWordSet wordSet, int writer
|
||||
)
|
||||
{
|
||||
EdgePutWordsRequest request = new EdgePutWordsRequest(domain, url, quality, wordSet, writer);
|
||||
|
||||
return this.post(ctx, "/words/", request);
|
||||
var keywordBuilder =
|
||||
IndexPutKeywordsReq.newBuilder()
|
||||
.setDomain(domain.id())
|
||||
.setUrl(url.id())
|
||||
.setIndex(writer);
|
||||
|
||||
for (var set : wordSet.wordSets.values()) {
|
||||
var wordSetBuilder = IndexPutKeywordsReq.WordSet.newBuilder();
|
||||
wordSetBuilder.setIndex(set.block.ordinal());
|
||||
wordSetBuilder.addAllWords(set.words);
|
||||
keywordBuilder.addWordSet(wordSetBuilder.build());
|
||||
}
|
||||
|
||||
var req = keywordBuilder.build();
|
||||
|
||||
return this.post(ctx, "/words/", req);
|
||||
}
|
||||
|
||||
|
||||
|
@ -15,6 +15,7 @@ import java.io.IOException;
|
||||
import java.io.RandomAccessFile;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
||||
@ -36,6 +37,8 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
||||
|
||||
byteBuffer = ByteBuffer.allocate(MAX_BLOCK_SIZE);
|
||||
|
||||
new Thread(this::journalWriterThread, "Journal Writer").start();
|
||||
|
||||
writerTask = Schedulers.io().schedulePeriodicallyDirect(this::forceWrite, 1, 1, TimeUnit.SECONDS);
|
||||
Runtime.getRuntime().addShutdownHook(new Thread(this::forceWrite));
|
||||
}
|
||||
@ -56,25 +59,45 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
||||
}
|
||||
}
|
||||
|
||||
private record WriteJob(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {}
|
||||
private final LinkedBlockingQueue<WriteJob> writeQueue = new LinkedBlockingQueue<>(512);
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public synchronized void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {
|
||||
public void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {
|
||||
writeQueue.put(new WriteJob(header, entryData));
|
||||
}
|
||||
|
||||
byteBuffer.clear();
|
||||
@SneakyThrows
|
||||
public void journalWriterThread() {
|
||||
|
||||
byteBuffer.putInt(entryData.size());
|
||||
byteBuffer.putInt(header.block().id);
|
||||
byteBuffer.putLong(header.documentId());
|
||||
while (true) {
|
||||
var job = writeQueue.take();
|
||||
|
||||
entryData.write(byteBuffer);
|
||||
writeEntry(job.header, job.entryData);
|
||||
}
|
||||
}
|
||||
private synchronized void writeEntry(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {
|
||||
|
||||
byteBuffer.limit(byteBuffer.position());
|
||||
byteBuffer.rewind();
|
||||
try {
|
||||
byteBuffer.clear();
|
||||
|
||||
while (byteBuffer.position() < byteBuffer.limit())
|
||||
channel.write(byteBuffer);
|
||||
byteBuffer.putInt(entryData.size());
|
||||
byteBuffer.putInt(header.block().id);
|
||||
byteBuffer.putLong(header.documentId());
|
||||
|
||||
writePositionMarker();
|
||||
entryData.write(byteBuffer);
|
||||
|
||||
byteBuffer.limit(byteBuffer.position());
|
||||
byteBuffer.rewind();
|
||||
|
||||
while (byteBuffer.position() < byteBuffer.limit())
|
||||
channel.write(byteBuffer);
|
||||
|
||||
writePositionMarker();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -94,13 +117,11 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
|
||||
}
|
||||
|
||||
private void writePositionMarker() throws IOException {
|
||||
var lock = channel.lock(0, 16, false);
|
||||
pos = channel.size();
|
||||
raf.seek(0);
|
||||
raf.writeLong(pos);
|
||||
raf.writeLong(dictionaryWriter.size());
|
||||
raf.seek(pos);
|
||||
lock.release();
|
||||
}
|
||||
|
||||
public synchronized void close() throws IOException {
|
||||
|
@ -5,16 +5,16 @@ import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
|
||||
@AllArgsConstructor @Getter
|
||||
@ToString
|
||||
public class EdgePutWordsRequest {
|
||||
public final EdgeId<EdgeDomain> domainId;
|
||||
public final EdgeId<EdgeUrl> urlId;
|
||||
public final double quality;
|
||||
public EdgeId<EdgeDomain> domainId;
|
||||
public EdgeId<EdgeUrl> urlId;
|
||||
public double quality;
|
||||
|
||||
public final EdgePageWordSet wordSet;
|
||||
public EdgePageWordSet wordSet;
|
||||
private int index = 0;
|
||||
}
|
||||
|
@ -5,14 +5,18 @@ public enum IndexBlock {
|
||||
Title(1, 1),
|
||||
Link(2, 1.25),
|
||||
Top(3, 2),
|
||||
Middle(4, 3),
|
||||
Low(5, 4),
|
||||
Words(6, 6),
|
||||
Middle(4, 2.5),
|
||||
Low(5, 3.0),
|
||||
Words_1(6, 3.0),
|
||||
Meta(7, 7),
|
||||
PositionWords(8, 4.5),
|
||||
Words_2(8, 3.5),
|
||||
NamesWords(9, 5),
|
||||
Artifacts(10, 10),
|
||||
Topic(11, 0.5);
|
||||
Topic(11, 0.5),
|
||||
Words_4(12, 4.0),
|
||||
Words_8(13, 4.5),
|
||||
Words_16Plus(14, 7.0),
|
||||
;
|
||||
|
||||
public final int id;
|
||||
public final double sortOrder;
|
||||
|
@ -29,8 +29,12 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
IndexBlock.Top,
|
||||
IndexBlock.Middle,
|
||||
IndexBlock.Low,
|
||||
IndexBlock.Words,
|
||||
IndexBlock.NamesWords,
|
||||
IndexBlock.Words_1,
|
||||
IndexBlock.Words_2,
|
||||
IndexBlock.Words_4,
|
||||
IndexBlock.Words_8,
|
||||
IndexBlock.Words_16Plus,
|
||||
};
|
||||
|
||||
@Inject
|
||||
@ -44,24 +48,29 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
var linkIndex = indices.get(IndexBlock.Link);
|
||||
var titleIndex = indices.get(IndexBlock.Title);
|
||||
var namesIndex = indices.get(IndexBlock.NamesWords);
|
||||
var positionIndex = indices.get(IndexBlock.PositionWords);
|
||||
var titleKeywordsIndex = indices.get(IndexBlock.TitleKeywords);
|
||||
var wordsIndex = indices.get(IndexBlock.Words);
|
||||
var metaIndex = indices.get(IndexBlock.Meta);
|
||||
var topicIndex = indices.get(IndexBlock.Topic);
|
||||
|
||||
var words1 = indices.get(IndexBlock.Words_1);
|
||||
var words2 = indices.get(IndexBlock.Words_2);
|
||||
var words4 = indices.get(IndexBlock.Words_4);
|
||||
var words8 = indices.get(IndexBlock.Words_8);
|
||||
var words16 = indices.get(IndexBlock.Words_16Plus);
|
||||
var artifacts = indices.get(IndexBlock.Artifacts);
|
||||
|
||||
queryBuilders = new EnumMap<>(IndexBlock.class);
|
||||
underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class);
|
||||
|
||||
queryBuilders.put(IndexBlock.Words, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex, namesIndex, wordsIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Low, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex), wordsIndex));
|
||||
queryBuilders.put(IndexBlock.Words_1, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words1), words1));
|
||||
queryBuilders.put(IndexBlock.Words_2, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words2), words1));
|
||||
queryBuilders.put(IndexBlock.Words_4, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words4), words1));
|
||||
queryBuilders.put(IndexBlock.Words_8, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words8), words1));
|
||||
queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words1, words2, words4, words8, words16, artifacts), words1));
|
||||
|
||||
underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
|
||||
underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
|
||||
underspecifiedQueryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
|
||||
underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
|
||||
underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
|
||||
underspecifiedQueryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
|
||||
}
|
||||
|
||||
@SafeVarargs
|
||||
@ -157,7 +166,7 @@ public class SearchIndexReader implements AutoCloseable {
|
||||
return block;
|
||||
}
|
||||
}
|
||||
return IndexBlock.Words;
|
||||
return IndexBlock.Words_1;
|
||||
}
|
||||
|
||||
public boolean isTermInBucket(IndexBlock block, int searchTerm, long urlId) {
|
||||
|
@ -27,7 +27,8 @@ public class IndexQueryBuilder {
|
||||
|
||||
public Query build(IndexSearchBudget budget,
|
||||
LongPredicate filter,
|
||||
int wordId) {
|
||||
int wordId)
|
||||
{
|
||||
return new QueryForIndices(budget, filter, wordId);
|
||||
}
|
||||
|
||||
|
@ -1,13 +1,13 @@
|
||||
package nu.marginalia.wmsa.edge.integration.stackoverflow;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
|
||||
import nu.marginalia.wmsa.edge.integration.stackoverflow.model.StackOverflowPost;
|
||||
import nu.marginalia.wmsa.edge.model.*;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.Jsoup;
|
||||
@ -46,8 +46,8 @@ public class StackOverflowPostProcessor {
|
||||
var keywords = documentKeywordExtractor.extractKeywords(dld);
|
||||
|
||||
keywords.get(IndexBlock.Meta).addJust("site:"+post.getUrl().domain);
|
||||
keywords.get(IndexBlock.Words).addJust("site:"+post.getUrl().domain);
|
||||
keywords.get(IndexBlock.Words).addJust("special:wikipedia");
|
||||
keywords.get(IndexBlock.Words_1).addJust("site:"+post.getUrl().domain);
|
||||
keywords.get(IndexBlock.Words_1).addJust("special:wikipedia");
|
||||
keywords.get(IndexBlock.Meta).addJust("special:wikipedia");
|
||||
keywords.get(IndexBlock.Meta).addJust("js:true");
|
||||
|
||||
|
@ -1,13 +1,13 @@
|
||||
package nu.marginalia.wmsa.edge.integration.wikipedia;
|
||||
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
|
||||
import nu.marginalia.wmsa.edge.integration.wikipedia.model.WikipediaArticle;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
@ -42,8 +42,8 @@ public class WikipediaProcessor {
|
||||
var keywords = documentKeywordExtractor.extractKeywords(dld);
|
||||
|
||||
keywords.get(IndexBlock.Meta).addJust("site:"+post.getUrl().domain);
|
||||
keywords.get(IndexBlock.Words).addJust("site:"+post.getUrl().domain);
|
||||
keywords.get(IndexBlock.Words).addJust("special:stackoverflow");
|
||||
keywords.get(IndexBlock.Words_1).addJust("site:"+post.getUrl().domain);
|
||||
keywords.get(IndexBlock.Words_1).addJust("special:stackoverflow");
|
||||
keywords.get(IndexBlock.Meta).addJust("special:stackoverflow");
|
||||
keywords.get(IndexBlock.Meta).addJust("js:true");
|
||||
|
||||
|
@ -1,13 +1,15 @@
|
||||
package nu.marginalia.wmsa.edge.model.crawl;
|
||||
|
||||
import com.dslplatform.json.JsonObject;
|
||||
import com.dslplatform.json.JsonWriter;
|
||||
import lombok.Data;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
@Data
|
||||
public class EdgePageWordSet {
|
||||
public final Map<IndexBlock, EdgePageWords> wordSets;
|
||||
public class EdgePageWordSet implements JsonObject {
|
||||
public Map<IndexBlock, EdgePageWords> wordSets;
|
||||
|
||||
public EdgePageWordSet(EdgePageWords... words) {
|
||||
wordSets = new EnumMap<>(IndexBlock.class);
|
||||
@ -45,4 +47,18 @@ public class EdgePageWordSet {
|
||||
});
|
||||
return sj.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void serialize(JsonWriter writer, boolean minimal) {
|
||||
writer.writeAscii("[");
|
||||
boolean first = false;
|
||||
for (var w : wordSets.values()) {
|
||||
if (!first) first = true;
|
||||
else writer.writeAscii(", ");
|
||||
|
||||
w.serialize(writer, minimal);
|
||||
}
|
||||
writer.writeAscii("]}");
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,7 @@
|
||||
package nu.marginalia.wmsa.edge.model.crawl;
|
||||
import com.dslplatform.json.JsonObject;
|
||||
import com.dslplatform.json.JsonWriter;
|
||||
import com.dslplatform.json.NumberConverter;
|
||||
import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
@ -8,7 +11,7 @@ import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
@ToString @Getter
|
||||
public class EdgePageWords {
|
||||
public class EdgePageWords implements JsonObject {
|
||||
public final IndexBlock block;
|
||||
public final List<String> words = new ArrayList<>();
|
||||
|
||||
@ -31,4 +34,19 @@ public class EdgePageWords {
|
||||
return words.size();
|
||||
}
|
||||
public void addJust(String word) { words.add(word); }
|
||||
|
||||
@Override
|
||||
public void serialize(JsonWriter writer, boolean minimal) {
|
||||
writer.writeAscii("{\"b\":");
|
||||
NumberConverter.serialize(block.ordinal(), writer);
|
||||
writer.writeAscii(", \"w\": [");
|
||||
boolean first = false;
|
||||
for (var word : words) {
|
||||
if (!first) first = true;
|
||||
else { writer.writeAscii(","); }
|
||||
|
||||
writer.writeString(word);
|
||||
}
|
||||
writer.writeAscii("]}");
|
||||
}
|
||||
}
|
||||
|
@ -10,25 +10,31 @@ import java.util.stream.Collectors;
|
||||
|
||||
public enum EdgeSearchProfile {
|
||||
DEFAULT("default",
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Link,
|
||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
|
||||
),
|
||||
0, 1),
|
||||
MODERN("modern",
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Link, IndexBlock.NamesWords,
|
||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
|
||||
),
|
||||
2),
|
||||
CORPO("corpo",
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
||||
4, 5, 6, 7),
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords,
|
||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
|
||||
4, 5, 7),
|
||||
YOLO("yolo",
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords,
|
||||
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
|
||||
0, 2, 1, 3, 4, 6),
|
||||
CORPO_CLEAN("corpo-clean",
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords),
|
||||
4, 5),
|
||||
ACADEMIA("academia",
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords),
|
||||
3),
|
||||
FOOD("food",
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
|
||||
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords),
|
||||
2, 0),
|
||||
;
|
||||
|
||||
|
@ -62,7 +62,7 @@ public class SiteSearchCommand implements SearchCommandInterface {
|
||||
DecoratedSearchResultSet resultSet;
|
||||
Path screenshotPath = null;
|
||||
if (null != domain) {
|
||||
resultSet = searchOperator.performDumbQuery(ctx, EdgeSearchProfile.CORPO, IndexBlock.Words, 100, 100, "site:"+domain);
|
||||
resultSet = searchOperator.performDumbQuery(ctx, EdgeSearchProfile.CORPO, IndexBlock.Words_1, 100, 100, "site:"+domain);
|
||||
|
||||
screenshotPath = Path.of("/screenshot/" + dataStoreDao.getDomainId(domain).id());
|
||||
}
|
||||
|
@ -30,7 +30,7 @@ public class SearchResultValuator {
|
||||
EdgeSearchResultKeywordScore[] scores = rawScores.stream().filter(w -> !w.keyword.contains(":")).toArray(EdgeSearchResultKeywordScore[]::new);
|
||||
|
||||
if (scores.length == 0) {
|
||||
return IndexBlock.Words.sortOrder;
|
||||
return IndexBlock.Words_1.sortOrder;
|
||||
}
|
||||
|
||||
final double[] weights = getTermWeights(scores);
|
||||
|
@ -51,7 +51,7 @@ public class FeaturesLoaderTool {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
client.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -5, wordSet, 0)
|
||||
client.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), wordSet, 0)
|
||||
.blockingSubscribe();
|
||||
});
|
||||
|
||||
|
@ -141,7 +141,7 @@ CREATE OR REPLACE VIEW EC_URL_VIEW AS
|
||||
EC_PAGE_DATA.FEATURES AS FEATURES,
|
||||
|
||||
EC_DOMAIN.IP AS IP,
|
||||
EC_DOMAIN.STATE AS STATE,
|
||||
EC_URL.STATE AS STATE,
|
||||
EC_DOMAIN.RANK AS RANK,
|
||||
EC_DOMAIN.STATE AS DOMAIN_STATE
|
||||
FROM EC_URL
|
||||
|
@ -0,0 +1,12 @@
|
||||
package nu.marginalia.wmsa.edge.converting.processor.logic;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
class DomPrunerTest {
|
||||
@Test
|
||||
public void test() throws IOException {
|
||||
|
||||
}
|
||||
}
|
@ -1,10 +1,8 @@
|
||||
package nu.marginalia.wmsa.edge.crawling;
|
||||
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.util.TestLanguageModels;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||
import nu.marginalia.util.language.WordPatterns;
|
||||
import nu.marginalia.util.language.conf.LanguageModels;
|
||||
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
|
||||
import nu.marginalia.util.language.processing.KeywordExtractor;
|
||||
@ -12,11 +10,9 @@ import nu.marginalia.util.language.processing.SentenceExtractor;
|
||||
import nu.marginalia.util.language.processing.model.WordRep;
|
||||
import nu.marginalia.util.language.processing.model.WordSpan;
|
||||
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
|
||||
import nu.marginalia.util.ranking.BuggyReversePageRank;
|
||||
import nu.marginalia.util.ranking.BuggyStandardPageRank;
|
||||
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
|
||||
import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
@ -103,6 +99,11 @@ class SentenceExtractorTest {
|
||||
});
|
||||
reader.join();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPattern() {
|
||||
System.out.println(WordPatterns.singleWordAdditionalPattern.matcher("2.6.18164.el5pae").matches());
|
||||
}
|
||||
@Test
|
||||
void extractSentences() throws IOException {
|
||||
var data = Path.of("/home/vlofgren/Code/tmp-data/");
|
||||
|
@ -11,12 +11,15 @@ import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
|
||||
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
|
||||
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
|
||||
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeId;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import org.junit.jupiter.api.*;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
|
||||
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
|
||||
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.parallel.Execution;
|
||||
import org.junit.jupiter.api.parallel.ExecutionMode;
|
||||
import org.junit.jupiter.api.parallel.ResourceAccessMode;
|
||||
@ -141,7 +144,7 @@ public class EdgeIndexClientTest {
|
||||
void putWords(int didx, int idx, double quality, String... words) {
|
||||
EdgePageWords epw = new EdgePageWords(IndexBlock.Title);
|
||||
epw.addAll(Arrays.asList(words));
|
||||
client.putWords(Context.internal(), new EdgeId<>(didx), new EdgeId<>(idx), quality,
|
||||
client.putWords(Context.internal(), new EdgeId<>(didx), new EdgeId<>(idx),
|
||||
new EdgePageWordSet(epw), 0).blockingSubscribe();
|
||||
}
|
||||
|
||||
|
@ -61,7 +61,7 @@ class SearchIndexJournalWriterTest {
|
||||
void put() throws IOException {
|
||||
writer.put(new SearchIndexJournalEntryHeader(4, (1234L << 32) | 5678, IndexBlock.Link),
|
||||
new SearchIndexJournalEntry(new long[] { 1, 2, 3, 4 }));
|
||||
writer.put(new SearchIndexJournalEntryHeader(4, (2345L << 32) | 2244, IndexBlock.Words),
|
||||
writer.put(new SearchIndexJournalEntryHeader(4, (2345L << 32) | 2244, IndexBlock.Words_1),
|
||||
new SearchIndexJournalEntry(new long[] { 5, 6, 7 }));
|
||||
writer.forceWrite();
|
||||
|
||||
|
27
protocol/build.gradle
Normal file
27
protocol/build.gradle
Normal file
@ -0,0 +1,27 @@
|
||||
plugins {
|
||||
id "com.google.protobuf" version "0.8.19"
|
||||
id "java"
|
||||
}
|
||||
repositories {
|
||||
gradlePluginPortal()
|
||||
}
|
||||
protobuf {
|
||||
protoc {
|
||||
artifact = 'com.google.protobuf:protoc:3.0.0'
|
||||
}
|
||||
}
|
||||
|
||||
sourceSets {
|
||||
main {
|
||||
java {
|
||||
srcDirs 'build/generated/source/proto/main/grpc'
|
||||
srcDirs 'build/generated/source/proto/main/java'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
dependencies {
|
||||
protobuf files ("def/")
|
||||
|
||||
implementation group: 'com.google.protobuf', name: 'protobuf-java', version: '3.0.0'
|
||||
}
|
21
protocol/def/index.proto
Normal file
21
protocol/def/index.proto
Normal file
@ -0,0 +1,21 @@
|
||||
syntax = "proto3";
|
||||
|
||||
option java_package = "nu.wmsa.wmsa.edge.index.proto";
|
||||
option java_outer_classname = "IndexProto";
|
||||
option java_multiple_files = true;
|
||||
|
||||
message IndexPutKeywordsReq {
|
||||
int32 domain = 1;
|
||||
int32 url = 2;
|
||||
int32 index = 3;
|
||||
repeated WordSet wordSet = 4;
|
||||
|
||||
message WordSet {
|
||||
int32 index = 1;
|
||||
repeated string words = 2;
|
||||
}
|
||||
}
|
||||
|
||||
message IndexSearchQueryRsp {
|
||||
|
||||
}
|
@ -1,4 +1,5 @@
|
||||
rootProject.name = 'wmsa'
|
||||
|
||||
include 'marginalia_nu'
|
||||
include 'third_party'
|
||||
include 'third_party'
|
||||
include 'protocol'
|
Loading…
Reference in New Issue
Block a user