Experimental changes for 22-08/09 update.

This commit is contained in:
vlofgren 2022-08-26 16:08:46 +02:00
parent db056be06a
commit 3200c36072
32 changed files with 475 additions and 175 deletions

View File

@ -58,7 +58,7 @@ jmhJar {
}
dependencies {
implementation project(':third_party')
implementation project(':protocol')
implementation 'org.projectlombok:lombok:1.18.24'
annotationProcessor 'org.projectlombok:lombok:1.18.24'
@ -157,6 +157,9 @@ dependencies {
jmh 'org.openjdk.jmh:jmh-core:1.35'
jmh 'org.openjdk.jmh:jmh-generator-annprocess:1.35'
implementation 'com.dslplatform:dsl-json:1.9.9'
annotationProcessor 'com.dslplatform:dsl-json-processor:1.9.9'
}
configurations {

View File

@ -1,18 +1,18 @@
package nu.marginalia.util.dict;
import nu.marginalia.util.SeekDictionary;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.ByteBuffer;
import java.nio.LongBuffer;
import java.util.ArrayList;
public class DictionaryData {
private final int DICTIONARY_BANK_SIZE;
private static final Logger logger = LoggerFactory.getLogger(DictionaryData.class);
private final SeekDictionary<DictionaryDataBank> banks = SeekDictionary.of(DictionaryDataBank::getSize);
private final ArrayList<DictionaryDataBank> banks = new ArrayList(100);
public DictionaryData(int bankSize) {
DICTIONARY_BANK_SIZE = bankSize;
@ -20,12 +20,8 @@ public class DictionaryData {
banks.add(new DictionaryDataBank(0, bankSize));
}
public int size() {
return banks.end();
}
public int add(long key) {
var activeBank = banks.last();
var activeBank = banks.get(banks.size()-1);
int rb = activeBank.add(key);
if (rb == -1) {
@ -42,10 +38,10 @@ public class DictionaryData {
public long getKey(int offset) {
return banks.bankForOffset(offset).getKey(offset);
return banks.get(offset/DICTIONARY_BANK_SIZE).getKey(offset);
}
public boolean keyEquals(int offset, long otherKey) {
return banks.bankForOffset(offset).keyEquals(offset, otherKey);
return banks.get(offset/DICTIONARY_BANK_SIZE).keyEquals(offset, otherKey);
}
private static class DictionaryDataBank {

View File

@ -19,7 +19,12 @@ public class WordPatterns {
public static final Pattern wordBreakPattern = Pattern.compile("([^_#@.a-zA-Z'+\\-0-9\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+)|[|]|(\\.(\\s+|$))");
public static final Pattern characterNoisePattern = Pattern.compile("^[/+\\-]+$");
public static final Pattern singleWordAdditionalPattern =
Pattern.compile("[\\da-zA-Z]{1,15}([.\\-_/:][\\da-zA-Z]{1,10}){0,4}");
public static final Predicate<String> singleWordQualitiesPredicate = singleWordAdditionalPattern.asMatchPredicate();
public static final Predicate<String> wordQualitiesPredicate = wordPattern.asMatchPredicate();
public static final Predicate<String> wordAppendixPredicate = wordAppendixPattern.asMatchPredicate();
public static final Predicate<String> wordPredicateEither = wordQualitiesPredicate.or(wordAppendixPredicate);
public static final Predicate<String> characterNoisePredicate = characterNoisePattern.asMatchPredicate();

View File

@ -8,7 +8,6 @@ import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
import org.jetbrains.annotations.NotNull;
import javax.inject.Inject;
import java.util.*;
@ -45,7 +44,6 @@ public class DocumentKeywordExtractor {
List<WordRep> wordsNamesRepeated = nameCounter.count(documentLanguageData, 2);
List<WordRep> wordsNamesAll = nameCounter.count(documentLanguageData, 1);
List<WordRep> subjects = subjectCounter.count(documentLanguageData);
List<WordRep> wordsLongName = longNameCounter.count(documentLanguageData);
int totalSize = wordsTfIdf.size();
@ -61,17 +59,6 @@ public class DocumentKeywordExtractor {
var wordsToMatchWithTitle = joinWordLists(topKeywords, midKeywords, wordsNamesRepeated, subjects);
var words = getSimpleWords(documentLanguageData);
for (var w : wordsLongName)
words.add(w.word);
for (var w : lowKeywords)
words.remove(w.word);
for (var w : midKeywords)
words.remove(w.word);
for (var w : topKeywords)
words.remove(w.word);
Collection<String> artifacts = getArtifacts(documentLanguageData);
var wordSet = new EdgePageWordSet(
@ -85,15 +72,81 @@ public class DocumentKeywordExtractor {
new EdgePageWords(IndexBlock.Artifacts, artifacts)
);
wordSet.append(IndexBlock.Words, words);
getSimpleWords(wordSet, documentLanguageData,
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus);
return wordSet;
}
private void getSimpleWords(EdgePageWordSet wordSet, DocumentLanguageData documentLanguageData, IndexBlock... blocks) {
int start = 0;
int lengthGoal = 32;
for (int blockIdx = 0; blockIdx < blocks.length-1 && start < documentLanguageData.sentences.length; blockIdx++) {
IndexBlock block = blocks[blockIdx];
Set<String> words = new HashSet<>(lengthGoal+100);
int pos;
int length = 0;
for (pos = start; pos < documentLanguageData.sentences.length && length < lengthGoal; pos++) {
var sent = documentLanguageData.sentences[pos];
length += sent.length();
for (var word : sent) {
if (!word.isStopWord()) {
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
if (WordPatterns.singleWordQualitiesPredicate.test(w)) {
words.add(w);
}
}
}
}
wordSet.append(block, words);
start = pos;
lengthGoal+=32;
}
if (start < documentLanguageData.sentences.length) {
Map<String, Integer> counts = new HashMap<>(documentLanguageData.totalNumWords());
for (int pos = start; pos < documentLanguageData.sentences.length && counts.size() < lengthGoal; pos++) {
var sent = documentLanguageData.sentences[pos];
for (var word : sent) {
if (!word.isStopWord()) {
String w = AsciiFlattener.flattenUnicode(word.wordLowerCase());
if (counts.containsKey(w) || (WordPatterns.singleWordQualitiesPredicate.test(w))) {
counts.merge(w, 1, Integer::sum);
}
}
}
}
Set<String> lastSet;
if (counts.size() < 1024) {
lastSet = counts.keySet();
}
else {
lastSet = counts.entrySet().stream()
.sorted(Comparator.comparing(e -> {
double N = 11820118.; // Number of documents in term freq dictionary
// Caveat: This is actually the *negated* term score, because the second logarithm has
// its parameter inverted (log(a^b) = b log(a); here b = -1)
return (1 + Math.log(e.getValue())) * Math.log((1. + dict.getTermFreq(e.getKey())) / N);
}))
.map(Map.Entry::getKey)
.limit(1024)
.collect(Collectors.toCollection(LinkedHashSet::new));
}
wordSet.append(blocks[blocks.length - 1], lastSet);
}
}
private Collection<String> getArtifacts(DocumentLanguageData documentLanguageData) {
Set<String> reps = new HashSet<>();
for (var sent : documentLanguageData.sentences) {
for (var word : sent) {
String lc = word.wordLowerCase();
@ -138,33 +191,6 @@ public class DocumentKeywordExtractor {
return ret;
}
@NotNull
private Set<String> getSimpleWords(DocumentLanguageData documentLanguageData) {
Map<String, Integer> counts = new HashMap<>(documentLanguageData.totalNumWords());
for (var sent : documentLanguageData.sentences) {
for (int i = 0; i < sent.length(); i++) {
if (!sent.isStopWord(i)) {
String w = AsciiFlattener.flattenUnicode(sent.wordsLowerCase[i]);
if (counts.containsKey(w) || (WordPatterns.wordQualitiesPredicate.test(w) && WordPatterns.filter(w))) {
counts.merge(w, 1, Integer::sum);
}
}
}
}
return counts.entrySet().stream()
.sorted(Comparator.comparing(e -> {
double N = 11820118.; // Number of documents in term freq dictionary
// Caveat: This is actually the *negated* term score, because the second logarithm has
// its parameter inverted (log(a^b) = b log(a); here b = -1)
return (1+Math.log(e.getValue())) * Math.log((1.+dict.getTermFreq(e.getKey()))/N);
}))
.map(Map.Entry::getKey)
.limit(512).collect(Collectors.toCollection(LinkedHashSet::new));
}
public EdgePageWords createWords(IndexBlock block, Collection<WordRep> words) {
return new EdgePageWords(block, words.stream().map(w -> w.word).map(AsciiFlattener::flattenUnicode).filter(WordPatterns.wordQualitiesPredicate).collect(Collectors.toSet()));

View File

@ -2,6 +2,7 @@ package nu.marginalia.wmsa.client;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.protobuf.GeneratedMessageV3;
import io.reactivex.rxjava3.core.Observable;
import io.reactivex.rxjava3.core.ObservableSource;
import io.reactivex.rxjava3.plugins.RxJavaPlugins;
@ -17,8 +18,6 @@ import org.apache.http.HttpHost;
import org.apache.logging.log4j.ThreadContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Marker;
import org.slf4j.MarkerFactory;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
@ -186,6 +185,31 @@ public abstract class AbstractClient implements AutoCloseable {
.doFinally(() -> ThreadContext.remove("outbound-request"));
}
@SneakyThrows
protected synchronized Observable<HttpStatusCode> post(Context ctx, String endpoint, GeneratedMessageV3 data) {
ensureAlive();
RequestBody body = RequestBody.create(
MediaType.parse("application/protobuf"),
data.toByteArray());
var req = ctx.paint(new Request.Builder()).url(url + endpoint).post(body).build();
var call = client.newCall(req);
logInbound(call);
ThreadContext.put("outbound-request", url + endpoint);
try (var rsp = call.execute()) {
logOutbound(rsp);
int code = rsp.code();
return validateStatus(code, req).map(HttpStatusCode::new);
}
finally {
ThreadContext.remove("outbound-request");
}
}
@SneakyThrows
protected synchronized <T> Observable<T> postGet(Context ctx, String endpoint, Object data, Class<T> returnType) {

View File

@ -76,7 +76,7 @@ public class LinkKeywordLoaderMain {
// System.out.println(lastLine + " -/- " + domainId + ":" + urlId + " : " + keywords);
indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -5, new EdgePageWordSet(
indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), new EdgePageWordSet(
new EdgePageWords(IndexBlock.Link, new HashSet<>(keywords))), 0
).blockingSubscribe();
}

View File

@ -39,7 +39,7 @@ public class IndexLoadKeywords implements Runnable {
while (!canceled) {
var data = insertQueue.poll(1, TimeUnit.SECONDS);
if (data != null) {
client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), -5., data.wordSet, index).blockingSubscribe();
client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.wordSet, index).blockingSubscribe();
}
}
}

View File

@ -147,7 +147,10 @@ public class DocumentProcessor {
throw new DisqualifiedException(DisqualificationReason.ACCEPTABLE_ADS);
}
var dld = sentenceExtractor.extractSentences(doc.clone());
DomPruner domPruner = new DomPruner();
Document prunedDoc = doc.clone();
domPruner.prune(prunedDoc, 0.5);
var dld = sentenceExtractor.extractSentences(prunedDoc);
checkDocumentLanguage(dld);
@ -192,7 +195,7 @@ public class DocumentProcessor {
ret.features.stream().map(HtmlFeature::getKeyword).forEach(tagWords::add);
words.append(IndexBlock.Meta, tagWords);
words.append(IndexBlock.Words, tagWords);
words.append(IndexBlock.Words_1, tagWords);
}
private void getLinks(EdgeUrl baseUrl, ProcessedDocumentDetails ret, Document doc, EdgePageWordSet words) {

View File

@ -0,0 +1,111 @@
package nu.marginalia.wmsa.edge.converting.processor.logic;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.NodeVisitor;
import java.util.HashMap;
import java.util.Map;
public class DomPruner {
public void prune(Document document, double pruneThreshold) {
PruningVisitor pruningVisitor = new PruningVisitor();
document.traverse(pruningVisitor);
pruningVisitor.data.forEach((node, data) -> {
if (data.depth <= 1) {
return;
}
if (data.signalNodeSize == 0) node.remove();
else if (data.noiseNodeSize > 0
&& data.signalRate() < pruneThreshold
&& data.treeSize > 3) {
node.remove();
}
});
}
private static class PruningVisitor implements NodeVisitor {
private final Map<Node, NodeData> data = new HashMap<>();
private final NodeData dummy = new NodeData(Integer.MAX_VALUE, 1, 0);
@Override
public void head(Node node, int depth) {}
@Override
public void tail(Node node, int depth) {
final NodeData dataForNode;
if (node instanceof TextNode tn) {
dataForNode = new NodeData(depth, tn.text().length(), 0);
}
else if (isSignal(node)) {
dataForNode = new NodeData(depth, 0,0);
for (var childNode : node.childNodes()) {
dataForNode.add(data.getOrDefault(childNode, dummy));
}
}
else {
dataForNode = new NodeData(depth, 0,0);
for (var childNode : node.childNodes()) {
dataForNode.addAsNoise(data.getOrDefault(childNode, dummy));
}
}
data.put(node, dataForNode);
}
public boolean isSignal(Node node) {
if (node instanceof Element e) {
if ("a".equalsIgnoreCase(e.tagName()))
return false;
if ("nav".equalsIgnoreCase(e.tagName()))
return false;
if ("footer".equalsIgnoreCase(e.tagName()))
return false;
if ("header".equalsIgnoreCase(e.tagName()))
return false;
}
return true;
}
}
private static class NodeData {
int signalNodeSize;
int noiseNodeSize;
int treeSize = 1;
int depth;
private NodeData(int depth, int signalNodeSize, int noiseNodeSize) {
this.depth = depth;
this.signalNodeSize = signalNodeSize;
this.noiseNodeSize = noiseNodeSize;
}
public void add(NodeData other) {
signalNodeSize += other.signalNodeSize;
noiseNodeSize += other.noiseNodeSize;
treeSize += other.treeSize;
}
public void addAsNoise(NodeData other) {
noiseNodeSize += other.noiseNodeSize + other.signalNodeSize;
treeSize += other.treeSize;
}
public double signalRate() {
return signalNodeSize / (double)(signalNodeSize + noiseNodeSize);
}
}
}

View File

@ -4,11 +4,11 @@ import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import com.google.protobuf.InvalidProtocolBufferException;
import gnu.trove.map.TLongIntMap;
import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.map.hash.TLongIntHashMap;
import gnu.trove.set.hash.TIntHashSet;
import io.prometheus.client.Counter;
import io.prometheus.client.Histogram;
import io.reactivex.rxjava3.schedulers.Schedulers;
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
@ -22,18 +22,16 @@ import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry;
import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader;
import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon;
import nu.marginalia.wmsa.edge.index.model.EdgeIndexSearchTerms;
import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
import nu.marginalia.wmsa.edge.index.reader.query.IndexSearchBudget;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
import nu.marginalia.wmsa.edge.model.search.*;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq;
import org.apache.http.HttpStatus;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
@ -52,7 +50,7 @@ import static spark.Spark.get;
import static spark.Spark.halt;
public class EdgeIndexService extends Service {
private static final int SEARCH_BUDGET_TIMEOUT_MS = 100;
private static final int SEARCH_BUDGET_TIMEOUT_MS = 3000;
private final Logger logger = LoggerFactory.getLogger(getClass());
@ -66,11 +64,9 @@ public class EdgeIndexService extends Service {
.create();
private static final Histogram wmsa_edge_index_query_time
= Histogram.build().name("wmsa_edge_index_query_time").help("-").register();
private static final Counter wmsa_edge_index_query_count
= Counter.build().name("wmsa_edge_index_query_count").help("-").register();
private static final Histogram wmsa_edge_index_put_words_time
= Histogram.build().name("wmsa_edge_index_put_words_time").help("-").register();
= Histogram.build().name("wmsa_edge_index_query_time")
.linearBuckets(50, 50, 15)
.help("-").register();
public static final int DYNAMIC_BUCKET_LENGTH = 7;
@ -162,12 +158,15 @@ public class EdgeIndexService extends Service {
indexes.initialize(init);
}
private Object putWords(Request request, Response response) {
var putWordsRequest = gson.fromJson(request.body(), EdgePutWordsRequest.class);
private Object putWords(Request request, Response response) throws InvalidProtocolBufferException {
var req = IndexPutKeywordsReq.parseFrom(request.bodyAsBytes());
synchronized (this) {
putWords(putWordsRequest.getDomainId(), putWordsRequest.getUrlId(),
putWordsRequest.wordSet, putWordsRequest.getIndex());
EdgeId<EdgeDomain> domainId = new EdgeId<>(req.getDomain());
EdgeId<EdgeUrl> urlId = new EdgeId<>(req.getUrl());
int idx = req.getIndex();
for (int ws = 0; ws < req.getWordSetCount(); ws++) {
putWords(domainId, urlId, req.getWordSet(ws), idx);
}
response.status(HttpStatus.SC_ACCEPTED);
@ -175,26 +174,16 @@ public class EdgeIndexService extends Service {
}
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
EdgePageWordSet wordSet, int idx
) {
wmsa_edge_index_put_words_time.time(() -> {
for (EdgePageWords words : wordSet.values()) {
putWords(domainId, urlId, words, idx);
}
});
}
public void putWords(EdgeId<EdgeDomain> domainId, EdgeId<EdgeUrl> urlId,
EdgePageWords words, int idx
IndexPutKeywordsReq.WordSet words, int idx
) {
SearchIndexJournalWriterImpl indexWriter = indexes.getIndexWriter(idx);
for (var chunk : ListChunker.chopList(words.words, SearchIndexJournalEntry.MAX_LENGTH)) {
IndexBlock block = IndexBlock.values()[words.getIndex()];
for (var chunk : ListChunker.chopList(words.getWordsList(), SearchIndexJournalEntry.MAX_LENGTH)) {
var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk));
var header = new SearchIndexJournalEntryHeader(domainId, urlId, words.block);
var header = new SearchIndexJournalEntryHeader(domainId, urlId, block);
indexWriter.put(header, entry);
};
@ -257,7 +246,6 @@ public class EdgeIndexService extends Service {
}
finally {
wmsa_edge_index_query_time.observe(System.currentTimeMillis() - start);
wmsa_edge_index_query_count.inc();
}
}
@ -410,16 +398,6 @@ public class EdgeIndexService extends Service {
}
public LongStream getHotDomainsQuery(int bucket, IndexBlock block, int wordId,
int queryDepth, int minHitCount, int maxResults) {
if (!indexes.isValidBucket(bucket)) {
logger.warn("Invalid bucket {}", bucket);
return LongStream.empty();
}
return indexes.getBucket(bucket).findHotDomainsForKeyword(block, wordId, queryDepth, minHitCount, maxResults);
}
private LongStream getQuery(int bucket, IndexSearchBudget budget, IndexBlock block,
LongPredicate filter, EdgeIndexSearchTerms searchTerms) {
if (!indexes.isValidBucket(bucket)) {

View File

@ -9,7 +9,6 @@ import nu.marginalia.wmsa.client.AbstractDynamicClient;
import nu.marginalia.wmsa.client.HttpStatusCode;
import nu.marginalia.wmsa.configuration.ServiceDescriptor;
import nu.marginalia.wmsa.configuration.server.Context;
import nu.marginalia.wmsa.edge.index.model.EdgePutWordsRequest;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
@ -18,6 +17,7 @@ import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultSet;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults;
import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchSpecification;
import nu.wmsa.wmsa.edge.index.proto.IndexPutKeywordsReq;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -37,13 +37,27 @@ public class EdgeIndexClient extends AbstractDynamicClient {
}
@CheckReturnValue
public Observable<HttpStatusCode> putWords(Context ctx, EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url, double quality,
public Observable<HttpStatusCode> putWords(Context ctx, EdgeId<EdgeDomain> domain, EdgeId<EdgeUrl> url,
EdgePageWordSet wordSet, int writer
)
{
EdgePutWordsRequest request = new EdgePutWordsRequest(domain, url, quality, wordSet, writer);
return this.post(ctx, "/words/", request);
var keywordBuilder =
IndexPutKeywordsReq.newBuilder()
.setDomain(domain.id())
.setUrl(url.id())
.setIndex(writer);
for (var set : wordSet.wordSets.values()) {
var wordSetBuilder = IndexPutKeywordsReq.WordSet.newBuilder();
wordSetBuilder.setIndex(set.block.ordinal());
wordSetBuilder.addAllWords(set.words);
keywordBuilder.addWordSet(wordSetBuilder.build());
}
var req = keywordBuilder.build();
return this.post(ctx, "/words/", req);
}

View File

@ -15,6 +15,7 @@ import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
@ -36,6 +37,8 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
byteBuffer = ByteBuffer.allocate(MAX_BLOCK_SIZE);
new Thread(this::journalWriterThread, "Journal Writer").start();
writerTask = Schedulers.io().schedulePeriodicallyDirect(this::forceWrite, 1, 1, TimeUnit.SECONDS);
Runtime.getRuntime().addShutdownHook(new Thread(this::forceWrite));
}
@ -56,25 +59,45 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
}
}
private record WriteJob(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {}
private final LinkedBlockingQueue<WriteJob> writeQueue = new LinkedBlockingQueue<>(512);
@Override
@SneakyThrows
public synchronized void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {
public void put(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {
writeQueue.put(new WriteJob(header, entryData));
}
byteBuffer.clear();
@SneakyThrows
public void journalWriterThread() {
byteBuffer.putInt(entryData.size());
byteBuffer.putInt(header.block().id);
byteBuffer.putLong(header.documentId());
while (true) {
var job = writeQueue.take();
entryData.write(byteBuffer);
writeEntry(job.header, job.entryData);
}
}
private synchronized void writeEntry(SearchIndexJournalEntryHeader header, SearchIndexJournalEntry entryData) {
byteBuffer.limit(byteBuffer.position());
byteBuffer.rewind();
try {
byteBuffer.clear();
while (byteBuffer.position() < byteBuffer.limit())
channel.write(byteBuffer);
byteBuffer.putInt(entryData.size());
byteBuffer.putInt(header.block().id);
byteBuffer.putLong(header.documentId());
writePositionMarker();
entryData.write(byteBuffer);
byteBuffer.limit(byteBuffer.position());
byteBuffer.rewind();
while (byteBuffer.position() < byteBuffer.limit())
channel.write(byteBuffer);
writePositionMarker();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
@ -94,13 +117,11 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter {
}
private void writePositionMarker() throws IOException {
var lock = channel.lock(0, 16, false);
pos = channel.size();
raf.seek(0);
raf.writeLong(pos);
raf.writeLong(dictionaryWriter.size());
raf.seek(pos);
lock.release();
}
public synchronized void close() throws IOException {

View File

@ -5,16 +5,16 @@ import lombok.Getter;
import lombok.ToString;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
@AllArgsConstructor @Getter
@ToString
public class EdgePutWordsRequest {
public final EdgeId<EdgeDomain> domainId;
public final EdgeId<EdgeUrl> urlId;
public final double quality;
public EdgeId<EdgeDomain> domainId;
public EdgeId<EdgeUrl> urlId;
public double quality;
public final EdgePageWordSet wordSet;
public EdgePageWordSet wordSet;
private int index = 0;
}

View File

@ -5,14 +5,18 @@ public enum IndexBlock {
Title(1, 1),
Link(2, 1.25),
Top(3, 2),
Middle(4, 3),
Low(5, 4),
Words(6, 6),
Middle(4, 2.5),
Low(5, 3.0),
Words_1(6, 3.0),
Meta(7, 7),
PositionWords(8, 4.5),
Words_2(8, 3.5),
NamesWords(9, 5),
Artifacts(10, 10),
Topic(11, 0.5);
Topic(11, 0.5),
Words_4(12, 4.0),
Words_8(13, 4.5),
Words_16Plus(14, 7.0),
;
public final int id;
public final double sortOrder;

View File

@ -29,8 +29,12 @@ public class SearchIndexReader implements AutoCloseable {
IndexBlock.Top,
IndexBlock.Middle,
IndexBlock.Low,
IndexBlock.Words,
IndexBlock.NamesWords,
IndexBlock.Words_1,
IndexBlock.Words_2,
IndexBlock.Words_4,
IndexBlock.Words_8,
IndexBlock.Words_16Plus,
};
@Inject
@ -44,24 +48,29 @@ public class SearchIndexReader implements AutoCloseable {
var linkIndex = indices.get(IndexBlock.Link);
var titleIndex = indices.get(IndexBlock.Title);
var namesIndex = indices.get(IndexBlock.NamesWords);
var positionIndex = indices.get(IndexBlock.PositionWords);
var titleKeywordsIndex = indices.get(IndexBlock.TitleKeywords);
var wordsIndex = indices.get(IndexBlock.Words);
var metaIndex = indices.get(IndexBlock.Meta);
var topicIndex = indices.get(IndexBlock.Topic);
var words1 = indices.get(IndexBlock.Words_1);
var words2 = indices.get(IndexBlock.Words_2);
var words4 = indices.get(IndexBlock.Words_4);
var words8 = indices.get(IndexBlock.Words_8);
var words16 = indices.get(IndexBlock.Words_16Plus);
var artifacts = indices.get(IndexBlock.Artifacts);
queryBuilders = new EnumMap<>(IndexBlock.class);
underspecifiedQueryBuilders = new EnumMap<>(IndexBlock.class);
queryBuilders.put(IndexBlock.Words, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex, namesIndex, wordsIndex), wordsIndex));
queryBuilders.put(IndexBlock.Low, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex, midIndex, lowIndex, linkIndex), wordsIndex));
queryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex, topIndex), wordsIndex));
queryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, topicIndex, titleIndex), wordsIndex));
queryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex), wordsIndex));
queryBuilders.put(IndexBlock.Words_1, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words1), words1));
queryBuilders.put(IndexBlock.Words_2, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words2), words1));
queryBuilders.put(IndexBlock.Words_4, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words4), words1));
queryBuilders.put(IndexBlock.Words_8, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words8), words1));
queryBuilders.put(IndexBlock.Words_16Plus, new IndexQueryBuilder(listOfNonNulls(metaIndex, titleKeywordsIndex, titleIndex, topIndex, words1, words2, words4, words8, words16, artifacts), words1));
underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
underspecifiedQueryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, positionIndex, metaIndex), wordsIndex));
underspecifiedQueryBuilders.put(IndexBlock.TitleKeywords, new IndexQueryBuilder(listOfNonNulls(titleKeywordsIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
underspecifiedQueryBuilders.put(IndexBlock.Title, new IndexQueryBuilder(listOfNonNulls(titleIndex, topicIndex, linkIndex, topicIndex, topIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
underspecifiedQueryBuilders.put(IndexBlock.Top, new IndexQueryBuilder(listOfNonNulls(topIndex, linkIndex, midIndex, lowIndex, namesIndex, metaIndex), words1));
}
@SafeVarargs
@ -157,7 +166,7 @@ public class SearchIndexReader implements AutoCloseable {
return block;
}
}
return IndexBlock.Words;
return IndexBlock.Words_1;
}
public boolean isTermInBucket(IndexBlock block, int searchTerm, long urlId) {

View File

@ -27,7 +27,8 @@ public class IndexQueryBuilder {
public Query build(IndexSearchBudget budget,
LongPredicate filter,
int wordId) {
int wordId)
{
return new QueryForIndices(budget, filter, wordId);
}

View File

@ -1,13 +1,13 @@
package nu.marginalia.wmsa.edge.integration.stackoverflow;
import com.google.inject.Inject;
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
import nu.marginalia.wmsa.edge.integration.stackoverflow.model.StackOverflowPost;
import nu.marginalia.wmsa.edge.model.*;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
@ -46,8 +46,8 @@ public class StackOverflowPostProcessor {
var keywords = documentKeywordExtractor.extractKeywords(dld);
keywords.get(IndexBlock.Meta).addJust("site:"+post.getUrl().domain);
keywords.get(IndexBlock.Words).addJust("site:"+post.getUrl().domain);
keywords.get(IndexBlock.Words).addJust("special:wikipedia");
keywords.get(IndexBlock.Words_1).addJust("site:"+post.getUrl().domain);
keywords.get(IndexBlock.Words_1).addJust("special:wikipedia");
keywords.get(IndexBlock.Meta).addJust("special:wikipedia");
keywords.get(IndexBlock.Meta).addJust("js:true");

View File

@ -1,13 +1,13 @@
package nu.marginalia.wmsa.edge.integration.wikipedia;
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.wmsa.edge.converting.processor.logic.LinkParser;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.integration.model.BasicDocumentData;
import nu.marginalia.wmsa.edge.integration.wikipedia.model.WikipediaArticle;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.crawl.EdgeDomainLink;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
@ -42,8 +42,8 @@ public class WikipediaProcessor {
var keywords = documentKeywordExtractor.extractKeywords(dld);
keywords.get(IndexBlock.Meta).addJust("site:"+post.getUrl().domain);
keywords.get(IndexBlock.Words).addJust("site:"+post.getUrl().domain);
keywords.get(IndexBlock.Words).addJust("special:stackoverflow");
keywords.get(IndexBlock.Words_1).addJust("site:"+post.getUrl().domain);
keywords.get(IndexBlock.Words_1).addJust("special:stackoverflow");
keywords.get(IndexBlock.Meta).addJust("special:stackoverflow");
keywords.get(IndexBlock.Meta).addJust("js:true");

View File

@ -1,13 +1,15 @@
package nu.marginalia.wmsa.edge.model.crawl;
import com.dslplatform.json.JsonObject;
import com.dslplatform.json.JsonWriter;
import lombok.Data;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import java.util.*;
@Data
public class EdgePageWordSet {
public final Map<IndexBlock, EdgePageWords> wordSets;
public class EdgePageWordSet implements JsonObject {
public Map<IndexBlock, EdgePageWords> wordSets;
public EdgePageWordSet(EdgePageWords... words) {
wordSets = new EnumMap<>(IndexBlock.class);
@ -45,4 +47,18 @@ public class EdgePageWordSet {
});
return sj.toString();
}
@Override
public void serialize(JsonWriter writer, boolean minimal) {
writer.writeAscii("[");
boolean first = false;
for (var w : wordSets.values()) {
if (!first) first = true;
else writer.writeAscii(", ");
w.serialize(writer, minimal);
}
writer.writeAscii("]}");
}
}

View File

@ -1,4 +1,7 @@
package nu.marginalia.wmsa.edge.model.crawl;
import com.dslplatform.json.JsonObject;
import com.dslplatform.json.JsonWriter;
import com.dslplatform.json.NumberConverter;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
@ -8,7 +11,7 @@ import java.util.Collection;
import java.util.List;
@ToString @Getter
public class EdgePageWords {
public class EdgePageWords implements JsonObject {
public final IndexBlock block;
public final List<String> words = new ArrayList<>();
@ -31,4 +34,19 @@ public class EdgePageWords {
return words.size();
}
public void addJust(String word) { words.add(word); }
@Override
public void serialize(JsonWriter writer, boolean minimal) {
writer.writeAscii("{\"b\":");
NumberConverter.serialize(block.ordinal(), writer);
writer.writeAscii(", \"w\": [");
boolean first = false;
for (var word : words) {
if (!first) first = true;
else { writer.writeAscii(","); }
writer.writeString(word);
}
writer.writeAscii("]}");
}
}

View File

@ -10,25 +10,31 @@ import java.util.stream.Collectors;
public enum EdgeSearchProfile {
DEFAULT("default",
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Link,
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
),
0, 1),
MODERN("modern",
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Link, IndexBlock.NamesWords,
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus
),
2),
CORPO("corpo",
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
4, 5, 6, 7),
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords,
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
4, 5, 7),
YOLO("yolo",
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords,
IndexBlock.Words_1, IndexBlock.Words_2, IndexBlock.Words_4, IndexBlock.Words_8, IndexBlock.Words_16Plus),
0, 2, 1, 3, 4, 6),
CORPO_CLEAN("corpo-clean",
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords),
4, 5),
ACADEMIA("academia",
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.NamesWords),
3),
FOOD("food",
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words, IndexBlock.NamesWords),
List.of(IndexBlock.TitleKeywords, IndexBlock.Title, IndexBlock.Top, IndexBlock.Middle, IndexBlock.Low, IndexBlock.Link, IndexBlock.Words_1, IndexBlock.NamesWords),
2, 0),
;

View File

@ -62,7 +62,7 @@ public class SiteSearchCommand implements SearchCommandInterface {
DecoratedSearchResultSet resultSet;
Path screenshotPath = null;
if (null != domain) {
resultSet = searchOperator.performDumbQuery(ctx, EdgeSearchProfile.CORPO, IndexBlock.Words, 100, 100, "site:"+domain);
resultSet = searchOperator.performDumbQuery(ctx, EdgeSearchProfile.CORPO, IndexBlock.Words_1, 100, 100, "site:"+domain);
screenshotPath = Path.of("/screenshot/" + dataStoreDao.getDomainId(domain).id());
}

View File

@ -30,7 +30,7 @@ public class SearchResultValuator {
EdgeSearchResultKeywordScore[] scores = rawScores.stream().filter(w -> !w.keyword.contains(":")).toArray(EdgeSearchResultKeywordScore[]::new);
if (scores.length == 0) {
return IndexBlock.Words.sortOrder;
return IndexBlock.Words_1.sortOrder;
}
final double[] weights = getTermWeights(scores);

View File

@ -51,7 +51,7 @@ public class FeaturesLoaderTool {
throw new RuntimeException(ex);
}
client.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), -5, wordSet, 0)
client.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), wordSet, 0)
.blockingSubscribe();
});

View File

@ -141,7 +141,7 @@ CREATE OR REPLACE VIEW EC_URL_VIEW AS
EC_PAGE_DATA.FEATURES AS FEATURES,
EC_DOMAIN.IP AS IP,
EC_DOMAIN.STATE AS STATE,
EC_URL.STATE AS STATE,
EC_DOMAIN.RANK AS RANK,
EC_DOMAIN.STATE AS DOMAIN_STATE
FROM EC_URL

View File

@ -0,0 +1,12 @@
package nu.marginalia.wmsa.edge.converting.processor.logic;
import org.junit.jupiter.api.Test;
import java.io.IOException;
class DomPrunerTest {
@Test
public void test() throws IOException {
}
}

View File

@ -1,10 +1,8 @@
package nu.marginalia.wmsa.edge.crawling;
import com.zaxxer.hikari.HikariConfig;
import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.util.TestLanguageModels;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.util.language.WordPatterns;
import nu.marginalia.util.language.conf.LanguageModels;
import nu.marginalia.util.language.processing.DocumentKeywordExtractor;
import nu.marginalia.util.language.processing.KeywordExtractor;
@ -12,11 +10,9 @@ import nu.marginalia.util.language.processing.SentenceExtractor;
import nu.marginalia.util.language.processing.model.WordRep;
import nu.marginalia.util.language.processing.model.WordSpan;
import nu.marginalia.util.language.processing.model.tag.WordSeparator;
import nu.marginalia.util.ranking.BuggyReversePageRank;
import nu.marginalia.util.ranking.BuggyStandardPageRank;
import nu.marginalia.wmsa.edge.assistant.dict.NGramDict;
import nu.marginalia.wmsa.edge.integration.wikipedia.WikipediaReader;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.jsoup.Jsoup;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
@ -103,6 +99,11 @@ class SentenceExtractorTest {
});
reader.join();
}
@Test
public void testPattern() {
System.out.println(WordPatterns.singleWordAdditionalPattern.matcher("2.6.18164.el5pae").matches());
}
@Test
void extractSentences() throws IOException {
var data = Path.of("/home/vlofgren/Code/tmp-data/");

View File

@ -11,12 +11,15 @@ import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient;
import nu.marginalia.wmsa.edge.index.conversion.SearchIndexPartitioner;
import nu.marginalia.wmsa.edge.index.model.IndexBlock;
import nu.marginalia.wmsa.edge.index.reader.SearchIndexes;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
import nu.marginalia.wmsa.edge.model.EdgeId;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import org.junit.jupiter.api.*;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet;
import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords;
import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.parallel.Execution;
import org.junit.jupiter.api.parallel.ExecutionMode;
import org.junit.jupiter.api.parallel.ResourceAccessMode;
@ -141,7 +144,7 @@ public class EdgeIndexClientTest {
void putWords(int didx, int idx, double quality, String... words) {
EdgePageWords epw = new EdgePageWords(IndexBlock.Title);
epw.addAll(Arrays.asList(words));
client.putWords(Context.internal(), new EdgeId<>(didx), new EdgeId<>(idx), quality,
client.putWords(Context.internal(), new EdgeId<>(didx), new EdgeId<>(idx),
new EdgePageWordSet(epw), 0).blockingSubscribe();
}

View File

@ -61,7 +61,7 @@ class SearchIndexJournalWriterTest {
void put() throws IOException {
writer.put(new SearchIndexJournalEntryHeader(4, (1234L << 32) | 5678, IndexBlock.Link),
new SearchIndexJournalEntry(new long[] { 1, 2, 3, 4 }));
writer.put(new SearchIndexJournalEntryHeader(4, (2345L << 32) | 2244, IndexBlock.Words),
writer.put(new SearchIndexJournalEntryHeader(4, (2345L << 32) | 2244, IndexBlock.Words_1),
new SearchIndexJournalEntry(new long[] { 5, 6, 7 }));
writer.forceWrite();

27
protocol/build.gradle Normal file
View File

@ -0,0 +1,27 @@
plugins {
id "com.google.protobuf" version "0.8.19"
id "java"
}
repositories {
gradlePluginPortal()
}
protobuf {
protoc {
artifact = 'com.google.protobuf:protoc:3.0.0'
}
}
sourceSets {
main {
java {
srcDirs 'build/generated/source/proto/main/grpc'
srcDirs 'build/generated/source/proto/main/java'
}
}
}
dependencies {
protobuf files ("def/")
implementation group: 'com.google.protobuf', name: 'protobuf-java', version: '3.0.0'
}

21
protocol/def/index.proto Normal file
View File

@ -0,0 +1,21 @@
syntax = "proto3";
option java_package = "nu.wmsa.wmsa.edge.index.proto";
option java_outer_classname = "IndexProto";
option java_multiple_files = true;
message IndexPutKeywordsReq {
int32 domain = 1;
int32 url = 2;
int32 index = 3;
repeated WordSet wordSet = 4;
message WordSet {
int32 index = 1;
repeated string words = 2;
}
}
message IndexSearchQueryRsp {
}

View File

@ -1,4 +1,5 @@
rootProject.name = 'wmsa'
include 'marginalia_nu'
include 'third_party'
include 'third_party'
include 'protocol'