diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java index 1731f610..b94df67c 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/ConverterModule.java @@ -6,11 +6,15 @@ import com.google.inject.name.Names; import marcono1234.gson.recordadapter.RecordTypeAdapterFactory; import nu.marginalia.util.language.conf.LanguageModels; import nu.marginalia.wmsa.configuration.WmsaHome; +import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; +import nu.marginalia.wmsa.edge.index.client.EdgeIndexLocalService; +import nu.marginalia.wmsa.edge.index.client.EdgeIndexWriterClient; import nu.marginalia.wmsa.edge.model.EdgeCrawlPlan; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import java.net.URISyntaxException; +import java.nio.file.Path; public class ConverterModule extends AbstractModule { @@ -31,6 +35,15 @@ public class ConverterModule extends AbstractModule { bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128); bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255); + if (null != System.getProperty("local-index-path")) { + bind(Path.class).annotatedWith(Names.named("local-index-path")).toInstance(Path.of(System.getProperty("local-index-path"))); + bind(EdgeIndexWriterClient.class).to(EdgeIndexLocalService.class); + } + else { + bind(EdgeIndexWriterClient.class).to(EdgeIndexClient.class); + } + + bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels()); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java index 02f502d2..fd95b6a2 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordLoaderMain.java @@ -2,11 +2,10 @@ package nu.marginalia.wmsa.edge.converting; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.EdgeId; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; import java.io.IOException; import java.nio.file.Files; @@ -76,9 +75,8 @@ public class LinkKeywordLoaderMain { // System.out.println(lastLine + " -/- " + domainId + ":" + urlId + " : " + keywords); - indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), new EdgePageWordSet( - new EdgePageWords(IndexBlock.Link, new HashSet<>(keywords))), 0 - ).blockingSubscribe(); + indexClient.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), + new DocumentKeywords(IndexBlock.Link, keywords.toArray(String[]::new)), 0); } lastLine = urlKeyword.url; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoaderMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoaderMain.java index 13678fd0..d385fabc 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoaderMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LoaderMain.java @@ -27,7 +27,6 @@ public class LoaderMain { private static final Logger logger = LoggerFactory.getLogger(LoaderMain.class); - private final Path processDir; private final EdgeCrawlPlan plan; private final ConvertedDomainReader instructionsReader; private final LoaderFactory loaderFactory; @@ -59,7 +58,6 @@ public class LoaderMain { LoaderFactory loaderFactory, EdgeIndexClient indexClient) { - this.processDir = plan.process.getDir(); this.plan = plan; this.instructionsReader = instructionsReader; this.loaderFactory = loaderFactory; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java index 486eb343..cd2f000f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/IndexLoadKeywords.java @@ -4,23 +4,21 @@ import com.google.inject.Inject; import lombok.SneakyThrows; import nu.marginalia.wmsa.configuration.server.Context; import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; -import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; +import nu.marginalia.wmsa.edge.index.client.EdgeIndexWriterClient; import nu.marginalia.wmsa.edge.model.EdgeId; import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.Arrays; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; public class IndexLoadKeywords implements Runnable { - private final EdgeIndexClient client; private static final Logger logger = LoggerFactory.getLogger(IndexLoadKeywords.class); private final LinkedBlockingQueue insertQueue = new LinkedBlockingQueue<>(32); + private EdgeIndexWriterClient client; - private record InsertTask(int urlId, int domainId, EdgePageWordSet wordSet) {} + private record InsertTask(int urlId, int domainId, DocumentKeywords wordSet) {} private final Thread runThread; private volatile boolean canceled = false; @@ -28,7 +26,7 @@ public class IndexLoadKeywords implements Runnable { private static final int index = Integer.getInteger("keyword-index", 1); @Inject - public IndexLoadKeywords(EdgeIndexClient client) { + public IndexLoadKeywords(EdgeIndexWriterClient client) { this.client = client; runThread = new Thread(this, getClass().getSimpleName()); runThread.start(); @@ -39,7 +37,7 @@ public class IndexLoadKeywords implements Runnable { while (!canceled) { var data = insertQueue.poll(1, TimeUnit.SECONDS); if (data != null) { - client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.wordSet, index).blockingSubscribe(); + client.putWords(Context.internal(), new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), data.wordSet, index); } } } @@ -57,11 +55,8 @@ public class IndexLoadKeywords implements Runnable { logger.warn("Failed to get IDs for {} -- d={},u={}", url, domainId, urlId); } - var ws = new EdgePageWordSet(); - for (var doc : words) { - ws.append(doc.block(), Arrays.asList(doc.keywords())); + for (var ws : words) { + insertQueue.put(new InsertTask(urlId, domainId, ws)); } - - insertQueue.put(new InsertTask(urlId, domainId, ws)); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomains.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomains.java index 76a839c9..d476e473 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomains.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadDomains.java @@ -67,16 +67,34 @@ public class SqlLoadDomains { try (var insertCall = connection.prepareCall("CALL INSERT_DOMAIN(?,?)")) { + + int cnt = 0; int batchOffset = 0; for (var domain : domains) { insertCall.setString(1, domain.toString()); insertCall.setString(2, domain.domain); insertCall.addBatch(); - } - var ret = insertCall.executeBatch(); - for (int rv = 0; rv < domains.length; rv++) { - if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { - logger.warn("load({}) -- bad row count {}", domains[rv], ret[rv]); + if (++cnt == 1000) { + var ret = insertCall.executeBatch(); + connection.commit(); + + for (int rv = 0; rv < cnt; rv++) { + if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { + logger.warn("load({}) -- bad row count {}", domains[batchOffset + rv], ret[rv]); + } + } + + cnt = 0; + batchOffset += 1000; + } + } + if (cnt > 0) { + var ret = insertCall.executeBatch(); + connection.commit(); + for (int rv = 0; rv < cnt; rv++) { + if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { + logger.warn("load({}) -- bad row count {}", domains[batchOffset + rv], ret[rv]); + } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java index fb8a6303..bce60825 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadProcessedDocument.java @@ -64,6 +64,7 @@ public class SqlLoadProcessedDocument { var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?)")) { conn.setAutoCommit(false); + int cnt = 0; int batchOffset = 0; for (var doc : documents) { int urlId = data.getUrlId(doc.url()); if (urlId < 0) { @@ -81,16 +82,31 @@ public class SqlLoadProcessedDocument { stmt.setDouble(8, doc.quality()); stmt.setInt(9, (int) doc.hash()); stmt.addBatch(); - } - var ret = stmt.executeBatch(); - for (int rv = 0; rv < documents.size(); rv++) { - if (ret[rv] < 1 && ret[rv] != SUCCESS_NO_INFO) { - logger.warn("load({}) -- bad row count {}", documents.get(rv), ret[rv]); + if (++cnt == 100) { + var ret = stmt.executeBatch(); + conn.commit(); + + for (int rv = 0; rv < cnt; rv++) { + if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { + logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]); + } + } + + cnt = 0; + batchOffset += 100; + } + } + if (cnt > 0) { + var ret = stmt.executeBatch(); + conn.commit(); + for (int rv = 0; rv < cnt; rv++) { + if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { + logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]); + } } } - conn.commit(); } catch (SQLException ex) { logger.warn("SQL error inserting document", ex); } @@ -100,6 +116,7 @@ public class SqlLoadProcessedDocument { try (var conn = dataSource.getConnection(); var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT_BAD(?, ?)")) { + int cnt = 0; int batchOffset = 0; for (var doc : documents) { int urlId = data.getUrlId(doc.url()); if (urlId < 0) { @@ -110,13 +127,31 @@ public class SqlLoadProcessedDocument { stmt.setInt(1, urlId); stmt.setString(2, doc.state().name()); stmt.addBatch(); - } - var ret = stmt.executeBatch(); - for (int rv = 0; rv < documents.size(); rv++) { - if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { - logger.warn("load({}) -- bad row count {}", documents.get(rv), ret[rv]); + + if (++cnt == 100) { + var ret = stmt.executeBatch(); + conn.commit(); + + for (int rv = 0; rv < cnt; rv++) { + if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { + logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]); + } + } + + cnt = 0; + batchOffset += 100; } } + if (cnt > 0) { + var ret = stmt.executeBatch(); + conn.commit(); + for (int rv = 0; rv < cnt; rv++) { + if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { + logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]); + } + } + } + } catch (SQLException ex) { logger.warn("SQL error inserting failed document", ex); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java index f25bd0b9..7dc67cd3 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/loader/SqlLoadUrls.java @@ -52,6 +52,8 @@ public class SqlLoadUrls { ) { conn.setAutoCommit(false); + + int cnt = 0; int batchOffset = 0; for (var url : urls) { if (url.path.length() >= 255) { logger.warn("Skipping bad URL {}", url); @@ -70,11 +72,29 @@ public class SqlLoadUrls { insertCall.setString(5, url.param); insertCall.setLong(6, hashPath(url.path, url.param)); insertCall.addBatch(); + + if (++cnt == 250) { + var ret = insertCall.executeBatch(); + conn.commit(); + + for (int rv = 0; rv < cnt; rv++) { + if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { + logger.warn("load({}) -- bad row count {}", urls[batchOffset + rv], ret[rv]); + } + } + + cnt = 0; + batchOffset += 250; + } } - var ret = insertCall.executeBatch(); - for (int rv = 0; rv < ret.length; rv++) { - if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { - logger.warn("load({}) -- bad row count {}", urls[rv], ret[rv]); + if (cnt > 0) { + var ret = insertCall.executeBatch(); + conn.commit(); + + for (int rv = 0; rv < cnt; rv++) { + if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { + logger.warn("load({}) -- bad row count {}", urls[batchOffset + rv], ret[rv]); + } } } @@ -86,6 +106,7 @@ public class SqlLoadUrls { queryCall.setInt(1, data.getDomainId(targetDomain)); var rsp = queryCall.executeQuery(); + rsp.setFetchSize(urls.length); while (rsp.next()) { int urlId = rsp.getInt(1); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java index eb73534b..ba4e543b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java @@ -190,10 +190,20 @@ public class EdgeIndexService extends Service { } private long[] getOrInsertWordIds(List words) { - return words.stream() - .filter(w -> w.getBytes().length < Byte.MAX_VALUE) - .mapToLong(keywordLexicon::getOrInsert) - .toArray(); + long[] ids = new long[words.size()]; + int putId = 0; + + for (String word : words) { + long id = keywordLexicon.getOrInsert(word); + if (id != DictionaryHashMap.NO_VALUE) { + ids[putId++] = id; + } + } + + if (putId != words.size()) { + ids = Arrays.copyOf(ids, putId); + } + return ids; } private Object searchDomain(Request request, Response response) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java index 48b1b3ee..1866d1f6 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexClient.java @@ -6,13 +6,12 @@ import com.google.inject.Singleton; import io.reactivex.rxjava3.core.Observable; import io.reactivex.rxjava3.schedulers.Schedulers; import nu.marginalia.wmsa.client.AbstractDynamicClient; -import nu.marginalia.wmsa.client.HttpStatusCode; import nu.marginalia.wmsa.configuration.ServiceDescriptor; import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeId; import nu.marginalia.wmsa.edge.model.EdgeUrl; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; import nu.marginalia.wmsa.edge.model.search.EdgeSearchResultSet; import nu.marginalia.wmsa.edge.model.search.EdgeSearchSpecification; import nu.marginalia.wmsa.edge.model.search.domain.EdgeDomainSearchResults; @@ -26,7 +25,7 @@ import java.util.List; import java.util.concurrent.TimeUnit; @Singleton -public class EdgeIndexClient extends AbstractDynamicClient { +public class EdgeIndexClient extends AbstractDynamicClient implements EdgeIndexWriterClient { private final Gson gson = new GsonBuilder() .create(); private final Logger logger = LoggerFactory.getLogger(getClass()); @@ -36,10 +35,10 @@ public class EdgeIndexClient extends AbstractDynamicClient { setTimeout(30); } - @CheckReturnValue - public Observable putWords(Context ctx, EdgeId domain, EdgeId url, - EdgePageWordSet wordSet, int writer - ) + @Override + public void putWords(Context ctx, EdgeId domain, EdgeId url, + DocumentKeywords wordSet, int writer + ) { var keywordBuilder = @@ -48,16 +47,16 @@ public class EdgeIndexClient extends AbstractDynamicClient { .setUrl(url.id()) .setIndex(writer); - for (var set : wordSet.wordSets.values()) { + for (var set : wordSet.keywords()) { var wordSetBuilder = IndexPutKeywordsReq.WordSet.newBuilder(); - wordSetBuilder.setIndex(set.block.ordinal()); - wordSetBuilder.addAllWords(set.words); + wordSetBuilder.setIndex(wordSet.block().ordinal()); + wordSetBuilder.addAllWords(List.of(wordSet.keywords())); keywordBuilder.addWordSet(wordSetBuilder.build()); } var req = keywordBuilder.build(); - return this.post(ctx, "/words/", req); + this.post(ctx, "/words/", req).blockingSubscribe(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java new file mode 100644 index 00000000..e36b9752 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexLocalService.java @@ -0,0 +1,79 @@ +package nu.marginalia.wmsa.edge.index.client; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.google.inject.name.Named; +import nu.marginalia.util.ListChunker; +import nu.marginalia.util.dict.DictionaryHashMap; +import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; +import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalWriterImpl; +import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntry; +import nu.marginalia.wmsa.edge.index.journal.model.SearchIndexJournalEntryHeader; +import nu.marginalia.wmsa.edge.index.lexicon.KeywordLexicon; +import nu.marginalia.wmsa.edge.index.lexicon.journal.KeywordLexiconJournal; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.wmsa.edge.model.EdgeId; +import nu.marginalia.wmsa.edge.model.EdgeUrl; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.List; + +@Singleton +public class EdgeIndexLocalService implements EdgeIndexWriterClient { + + private final KeywordLexicon lexicon; + private final SearchIndexJournalWriterImpl indexWriter; + + @Inject + public EdgeIndexLocalService(@Named("local-index-path") Path path) throws IOException { + long hashMapSize = 1L << 31; + + if (Boolean.getBoolean("small-ram")) { + hashMapSize = 1L << 27; + } + + var lexiconJournal = new KeywordLexiconJournal(path.resolve("dictionary.dat").toFile()); + lexicon = new KeywordLexicon(lexiconJournal, new DictionaryHashMap(hashMapSize)); + indexWriter = new SearchIndexJournalWriterImpl(lexicon, path.resolve("index.dat").toFile()); + } + + public void putWords(Context ctx, EdgeId domain, EdgeId url, + DocumentKeywords wordSet, int writer) { + if (wordSet.keywords().length == 0) return; + + for (var chunk : ListChunker.chopList(List.of(wordSet.keywords()), SearchIndexJournalEntry.MAX_LENGTH)) { + + var entry = new SearchIndexJournalEntry(getOrInsertWordIds(chunk)); + var header = new SearchIndexJournalEntryHeader(domain.id(), url.id(), wordSet.block()); + + indexWriter.put(header, entry); + } + + } + + private long[] getOrInsertWordIds(List words) { + long[] ids = new long[words.size()]; + int putId = 0; + + for (String word : words) { + long id = lexicon.getOrInsert(word); + if (id != DictionaryHashMap.NO_VALUE) { + ids[putId++] = id; + } + } + + if (putId != words.size()) { + ids = Arrays.copyOf(ids, putId); + } + return ids; + } + + @Override + public void close() throws Exception { + indexWriter.close(); + lexicon.close(); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexWriterClient.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexWriterClient.java new file mode 100644 index 00000000..57a6f17d --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/client/EdgeIndexWriterClient.java @@ -0,0 +1,13 @@ +package nu.marginalia.wmsa.edge.index.client; + +import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; +import nu.marginalia.wmsa.edge.model.EdgeDomain; +import nu.marginalia.wmsa.edge.model.EdgeId; +import nu.marginalia.wmsa.edge.model.EdgeUrl; + +public interface EdgeIndexWriterClient extends AutoCloseable { + + void putWords(Context ctx, EdgeId domain, EdgeId url, + DocumentKeywords wordSets, int writer); +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java index a11ee5d0..7e0c361b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/journal/SearchIndexJournalWriterImpl.java @@ -19,7 +19,7 @@ import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter { - private final KeywordLexicon dictionaryWriter; + private final KeywordLexicon lexicon; private final Logger logger = LoggerFactory.getLogger(getClass()); private final Disposable writerTask; @@ -31,8 +31,8 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter { private long pos; @SneakyThrows - public SearchIndexJournalWriterImpl(KeywordLexicon dictionaryWriter, File indexFile) { - this.dictionaryWriter = dictionaryWriter; + public SearchIndexJournalWriterImpl(KeywordLexicon lexicon, File indexFile) { + this.lexicon = lexicon; initializeIndexFile(indexFile); byteBuffer = ByteBuffer.allocate(MAX_BLOCK_SIZE); @@ -113,14 +113,14 @@ public class SearchIndexJournalWriterImpl implements SearchIndexJournalWriter { @Override public void flushWords() { - dictionaryWriter.commitToDisk(); + lexicon.commitToDisk(); } private void writePositionMarker() throws IOException { pos = channel.size(); raf.seek(0); raf.writeLong(pos); - raf.writeLong(dictionaryWriter.size()); + raf.writeLong(lexicon.size()); raf.seek(pos); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java index 667ea6b1..f7b95ec4 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/lexicon/KeywordLexicon.java @@ -53,7 +53,7 @@ public class KeywordLexicon implements AutoCloseable { @SneakyThrows private int getOrInsert(byte[] bytes) { if (bytes.length >= Byte.MAX_VALUE) { - logger.warn("getOrInsert({}), illegal length {}", bytes, bytes.length); + logger.warn("getOrInsert({}), illegal length {}", new String(bytes), bytes.length); return DictionaryHashMap.NO_VALUE; } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java index 957769bf..fbb7a5e8 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/model/EdgeUrl.java @@ -96,11 +96,23 @@ public class EdgeUrl implements WideHashable { } public EdgeUrl(URI URI) { - this.domain = new EdgeDomain(URI.getHost()); - this.path = URI.getPath().isEmpty() ? "/" : URI.getPath(); - this.proto = URI.getScheme().toLowerCase(); - this.port = port(URI.getPort(), proto); - this.param = QueryParams.queryParamsSanitizer(this.path, URI.getQuery()); + try { + String host = URI.getHost(); + + if (host == null) { // deal with a rare serialization error + host = "parse-error.invalid.example.com"; + } + + this.domain = new EdgeDomain(host); + this.path = URI.getPath().isEmpty() ? "/" : URI.getPath(); + this.proto = URI.getScheme().toLowerCase(); + this.port = port(URI.getPort(), proto); + this.param = QueryParams.queryParamsSanitizer(this.path, URI.getQuery()); + } + catch (Exception ex) { + System.err.println("Failed to parse " + URI); + throw ex; + } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java index d3ed948e..64a1b1c1 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/FeaturesLoaderTool.java @@ -3,12 +3,11 @@ package nu.marginalia.wmsa.edge.tools; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.wmsa.configuration.module.DatabaseModule; import nu.marginalia.wmsa.configuration.server.Context; +import nu.marginalia.wmsa.edge.converting.interpreter.instruction.DocumentKeywords; import nu.marginalia.wmsa.edge.converting.processor.logic.HtmlFeature; import nu.marginalia.wmsa.edge.index.client.EdgeIndexClient; import nu.marginalia.wmsa.edge.index.model.IndexBlock; import nu.marginalia.wmsa.edge.model.EdgeId; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWordSet; -import nu.marginalia.wmsa.edge.model.crawl.EdgePageWords; import java.io.IOException; import java.nio.file.Files; @@ -17,7 +16,6 @@ import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.SQLException; import java.util.HashMap; -import java.util.List; import java.util.Map; import java.util.Objects; @@ -34,7 +32,6 @@ public class FeaturesLoaderTool { var linesStream = Files.lines(file)) { var urls = getUrls(ds); - var wordSet = new EdgePageWordSet(new EdgePageWords(IndexBlock.Meta, List.of(feature.getKeyword()))); linesStream .map(urls::get) .filter(Objects::nonNull) @@ -51,8 +48,9 @@ public class FeaturesLoaderTool { throw new RuntimeException(ex); } - client.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), wordSet, 0) - .blockingSubscribe(); + client.putWords(Context.internal(), new EdgeId<>(domainId), new EdgeId<>(urlId), + new DocumentKeywords(IndexBlock.Meta, feature.getKeyword()) + , 0); }); } catch (IOException e) {