diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbStatusWriter.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbStatusWriter.java new file mode 100644 index 00000000..ed3c49bd --- /dev/null +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbStatusWriter.java @@ -0,0 +1,63 @@ +package nu.marginalia.linkdb; + +import nu.marginalia.linkdb.model.UrlStatus; + +import java.io.IOException; +import java.nio.file.Path; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.sql.Types; +import java.util.List; + +public class LinkdbStatusWriter { + + private final Connection connection; + + public LinkdbStatusWriter(Path outputFile) throws SQLException { + String connStr = "jdbc:sqlite:" + outputFile.toString(); + connection = DriverManager.getConnection(connStr); + + try (var stream = ClassLoader.getSystemResourceAsStream("db/linkdb-status.sql"); + var stmt = connection.createStatement() + ) { + var sql = new String(stream.readAllBytes()); + stmt.executeUpdate(sql); + + // Disable synchronous writing as this is a one-off operation with no recovery + stmt.execute("PRAGMA synchronous = OFF"); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public void add(List statuses) throws SQLException { + try (var stmt = connection.prepareStatement(""" + INSERT INTO STATUS(ID, URL, STATUS, DESCRIPTION) + VALUES (?, ?, ?, ?) + """)) { + int count = 0; + for (var status : statuses) { + stmt.setLong(1, status.id()); + stmt.setString(2, status.url().toString()); + stmt.setString(3, status.status()); + if (status.description() == null) { + stmt.setNull(4, Types.VARCHAR); + } else { + stmt.setString(4, status.description()); + } + stmt.addBatch(); + if (++count > 100) { + stmt.executeBatch(); + } + } + if (count != 0) { + stmt.executeBatch(); + } + } + } + + public void close() throws SQLException { + connection.close(); + } +} diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbWriter.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbWriter.java index f97796de..a819471f 100644 --- a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbWriter.java +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbWriter.java @@ -1,7 +1,6 @@ package nu.marginalia.linkdb; import nu.marginalia.linkdb.model.UrlDetail; -import nu.marginalia.linkdb.model.UrlProtocol; import java.io.IOException; import java.nio.file.Path; @@ -19,7 +18,7 @@ public class LinkdbWriter { String connStr = "jdbc:sqlite:" + outputFile.toString(); connection = DriverManager.getConnection(connStr); - try (var stream = ClassLoader.getSystemResourceAsStream("db/linkdb.sql"); + try (var stream = ClassLoader.getSystemResourceAsStream("db/linkdb-document.sql"); var stmt = connection.createStatement() ) { var sql = new String(stream.readAllBytes()); diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/UrlStatus.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/UrlStatus.java new file mode 100644 index 00000000..dbb5334a --- /dev/null +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/UrlStatus.java @@ -0,0 +1,8 @@ +package nu.marginalia.linkdb.model; + +import nu.marginalia.model.EdgeUrl; + +import javax.annotation.Nullable; + +public record UrlStatus(long id, EdgeUrl url, String status, @Nullable String description) { +} diff --git a/code/common/linkdb/src/main/resources/db/linkdb.sql b/code/common/linkdb/src/main/resources/db/linkdb-document.sql similarity index 92% rename from code/common/linkdb/src/main/resources/db/linkdb.sql rename to code/common/linkdb/src/main/resources/db/linkdb-document.sql index 2e4e95b6..a15aeb5e 100644 --- a/code/common/linkdb/src/main/resources/db/linkdb.sql +++ b/code/common/linkdb/src/main/resources/db/linkdb-document.sql @@ -1,5 +1,5 @@ CREATE TABLE DOCUMENT ( - ID LONG PRIMARY KEY, + ID INT8 PRIMARY KEY, URL TEXT, diff --git a/code/common/linkdb/src/main/resources/db/linkdb-status.sql b/code/common/linkdb/src/main/resources/db/linkdb-status.sql new file mode 100644 index 00000000..93fb9a40 --- /dev/null +++ b/code/common/linkdb/src/main/resources/db/linkdb-status.sql @@ -0,0 +1,6 @@ +CREATE TABLE STATUS ( + ID INT8 PRIMARY KEY, + URL TEXT, + STATUS TEXT NOT NULL, + DESCRIPTION TEXT +); diff --git a/code/common/linkdb/src/test/java/nu/marginalia/linkdb/LinkdbStatusWriterTest.java b/code/common/linkdb/src/test/java/nu/marginalia/linkdb/LinkdbStatusWriterTest.java new file mode 100644 index 00000000..e6c4b4b6 --- /dev/null +++ b/code/common/linkdb/src/test/java/nu/marginalia/linkdb/LinkdbStatusWriterTest.java @@ -0,0 +1,33 @@ +package nu.marginalia.linkdb; + +import nu.marginalia.linkdb.model.UrlStatus; +import nu.marginalia.model.EdgeUrl; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.List; + +public class LinkdbStatusWriterTest { + @Test + public void testCreate() throws IOException { + Path tempPath = Files.createTempFile("linkdb-status", ".db"); + try { + var writer = new LinkdbStatusWriter(tempPath); + writer.add(List.of( + new UrlStatus(5, new EdgeUrl("https://www.marginalia.nu/x"), "y", null), + new UrlStatus(6, new EdgeUrl("https://www.marginalia.nu/y"), "y", "z") + )); + writer.close(); + } catch (SQLException e) { + throw new RuntimeException(e); + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } finally { + Files.deleteIfExists(tempPath); + } + } +} diff --git a/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java b/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java index 1c6fdf1c..a801021d 100644 --- a/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java +++ b/code/features-index/index-forward/src/test/java/nu/marginalia/index/forward/ForwardIndexConverterTest.java @@ -5,6 +5,7 @@ import nu.marginalia.index.journal.model.IndexJournalEntry; import nu.marginalia.index.journal.writer.IndexJournalWriterImpl; import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode; +import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.ranking.DomainRankings; import nu.marginalia.lexicon.KeywordLexicon; import nu.marginalia.lexicon.journal.KeywordLexiconJournal; @@ -84,8 +85,9 @@ class ForwardIndexConverterTest { } long createId(long url, long domain) { - return (domain << 32) | url; + return UrlIdCodec.encodeId((int) domain, (int) url); } + public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) { int[] factors = getFactorsI(id); diff --git a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntry.java b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntry.java index c3602266..d58b93b6 100644 --- a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntry.java +++ b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntry.java @@ -3,6 +3,7 @@ package nu.marginalia.index.journal.model; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.id.EdgeId; +import nu.marginalia.model.id.UrlIdCodec; public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntryData data) { @@ -15,18 +16,7 @@ public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntr long documentMeta) { - return builder(new EdgeId<>(domainId), - new EdgeId<>(urlId), - documentMeta); + return builder(UrlIdCodec.encodeId(domainId, urlId), documentMeta); } - public static IndexJournalEntryBuilder builder(EdgeId domainId, - EdgeId urlId, - long documentMeta) { - - - return new IndexJournalEntryBuilder(0, - IndexJournalEntryHeader.combineIds(domainId, urlId), - documentMeta); - } } diff --git a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryHeader.java b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryHeader.java index 657b1685..e19fc229 100644 --- a/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryHeader.java +++ b/code/features-index/index-journal/src/main/java/nu.marginalia.index/journal/model/IndexJournalEntryHeader.java @@ -1,29 +1,17 @@ package nu.marginalia.index.journal.model; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.id.EdgeId; - public record IndexJournalEntryHeader(int entrySize, int documentFeatures, long combinedId, long documentMeta) { - public IndexJournalEntryHeader(EdgeId domainId, + public IndexJournalEntryHeader(long combinedId, int documentFeatures, - EdgeId urlId, long documentMeta) { this(-1, documentFeatures, - combineIds(domainId, urlId), + combinedId, documentMeta); } - static long combineIds(EdgeId domainId, EdgeId urlId) { - long did = domainId.id(); - long uid = urlId.id(); - - return (did << 32L) | uid; - } - } diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/InstructionTag.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/InstructionTag.java index 7527229c..23584925 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/InstructionTag.java +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/InstructionTag.java @@ -5,7 +5,6 @@ import nu.marginalia.converting.instruction.instructions.*; public enum InstructionTag { DOMAIN(LoadDomain.class), - URL(LoadUrl.class), LINK(LoadDomainLink.class), REDIRECT(LoadDomainRedirect.class), WORDS(LoadKeywords.class), diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java index a7089b9f..624081c9 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/Interpreter.java @@ -10,7 +10,6 @@ import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; public interface Interpreter { - default void loadUrl(EdgeUrl[] url) {} default void loadDomain(EdgeDomain[] domain) {} default void loadRssFeed(EdgeUrl[] rssFeed) {} default void loadDomainLink(DomainLink[] links) {} @@ -19,7 +18,7 @@ public interface Interpreter { default void loadProcessedDocument(LoadProcessedDocument loadProcessedDocument) {} default void loadProcessedDocumentWithError(LoadProcessedDocumentWithError loadProcessedDocumentWithError) {} - default void loadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) {} + default void loadKeywords(EdgeUrl url, int ordinal, int features, DocumentMetadata metadata, DocumentKeywords words) {} default void loadDomainRedirect(DomainLink link) {} diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadKeywords.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadKeywords.java index 779ec79c..96c78611 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadKeywords.java +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadKeywords.java @@ -7,11 +7,11 @@ import nu.marginalia.converting.instruction.InstructionTag; import nu.marginalia.converting.instruction.Interpreter; import nu.marginalia.model.EdgeUrl; -public record LoadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) implements Instruction { +public record LoadKeywords(EdgeUrl url, int ordinal, int features, DocumentMetadata metadata, DocumentKeywords words) implements Instruction { @Override public void apply(Interpreter interpreter) { - interpreter.loadKeywords(url, features, metadata, words); + interpreter.loadKeywords(url, ordinal, features, metadata, words); } @Override diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocument.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocument.java index 11885f18..2a43494c 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocument.java +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocument.java @@ -9,7 +9,7 @@ import org.jetbrains.annotations.Nullable; public record LoadProcessedDocument(EdgeUrl url, - UrlIndexingState state, + int ordinal, UrlIndexingState state, String title, String description, int htmlFeatures, @@ -17,7 +17,8 @@ public record LoadProcessedDocument(EdgeUrl url, int length, long hash, double quality, - @Nullable Integer pubYear) implements Instruction + @Nullable Integer pubYear +) implements Instruction { @Override public void apply(Interpreter interpreter) { diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocumentWithError.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocumentWithError.java index 28e42f5d..a1a42a90 100644 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocumentWithError.java +++ b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadProcessedDocumentWithError.java @@ -9,7 +9,8 @@ import nu.marginalia.model.EdgeUrl; public record LoadProcessedDocumentWithError(EdgeUrl url, UrlIndexingState state, - String reason) implements Instruction + String reason, + int ordinal) implements Instruction { @Override public void apply(Interpreter interpreter) { diff --git a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadUrl.java b/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadUrl.java deleted file mode 100644 index d126a515..00000000 --- a/code/process-models/converting-model/src/main/java/nu/marginalia/converting/instruction/instructions/LoadUrl.java +++ /dev/null @@ -1,31 +0,0 @@ -package nu.marginalia.converting.instruction.instructions; - -import nu.marginalia.converting.instruction.Instruction; -import nu.marginalia.converting.instruction.InstructionTag; -import nu.marginalia.converting.instruction.Interpreter; -import nu.marginalia.model.EdgeUrl; - -import java.util.Arrays; - -public record LoadUrl(EdgeUrl... url) implements Instruction { - - @Override - public void apply(Interpreter interpreter) { - interpreter.loadUrl(url); - } - - @Override - public String toString() { - return getClass().getSimpleName()+"["+ Arrays.toString(url)+"]"; - } - - @Override - public InstructionTag tag() { - return InstructionTag.URL; - } - - @Override - public boolean isNoOp() { - return url.length == 0; - } -} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java index c3683cd0..e3b68629 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/InstructionWriterFactory.java @@ -7,9 +7,7 @@ import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.Interpreter; import nu.marginalia.keyword.model.DocumentKeywords; -import nu.marginalia.converting.instruction.instructions.DomainLink; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import org.slf4j.Logger; @@ -130,7 +128,7 @@ public class InstructionWriterFactory { } @Override - public void loadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) { + public void loadKeywords(EdgeUrl url, int ordinal, int features, DocumentMetadata metadata, DocumentKeywords words) { keywords++; } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java index 21a610fb..b3cb2a9f 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java @@ -3,6 +3,7 @@ package nu.marginalia.converting.compiler; import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.instructions.LoadKeywords; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; +import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.model.crawl.HtmlFeature; @@ -11,32 +12,43 @@ import java.util.function.Consumer; public class DocumentsCompiler { - public void compile(Consumer instructionConsumer, List documents) { - - for (var doc : documents) { - compileDocumentDetails(instructionConsumer, doc); - } - - for (var doc : documents) { - compileWords(instructionConsumer, doc); - } - - } - - public void compileDocumentDetails(Consumer instructionConsumer, ProcessedDocument doc) { + public void compileDocumentDetails(Consumer instructionConsumer, + ProcessedDocument doc, + int ordinal) { var details = doc.details; if (details != null) { - instructionConsumer.accept(new LoadProcessedDocument(doc.url, doc.state, details.title, details.description, HtmlFeature.encode(details.features), details.standard.name(), details.length, details.hashCode, details.quality, details.pubYear)); + instructionConsumer.accept(new LoadProcessedDocument(doc.url, + ordinal, + doc.state, + details.title, + details.description, + HtmlFeature.encode(details.features), + details.standard.name(), + details.length, + details.hashCode, + details.quality, + details.pubYear + )); + } + else { + instructionConsumer.accept(new LoadProcessedDocumentWithError( + doc.url, + doc.state, + doc.stateReason, + ordinal + )); } } public void compileWords(Consumer instructionConsumer, - ProcessedDocument doc) { + ProcessedDocument doc, + int ordinal) { var words = doc.words; if (words != null) { instructionConsumer.accept(new LoadKeywords(doc.url, + ordinal, HtmlFeature.encode(doc.details.features), doc.details.metadata, words.build()) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java index 87f28e3c..65d2e989 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/InstructionsCompiler.java @@ -6,7 +6,6 @@ import nu.marginalia.converting.instruction.instructions.LoadProcessedDomain; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.sideload.SideloadSource; -import nu.marginalia.model.EdgeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -17,7 +16,6 @@ import java.util.function.Consumer; import static java.util.Objects.requireNonNullElse; public class InstructionsCompiler { - private final UrlsCompiler urlsCompiler; private final DocumentsCompiler documentsCompiler; private final DomainMetadataCompiler domainMetadataCompiler; private final FeedsCompiler feedsCompiler; @@ -27,14 +25,12 @@ public class InstructionsCompiler { private final Logger logger = LoggerFactory.getLogger(InstructionsCompiler.class); @Inject - public InstructionsCompiler(UrlsCompiler urlsCompiler, - DocumentsCompiler documentsCompiler, + public InstructionsCompiler(DocumentsCompiler documentsCompiler, DomainMetadataCompiler domainMetadataCompiler, FeedsCompiler feedsCompiler, LinksCompiler linksCompiler, RedirectCompiler redirectCompiler) { - this.urlsCompiler = urlsCompiler; this.documentsCompiler = documentsCompiler; this.domainMetadataCompiler = domainMetadataCompiler; this.feedsCompiler = feedsCompiler; @@ -47,8 +43,13 @@ public class InstructionsCompiler { instructionConsumer.accept(new LoadProcessedDomain(domain.domain, domain.state, domain.ip)); if (domain.documents != null) { - urlsCompiler.compile(instructionConsumer, domain.documents); - documentsCompiler.compile(instructionConsumer, domain.documents); + + int ordinal = 0; + for (var doc : domain.documents) { + documentsCompiler.compileDocumentDetails(instructionConsumer, doc, ordinal); + documentsCompiler.compileWords(instructionConsumer, doc, ordinal); + ordinal++; + } feedsCompiler.compile(instructionConsumer, domain.documents); linksCompiler.compile(instructionConsumer, domain.domain, domain.documents); @@ -63,7 +64,6 @@ public class InstructionsCompiler { public void compileStreaming(SideloadSource sideloadSource, Consumer instructionConsumer) { ProcessedDomain domain = sideloadSource.getDomain(); - Iterator urlsIterator = sideloadSource.getUrlsIterator(); Iterator documentsIterator = sideloadSource.getDocumentsStream(); // Guaranteed to always be first @@ -72,11 +72,6 @@ public class InstructionsCompiler { int countAll = 0; int countGood = 0; - logger.info("Writing domains"); - urlsCompiler.compileJustDomain(instructionConsumer, domain.domain); - logger.info("Writing urls"); - urlsCompiler.compileJustUrls(instructionConsumer, urlsIterator); - logger.info("Writing docs"); while (documentsIterator.hasNext()) { @@ -84,8 +79,8 @@ public class InstructionsCompiler { countAll++; if (doc.isOk()) countGood++; - documentsCompiler.compileDocumentDetails(instructionConsumer, doc); - documentsCompiler.compileWords(instructionConsumer, doc); + documentsCompiler.compileDocumentDetails(instructionConsumer, doc, countAll); + documentsCompiler.compileWords(instructionConsumer, doc, countAll); } domainMetadataCompiler.compileFake(instructionConsumer, domain.domain, countAll, countGood); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/LinksCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/LinksCompiler.java index e100cb86..e84a7c54 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/LinksCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/LinksCompiler.java @@ -2,26 +2,34 @@ package nu.marginalia.converting.compiler; import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.instructions.DomainLink; +import nu.marginalia.converting.instruction.instructions.LoadDomain; import nu.marginalia.converting.instruction.instructions.LoadDomainLink; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.model.EdgeDomain; +import java.util.Arrays; import java.util.List; import java.util.Objects; import java.util.function.Consumer; public class LinksCompiler { - public void compile(Consumer instructionConsumer, EdgeDomain from, List documents) { + public void compile(Consumer instructionConsumer, + EdgeDomain from, + List documents) { - DomainLink[] links = documents.stream().map(doc -> doc.details) + EdgeDomain[] domains = documents.stream() + .map(doc -> doc.details) .filter(Objects::nonNull) - .flatMap(dets -> dets.linksExternal.stream()) + .flatMap(details -> details.linksExternal.stream()) .map(link -> link.domain) .distinct() - .map(domain -> new DomainLink(from, domain)) - .toArray(DomainLink[]::new); + .toArray(EdgeDomain[]::new); + DomainLink[] links = new DomainLink[domains.length]; + Arrays.setAll(links, i -> new DomainLink(from, domains[i])); + + instructionConsumer.accept(new LoadDomain(domains)); instructionConsumer.accept(new LoadDomainLink(links)); } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java deleted file mode 100644 index ee4f3cbe..00000000 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java +++ /dev/null @@ -1,77 +0,0 @@ -package nu.marginalia.converting.compiler; - -import nu.marginalia.converting.instruction.Instruction; -import nu.marginalia.converting.instruction.instructions.LoadDomain; -import nu.marginalia.converting.instruction.instructions.LoadUrl; -import nu.marginalia.converting.model.ProcessedDocument; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.*; -import java.util.function.Consumer; - -public class UrlsCompiler { - - private static final int MAX_INTERNAL_LINKS = 25; - private final Logger logger = LoggerFactory.getLogger(getClass()); - - public void compile(Consumer instructionConsumer, List documents) { - Set seenUrls = new HashSet<>(documents.size()*4); - Set seenDomains = new HashSet<>(documents.size()); - - for (var doc : documents) { - if (doc.url == null) { - logger.warn("Discovered document with null URL"); - continue; - } - - seenUrls.add(doc.url); - - if (doc.details == null) { - continue; - } - - // Add *some* external links; to avoid loading too many and gunking up the database with nonsense, - // only permit this once per external domain per crawled domain - for (var url : doc.details.linksExternal) { - if (seenDomains.add(url.domain)) { - seenUrls.add(url); - } - } - - if (doc.isOk()) { - // Don't load more than a few from linksInternal, grows too big for no reason - var linksToAdd = new ArrayList<>(doc.details.linksInternal); - if (linksToAdd.size() > MAX_INTERNAL_LINKS) { - linksToAdd.subList(MAX_INTERNAL_LINKS, linksToAdd.size()).clear(); - } - seenUrls.addAll(linksToAdd); - } - } - - instructionConsumer.accept(new LoadDomain(seenDomains.toArray(EdgeDomain[]::new))); - instructionConsumer.accept(new LoadUrl(seenUrls.toArray(EdgeUrl[]::new))); - } - - public void compileJustUrls(Consumer instructionConsumer, Iterator urlsIterator) { - var urls = new ArrayList(1000); - - while (urlsIterator.hasNext()) { - if (urls.size() >= 1000) { - instructionConsumer.accept(new LoadUrl(urls.toArray(EdgeUrl[]::new))); - urls.clear(); - } - - urls.add(urlsIterator.next()); - } - if (!urls.isEmpty()) { - instructionConsumer.accept(new LoadUrl(urls.toArray(EdgeUrl[]::new))); - } - } - - public void compileJustDomain(Consumer instructionConsumer, EdgeDomain domain) { - instructionConsumer.accept(new LoadDomain(domain)); - } -} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/EncyclopediaMarginaliaNuSideloader.java index ae07b6c3..5a5a6855 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/EncyclopediaMarginaliaNuSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/EncyclopediaMarginaliaNuSideloader.java @@ -64,25 +64,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC return ret; } - @Override - @SneakyThrows - public Iterator getUrlsIterator() { - EdgeUrl base = new EdgeUrl("https://encyclopedia.marginalia.nu/"); - - return new SqlQueryIterator<>(connection.prepareStatement(""" - SELECT url, html FROM articles - """)) - { - @Override - public EdgeUrl convert(ResultSet rs) throws Exception { - var path = URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8); - - return base.withPathAndParam("/article/"+path, null); - } - }; - } - - @SneakyThrows @Override public Iterator getDocumentsStream() { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSource.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSource.java index d23a81ae..3d07d56a 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSource.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSource.java @@ -8,7 +8,6 @@ import java.util.Iterator; public interface SideloadSource { ProcessedDomain getDomain(); - Iterator getUrlsIterator(); Iterator getDocumentsStream(); String getId(); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackexchangeSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackexchangeSideloader.java index 97a37ac9..0f40639b 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackexchangeSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/StackexchangeSideloader.java @@ -22,7 +22,6 @@ import java.nio.file.Path; import java.util.EnumSet; import java.util.Iterator; import java.util.List; -import java.util.Optional; /** This code is broken */ @Deprecated() @@ -55,17 +54,6 @@ public class StackexchangeSideloader implements SideloadSource { return ret; } - @SneakyThrows - @Override - public Iterator getUrlsIterator() { - var ids = reader.getIds(); - return ids.stream() - .map(id -> EdgeUrl.parse("https://" + domainName + "/questions/" + id)) - .filter(Optional::isPresent) - .map(Optional::get) - .iterator(); - } - @Override public Iterator getDocumentsStream() { try { diff --git a/code/processes/loading-process/build.gradle b/code/processes/loading-process/build.gradle index eb705bbf..df07e5f7 100644 --- a/code/processes/loading-process/build.gradle +++ b/code/processes/loading-process/build.gradle @@ -27,6 +27,7 @@ dependencies { implementation project(':code:common:service') implementation project(':code:common:service-discovery') implementation project(':code:common:service-client') + implementation project(':code:common:linkdb') implementation project(':code:features-index:lexicon') implementation project(':code:features-index:index-journal') implementation project(':code:libraries:message-queue') diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java index 99266e13..1fc5e49d 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderMain.java @@ -7,9 +7,9 @@ import com.google.inject.Inject; import com.google.inject.Injector; import lombok.SneakyThrows; import nu.marginalia.converting.instruction.Interpreter; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.keyword.model.DocumentKeywords; +import nu.marginalia.linkdb.LinkdbWriter; import nu.marginalia.loading.loader.IndexLoadKeywords; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.idx.DocumentMetadata; @@ -45,6 +45,7 @@ public class LoaderMain { private final MessageQueueFactory messageQueueFactory; private final FileStorageService fileStorageService; private final IndexLoadKeywords indexLoadKeywords; + private final LinkdbWriter writer; private final Gson gson; public static void main(String... args) throws Exception { @@ -73,6 +74,7 @@ public class LoaderMain { MessageQueueFactory messageQueueFactory, FileStorageService fileStorageService, IndexLoadKeywords indexLoadKeywords, + LinkdbWriter writer, Gson gson ) { @@ -82,6 +84,7 @@ public class LoaderMain { this.messageQueueFactory = messageQueueFactory; this.fileStorageService = fileStorageService; this.indexLoadKeywords = indexLoadKeywords; + this.writer = writer; this.gson = gson; heartbeat.start(); @@ -136,6 +139,7 @@ public class LoaderMain { // This needs to be done in order to have a readable index journal indexLoadKeywords.close(); + writer.close(); logger.info("Loading finished"); } catch (Exception ex) { @@ -215,7 +219,7 @@ public class LoaderMain { public class InstructionCounter implements Interpreter { private int count = 0; - public void loadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) { + public void loadKeywords(EdgeUrl url, int ordinal, int features, DocumentMetadata metadata, DocumentKeywords words) { count++; } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java index a2df0ea9..920abf19 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/LoaderModule.java @@ -2,21 +2,29 @@ package nu.marginalia.loading; import com.google.gson.Gson; import com.google.inject.AbstractModule; +import com.google.inject.Inject; +import com.google.inject.Provides; +import com.google.inject.Singleton; import com.google.inject.name.Names; import nu.marginalia.LanguageModels; import nu.marginalia.ProcessConfiguration; import nu.marginalia.WmsaHome; -import plan.CrawlPlan; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorageType; +import nu.marginalia.linkdb.LinkdbStatusWriter; +import nu.marginalia.linkdb.LinkdbWriter; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.service.SearchServiceDescriptors; import nu.marginalia.service.descriptor.ServiceDescriptors; +import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Path; +import java.sql.SQLException; import java.util.UUID; public class LoaderModule extends AbstractModule { - public LoaderModule() { } @@ -25,11 +33,32 @@ public class LoaderModule extends AbstractModule { bind(ProcessConfiguration.class).toInstance(new ProcessConfiguration("loader", 0, UUID.randomUUID())); bind(Gson.class).toProvider(this::createGson); - bind(Path.class).annotatedWith(Names.named("local-index-path")).toInstance(Path.of(System.getProperty("local-index-path", "/vol"))); bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels()); } + @Inject @Provides @Singleton + private LinkdbWriter createLinkdbWriter(FileStorageService service) throws SQLException, IOException { + var storage = service.getStorageByType(FileStorageType.LINKDB_STAGING); + Path dbPath = storage.asPath().resolve("links.db"); + + if (Files.exists(dbPath)) { + Files.delete(dbPath); + } + return new LinkdbWriter(dbPath); + } + + @Inject @Provides @Singleton + private LinkdbStatusWriter createLinkdbStatusWriter(FileStorageService service) throws SQLException, IOException { + var storage = service.getStorageByType(FileStorageType.LINKDB_STAGING); + Path dbPath = storage.asPath().resolve("urlstatus.db"); + + if (Files.exists(dbPath)) { + Files.delete(dbPath); + } + return new LinkdbStatusWriter(dbPath); + } + private Gson createGson() { return GsonFactory.get(); } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java index ae914d3d..c1d502e5 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/IndexLoadKeywords.java @@ -3,9 +3,9 @@ package nu.marginalia.loading.loader; import com.google.inject.Inject; import lombok.SneakyThrows; import nu.marginalia.keyword.model.DocumentKeywords; +import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.id.EdgeId; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -18,8 +18,7 @@ public class IndexLoadKeywords implements Runnable { private final LinkedBlockingQueue insertQueue = new LinkedBlockingQueue<>(32); private final LoaderIndexJournalWriter journalWriter; - private record InsertTask(int urlId, - int domainId, + private record InsertTask(long combinedId, int features, DocumentMetadata metadata, DocumentKeywords wordSet) {} @@ -40,7 +39,7 @@ public class IndexLoadKeywords implements Runnable { while (!canceled) { var data = insertQueue.poll(1, TimeUnit.SECONDS); if (data != null) { - journalWriter.putWords(new EdgeId<>(data.domainId), new EdgeId<>(data.urlId), + journalWriter.putWords(data.combinedId, data.features, data.metadata(), data.wordSet); @@ -57,18 +56,18 @@ public class IndexLoadKeywords implements Runnable { } public void load(LoaderData loaderData, + int ordinal, EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) throws InterruptedException { - int domainId = loaderData.getDomainId(url.domain); - int urlId = loaderData.getUrlId(url); + long combinedId = UrlIdCodec.encodeId(loaderData.getTargetDomainId(), ordinal); - if (urlId <= 0 || domainId <= 0) { - logger.warn("Failed to get IDs for {} -- d={},u={}", url, domainId, urlId); + if (combinedId <= 0) { + logger.warn("Failed to get IDs for {} -- c={}", url, combinedId); return; } - insertQueue.put(new InsertTask(urlId, domainId, features, metadata, words)); + insertQueue.put(new InsertTask(combinedId, features, metadata, words)); } } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LdbLoadProcessedDocument.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LdbLoadProcessedDocument.java new file mode 100644 index 00000000..7c460d63 --- /dev/null +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LdbLoadProcessedDocument.java @@ -0,0 +1,83 @@ +package nu.marginalia.loading.loader; + +import com.google.inject.Inject; +import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; +import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; +import nu.marginalia.linkdb.LinkdbStatusWriter; +import nu.marginalia.linkdb.LinkdbWriter; +import nu.marginalia.linkdb.model.UrlDetail; +import nu.marginalia.linkdb.model.UrlStatus; +import nu.marginalia.model.id.UrlIdCodec; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +public class LdbLoadProcessedDocument { + private static final Logger logger = LoggerFactory.getLogger(LdbLoadProcessedDocument.class); + private final LinkdbWriter linkdbWriter; + private final LinkdbStatusWriter linkdbStatusWriter; + + @Inject + public LdbLoadProcessedDocument(LinkdbWriter linkdbWriter, + LinkdbStatusWriter linkdbStatusWriter + ) { + this.linkdbWriter = linkdbWriter; + this.linkdbStatusWriter = linkdbStatusWriter; + } + + public void load(LoaderData data, List documents) { + var details = new ArrayList(); + + int domainId = data.getTargetDomainId(); + var statusList = new ArrayList(); + + for (var document : documents) { + long id = UrlIdCodec.encodeId(domainId, document.ordinal()); + details.add(new UrlDetail( + id, + document.url(), + document.title(), + document.description(), + document.quality(), + document.standard(), + document.htmlFeatures(), + document.pubYear(), + document.hash(), + document.length() + )); + statusList.add(new UrlStatus(id, document.url(), document.state().toString(), null)); + } + + try { + linkdbWriter.add(details); + } + catch (SQLException ex) { + logger.warn("Failed to add processed documents to linkdb", ex); + } + } + + public void loadWithError(LoaderData data, List documents) { + var statusList = new ArrayList(); + int domainId = data.getTargetDomainId(); + + for (var document : documents) { + statusList.add(new UrlStatus( + UrlIdCodec.encodeId(domainId, document.ordinal()), + document.url(), + document.state().toString(), + document.reason() + )); + } + + try { + linkdbStatusWriter.add(statusList); + } + catch (SQLException ex) { + logger.warn("Failed to add processed documents to linkdb", ex); + } + } + +} diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java index 80b6afec..3e9c2073 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/Loader.java @@ -16,11 +16,10 @@ import java.util.ArrayList; import java.util.List; public class Loader implements Interpreter, AutoCloseable { - private final SqlLoadUrls sqlLoadUrls; private final SqlLoadDomains sqlLoadDomains; private final SqlLoadDomainLinks sqlLoadDomainLinks; private final SqlLoadProcessedDomain sqlLoadProcessedDomain; - private final SqlLoadProcessedDocument sqlLoadProcessedDocument; + private final LdbLoadProcessedDocument loadProcessedDocument; private final SqlLoadDomainMetadata sqlLoadDomainMetadata; private final IndexLoadKeywords indexLoadKeywords; @@ -34,21 +33,19 @@ public class Loader implements Interpreter, AutoCloseable { public final LoaderData data; public Loader(int sizeHint, - SqlLoadUrls sqlLoadUrls, SqlLoadDomains sqlLoadDomains, SqlLoadDomainLinks sqlLoadDomainLinks, SqlLoadProcessedDomain sqlLoadProcessedDomain, - SqlLoadProcessedDocument sqlLoadProcessedDocument, + LdbLoadProcessedDocument loadProcessedDocument, SqlLoadDomainMetadata sqlLoadDomainMetadata, IndexLoadKeywords indexLoadKeywords) { data = new LoaderData(sizeHint); - this.sqlLoadUrls = sqlLoadUrls; this.sqlLoadDomains = sqlLoadDomains; this.sqlLoadDomainLinks = sqlLoadDomainLinks; this.sqlLoadProcessedDomain = sqlLoadProcessedDomain; - this.sqlLoadProcessedDocument = sqlLoadProcessedDocument; + this.loadProcessedDocument = loadProcessedDocument; this.sqlLoadDomainMetadata = sqlLoadDomainMetadata; this.indexLoadKeywords = indexLoadKeywords; @@ -56,12 +53,6 @@ public class Loader implements Interpreter, AutoCloseable { processedDocumentWithErrorList = new ArrayList<>(sizeHint); } - - @Override - public void loadUrl(EdgeUrl[] urls) { - sqlLoadUrls.load(data, urls); - } - @Override public void loadDomain(EdgeDomain[] domains) { sqlLoadDomains.load(data, domains); @@ -87,25 +78,23 @@ public class Loader implements Interpreter, AutoCloseable { processedDocumentList.add(document); if (processedDocumentList.size() > 100) { - sqlLoadProcessedDocument.load(data, processedDocumentList); + loadProcessedDocument.load(data, processedDocumentList); processedDocumentList.clear(); } } - @Override public void loadProcessedDocumentWithError(LoadProcessedDocumentWithError document) { processedDocumentWithErrorList.add(document); if (processedDocumentWithErrorList.size() > 100) { - sqlLoadProcessedDocument.loadWithError(data, processedDocumentWithErrorList); + loadProcessedDocument.loadWithError(data, processedDocumentWithErrorList); processedDocumentWithErrorList.clear(); } } - @Override - public void loadKeywords(EdgeUrl url, int features, DocumentMetadata metadata, DocumentKeywords words) { + public void loadKeywords(EdgeUrl url, int ordinal, int features, DocumentMetadata metadata, DocumentKeywords words) { try { - indexLoadKeywords.load(data, url, features, metadata, words); + indexLoadKeywords.load(data, ordinal, url, features, metadata, words); } catch (InterruptedException e) { throw new RuntimeException(e); } @@ -123,10 +112,10 @@ public class Loader implements Interpreter, AutoCloseable { public void close() { if (processedDocumentList.size() > 0) { - sqlLoadProcessedDocument.load(data, processedDocumentList); + loadProcessedDocument.load(data, processedDocumentList); } if (processedDocumentWithErrorList.size() > 0) { - sqlLoadProcessedDocument.loadWithError(data, processedDocumentWithErrorList); + loadProcessedDocument.loadWithError(data, processedDocumentWithErrorList); } } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderData.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderData.java index 570cb579..3ec604e6 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderData.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderData.java @@ -2,17 +2,15 @@ package nu.marginalia.loading.loader; import gnu.trove.map.hash.TObjectIntHashMap; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; public class LoaderData { - private final TObjectIntHashMap urlIds; private final TObjectIntHashMap domainIds; private EdgeDomain targetDomain; public final int sizeHint; + private int targetDomainId = -1; public LoaderData(int sizeHint) { - urlIds = new TObjectIntHashMap<>(sizeHint+1); domainIds = new TObjectIntHashMap<>(10); this.sizeHint = sizeHint; } @@ -23,20 +21,16 @@ public class LoaderData { public EdgeDomain getTargetDomain() { return targetDomain; } - + public int getTargetDomainId() { + if (targetDomainId < 0) + targetDomainId = domainIds.get(targetDomain); + return targetDomainId; + } public void addDomain(EdgeDomain domain, int id) { domainIds.put(domain, id); } - public void addUrl(EdgeUrl url, int id) { - urlIds.put(url, id); - } - - public int getUrlId(EdgeUrl url) { - return urlIds.get(url); - } - public int getDomainId(EdgeDomain domain) { return domainIds.get(domain); } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderFactory.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderFactory.java index 21435ac0..c2edc32a 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderFactory.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderFactory.java @@ -3,24 +3,21 @@ package nu.marginalia.loading.loader; import com.google.inject.Inject; public class LoaderFactory { - private final SqlLoadUrls sqlLoadUrls; private final SqlLoadDomains sqlLoadDomains; private final SqlLoadDomainLinks sqlLoadDomainLinks; private final SqlLoadProcessedDomain sqlLoadProcessedDomain; - private final SqlLoadProcessedDocument sqlLoadProcessedDocument; + private final LdbLoadProcessedDocument sqlLoadProcessedDocument; private final SqlLoadDomainMetadata sqlLoadDomainMetadata; private final IndexLoadKeywords indexLoadKeywords; @Inject - public LoaderFactory(SqlLoadUrls sqlLoadUrls, - SqlLoadDomains sqlLoadDomains, + public LoaderFactory(SqlLoadDomains sqlLoadDomains, SqlLoadDomainLinks sqlLoadDomainLinks, SqlLoadProcessedDomain sqlLoadProcessedDomain, - SqlLoadProcessedDocument sqlLoadProcessedDocument, + LdbLoadProcessedDocument sqlLoadProcessedDocument, SqlLoadDomainMetadata sqlLoadDomainMetadata, IndexLoadKeywords indexLoadKeywords) { - this.sqlLoadUrls = sqlLoadUrls; this.sqlLoadDomains = sqlLoadDomains; this.sqlLoadDomainLinks = sqlLoadDomainLinks; this.sqlLoadProcessedDomain = sqlLoadProcessedDomain; @@ -30,6 +27,6 @@ public class LoaderFactory { } public Loader create(int sizeHint) { - return new Loader(sizeHint, sqlLoadUrls, sqlLoadDomains, sqlLoadDomainLinks, sqlLoadProcessedDomain, sqlLoadProcessedDocument, sqlLoadDomainMetadata, indexLoadKeywords); + return new Loader(sizeHint, sqlLoadDomains, sqlLoadDomainLinks, sqlLoadProcessedDomain, sqlLoadProcessedDocument, sqlLoadDomainMetadata, indexLoadKeywords); } } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java index 4aabdcea..d33f9c3d 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java @@ -59,17 +59,17 @@ public class LoaderIndexJournalWriter { new ThreadPoolExecutor(8, 16, 1, TimeUnit.MINUTES, keywordInsertTaskQueue); @SneakyThrows - public void putWords(EdgeId domain, EdgeId url, + public void putWords(long combinedId, int features, DocumentMetadata metadata, DocumentKeywords wordSet) { if (wordSet.keywords().length == 0) { - logger.info("Skipping zero-length word set for {}:{}", domain, url); + logger.info("Skipping zero-length word set for {}", combinedId); return; } - if (domain.id() <= 0 || url.id() <= 0) { - logger.warn("Bad ID: {}:{}", domain, url); + if (combinedId <= 0) { + logger.warn("Bad ID: {}", combinedId); return; } @@ -77,27 +77,26 @@ public class LoaderIndexJournalWriter { // with a chonky work queue is a fairly decent improvement for (var chunk : KeywordListChunker.chopList(wordSet, IndexJournalEntryData.MAX_LENGTH)) { try { - keywordInsertionExecutor.submit(() -> loadWords(domain, url, features, metadata, chunk)); + keywordInsertionExecutor.submit(() -> loadWords(combinedId, features, metadata, chunk)); } catch (RejectedExecutionException ex) { - loadWords(domain, url, features, metadata, chunk); + loadWords(combinedId, features, metadata, chunk); } } } - private void loadWords(EdgeId domain, - EdgeId url, + private void loadWords(long combinedId, int features, DocumentMetadata metadata, DocumentKeywords wordSet) { if (null == metadata) { - logger.warn("Null metadata for {}:{}", domain, url); + logger.warn("Null metadata for {}", combinedId); return; } var entry = new IndexJournalEntryData(getOrInsertWordIds(wordSet.keywords(), wordSet.metadata())); - var header = new IndexJournalEntryHeader(domain, features, url, metadata.encode()); + var header = new IndexJournalEntryHeader(combinedId, features, metadata.encode()); indexWriter.put(header, entry); } diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java deleted file mode 100644 index 909ec986..00000000 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDocument.java +++ /dev/null @@ -1,187 +0,0 @@ -package nu.marginalia.loading.loader; - -import com.google.inject.Inject; -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; -import org.apache.commons.lang3.StringUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.sql.SQLException; -import java.sql.Types; -import java.util.List; - -import static java.sql.Statement.SUCCESS_NO_INFO; - -public class SqlLoadProcessedDocument { - private final HikariDataSource dataSource; - private static final Logger logger = LoggerFactory.getLogger(SqlLoadProcessedDocument.class); - - @Inject - public SqlLoadProcessedDocument(HikariDataSource dataSource) { - this.dataSource = dataSource; - - try (var conn = dataSource.getConnection()) { - try (var stmt = conn.createStatement()) { - stmt.execute("DROP PROCEDURE IF EXISTS INSERT_PAGE_VISIT"); - stmt.execute("DROP PROCEDURE IF EXISTS INSERT_PAGE_VISIT_BAD"); - stmt.execute(""" - CREATE PROCEDURE INSERT_PAGE_VISIT ( - IN URL_ID INT, - IN STATE VARCHAR(32), - IN TITLE VARCHAR(255), - IN DESCRIPTION VARCHAR(255), - IN LENGTH INT, - IN FEATURES INT, - IN STANDARD VARCHAR(32), - IN QUALITY DOUBLE, - IN HASH BIGINT, - IN PUB_YEAR SMALLINT) - BEGIN - SET FOREIGN_KEY_CHECKS=0; - REPLACE INTO EC_PAGE_DATA(ID, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR) VALUES (URL_ID, TITLE, DESCRIPTION, LENGTH, STANDARD, FEATURES, HASH, QUALITY, PUB_YEAR); - UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID; - SET FOREIGN_KEY_CHECKS=1; - END - """); - stmt.execute(""" - CREATE PROCEDURE INSERT_PAGE_VISIT_BAD ( - IN URL_ID INT, - IN STATE VARCHAR(32)) - BEGIN - UPDATE EC_URL SET VISITED=1, STATE=STATE WHERE ID=URL_ID; - DELETE FROM EC_PAGE_DATA WHERE ID=URL_ID; - END - """); - - } - } - catch (SQLException ex) { - throw new RuntimeException("Failed to set up loader", ex); - } - } - - public void load(LoaderData data, List documents) { - - try (var conn = dataSource.getConnection()) { - try (var insertCall = conn.prepareCall("CALL INSERT_PAGE_VISIT(?, ?, ?, ?, ?, ?, ?, ?, ?, ?)") - ) { - conn.setAutoCommit(false); - - int cnt = 0; - int batchOffset = 0; - for (var doc : documents) { - int urlId = data.getUrlId(doc.url()); - if (urlId <= 0) { - logger.warn("Failed to resolve ID for URL {}", doc.url()); - continue; - } - - insertCall.setInt(1, urlId); - insertCall.setString(2, doc.state().name()); - insertCall.setString(3, doc.title()); - insertCall.setString(4, StringUtils.truncate(doc.description(), 255)); - insertCall.setInt(5, doc.length()); - insertCall.setInt(6, doc.htmlFeatures()); - insertCall.setString(7, doc.standard()); - insertCall.setDouble(8, doc.quality()); - insertCall.setLong(9, doc.hash()); - if (doc.pubYear() != null) { - insertCall.setShort(10, (short) doc.pubYear().intValue()); - } else { - insertCall.setInt(10, Types.SMALLINT); - } - insertCall.addBatch(); - - if (++cnt == 100) { - var ret = insertCall.executeBatch(); - conn.commit(); - - for (int rv = 0; rv < cnt; rv++) { - if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { - logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]); - } - } - - cnt = 0; - batchOffset += 100; - } - } - if (cnt > 0) { - var ret = insertCall.executeBatch(); - conn.commit(); - for (int rv = 0; rv < cnt; rv++) { - if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { - logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]); - } - } - } - - conn.setAutoCommit(true); - } - catch (SQLException ex) { - conn.rollback(); - throw ex; - } - } catch (SQLException ex) { - logger.warn("SQL error inserting document", ex); - - if (getClass().desiredAssertionStatus()) - throw new RuntimeException(ex); - } - } - - public void loadWithError(LoaderData data, List documents) { - - try (var conn = dataSource.getConnection(); - var stmt = conn.prepareCall("CALL INSERT_PAGE_VISIT_BAD(?, ?)")) { - - conn.setAutoCommit(false); - - int cnt = 0; int batchOffset = 0; - for (var doc : documents) { - int urlId = data.getUrlId(doc.url()); - if (urlId < 0) { - logger.warn("Failed to resolve ID for URL {}", doc.url()); - return; - } - - stmt.setInt(1, urlId); - stmt.setString(2, doc.state().name()); - stmt.addBatch(); - - if (++cnt == 100) { - var ret = stmt.executeBatch(); - conn.commit(); - - for (int rv = 0; rv < cnt; rv++) { - if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { - logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]); - } - } - - cnt = 0; - batchOffset += 100; - } - } - if (cnt > 0) { - var ret = stmt.executeBatch(); - conn.commit(); - for (int rv = 0; rv < cnt; rv++) { - if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { - logger.warn("load({}) -- bad row count {}", documents.get(batchOffset + rv), ret[rv]); - } - } - } - - conn.setAutoCommit(true); - } catch (SQLException ex) { - logger.warn("SQL error inserting failed document", ex); - - if (getClass().desiredAssertionStatus()) - throw new RuntimeException(ex); - } - - } -} diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java index 9ac576af..17a423ed 100644 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java +++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadProcessedDomain.java @@ -14,14 +14,12 @@ import java.sql.SQLException; public class SqlLoadProcessedDomain { private final HikariDataSource dataSource; private final SqlLoadDomains loadDomains; - private final SqlLoadUrls loadUrls; private static final Logger logger = LoggerFactory.getLogger(SqlLoadProcessedDomain.class); @Inject - public SqlLoadProcessedDomain(HikariDataSource dataSource, SqlLoadDomains loadDomains, SqlLoadUrls loadUrls) { + public SqlLoadProcessedDomain(HikariDataSource dataSource, SqlLoadDomains loadDomains) { this.dataSource = dataSource; this.loadDomains = loadDomains; - this.loadUrls = loadUrls; try (var conn = dataSource.getConnection()) { @@ -69,8 +67,6 @@ public class SqlLoadProcessedDomain { if (rc < 1) { logger.warn("load({},{}) -- bad rowcount {}", domain, state, rc); } - - loadUrls.loadUrlsForDomain(data, domain, 0); } catch (SQLException ex) { conn.rollback(); diff --git a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java deleted file mode 100644 index 4ef1509e..00000000 --- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/SqlLoadUrls.java +++ /dev/null @@ -1,151 +0,0 @@ -package nu.marginalia.loading.loader; - -import com.google.inject.Inject; -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.hash.MurmurHash3_128; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.sql.SQLException; -import java.sql.Types; -import java.util.HashSet; -import java.util.Set; - -import static java.sql.Statement.SUCCESS_NO_INFO; - -public class SqlLoadUrls { - - private final HikariDataSource dataSource; - private static final Logger logger = LoggerFactory.getLogger(SqlLoadUrls.class); - - @Inject - public SqlLoadUrls(HikariDataSource dataSource) { - this.dataSource = dataSource; - } - private final MurmurHash3_128 murmurHash = new MurmurHash3_128(); - - public void load(LoaderData data, EdgeUrl[] urls) { - Set affectedDomains = new HashSet<>(); - - if (urls.length == 0) - return; - - int maxOldId = 0; - try (var conn = dataSource.getConnection()) { - - try (var insertStmt = conn.prepareStatement("INSERT IGNORE INTO EC_URL (PROTO,DOMAIN_ID,PORT,PATH,PARAM,PATH_HASH) VALUES (?,?,?,?,?,?)"); - var queryMaxId = conn.prepareStatement("SELECT MAX(ID) FROM EC_URL")) { - - conn.setAutoCommit(false); - - var rs = queryMaxId.executeQuery(); - if (rs.next()) { - maxOldId = rs.getInt(1); - } - - int cnt = 0; - int batchOffset = 0; - - for (var url : urls) { - if (data.getUrlId(url) != 0) - continue; - - if (url.path.length() >= 255) { - logger.info("Skipping bad URL {}", url); - continue; - } - var domainId = data.getDomainId(url.domain); - - affectedDomains.add(url.domain); - - insertStmt.setString(1, url.proto); - insertStmt.setInt(2, domainId); - if (url.port != null) { - insertStmt.setInt(3, url.port); - } else { - insertStmt.setNull(3, Types.INTEGER); - } - insertStmt.setString(4, url.path); - insertStmt.setString(5, url.param); - insertStmt.setLong(6, hashPath(url.path, url.param)); - insertStmt.addBatch(); - - if (++cnt == 1000) { - var ret = insertStmt.executeBatch(); - for (int rv = 0; rv < cnt; rv++) { - if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { - logger.warn("load({}) -- bad row count {}", urls[batchOffset + rv], ret[rv]); - } - } - - batchOffset += cnt; - cnt = 0; - } - } - - if (cnt > 0) { - var ret = insertStmt.executeBatch(); - for (int rv = 0; rv < cnt; rv++) { - if (ret[rv] < 0 && ret[rv] != SUCCESS_NO_INFO) { - logger.warn("load({}) -- bad row count {}", urls[batchOffset + rv], ret[rv]); - } - } - } - - conn.commit(); - conn.setAutoCommit(true); - - for (var domain : affectedDomains) { - loadUrlsForDomain(data, domain, maxOldId); - } - } - catch (SQLException ex) { - conn.rollback(); - throw ex; - } - } - catch (SQLException ex) { - logger.warn("SQL error inserting URLs", ex); - - if (getClass().desiredAssertionStatus()) - throw new RuntimeException(ex); - } - } - - /* We use a uniqueness constraint on DOMAIN_ID and this hash instead of on the PATH and PARAM - * fields as the uniqueness index grows absurdly large for some reason, possibly due to the prevalent - * shared leading substrings in paths? - */ - private long hashPath(String path, String queryParam) { - long hash = murmurHash.hashNearlyASCII(path); - if (queryParam != null) { - hash ^= murmurHash.hashNearlyASCII(queryParam); - } - return hash; - } - - /** Loads urlIDs for the domain into `data` from the database, starting at URL ID minId. */ - public void loadUrlsForDomain(LoaderData data, EdgeDomain domain, int minId) throws SQLException { - try (var conn = dataSource.getConnection(); - var queryCall = conn.prepareStatement("SELECT ID, PROTO, PATH, PARAM FROM EC_URL WHERE DOMAIN_ID=? AND ID > ?")) { - - queryCall.setFetchSize(1000); - queryCall.setInt(1, data.getDomainId(domain)); - queryCall.setInt(2, minId); - - var rsp = queryCall.executeQuery(); - - while (rsp.next()) { - int urlId = rsp.getInt(1); - String proto = rsp.getString(2); - String path = rsp.getString(3); - String param = rsp.getString(4); - - data.addUrl(new EdgeUrl(proto, domain, null, path, param), urlId); - } - } - - } -} diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDocumentTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDocumentTest.java deleted file mode 100644 index e9dd92b6..00000000 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDocumentTest.java +++ /dev/null @@ -1,96 +0,0 @@ -package nu.marginalia.loader; - -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; -import nu.marginalia.search.db.DbUrlDetailsQuery; -import nu.marginalia.loading.loader.LoaderData; -import nu.marginalia.loading.loader.SqlLoadDomains; -import nu.marginalia.loading.loader.SqlLoadProcessedDocument; -import nu.marginalia.loading.loader.SqlLoadUrls; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; -import nu.marginalia.converting.model.HtmlStandard; -import nu.marginalia.model.crawl.UrlIndexingState; -import nu.marginalia.model.crawl.HtmlFeature; -import nu.marginalia.model.id.EdgeIdArray; -import org.junit.jupiter.api.*; -import org.testcontainers.containers.MariaDBContainer; -import org.testcontainers.junit.jupiter.Container; -import org.testcontainers.junit.jupiter.Testcontainers; - -import java.net.URISyntaxException; -import java.util.List; -import java.util.Set; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; - -@Tag("slow") -@Testcontainers -class SqlLoadProcessedDocumentTest { - @Container - static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") - .withDatabaseName("WMSA_prod") - .withUsername("wmsa") - .withPassword("wmsa") - .withInitScript("db/migration/V23_06_0_000__base.sql") - .withNetworkAliases("mariadb"); - - HikariDataSource dataSource; - LoaderData loaderData; - - DbUrlDetailsQuery dbUrlDetailsQuery; - @BeforeEach - public void setUp() throws URISyntaxException { - dataSource = DbTestUtil.getConnection(mariaDBContainer.getJdbcUrl()); - dbUrlDetailsQuery = new DbUrlDetailsQuery(dataSource); - - var loadDomains = new SqlLoadDomains(dataSource); - var loadUrls = new SqlLoadUrls(dataSource); - - loaderData = new LoaderData(10); - - loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); - loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu")); - - loadUrls.load(loaderData, new EdgeUrl[]{new EdgeUrl("https://www.marginalia.nu/")}); - - } - - @AfterEach - public void tearDown() { - dataSource.close(); - } - - @Test - public void loadProcessedDocument() throws URISyntaxException { - var loader = new SqlLoadProcessedDocument(dataSource); - var url = new EdgeUrl("https://www.marginalia.nu/"); - - loader.load(loaderData, List.of(new LoadProcessedDocument( - url, - UrlIndexingState.OK, - "TITLE", - "DESCR", - HtmlFeature.encode(Set.of(HtmlFeature.AFFILIATE_LINK)), - HtmlStandard.HTML5.name(), - 100, - 12345, - -3.14, - null - ))); - - var details = dbUrlDetailsQuery.getUrlDetailsMulti(new EdgeIdArray<>(loaderData.getUrlId(new EdgeUrl("https://www.marginalia.nu/")))); - Assertions.assertEquals(1, details.size()); - - var urlDetails = details.get(0); - - assertEquals("TITLE", urlDetails.getTitle()); - assertEquals("DESCR", urlDetails.getDescription()); - assertTrue(urlDetails.isAffiliate()); - assertEquals(100, urlDetails.words); - assertEquals(12345, urlDetails.dataHash); - assertEquals(-3.14, urlDetails.getUrlQuality()); - } - -} \ No newline at end of file diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java index 0ef662eb..e1fa8223 100644 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java +++ b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadProcessedDomainTest.java @@ -5,7 +5,6 @@ import nu.marginalia.loading.loader.LoaderData; import nu.marginalia.loading.loader.SqlLoadDomains; import nu.marginalia.loading.loader.SqlLoadProcessedDomain; import nu.marginalia.converting.instruction.instructions.DomainLink; -import nu.marginalia.loading.loader.SqlLoadUrls; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.crawl.DomainIndexingState; import org.junit.jupiter.api.AfterEach; @@ -51,18 +50,18 @@ class SqlLoadProcessedDomainTest { @Test public void loadProcessedDomain() { - var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource), new SqlLoadUrls(dataSource)); + var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), DomainIndexingState.BLOCKED, "127.0.0.1"); } @Test public void loadProcessedDomainTwice() { - var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource), new SqlLoadUrls(dataSource)); + var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); loader.load(loaderData, new EdgeDomain("www.marginalia.nu"), DomainIndexingState.BLOCKED, "127.0.0.1"); } @Test public void loadProcessedDomaiWithExtremelyLongIP() { - var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource), new SqlLoadUrls(dataSource)); + var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); String ip = Stream.generate(() -> "127.").limit(1024).collect(Collectors.joining()); @@ -71,7 +70,7 @@ class SqlLoadProcessedDomainTest { @Test public void loadDomainAlias() { - var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource), new SqlLoadUrls(dataSource)); + var loader = new SqlLoadProcessedDomain(dataSource, new SqlLoadDomains(dataSource)); loader.loadAlias(loaderData, new DomainLink(new EdgeDomain("memex.marginalia.nu"), new EdgeDomain("www.marginalia.nu"))); } } \ No newline at end of file diff --git a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadUrlsTest.java b/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadUrlsTest.java deleted file mode 100644 index 7fece308..00000000 --- a/code/processes/loading-process/src/test/java/nu/marginalia/loader/SqlLoadUrlsTest.java +++ /dev/null @@ -1,54 +0,0 @@ -package nu.marginalia.loader; - -import com.zaxxer.hikari.HikariDataSource; -import nu.marginalia.loading.loader.LoaderData; -import nu.marginalia.loading.loader.SqlLoadDomains; -import nu.marginalia.loading.loader.SqlLoadUrls; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Tag; -import org.junit.jupiter.api.Test; -import org.testcontainers.containers.MariaDBContainer; -import org.testcontainers.junit.jupiter.Container; -import org.testcontainers.junit.jupiter.Testcontainers; - -import java.net.URISyntaxException; - -@Tag("slow") -@Testcontainers -class SqlLoadUrlsTest { - @Container - static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") - .withDatabaseName("WMSA_prod") - .withUsername("wmsa") - .withPassword("wmsa") - .withInitScript("db/migration/V23_06_0_000__base.sql") - .withNetworkAliases("mariadb"); - - HikariDataSource dataSource; - LoaderData loaderData; - @BeforeEach - public void setUp() { - dataSource = DbTestUtil.getConnection(mariaDBContainer.getJdbcUrl()); - - var loadDomains = new SqlLoadDomains(dataSource); - loaderData = new LoaderData(10); - - loaderData.setTargetDomain(new EdgeDomain("www.marginalia.nu")); - loadDomains.load(loaderData, new EdgeDomain("www.marginalia.nu")); - } - - @AfterEach - public void tearDown() { - dataSource.close(); - } - - @Test - public void loadUrl() throws URISyntaxException { - var loadUrls = new SqlLoadUrls(dataSource); - loadUrls.load(loaderData, new EdgeUrl[] { new EdgeUrl("https://www.marginalia.nu/") }); - } - -} \ No newline at end of file