From b22f4fbb725c355e9cb61e57b8c64afc570948a3 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 24 Aug 2023 09:04:39 +0200 Subject: [PATCH] (linkdb) New Module for sqlite-backed document db --- code/common/linkdb/build.gradle | 56 +++++++++++++ .../nu/marginalia/linkdb/LinkdbReader.java | 84 +++++++++++++++++++ .../nu/marginalia/linkdb/LinkdbWriter.java | 81 ++++++++++++++++++ .../nu/marginalia/linkdb/model/UrlDetail.java | 18 ++++ .../marginalia/linkdb/model/UrlProtocol.java | 24 ++++++ .../linkdb/src/main/resources/db/linkdb.sql | 17 ++++ .../marginalia/linkdb/LinkdbWriterTest.java | 42 ++++++++++ settings.gradle | 1 + 8 files changed, 323 insertions(+) create mode 100644 code/common/linkdb/build.gradle create mode 100644 code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbReader.java create mode 100644 code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbWriter.java create mode 100644 code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/UrlDetail.java create mode 100644 code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/UrlProtocol.java create mode 100644 code/common/linkdb/src/main/resources/db/linkdb.sql create mode 100644 code/common/linkdb/src/test/java/nu/marginalia/linkdb/LinkdbWriterTest.java diff --git a/code/common/linkdb/build.gradle b/code/common/linkdb/build.gradle new file mode 100644 index 00000000..19caa529 --- /dev/null +++ b/code/common/linkdb/build.gradle @@ -0,0 +1,56 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "8.2.2" + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(20)) + } +} + +configurations { + flywayMigration.extendsFrom(implementation) +} + +dependencies { + implementation project(':code:common:model') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + + implementation libs.guice + implementation libs.bundles.gson + + implementation libs.notnull + + implementation libs.sqlite + implementation libs.commons.lang3 + + implementation libs.trove + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + + testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation 'org.testcontainers:mariadb:1.17.4' + testImplementation 'org.testcontainers:junit-jupiter:1.17.4' +} + + +test { + maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 + maxHeapSize = "8G" + useJUnitPlatform() +} + +task fastTests(type: Test) { + maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1 + maxHeapSize = "8G" + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbReader.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbReader.java new file mode 100644 index 00000000..31139786 --- /dev/null +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbReader.java @@ -0,0 +1,84 @@ +package nu.marginalia.linkdb; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.google.inject.name.Named; +import gnu.trove.list.TLongList; +import nu.marginalia.linkdb.model.UrlDetail; +import nu.marginalia.linkdb.model.UrlProtocol; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.file.Files; +import java.nio.file.Path; + +import java.nio.file.StandardCopyOption; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +@Singleton +public class LinkdbReader { + Path dbFile; + volatile Connection connection; + + @Inject + public LinkdbReader(@Named("linkdb-file") Path dbFile) throws SQLException { + this.dbFile = dbFile; + connection = createConnection(); + } + + private Connection createConnection() throws SQLException { + String connStr = "jdbc:sqlite:" + dbFile.toString(); + return DriverManager.getConnection(connStr); + } + + public void switchInput(Path newDbFile) throws IOException, SQLException { + connection.close(); + + Files.move(newDbFile, dbFile, StandardCopyOption.REPLACE_EXISTING); + + connection = createConnection(); + } + + public List getUrlDetails(TLongList ids) throws SQLException { + List ret = new ArrayList<>(ids.size()); + + if (connection.isClosed()) { + throw new RuntimeException("URL query temporarily unavailable due to database switch"); + } + + try (var stmt = connection.prepareStatement(""" + SELECT ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR + FROM DOCUMENT WHERE ID = ? + """)) { + for (int i = 0; i < ids.size(); i++) { + long id = ids.get(i); + stmt.setLong(1, id); + var rs = stmt.executeQuery(); + if (rs.next()) { + var url = new EdgeUrl(rs.getString("URL")); + ret.add(new UrlDetail( + rs.getLong("ID"), + url, + rs.getString("TITLE"), + rs.getString("DESCRIPTION"), + rs.getDouble("QUALITY"), + rs.getString("FORMAT"), + rs.getInt("FEATURES"), + rs.getInt("PUB_YEAR"), + rs.getLong("DATA_HASH"), + rs.getInt("WORDS_TOTAL") + )); + } + } + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } + return ret; + } +} diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbWriter.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbWriter.java new file mode 100644 index 00000000..f97796de --- /dev/null +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/LinkdbWriter.java @@ -0,0 +1,81 @@ +package nu.marginalia.linkdb; + +import nu.marginalia.linkdb.model.UrlDetail; +import nu.marginalia.linkdb.model.UrlProtocol; + +import java.io.IOException; +import java.nio.file.Path; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.sql.Types; +import java.util.List; + +public class LinkdbWriter { + + private final Connection connection; + + public LinkdbWriter(Path outputFile) throws SQLException { + String connStr = "jdbc:sqlite:" + outputFile.toString(); + connection = DriverManager.getConnection(connStr); + + try (var stream = ClassLoader.getSystemResourceAsStream("db/linkdb.sql"); + var stmt = connection.createStatement() + ) { + var sql = new String(stream.readAllBytes()); + stmt.executeUpdate(sql); + + // Disable synchronous writing as this is a one-off operation with no recovery + stmt.execute("PRAGMA synchronous = OFF"); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public void add(UrlDetail urlDetail) throws SQLException { + add(List.of(urlDetail)); + } + + public void add(List urlDetail) throws SQLException { + + try (var stmt = connection.prepareStatement(""" + INSERT OR IGNORE INTO DOCUMENT(ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """)) { + + int i = 0; + for (var document : urlDetail) { + var url = document.url(); + + stmt.setLong(1, document.urlId()); + stmt.setString(2, url.toString()); + + stmt.setString(3, document.title()); + stmt.setString(4, document.description()); + stmt.setInt(5, document.wordsTotal()); + stmt.setString(6, document.format()); + stmt.setInt(7, document.features()); + stmt.setLong(8, document.dataHash()); + stmt.setDouble(9, document.urlQuality()); + if (document.pubYear() == null) { + stmt.setNull(10, Types.INTEGER); + } else { + stmt.setInt(10, document.pubYear()); + } + + stmt.addBatch(); + + if (++i > 1000) { + stmt.executeBatch(); + i = 0; + } + } + + if (i != 0) stmt.executeBatch(); + } + } + + public void close() throws SQLException { + connection.close(); + } +} diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/UrlDetail.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/UrlDetail.java new file mode 100644 index 00000000..19bc1906 --- /dev/null +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/UrlDetail.java @@ -0,0 +1,18 @@ +package nu.marginalia.linkdb.model; + +import nu.marginalia.model.EdgeUrl; + +public record UrlDetail(long urlId, + EdgeUrl url, + String title, + String description, + double urlQuality, + String format, + int features, + Integer pubYear, + long dataHash, + int wordsTotal + ) + +{ +} diff --git a/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/UrlProtocol.java b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/UrlProtocol.java new file mode 100644 index 00000000..a090a51a --- /dev/null +++ b/code/common/linkdb/src/main/java/nu/marginalia/linkdb/model/UrlProtocol.java @@ -0,0 +1,24 @@ +package nu.marginalia.linkdb.model; + +public enum UrlProtocol { + HTTP, + HTTPS; + + public static int encode(String str) { + if ("http".equalsIgnoreCase(str)) { + return HTTP.ordinal(); + } + else if ("https".equalsIgnoreCase(str)) { + return HTTPS.ordinal(); + } + + throw new IllegalArgumentException(str); + } + + public static String decode(int ordinal) { + return switch (values()[ordinal]) { + case HTTP -> "http"; + case HTTPS -> "https"; + }; + }; +} diff --git a/code/common/linkdb/src/main/resources/db/linkdb.sql b/code/common/linkdb/src/main/resources/db/linkdb.sql new file mode 100644 index 00000000..2e4e95b6 --- /dev/null +++ b/code/common/linkdb/src/main/resources/db/linkdb.sql @@ -0,0 +1,17 @@ +CREATE TABLE DOCUMENT ( + ID LONG PRIMARY KEY, + + URL TEXT, + + STATE INT, + TITLE TEXT NOT NULL, + DESCRIPTION TEXT NOT NULL, + + WORDS_TOTAL INTEGER NOT NULL, + FORMAT TEXT NOT NULL, + FEATURES INTEGER NOT NULL, + + DATA_HASH INTEGER NOT NULL, + QUALITY REAL NOT NULL, + PUB_YEAR INTEGER NOT NULL +); \ No newline at end of file diff --git a/code/common/linkdb/src/test/java/nu/marginalia/linkdb/LinkdbWriterTest.java b/code/common/linkdb/src/test/java/nu/marginalia/linkdb/LinkdbWriterTest.java new file mode 100644 index 00000000..75645546 --- /dev/null +++ b/code/common/linkdb/src/test/java/nu/marginalia/linkdb/LinkdbWriterTest.java @@ -0,0 +1,42 @@ +package nu.marginalia.linkdb; + +import gnu.trove.list.array.TLongArrayList; +import nu.marginalia.linkdb.model.UrlDetail; +import nu.marginalia.model.EdgeDomain; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; + +public class LinkdbWriterTest { + @Test + public void testCreate() throws IOException { + Path tempPath = Files.createTempFile("linkdb", ".db"); + try { + var writer = new LinkdbWriter(tempPath); + writer.add(new UrlDetail( + 1, + new nu.marginalia.model.EdgeUrl("http", new EdgeDomain("example.com"), null, "/", null), + "Test", + "This is a test", + -4., + "XHTML", + 5, + 2020, + 0xF00BA3, + 444 + )); + writer.close(); + + var reader = new LinkdbReader(tempPath); + var deets = reader.getUrlDetails(new TLongArrayList(new long[]{1})); + System.out.println(deets); + } catch (SQLException e) { + throw new RuntimeException(e); + } finally { + Files.deleteIfExists(tempPath); + } + } +} diff --git a/settings.gradle b/settings.gradle index 0e00abc2..623e6576 100644 --- a/settings.gradle +++ b/settings.gradle @@ -51,6 +51,7 @@ include 'code:api:process-mqapi' include 'code:common:service-discovery' include 'code:common:service-client' include 'code:common:db' +include 'code:common:linkdb' include 'code:common:service' include 'code:common:config' include 'code:common:model'