(linkdb) New Module for sqlite-backed document db

This commit is contained in:
Viktor Lofgren 2023-08-24 09:04:39 +02:00
parent e8c0648e04
commit b22f4fbb72
8 changed files with 323 additions and 0 deletions

View File

@ -0,0 +1,56 @@
plugins {
id 'java'
id "io.freefair.lombok" version "8.2.2"
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(20))
}
}
configurations {
flywayMigration.extendsFrom(implementation)
}
dependencies {
implementation project(':code:common:model')
implementation libs.lombok
annotationProcessor libs.lombok
implementation libs.bundles.slf4j
implementation libs.guice
implementation libs.bundles.gson
implementation libs.notnull
implementation libs.sqlite
implementation libs.commons.lang3
implementation libs.trove
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
testImplementation 'org.testcontainers:mariadb:1.17.4'
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
}
test {
maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
maxHeapSize = "8G"
useJUnitPlatform()
}
task fastTests(type: Test) {
maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
maxHeapSize = "8G"
useJUnitPlatform {
excludeTags "slow"
}
}

View File

@ -0,0 +1,84 @@
package nu.marginalia.linkdb;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import gnu.trove.list.TLongList;
import nu.marginalia.linkdb.model.UrlDetail;
import nu.marginalia.linkdb.model.UrlProtocol;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
@Singleton
public class LinkdbReader {
Path dbFile;
volatile Connection connection;
@Inject
public LinkdbReader(@Named("linkdb-file") Path dbFile) throws SQLException {
this.dbFile = dbFile;
connection = createConnection();
}
private Connection createConnection() throws SQLException {
String connStr = "jdbc:sqlite:" + dbFile.toString();
return DriverManager.getConnection(connStr);
}
public void switchInput(Path newDbFile) throws IOException, SQLException {
connection.close();
Files.move(newDbFile, dbFile, StandardCopyOption.REPLACE_EXISTING);
connection = createConnection();
}
public List<UrlDetail> getUrlDetails(TLongList ids) throws SQLException {
List<UrlDetail> ret = new ArrayList<>(ids.size());
if (connection.isClosed()) {
throw new RuntimeException("URL query temporarily unavailable due to database switch");
}
try (var stmt = connection.prepareStatement("""
SELECT ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR
FROM DOCUMENT WHERE ID = ?
""")) {
for (int i = 0; i < ids.size(); i++) {
long id = ids.get(i);
stmt.setLong(1, id);
var rs = stmt.executeQuery();
if (rs.next()) {
var url = new EdgeUrl(rs.getString("URL"));
ret.add(new UrlDetail(
rs.getLong("ID"),
url,
rs.getString("TITLE"),
rs.getString("DESCRIPTION"),
rs.getDouble("QUALITY"),
rs.getString("FORMAT"),
rs.getInt("FEATURES"),
rs.getInt("PUB_YEAR"),
rs.getLong("DATA_HASH"),
rs.getInt("WORDS_TOTAL")
));
}
}
} catch (URISyntaxException e) {
throw new RuntimeException(e);
}
return ret;
}
}

View File

@ -0,0 +1,81 @@
package nu.marginalia.linkdb;
import nu.marginalia.linkdb.model.UrlDetail;
import nu.marginalia.linkdb.model.UrlProtocol;
import java.io.IOException;
import java.nio.file.Path;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.sql.Types;
import java.util.List;
public class LinkdbWriter {
private final Connection connection;
public LinkdbWriter(Path outputFile) throws SQLException {
String connStr = "jdbc:sqlite:" + outputFile.toString();
connection = DriverManager.getConnection(connStr);
try (var stream = ClassLoader.getSystemResourceAsStream("db/linkdb.sql");
var stmt = connection.createStatement()
) {
var sql = new String(stream.readAllBytes());
stmt.executeUpdate(sql);
// Disable synchronous writing as this is a one-off operation with no recovery
stmt.execute("PRAGMA synchronous = OFF");
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public void add(UrlDetail urlDetail) throws SQLException {
add(List.of(urlDetail));
}
public void add(List<UrlDetail> urlDetail) throws SQLException {
try (var stmt = connection.prepareStatement("""
INSERT OR IGNORE INTO DOCUMENT(ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""")) {
int i = 0;
for (var document : urlDetail) {
var url = document.url();
stmt.setLong(1, document.urlId());
stmt.setString(2, url.toString());
stmt.setString(3, document.title());
stmt.setString(4, document.description());
stmt.setInt(5, document.wordsTotal());
stmt.setString(6, document.format());
stmt.setInt(7, document.features());
stmt.setLong(8, document.dataHash());
stmt.setDouble(9, document.urlQuality());
if (document.pubYear() == null) {
stmt.setNull(10, Types.INTEGER);
} else {
stmt.setInt(10, document.pubYear());
}
stmt.addBatch();
if (++i > 1000) {
stmt.executeBatch();
i = 0;
}
}
if (i != 0) stmt.executeBatch();
}
}
public void close() throws SQLException {
connection.close();
}
}

View File

@ -0,0 +1,18 @@
package nu.marginalia.linkdb.model;
import nu.marginalia.model.EdgeUrl;
public record UrlDetail(long urlId,
EdgeUrl url,
String title,
String description,
double urlQuality,
String format,
int features,
Integer pubYear,
long dataHash,
int wordsTotal
)
{
}

View File

@ -0,0 +1,24 @@
package nu.marginalia.linkdb.model;
public enum UrlProtocol {
HTTP,
HTTPS;
public static int encode(String str) {
if ("http".equalsIgnoreCase(str)) {
return HTTP.ordinal();
}
else if ("https".equalsIgnoreCase(str)) {
return HTTPS.ordinal();
}
throw new IllegalArgumentException(str);
}
public static String decode(int ordinal) {
return switch (values()[ordinal]) {
case HTTP -> "http";
case HTTPS -> "https";
};
};
}

View File

@ -0,0 +1,17 @@
CREATE TABLE DOCUMENT (
ID LONG PRIMARY KEY,
URL TEXT,
STATE INT,
TITLE TEXT NOT NULL,
DESCRIPTION TEXT NOT NULL,
WORDS_TOTAL INTEGER NOT NULL,
FORMAT TEXT NOT NULL,
FEATURES INTEGER NOT NULL,
DATA_HASH INTEGER NOT NULL,
QUALITY REAL NOT NULL,
PUB_YEAR INTEGER NOT NULL
);

View File

@ -0,0 +1,42 @@
package nu.marginalia.linkdb;
import gnu.trove.list.array.TLongArrayList;
import nu.marginalia.linkdb.model.UrlDetail;
import nu.marginalia.model.EdgeDomain;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
public class LinkdbWriterTest {
@Test
public void testCreate() throws IOException {
Path tempPath = Files.createTempFile("linkdb", ".db");
try {
var writer = new LinkdbWriter(tempPath);
writer.add(new UrlDetail(
1,
new nu.marginalia.model.EdgeUrl("http", new EdgeDomain("example.com"), null, "/", null),
"Test",
"This is a test",
-4.,
"XHTML",
5,
2020,
0xF00BA3,
444
));
writer.close();
var reader = new LinkdbReader(tempPath);
var deets = reader.getUrlDetails(new TLongArrayList(new long[]{1}));
System.out.println(deets);
} catch (SQLException e) {
throw new RuntimeException(e);
} finally {
Files.deleteIfExists(tempPath);
}
}
}

View File

@ -51,6 +51,7 @@ include 'code:api:process-mqapi'
include 'code:common:service-discovery' include 'code:common:service-discovery'
include 'code:common:service-client' include 'code:common:service-client'
include 'code:common:db' include 'code:common:db'
include 'code:common:linkdb'
include 'code:common:service' include 'code:common:service'
include 'code:common:config' include 'code:common:config'
include 'code:common:model' include 'code:common:model'