mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 04:58:59 +00:00
(linkdb) New Module for sqlite-backed document db
This commit is contained in:
parent
e8c0648e04
commit
b22f4fbb72
56
code/common/linkdb/build.gradle
Normal file
56
code/common/linkdb/build.gradle
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
id "io.freefair.lombok" version "8.2.2"
|
||||||
|
id 'jvm-test-suite'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(20))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
configurations {
|
||||||
|
flywayMigration.extendsFrom(implementation)
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation project(':code:common:model')
|
||||||
|
|
||||||
|
implementation libs.lombok
|
||||||
|
annotationProcessor libs.lombok
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
|
implementation libs.guice
|
||||||
|
implementation libs.bundles.gson
|
||||||
|
|
||||||
|
implementation libs.notnull
|
||||||
|
|
||||||
|
implementation libs.sqlite
|
||||||
|
implementation libs.commons.lang3
|
||||||
|
|
||||||
|
implementation libs.trove
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
|
||||||
|
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||||
|
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||||
|
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
test {
|
||||||
|
maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
|
||||||
|
maxHeapSize = "8G"
|
||||||
|
useJUnitPlatform()
|
||||||
|
}
|
||||||
|
|
||||||
|
task fastTests(type: Test) {
|
||||||
|
maxParallelForks = Runtime.runtime.availableProcessors().intdiv(2) ?: 1
|
||||||
|
maxHeapSize = "8G"
|
||||||
|
useJUnitPlatform {
|
||||||
|
excludeTags "slow"
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,84 @@
|
|||||||
|
package nu.marginalia.linkdb;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import com.google.inject.name.Named;
|
||||||
|
import gnu.trove.list.TLongList;
|
||||||
|
import nu.marginalia.linkdb.model.UrlDetail;
|
||||||
|
import nu.marginalia.linkdb.model.UrlProtocol;
|
||||||
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
import java.nio.file.StandardCopyOption;
|
||||||
|
import java.sql.Connection;
|
||||||
|
import java.sql.DriverManager;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class LinkdbReader {
|
||||||
|
Path dbFile;
|
||||||
|
volatile Connection connection;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public LinkdbReader(@Named("linkdb-file") Path dbFile) throws SQLException {
|
||||||
|
this.dbFile = dbFile;
|
||||||
|
connection = createConnection();
|
||||||
|
}
|
||||||
|
|
||||||
|
private Connection createConnection() throws SQLException {
|
||||||
|
String connStr = "jdbc:sqlite:" + dbFile.toString();
|
||||||
|
return DriverManager.getConnection(connStr);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void switchInput(Path newDbFile) throws IOException, SQLException {
|
||||||
|
connection.close();
|
||||||
|
|
||||||
|
Files.move(newDbFile, dbFile, StandardCopyOption.REPLACE_EXISTING);
|
||||||
|
|
||||||
|
connection = createConnection();
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<UrlDetail> getUrlDetails(TLongList ids) throws SQLException {
|
||||||
|
List<UrlDetail> ret = new ArrayList<>(ids.size());
|
||||||
|
|
||||||
|
if (connection.isClosed()) {
|
||||||
|
throw new RuntimeException("URL query temporarily unavailable due to database switch");
|
||||||
|
}
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
SELECT ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR
|
||||||
|
FROM DOCUMENT WHERE ID = ?
|
||||||
|
""")) {
|
||||||
|
for (int i = 0; i < ids.size(); i++) {
|
||||||
|
long id = ids.get(i);
|
||||||
|
stmt.setLong(1, id);
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
if (rs.next()) {
|
||||||
|
var url = new EdgeUrl(rs.getString("URL"));
|
||||||
|
ret.add(new UrlDetail(
|
||||||
|
rs.getLong("ID"),
|
||||||
|
url,
|
||||||
|
rs.getString("TITLE"),
|
||||||
|
rs.getString("DESCRIPTION"),
|
||||||
|
rs.getDouble("QUALITY"),
|
||||||
|
rs.getString("FORMAT"),
|
||||||
|
rs.getInt("FEATURES"),
|
||||||
|
rs.getInt("PUB_YEAR"),
|
||||||
|
rs.getLong("DATA_HASH"),
|
||||||
|
rs.getInt("WORDS_TOTAL")
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (URISyntaxException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,81 @@
|
|||||||
|
package nu.marginalia.linkdb;
|
||||||
|
|
||||||
|
import nu.marginalia.linkdb.model.UrlDetail;
|
||||||
|
import nu.marginalia.linkdb.model.UrlProtocol;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.sql.Connection;
|
||||||
|
import java.sql.DriverManager;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.sql.Types;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class LinkdbWriter {
|
||||||
|
|
||||||
|
private final Connection connection;
|
||||||
|
|
||||||
|
public LinkdbWriter(Path outputFile) throws SQLException {
|
||||||
|
String connStr = "jdbc:sqlite:" + outputFile.toString();
|
||||||
|
connection = DriverManager.getConnection(connStr);
|
||||||
|
|
||||||
|
try (var stream = ClassLoader.getSystemResourceAsStream("db/linkdb.sql");
|
||||||
|
var stmt = connection.createStatement()
|
||||||
|
) {
|
||||||
|
var sql = new String(stream.readAllBytes());
|
||||||
|
stmt.executeUpdate(sql);
|
||||||
|
|
||||||
|
// Disable synchronous writing as this is a one-off operation with no recovery
|
||||||
|
stmt.execute("PRAGMA synchronous = OFF");
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void add(UrlDetail urlDetail) throws SQLException {
|
||||||
|
add(List.of(urlDetail));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void add(List<UrlDetail> urlDetail) throws SQLException {
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
INSERT OR IGNORE INTO DOCUMENT(ID, URL, TITLE, DESCRIPTION, WORDS_TOTAL, FORMAT, FEATURES, DATA_HASH, QUALITY, PUB_YEAR)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
|
""")) {
|
||||||
|
|
||||||
|
int i = 0;
|
||||||
|
for (var document : urlDetail) {
|
||||||
|
var url = document.url();
|
||||||
|
|
||||||
|
stmt.setLong(1, document.urlId());
|
||||||
|
stmt.setString(2, url.toString());
|
||||||
|
|
||||||
|
stmt.setString(3, document.title());
|
||||||
|
stmt.setString(4, document.description());
|
||||||
|
stmt.setInt(5, document.wordsTotal());
|
||||||
|
stmt.setString(6, document.format());
|
||||||
|
stmt.setInt(7, document.features());
|
||||||
|
stmt.setLong(8, document.dataHash());
|
||||||
|
stmt.setDouble(9, document.urlQuality());
|
||||||
|
if (document.pubYear() == null) {
|
||||||
|
stmt.setNull(10, Types.INTEGER);
|
||||||
|
} else {
|
||||||
|
stmt.setInt(10, document.pubYear());
|
||||||
|
}
|
||||||
|
|
||||||
|
stmt.addBatch();
|
||||||
|
|
||||||
|
if (++i > 1000) {
|
||||||
|
stmt.executeBatch();
|
||||||
|
i = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i != 0) stmt.executeBatch();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() throws SQLException {
|
||||||
|
connection.close();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,18 @@
|
|||||||
|
package nu.marginalia.linkdb.model;
|
||||||
|
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
|
||||||
|
public record UrlDetail(long urlId,
|
||||||
|
EdgeUrl url,
|
||||||
|
String title,
|
||||||
|
String description,
|
||||||
|
double urlQuality,
|
||||||
|
String format,
|
||||||
|
int features,
|
||||||
|
Integer pubYear,
|
||||||
|
long dataHash,
|
||||||
|
int wordsTotal
|
||||||
|
)
|
||||||
|
|
||||||
|
{
|
||||||
|
}
|
@ -0,0 +1,24 @@
|
|||||||
|
package nu.marginalia.linkdb.model;
|
||||||
|
|
||||||
|
public enum UrlProtocol {
|
||||||
|
HTTP,
|
||||||
|
HTTPS;
|
||||||
|
|
||||||
|
public static int encode(String str) {
|
||||||
|
if ("http".equalsIgnoreCase(str)) {
|
||||||
|
return HTTP.ordinal();
|
||||||
|
}
|
||||||
|
else if ("https".equalsIgnoreCase(str)) {
|
||||||
|
return HTTPS.ordinal();
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new IllegalArgumentException(str);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String decode(int ordinal) {
|
||||||
|
return switch (values()[ordinal]) {
|
||||||
|
case HTTP -> "http";
|
||||||
|
case HTTPS -> "https";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
17
code/common/linkdb/src/main/resources/db/linkdb.sql
Normal file
17
code/common/linkdb/src/main/resources/db/linkdb.sql
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
CREATE TABLE DOCUMENT (
|
||||||
|
ID LONG PRIMARY KEY,
|
||||||
|
|
||||||
|
URL TEXT,
|
||||||
|
|
||||||
|
STATE INT,
|
||||||
|
TITLE TEXT NOT NULL,
|
||||||
|
DESCRIPTION TEXT NOT NULL,
|
||||||
|
|
||||||
|
WORDS_TOTAL INTEGER NOT NULL,
|
||||||
|
FORMAT TEXT NOT NULL,
|
||||||
|
FEATURES INTEGER NOT NULL,
|
||||||
|
|
||||||
|
DATA_HASH INTEGER NOT NULL,
|
||||||
|
QUALITY REAL NOT NULL,
|
||||||
|
PUB_YEAR INTEGER NOT NULL
|
||||||
|
);
|
@ -0,0 +1,42 @@
|
|||||||
|
package nu.marginalia.linkdb;
|
||||||
|
|
||||||
|
import gnu.trove.list.array.TLongArrayList;
|
||||||
|
import nu.marginalia.linkdb.model.UrlDetail;
|
||||||
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
|
||||||
|
public class LinkdbWriterTest {
|
||||||
|
@Test
|
||||||
|
public void testCreate() throws IOException {
|
||||||
|
Path tempPath = Files.createTempFile("linkdb", ".db");
|
||||||
|
try {
|
||||||
|
var writer = new LinkdbWriter(tempPath);
|
||||||
|
writer.add(new UrlDetail(
|
||||||
|
1,
|
||||||
|
new nu.marginalia.model.EdgeUrl("http", new EdgeDomain("example.com"), null, "/", null),
|
||||||
|
"Test",
|
||||||
|
"This is a test",
|
||||||
|
-4.,
|
||||||
|
"XHTML",
|
||||||
|
5,
|
||||||
|
2020,
|
||||||
|
0xF00BA3,
|
||||||
|
444
|
||||||
|
));
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
var reader = new LinkdbReader(tempPath);
|
||||||
|
var deets = reader.getUrlDetails(new TLongArrayList(new long[]{1}));
|
||||||
|
System.out.println(deets);
|
||||||
|
} catch (SQLException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
} finally {
|
||||||
|
Files.deleteIfExists(tempPath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -51,6 +51,7 @@ include 'code:api:process-mqapi'
|
|||||||
include 'code:common:service-discovery'
|
include 'code:common:service-discovery'
|
||||||
include 'code:common:service-client'
|
include 'code:common:service-client'
|
||||||
include 'code:common:db'
|
include 'code:common:db'
|
||||||
|
include 'code:common:linkdb'
|
||||||
include 'code:common:service'
|
include 'code:common:service'
|
||||||
include 'code:common:config'
|
include 'code:common:config'
|
||||||
include 'code:common:model'
|
include 'code:common:model'
|
||||||
|
Loading…
Reference in New Issue
Block a user