diff --git a/code/features-convert/stackexchange-xml/readme.md b/code/features-convert/stackexchange-xml/readme.md
new file mode 100644
index 00000000..8af6d05a
--- /dev/null
+++ b/code/features-convert/stackexchange-xml/readme.md
@@ -0,0 +1,19 @@
+Stackexchange's data is a jumble of questions and answers,
+where the answers refer to the questions with a parentId field.
+
+e.g.
+```xml
+
+
+
+
+
+
+```
+
+Since the search engine wants to extract keywords for each thread
+holistically, not by question or answer, it is necessary to re-arrange
+the data (which is very large). SQLite does a decent job of enabling
+this task.
+
+See [tools/stackexchange-converter](../../tools/stackexchange-converter).
\ No newline at end of file
diff --git a/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java
index 3e9005bc..bf78f5f1 100644
--- a/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java
+++ b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java
@@ -33,8 +33,9 @@ public class StackExchangePostsDb {
/** Construct a SQLIte file containing the Posts in the stack exchange-style 7z file */
@SneakyThrows
- public static void create(Path sqliteFile,
- Path stackExchange7zFile) {
+ public static void create(String domain,
+ Path sqliteFile,
+ Path stackExchange7zFile) {
if (Files.exists(sqliteFile))
Files.delete(sqliteFile);
String connStr = "jdbc:sqlite:" + sqliteFile;
@@ -58,6 +59,13 @@ public class StackExchangePostsDb {
stackExchange7zFile
);
+ var insertMeta = connection.prepareStatement("""
+ INSERT INTO metadata(domainName)
+ VALUES (?)
+ """);
+ insertMeta.setString(1, domain);
+ insertMeta.executeUpdate();
+
var insertPost = connection.prepareStatement("""
INSERT INTO post(id, threadId, postYear, title, body, origSize, tags)
VALUES (?, ?, ?, ?, ?, ?, ?)
diff --git a/code/features-convert/stackexchange-xml/src/main/resources/db/stackexchange.sql b/code/features-convert/stackexchange-xml/src/main/resources/db/stackexchange.sql
index 401fe7a1..9ac05750 100644
--- a/code/features-convert/stackexchange-xml/src/main/resources/db/stackexchange.sql
+++ b/code/features-convert/stackexchange-xml/src/main/resources/db/stackexchange.sql
@@ -8,4 +8,8 @@ CREATE TABLE post (
tags TEXT
);
+CREATE TABLE metadata (
+ domainName TEXT
+);
+
CREATE INDEX post_threadId ON post(threadId);
\ No newline at end of file
diff --git a/code/tools/stackexchange-converter/build.gradle b/code/tools/stackexchange-converter/build.gradle
new file mode 100644
index 00000000..7590cccd
--- /dev/null
+++ b/code/tools/stackexchange-converter/build.gradle
@@ -0,0 +1,52 @@
+plugins {
+ id 'java'
+ id "io.freefair.lombok" version "8.2.2"
+ id 'application'
+
+ id 'jvm-test-suite'
+}
+
+java {
+ toolchain {
+ languageVersion.set(JavaLanguageVersion.of(20))
+ }
+}
+
+application {
+ mainClass = 'nu.marginalia.tools.StackexchangeConverter'
+ applicationName = 'stackexchange-converter'
+}
+
+tasks.distZip.enabled = false
+
+dependencies {
+ implementation project(':code:features-convert:stackexchange-xml')
+
+ implementation libs.lombok
+ annotationProcessor libs.lombok
+ implementation libs.bundles.slf4j
+ implementation libs.notnull
+
+ implementation libs.guice
+ implementation libs.jsoup
+ implementation libs.trove
+ implementation libs.fastutil
+
+ implementation libs.bundles.nlp
+ implementation libs.commons.lang3
+
+ testImplementation libs.bundles.slf4j.test
+ testImplementation libs.bundles.junit
+ testImplementation libs.mockito
+}
+
+
+test {
+ useJUnitPlatform()
+}
+
+task fastTests(type: Test) {
+ useJUnitPlatform {
+ excludeTags "slow"
+ }
+}
diff --git a/code/tools/stackexchange-converter/readme.md b/code/tools/stackexchange-converter/readme.md
new file mode 100644
index 00000000..2490d045
--- /dev/null
+++ b/code/tools/stackexchange-converter/readme.md
@@ -0,0 +1,24 @@
+This tool converts from stackexchange's 7z-compressed XML
+format to a sqlite database that is digestible by the search engine.
+
+See [features-convert/stackexchange-xml](../../features-convert/stackexchange-xml) for
+an explanation why this is necessary.
+
+Stackexchange's data dumps can be downloaded from archive.org
+here: [https://archive.org/details/stackexchange](https://archive.org/details/stackexchange)
+
+Usage
+
+```shell
+$ stackexchange-converter domain-name input.7z output.db
+```
+
+Stackexchange is relatively conservative about allowing
+new questions, so this is a job that doesn't run more than once.
+
+Note: Reading and writing these db files is *absurdly* slow
+on a mechanical hard-drive.
+
+## See Also
+
+* [features-convert/stackexchange-xml](../../features-convert/stackexchange-xml)
\ No newline at end of file
diff --git a/code/tools/stackexchange-converter/src/main/java/nu/marginalia/tools/StackexchangeConverter.java b/code/tools/stackexchange-converter/src/main/java/nu/marginalia/tools/StackexchangeConverter.java
new file mode 100644
index 00000000..a287bdd2
--- /dev/null
+++ b/code/tools/stackexchange-converter/src/main/java/nu/marginalia/tools/StackexchangeConverter.java
@@ -0,0 +1,31 @@
+package nu.marginalia.tools;
+
+import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+public class StackexchangeConverter {
+ public static void main(String[] args) {
+
+ if (args.length != 3) {
+ System.err.println("Converts a stackexchange Posts 7z file to a Marginalia-digestible sqlite-db\n");
+ System.err.println("Arguments: domain-name input-file.7z output-file.db");
+ return;
+ }
+
+ String domain = args[0];
+
+ Path inputFile = Path.of(args[1]);
+ Path outputFile = Path.of(args[2]);
+
+ if (!Files.exists(inputFile))
+ System.err.println("Input file " + inputFile + " does not exists");
+
+ System.out.println("Converting " + inputFile);
+
+ StackExchangePostsDb.create(domain, outputFile, inputFile);
+
+ System.out.println("... done!");
+ }
+}
diff --git a/settings.gradle b/settings.gradle
index b7e49ae8..cfd13d33 100644
--- a/settings.gradle
+++ b/settings.gradle
@@ -75,6 +75,7 @@ include 'code:tools:experiment-runner'
include 'code:tools:website-adjacencies-calculator'
include 'code:tools:screenshot-capture-tool'
include 'code:tools:load-test'
+include 'code:tools:stackexchange-converter'
include 'third-party:porterstemmer'
include 'third-party:xz'