diff --git a/code/features-convert/stackexchange-xml/readme.md b/code/features-convert/stackexchange-xml/readme.md new file mode 100644 index 00000000..8af6d05a --- /dev/null +++ b/code/features-convert/stackexchange-xml/readme.md @@ -0,0 +1,19 @@ +Stackexchange's data is a jumble of questions and answers, +where the answers refer to the questions with a parentId field. + +e.g. +```xml + + + + + + +``` + +Since the search engine wants to extract keywords for each thread +holistically, not by question or answer, it is necessary to re-arrange +the data (which is very large). SQLite does a decent job of enabling +this task. + +See [tools/stackexchange-converter](../../tools/stackexchange-converter). \ No newline at end of file diff --git a/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java index 3e9005bc..bf78f5f1 100644 --- a/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java +++ b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java @@ -33,8 +33,9 @@ public class StackExchangePostsDb { /** Construct a SQLIte file containing the Posts in the stack exchange-style 7z file */ @SneakyThrows - public static void create(Path sqliteFile, - Path stackExchange7zFile) { + public static void create(String domain, + Path sqliteFile, + Path stackExchange7zFile) { if (Files.exists(sqliteFile)) Files.delete(sqliteFile); String connStr = "jdbc:sqlite:" + sqliteFile; @@ -58,6 +59,13 @@ public class StackExchangePostsDb { stackExchange7zFile ); + var insertMeta = connection.prepareStatement(""" + INSERT INTO metadata(domainName) + VALUES (?) + """); + insertMeta.setString(1, domain); + insertMeta.executeUpdate(); + var insertPost = connection.prepareStatement(""" INSERT INTO post(id, threadId, postYear, title, body, origSize, tags) VALUES (?, ?, ?, ?, ?, ?, ?) diff --git a/code/features-convert/stackexchange-xml/src/main/resources/db/stackexchange.sql b/code/features-convert/stackexchange-xml/src/main/resources/db/stackexchange.sql index 401fe7a1..9ac05750 100644 --- a/code/features-convert/stackexchange-xml/src/main/resources/db/stackexchange.sql +++ b/code/features-convert/stackexchange-xml/src/main/resources/db/stackexchange.sql @@ -8,4 +8,8 @@ CREATE TABLE post ( tags TEXT ); +CREATE TABLE metadata ( + domainName TEXT +); + CREATE INDEX post_threadId ON post(threadId); \ No newline at end of file diff --git a/code/tools/stackexchange-converter/build.gradle b/code/tools/stackexchange-converter/build.gradle new file mode 100644 index 00000000..7590cccd --- /dev/null +++ b/code/tools/stackexchange-converter/build.gradle @@ -0,0 +1,52 @@ +plugins { + id 'java' + id "io.freefair.lombok" version "8.2.2" + id 'application' + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(20)) + } +} + +application { + mainClass = 'nu.marginalia.tools.StackexchangeConverter' + applicationName = 'stackexchange-converter' +} + +tasks.distZip.enabled = false + +dependencies { + implementation project(':code:features-convert:stackexchange-xml') + + implementation libs.lombok + annotationProcessor libs.lombok + implementation libs.bundles.slf4j + implementation libs.notnull + + implementation libs.guice + implementation libs.jsoup + implementation libs.trove + implementation libs.fastutil + + implementation libs.bundles.nlp + implementation libs.commons.lang3 + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + + +test { + useJUnitPlatform() +} + +task fastTests(type: Test) { + useJUnitPlatform { + excludeTags "slow" + } +} diff --git a/code/tools/stackexchange-converter/readme.md b/code/tools/stackexchange-converter/readme.md new file mode 100644 index 00000000..2490d045 --- /dev/null +++ b/code/tools/stackexchange-converter/readme.md @@ -0,0 +1,24 @@ +This tool converts from stackexchange's 7z-compressed XML +format to a sqlite database that is digestible by the search engine. + +See [features-convert/stackexchange-xml](../../features-convert/stackexchange-xml) for +an explanation why this is necessary. + +Stackexchange's data dumps can be downloaded from archive.org +here: [https://archive.org/details/stackexchange](https://archive.org/details/stackexchange) + +Usage + +```shell +$ stackexchange-converter domain-name input.7z output.db +``` + +Stackexchange is relatively conservative about allowing +new questions, so this is a job that doesn't run more than once. + +Note: Reading and writing these db files is *absurdly* slow +on a mechanical hard-drive. + +## See Also + +* [features-convert/stackexchange-xml](../../features-convert/stackexchange-xml) \ No newline at end of file diff --git a/code/tools/stackexchange-converter/src/main/java/nu/marginalia/tools/StackexchangeConverter.java b/code/tools/stackexchange-converter/src/main/java/nu/marginalia/tools/StackexchangeConverter.java new file mode 100644 index 00000000..a287bdd2 --- /dev/null +++ b/code/tools/stackexchange-converter/src/main/java/nu/marginalia/tools/StackexchangeConverter.java @@ -0,0 +1,31 @@ +package nu.marginalia.tools; + +import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb; + +import java.nio.file.Files; +import java.nio.file.Path; + +public class StackexchangeConverter { + public static void main(String[] args) { + + if (args.length != 3) { + System.err.println("Converts a stackexchange Posts 7z file to a Marginalia-digestible sqlite-db\n"); + System.err.println("Arguments: domain-name input-file.7z output-file.db"); + return; + } + + String domain = args[0]; + + Path inputFile = Path.of(args[1]); + Path outputFile = Path.of(args[2]); + + if (!Files.exists(inputFile)) + System.err.println("Input file " + inputFile + " does not exists"); + + System.out.println("Converting " + inputFile); + + StackExchangePostsDb.create(domain, outputFile, inputFile); + + System.out.println("... done!"); + } +} diff --git a/settings.gradle b/settings.gradle index b7e49ae8..cfd13d33 100644 --- a/settings.gradle +++ b/settings.gradle @@ -75,6 +75,7 @@ include 'code:tools:experiment-runner' include 'code:tools:website-adjacencies-calculator' include 'code:tools:screenshot-capture-tool' include 'code:tools:load-test' +include 'code:tools:stackexchange-converter' include 'third-party:porterstemmer' include 'third-party:xz'