mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 04:58:59 +00:00
(stackexchange-converter) Create tool for converting stackexchange 7z-files to digestible sqlite db:s
This commit is contained in:
parent
3b4d08f52b
commit
5b0a6d7ec1
19
code/features-convert/stackexchange-xml/readme.md
Normal file
19
code/features-convert/stackexchange-xml/readme.md
Normal file
@ -0,0 +1,19 @@
|
||||
Stackexchange's data is a jumble of questions and answers,
|
||||
where the answers refer to the questions with a parentId field.
|
||||
|
||||
e.g.
|
||||
```xml
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<posts>
|
||||
<row Id="1" PostTypeId="1" AcceptedAnswerId="51" CreationDate="2016-01-12T18:45:19.963" Score="10" ViewCount="424" Body="<p>When I've printed an object I've had to choose between high resolution and quick prints. What techniques or technologies can I use or deploy to speed up my high resolution prints?</p>
" OwnerUserId="16" LastActivityDate="2017-10-31T02:31:08.560" Title="How to obtain high resolution prints in a shorter period of time?" Tags="<resolution><speed><quality>" AnswerCount="2" CommentCount="6" ContentLicense="CC BY-SA 3.0" />
|
||||
<row Id="2" PostTypeId="1" AcceptedAnswerId="12" CreationDate="2016-01-12T18:45:51.287" Score="34" ViewCount="7377" Body="<p>I would like to buy a 3D printer, but I'm concerned about the health risks that are associated with its operation. Some groups of scientists say it can be <a href="http://www.techworld.com/news/personal-tech/scientists-warn-of-3d-printing-health-effects-as-tech-hits-high-street-3460992/">harmful</a> for humans.</p>

<p>What do I need to consider before buying a 3D printer if I care about my health? Are there any safe printers?</p>
" OwnerUserId="20" LastEditorUserId="334" LastEditDate="2016-11-15T16:16:11.163" LastActivityDate="2019-06-10T23:18:34.190" Title="Is 3D printing safe for your health?" Tags="<print-material><safety><health>" AnswerCount="4" CommentCount="1" ContentLicense="CC BY-SA 3.0" />
|
||||
<row Id="12" PostTypeId="2" ParentId="2" CreationDate="2016-01-12T19:13:00.710" Score="23" Body="<p>There is very little information about safety available, as home 3D printers are relatively new. However, plastics such as ABS have a long history in making plastic products, and a study found..." />
|
||||
</posts>
|
||||
```
|
||||
|
||||
Since the search engine wants to extract keywords for each thread
|
||||
holistically, not by question or answer, it is necessary to re-arrange
|
||||
the data (which is very large). SQLite does a decent job of enabling
|
||||
this task.
|
||||
|
||||
See [tools/stackexchange-converter](../../tools/stackexchange-converter).
|
@ -33,8 +33,9 @@ public class StackExchangePostsDb {
|
||||
|
||||
/** Construct a SQLIte file containing the Posts in the stack exchange-style 7z file */
|
||||
@SneakyThrows
|
||||
public static void create(Path sqliteFile,
|
||||
Path stackExchange7zFile) {
|
||||
public static void create(String domain,
|
||||
Path sqliteFile,
|
||||
Path stackExchange7zFile) {
|
||||
if (Files.exists(sqliteFile))
|
||||
Files.delete(sqliteFile);
|
||||
String connStr = "jdbc:sqlite:" + sqliteFile;
|
||||
@ -58,6 +59,13 @@ public class StackExchangePostsDb {
|
||||
stackExchange7zFile
|
||||
);
|
||||
|
||||
var insertMeta = connection.prepareStatement("""
|
||||
INSERT INTO metadata(domainName)
|
||||
VALUES (?)
|
||||
""");
|
||||
insertMeta.setString(1, domain);
|
||||
insertMeta.executeUpdate();
|
||||
|
||||
var insertPost = connection.prepareStatement("""
|
||||
INSERT INTO post(id, threadId, postYear, title, body, origSize, tags)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
|
@ -8,4 +8,8 @@ CREATE TABLE post (
|
||||
tags TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE metadata (
|
||||
domainName TEXT
|
||||
);
|
||||
|
||||
CREATE INDEX post_threadId ON post(threadId);
|
52
code/tools/stackexchange-converter/build.gradle
Normal file
52
code/tools/stackexchange-converter/build.gradle
Normal file
@ -0,0 +1,52 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id "io.freefair.lombok" version "8.2.2"
|
||||
id 'application'
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(20))
|
||||
}
|
||||
}
|
||||
|
||||
application {
|
||||
mainClass = 'nu.marginalia.tools.StackexchangeConverter'
|
||||
applicationName = 'stackexchange-converter'
|
||||
}
|
||||
|
||||
tasks.distZip.enabled = false
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:features-convert:stackexchange-xml')
|
||||
|
||||
implementation libs.lombok
|
||||
annotationProcessor libs.lombok
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.notnull
|
||||
|
||||
implementation libs.guice
|
||||
implementation libs.jsoup
|
||||
implementation libs.trove
|
||||
implementation libs.fastutil
|
||||
|
||||
implementation libs.bundles.nlp
|
||||
implementation libs.commons.lang3
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
}
|
||||
|
||||
|
||||
test {
|
||||
useJUnitPlatform()
|
||||
}
|
||||
|
||||
task fastTests(type: Test) {
|
||||
useJUnitPlatform {
|
||||
excludeTags "slow"
|
||||
}
|
||||
}
|
24
code/tools/stackexchange-converter/readme.md
Normal file
24
code/tools/stackexchange-converter/readme.md
Normal file
@ -0,0 +1,24 @@
|
||||
This tool converts from stackexchange's 7z-compressed XML
|
||||
format to a sqlite database that is digestible by the search engine.
|
||||
|
||||
See [features-convert/stackexchange-xml](../../features-convert/stackexchange-xml) for
|
||||
an explanation why this is necessary.
|
||||
|
||||
Stackexchange's data dumps can be downloaded from archive.org
|
||||
here: [https://archive.org/details/stackexchange](https://archive.org/details/stackexchange)
|
||||
|
||||
<b>Usage</b>
|
||||
|
||||
```shell
|
||||
$ stackexchange-converter domain-name input.7z output.db
|
||||
```
|
||||
|
||||
Stackexchange is relatively conservative about allowing
|
||||
new questions, so this is a job that doesn't run more than once.
|
||||
|
||||
<b>Note</b>: Reading and writing these db files is *absurdly* slow
|
||||
on a mechanical hard-drive.
|
||||
|
||||
## See Also
|
||||
|
||||
* [features-convert/stackexchange-xml](../../features-convert/stackexchange-xml)
|
@ -0,0 +1,31 @@
|
||||
package nu.marginalia.tools;
|
||||
|
||||
import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class StackexchangeConverter {
|
||||
public static void main(String[] args) {
|
||||
|
||||
if (args.length != 3) {
|
||||
System.err.println("Converts a stackexchange Posts 7z file to a Marginalia-digestible sqlite-db\n");
|
||||
System.err.println("Arguments: domain-name input-file.7z output-file.db");
|
||||
return;
|
||||
}
|
||||
|
||||
String domain = args[0];
|
||||
|
||||
Path inputFile = Path.of(args[1]);
|
||||
Path outputFile = Path.of(args[2]);
|
||||
|
||||
if (!Files.exists(inputFile))
|
||||
System.err.println("Input file " + inputFile + " does not exists");
|
||||
|
||||
System.out.println("Converting " + inputFile);
|
||||
|
||||
StackExchangePostsDb.create(domain, outputFile, inputFile);
|
||||
|
||||
System.out.println("... done!");
|
||||
}
|
||||
}
|
@ -75,6 +75,7 @@ include 'code:tools:experiment-runner'
|
||||
include 'code:tools:website-adjacencies-calculator'
|
||||
include 'code:tools:screenshot-capture-tool'
|
||||
include 'code:tools:load-test'
|
||||
include 'code:tools:stackexchange-converter'
|
||||
|
||||
include 'third-party:porterstemmer'
|
||||
include 'third-party:xz'
|
||||
|
Loading…
Reference in New Issue
Block a user