mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(stackexchange-converter) Create tool for converting stackexchange 7z-files to digestible sqlite db:s
This commit is contained in:
parent
3b4d08f52b
commit
5b0a6d7ec1
19
code/features-convert/stackexchange-xml/readme.md
Normal file
19
code/features-convert/stackexchange-xml/readme.md
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
Stackexchange's data is a jumble of questions and answers,
|
||||||
|
where the answers refer to the questions with a parentId field.
|
||||||
|
|
||||||
|
e.g.
|
||||||
|
```xml
|
||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<posts>
|
||||||
|
<row Id="1" PostTypeId="1" AcceptedAnswerId="51" CreationDate="2016-01-12T18:45:19.963" Score="10" ViewCount="424" Body="<p>When I've printed an object I've had to choose between high resolution and quick prints. What techniques or technologies can I use or deploy to speed up my high resolution prints?</p>
" OwnerUserId="16" LastActivityDate="2017-10-31T02:31:08.560" Title="How to obtain high resolution prints in a shorter period of time?" Tags="<resolution><speed><quality>" AnswerCount="2" CommentCount="6" ContentLicense="CC BY-SA 3.0" />
|
||||||
|
<row Id="2" PostTypeId="1" AcceptedAnswerId="12" CreationDate="2016-01-12T18:45:51.287" Score="34" ViewCount="7377" Body="<p>I would like to buy a 3D printer, but I'm concerned about the health risks that are associated with its operation. Some groups of scientists say it can be <a href="http://www.techworld.com/news/personal-tech/scientists-warn-of-3d-printing-health-effects-as-tech-hits-high-street-3460992/">harmful</a> for humans.</p>

<p>What do I need to consider before buying a 3D printer if I care about my health? Are there any safe printers?</p>
" OwnerUserId="20" LastEditorUserId="334" LastEditDate="2016-11-15T16:16:11.163" LastActivityDate="2019-06-10T23:18:34.190" Title="Is 3D printing safe for your health?" Tags="<print-material><safety><health>" AnswerCount="4" CommentCount="1" ContentLicense="CC BY-SA 3.0" />
|
||||||
|
<row Id="12" PostTypeId="2" ParentId="2" CreationDate="2016-01-12T19:13:00.710" Score="23" Body="<p>There is very little information about safety available, as home 3D printers are relatively new. However, plastics such as ABS have a long history in making plastic products, and a study found..." />
|
||||||
|
</posts>
|
||||||
|
```
|
||||||
|
|
||||||
|
Since the search engine wants to extract keywords for each thread
|
||||||
|
holistically, not by question or answer, it is necessary to re-arrange
|
||||||
|
the data (which is very large). SQLite does a decent job of enabling
|
||||||
|
this task.
|
||||||
|
|
||||||
|
See [tools/stackexchange-converter](../../tools/stackexchange-converter).
|
@ -33,7 +33,8 @@ public class StackExchangePostsDb {
|
|||||||
|
|
||||||
/** Construct a SQLIte file containing the Posts in the stack exchange-style 7z file */
|
/** Construct a SQLIte file containing the Posts in the stack exchange-style 7z file */
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public static void create(Path sqliteFile,
|
public static void create(String domain,
|
||||||
|
Path sqliteFile,
|
||||||
Path stackExchange7zFile) {
|
Path stackExchange7zFile) {
|
||||||
if (Files.exists(sqliteFile))
|
if (Files.exists(sqliteFile))
|
||||||
Files.delete(sqliteFile);
|
Files.delete(sqliteFile);
|
||||||
@ -58,6 +59,13 @@ public class StackExchangePostsDb {
|
|||||||
stackExchange7zFile
|
stackExchange7zFile
|
||||||
);
|
);
|
||||||
|
|
||||||
|
var insertMeta = connection.prepareStatement("""
|
||||||
|
INSERT INTO metadata(domainName)
|
||||||
|
VALUES (?)
|
||||||
|
""");
|
||||||
|
insertMeta.setString(1, domain);
|
||||||
|
insertMeta.executeUpdate();
|
||||||
|
|
||||||
var insertPost = connection.prepareStatement("""
|
var insertPost = connection.prepareStatement("""
|
||||||
INSERT INTO post(id, threadId, postYear, title, body, origSize, tags)
|
INSERT INTO post(id, threadId, postYear, title, body, origSize, tags)
|
||||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||||
|
@ -8,4 +8,8 @@ CREATE TABLE post (
|
|||||||
tags TEXT
|
tags TEXT
|
||||||
);
|
);
|
||||||
|
|
||||||
|
CREATE TABLE metadata (
|
||||||
|
domainName TEXT
|
||||||
|
);
|
||||||
|
|
||||||
CREATE INDEX post_threadId ON post(threadId);
|
CREATE INDEX post_threadId ON post(threadId);
|
52
code/tools/stackexchange-converter/build.gradle
Normal file
52
code/tools/stackexchange-converter/build.gradle
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
id "io.freefair.lombok" version "8.2.2"
|
||||||
|
id 'application'
|
||||||
|
|
||||||
|
id 'jvm-test-suite'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(20))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
application {
|
||||||
|
mainClass = 'nu.marginalia.tools.StackexchangeConverter'
|
||||||
|
applicationName = 'stackexchange-converter'
|
||||||
|
}
|
||||||
|
|
||||||
|
tasks.distZip.enabled = false
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation project(':code:features-convert:stackexchange-xml')
|
||||||
|
|
||||||
|
implementation libs.lombok
|
||||||
|
annotationProcessor libs.lombok
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
implementation libs.notnull
|
||||||
|
|
||||||
|
implementation libs.guice
|
||||||
|
implementation libs.jsoup
|
||||||
|
implementation libs.trove
|
||||||
|
implementation libs.fastutil
|
||||||
|
|
||||||
|
implementation libs.bundles.nlp
|
||||||
|
implementation libs.commons.lang3
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
test {
|
||||||
|
useJUnitPlatform()
|
||||||
|
}
|
||||||
|
|
||||||
|
task fastTests(type: Test) {
|
||||||
|
useJUnitPlatform {
|
||||||
|
excludeTags "slow"
|
||||||
|
}
|
||||||
|
}
|
24
code/tools/stackexchange-converter/readme.md
Normal file
24
code/tools/stackexchange-converter/readme.md
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
This tool converts from stackexchange's 7z-compressed XML
|
||||||
|
format to a sqlite database that is digestible by the search engine.
|
||||||
|
|
||||||
|
See [features-convert/stackexchange-xml](../../features-convert/stackexchange-xml) for
|
||||||
|
an explanation why this is necessary.
|
||||||
|
|
||||||
|
Stackexchange's data dumps can be downloaded from archive.org
|
||||||
|
here: [https://archive.org/details/stackexchange](https://archive.org/details/stackexchange)
|
||||||
|
|
||||||
|
<b>Usage</b>
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ stackexchange-converter domain-name input.7z output.db
|
||||||
|
```
|
||||||
|
|
||||||
|
Stackexchange is relatively conservative about allowing
|
||||||
|
new questions, so this is a job that doesn't run more than once.
|
||||||
|
|
||||||
|
<b>Note</b>: Reading and writing these db files is *absurdly* slow
|
||||||
|
on a mechanical hard-drive.
|
||||||
|
|
||||||
|
## See Also
|
||||||
|
|
||||||
|
* [features-convert/stackexchange-xml](../../features-convert/stackexchange-xml)
|
@ -0,0 +1,31 @@
|
|||||||
|
package nu.marginalia.tools;
|
||||||
|
|
||||||
|
import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb;
|
||||||
|
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
public class StackexchangeConverter {
|
||||||
|
public static void main(String[] args) {
|
||||||
|
|
||||||
|
if (args.length != 3) {
|
||||||
|
System.err.println("Converts a stackexchange Posts 7z file to a Marginalia-digestible sqlite-db\n");
|
||||||
|
System.err.println("Arguments: domain-name input-file.7z output-file.db");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
String domain = args[0];
|
||||||
|
|
||||||
|
Path inputFile = Path.of(args[1]);
|
||||||
|
Path outputFile = Path.of(args[2]);
|
||||||
|
|
||||||
|
if (!Files.exists(inputFile))
|
||||||
|
System.err.println("Input file " + inputFile + " does not exists");
|
||||||
|
|
||||||
|
System.out.println("Converting " + inputFile);
|
||||||
|
|
||||||
|
StackExchangePostsDb.create(domain, outputFile, inputFile);
|
||||||
|
|
||||||
|
System.out.println("... done!");
|
||||||
|
}
|
||||||
|
}
|
@ -75,6 +75,7 @@ include 'code:tools:experiment-runner'
|
|||||||
include 'code:tools:website-adjacencies-calculator'
|
include 'code:tools:website-adjacencies-calculator'
|
||||||
include 'code:tools:screenshot-capture-tool'
|
include 'code:tools:screenshot-capture-tool'
|
||||||
include 'code:tools:load-test'
|
include 'code:tools:load-test'
|
||||||
|
include 'code:tools:stackexchange-converter'
|
||||||
|
|
||||||
include 'third-party:porterstemmer'
|
include 'third-party:porterstemmer'
|
||||||
include 'third-party:xz'
|
include 'third-party:xz'
|
||||||
|
Loading…
Reference in New Issue
Block a user