diff --git a/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java index 8fdbabaf..3e9005bc 100644 --- a/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java +++ b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java @@ -3,11 +3,8 @@ package nu.marginalia.integration.stackexchange.sqlite; import com.github.luben.zstd.Zstd; import gnu.trove.list.TIntList; import gnu.trove.list.array.TIntArrayList; -import gnu.trove.map.TIntIntMap; -import gnu.trove.map.hash.TIntIntHashMap; import lombok.SneakyThrows; import nu.marginalia.integration.stackexchange.xml.StackExchangeXmlPostReader; -import org.apache.commons.compress.compressors.zstandard.ZstdUtils; import javax.xml.stream.XMLStreamException; import java.io.IOException; @@ -22,8 +19,19 @@ import java.util.concurrent.ForkJoinPool; import java.util.concurrent.Future; import java.util.function.Predicate; +/** Because stackexchange's XML format is a stream of entities that reference their parent, + * and we want to process them in a thread-by-thread order, it is necessary to use something + * to essentially re-order the data. + *
+ * This class uses SQLite to perform this task. The actual post bodies are compressed to keep + * the size of the file down. It is strongly advisable to read off an SSD and not a mechanical + * hard drive when processing these database files, the difference in processing time is 20 minutes + * vs 6+ hours. + *
+ */
public class StackExchangePostsDb {
+ /** Construct a SQLIte file containing the Posts in the stack exchange-style 7z file */
@SneakyThrows
public static void create(Path sqliteFile,
Path stackExchange7zFile) {
@@ -62,6 +70,8 @@ public class StackExchangePostsDb {
var post = iter.next();
insertPost.setInt(1, post.id());
+ // We invent a new field called threadId, which is the id of the post if it's
+ // a question, or the parent if it's an answer
if (post.parentId() == null) insertPost.setInt(2, post.id());
else insertPost.setInt(2, post.parentId());
@@ -92,6 +102,11 @@ public class StackExchangePostsDb {
}
}
+ /** Iterate over each post in the sqlite post database.
+ * Each post will be assigned an ordinal number that is different from the id of the post. This is
+ * necessary as stackexchange's entry count exceeds the ~67 million entries that UrlIdCodec can encode
+ * for a single domain, despite having less than 67 million 'threads'.
+ * */
@SneakyThrows
public static void forEachPost(
Path sqliteFile,
@@ -108,20 +123,24 @@ public class StackExchangePostsDb {
WHERE threadId = ?
""")
) {
+
+ // Step 1 is to export a list of thread IDs from the database
TIntList threadIds = new TIntArrayList(10_000);
ResultSet rs = selectThreadIds.executeQuery();
-
while (rs.next()) {
threadIds.add(rs.getInt(1));
}
System.out.println("Got " + threadIds.size() + " IDs");
+ // Step 2: Iterate over each thread
var idIterator = threadIds.iterator();
int ordinal = 0;
-
while (idIterator.hasNext()) {
- queryPostContents.setInt(1, idIterator.next());
+ int threadId = idIterator.next();
+
+ // Query posts with this threadId
+ queryPostContents.setInt(1, threadId);
rs = queryPostContents.executeQuery();
List