From 3b4d08f52bda092a45bf18af01c7c0e7c41e1fba Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 20 Sep 2023 14:43:06 +0200 Subject: [PATCH] (stackexchange-integration) Add better comments --- .../sqlite/StackExchangePostsDb.java | 39 ++++++++++++++----- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java index 8fdbabaf..3e9005bc 100644 --- a/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java +++ b/code/features-convert/stackexchange-xml/src/main/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java @@ -3,11 +3,8 @@ package nu.marginalia.integration.stackexchange.sqlite; import com.github.luben.zstd.Zstd; import gnu.trove.list.TIntList; import gnu.trove.list.array.TIntArrayList; -import gnu.trove.map.TIntIntMap; -import gnu.trove.map.hash.TIntIntHashMap; import lombok.SneakyThrows; import nu.marginalia.integration.stackexchange.xml.StackExchangeXmlPostReader; -import org.apache.commons.compress.compressors.zstandard.ZstdUtils; import javax.xml.stream.XMLStreamException; import java.io.IOException; @@ -22,8 +19,19 @@ import java.util.concurrent.ForkJoinPool; import java.util.concurrent.Future; import java.util.function.Predicate; +/** Because stackexchange's XML format is a stream of entities that reference their parent, + * and we want to process them in a thread-by-thread order, it is necessary to use something + * to essentially re-order the data. + *

+ * This class uses SQLite to perform this task. The actual post bodies are compressed to keep + * the size of the file down. It is strongly advisable to read off an SSD and not a mechanical + * hard drive when processing these database files, the difference in processing time is 20 minutes + * vs 6+ hours. + *

+ */ public class StackExchangePostsDb { + /** Construct a SQLIte file containing the Posts in the stack exchange-style 7z file */ @SneakyThrows public static void create(Path sqliteFile, Path stackExchange7zFile) { @@ -62,6 +70,8 @@ public class StackExchangePostsDb { var post = iter.next(); insertPost.setInt(1, post.id()); + // We invent a new field called threadId, which is the id of the post if it's + // a question, or the parent if it's an answer if (post.parentId() == null) insertPost.setInt(2, post.id()); else insertPost.setInt(2, post.parentId()); @@ -92,6 +102,11 @@ public class StackExchangePostsDb { } } + /** Iterate over each post in the sqlite post database. + * Each post will be assigned an ordinal number that is different from the id of the post. This is + * necessary as stackexchange's entry count exceeds the ~67 million entries that UrlIdCodec can encode + * for a single domain, despite having less than 67 million 'threads'. + * */ @SneakyThrows public static void forEachPost( Path sqliteFile, @@ -108,20 +123,24 @@ public class StackExchangePostsDb { WHERE threadId = ? """) ) { + + // Step 1 is to export a list of thread IDs from the database TIntList threadIds = new TIntArrayList(10_000); ResultSet rs = selectThreadIds.executeQuery(); - while (rs.next()) { threadIds.add(rs.getInt(1)); } System.out.println("Got " + threadIds.size() + " IDs"); + // Step 2: Iterate over each thread var idIterator = threadIds.iterator(); int ordinal = 0; - while (idIterator.hasNext()) { - queryPostContents.setInt(1, idIterator.next()); + int threadId = idIterator.next(); + + // Query posts with this threadId + queryPostContents.setInt(1, threadId); rs = queryPostContents.executeQuery(); List parts = new ArrayList<>(); @@ -139,6 +158,7 @@ public class StackExchangePostsDb { year = Math.min(year, rs.getInt("postYear")); + // Decompress the bodies byte[] bytes = rs.getBytes("body"); partWork.add(commonPool.submit( () -> new String(Zstd.decompress(bytes, origSize) @@ -149,7 +169,7 @@ public class StackExchangePostsDb { parts.add(workItem.get()); } - if (!consumer.test(new CombinedPostModel(ordinal++, title, year, parts))) + if (!consumer.test(new CombinedPostModel(ordinal++, threadId, title, year, parts))) break; } @@ -161,11 +181,10 @@ public class StackExchangePostsDb { } public record CombinedPostModel(int ordinal, + int threadId, String title, int year, List bodies) - { - - } + { } }