mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 04:58:59 +00:00
(converter) Integrate zim->db conversion into automatic encyclopedia processing workflow
Previously, in order to load encyclopedia data into the search engine, it was necessary to use the encyclopedia.marginalia.nu converter to first create a .db-file. This isn't very ergonomic, so parts of that code-base was lifted in as a 3rd party library, and conversion from .zim to .db is now done automatically. The output file name is based on the original filename, plus a crc32 hash and a .db-ending, to ensure we can recycle the data on repeat loads.
This commit is contained in:
parent
22c8fb3f59
commit
27ffb8fa8a
@ -2,6 +2,8 @@ package nu.marginalia.loading;
|
|||||||
|
|
||||||
import nu.marginalia.io.processed.ProcessedDataFileNames;
|
import nu.marginalia.io.processed.ProcessedDataFileNames;
|
||||||
import nu.marginalia.worklog.BatchingWorkLogInspector;
|
import nu.marginalia.worklog.BatchingWorkLogInspector;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
@ -10,13 +12,21 @@ import java.util.*;
|
|||||||
|
|
||||||
public class LoaderInputData {
|
public class LoaderInputData {
|
||||||
private final List<Path> sourceDirectories;
|
private final List<Path> sourceDirectories;
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(LoaderInputData.class);
|
||||||
private final Map<Path, Integer> lastGoodBatch = new HashMap<>();
|
private final Map<Path, Integer> lastGoodBatch = new HashMap<>();
|
||||||
|
|
||||||
public LoaderInputData(List<Path> sourceDirectories) throws IOException {
|
public LoaderInputData(List<Path> sourceDirectories) throws IOException {
|
||||||
this.sourceDirectories = sourceDirectories;
|
this.sourceDirectories = sourceDirectories;
|
||||||
|
|
||||||
for (var source : sourceDirectories) {
|
for (var source : sourceDirectories) {
|
||||||
lastGoodBatch.put(source, BatchingWorkLogInspector.getValidBatches(source.resolve("processor.log")));
|
int lastGoodBatch = BatchingWorkLogInspector.getValidBatches(source.resolve("processor.log"));
|
||||||
|
|
||||||
|
this.lastGoodBatch.put(source, lastGoodBatch);
|
||||||
|
|
||||||
|
if (lastGoodBatch == 0) {
|
||||||
|
// This is useful diagnostic information, so we log it as a warning
|
||||||
|
logger.warn("No valid batches found in {}", source);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,11 +1,17 @@
|
|||||||
<h1 class="my-3">Sideload Encyclopedia</h1>
|
<h1 class="my-3">Sideload Encyclopedia</h1>
|
||||||
|
|
||||||
<div class="my-3 p-3 border bg-light">
|
<div class="my-3 p-3 border bg-light">
|
||||||
<p>This will sideload a pre-converted MediaWiki-style OpenZim data set.
|
<p>This will side-load a MediaWiki-style OpenZim data set. Place a zim file in the uploads directory.
|
||||||
See the <a href="https://github.com/MarginaliaSearch/MarginaliaSearch/blob/master/doc/sideloading-howto.md">sideloading howto</a>
|
For Wikipedia, the zim file can be downloaded from <a href="https://download.kiwix.org/zim/wikipedia/">https://download.kiwix.org/zim/wikipedia/</a>.
|
||||||
for instructions how to produce this file. </p>
|
The en_all_nopic sets are recommended for wikipedia, since they are smaller and do not contain images
|
||||||
<p>Place an articles.db file in the upload directory on the server, and select it from the list
|
(which are not used anyway). For testing, the _mini or _en_100 sets are good choices.
|
||||||
below. </p>
|
<p></p>
|
||||||
|
The zim file will be converted to a sqlite database (.db-file) with a similar name to
|
||||||
|
the zim file, which then automatically is turned into processed data.
|
||||||
|
<p></p>
|
||||||
|
Since the first stage of processing is very time-consuming, the sqlite database can
|
||||||
|
also be loaded from this form.
|
||||||
|
</p>
|
||||||
</div>
|
</div>
|
||||||
<form method="post" action="actions/sideload-encyclopedia" onsubmit="return confirm('Confirm sideloading')">
|
<form method="post" action="actions/sideload-encyclopedia" onsubmit="return confirm('Confirm sideloading')">
|
||||||
<div class="my-3 py-3">
|
<div class="my-3 py-3">
|
||||||
|
@ -45,6 +45,7 @@ dependencies {
|
|||||||
implementation project(':code:api:query-api')
|
implementation project(':code:api:query-api')
|
||||||
implementation project(':code:api:process-mqapi')
|
implementation project(':code:api:process-mqapi')
|
||||||
implementation project(':code:api:executor-api')
|
implementation project(':code:api:executor-api')
|
||||||
|
implementation project(':third-party:encyclopedia-marginalia-nu')
|
||||||
|
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
|
@ -7,6 +7,7 @@ import nu.marginalia.actor.prototype.RecordActorPrototype;
|
|||||||
import nu.marginalia.actor.state.ActorResumeBehavior;
|
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||||
import nu.marginalia.actor.state.ActorStep;
|
import nu.marginalia.actor.state.ActorStep;
|
||||||
import nu.marginalia.actor.state.Resume;
|
import nu.marginalia.actor.state.Resume;
|
||||||
|
import nu.marginalia.encyclopedia.EncyclopediaConverter;
|
||||||
import nu.marginalia.process.ProcessOutboxes;
|
import nu.marginalia.process.ProcessOutboxes;
|
||||||
import nu.marginalia.process.ProcessService;
|
import nu.marginalia.process.ProcessService;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
@ -16,21 +17,27 @@ import nu.marginalia.storage.model.FileStorageState;
|
|||||||
import nu.marginalia.storage.model.FileStorageType;
|
import nu.marginalia.storage.model.FileStorageType;
|
||||||
import nu.marginalia.mq.MqMessageState;
|
import nu.marginalia.mq.MqMessageState;
|
||||||
import nu.marginalia.mq.outbox.MqOutbox;
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
import nu.marginalia.mqapi.converting.ConvertAction;
|
|
||||||
import nu.marginalia.mqapi.converting.ConvertRequest;
|
import nu.marginalia.mqapi.converting.ConvertRequest;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.util.zip.CRC32;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class ConvertActor extends RecordActorPrototype {
|
public class ConvertActor extends RecordActorPrototype {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(ConvertActor.class);
|
||||||
private final ActorProcessWatcher processWatcher;
|
private final ActorProcessWatcher processWatcher;
|
||||||
private final MqOutbox mqConverterOutbox;
|
private final MqOutbox mqConverterOutbox;
|
||||||
private final FileStorageService storageService;
|
private final FileStorageService storageService;
|
||||||
private final Gson gson;
|
|
||||||
|
|
||||||
public record Convert(FileStorageId fid) implements ActorStep {};
|
public record Convert(FileStorageId fid) implements ActorStep {};
|
||||||
public record ConvertEncyclopedia(String source, String baseUrl) implements ActorStep {};
|
public record ConvertEncyclopedia(String source, String baseUrl) implements ActorStep {};
|
||||||
|
public record PredigestEncyclopedia(String source, String dest, String baseUrl) implements ActorStep {};
|
||||||
public record ConvertDirtree(String source) implements ActorStep {};
|
public record ConvertDirtree(String source) implements ActorStep {};
|
||||||
public record ConvertWarc(String source) implements ActorStep {};
|
public record ConvertWarc(String source) implements ActorStep {};
|
||||||
public record ConvertStackexchange(String source) implements ActorStep {};
|
public record ConvertStackexchange(String source) implements ActorStep {};
|
||||||
@ -100,6 +107,19 @@ public class ConvertActor extends RecordActorPrototype {
|
|||||||
if (!Files.exists(sourcePath))
|
if (!Files.exists(sourcePath))
|
||||||
yield new Error("Source path does not exist: " + sourcePath);
|
yield new Error("Source path does not exist: " + sourcePath);
|
||||||
|
|
||||||
|
if (source.toLowerCase().endsWith(".zim")) {
|
||||||
|
// If we're fed a ZIM file, we need to convert it to a sqlite database first
|
||||||
|
String hash = getCrc32FileHash(sourcePath);
|
||||||
|
|
||||||
|
// To avoid re-converting the same file, we'll assign the file a name based on its hash
|
||||||
|
// and the original filename. This way, if we're fed the same file again, we'll be able to just
|
||||||
|
// re-use the predigested database file.
|
||||||
|
yield new PredigestEncyclopedia(source, STR."\{source}.\{hash}.db", baseUrl);
|
||||||
|
} else if (!source.endsWith(".db")) {
|
||||||
|
yield new Error("Source path must be a ZIM or pre-digested sqlite database file (.db)");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
String fileName = sourcePath.toFile().getName();
|
String fileName = sourcePath.toFile().getName();
|
||||||
|
|
||||||
var base = storageService.getStorageBase(FileStorageBaseType.STORAGE);
|
var base = storageService.getStorageBase(FileStorageBaseType.STORAGE);
|
||||||
@ -114,6 +134,36 @@ public class ConvertActor extends RecordActorPrototype {
|
|||||||
mqConverterOutbox.sendAsync(ConvertRequest.forEncyclopedia(sourcePath, baseUrl, processedArea.id()))
|
mqConverterOutbox.sendAsync(ConvertRequest.forEncyclopedia(sourcePath, baseUrl, processedArea.id()))
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
case PredigestEncyclopedia(String source, String dest, String baseUrl) -> {
|
||||||
|
Path sourcePath = Path.of(source);
|
||||||
|
|
||||||
|
if (!Files.exists(sourcePath)) {
|
||||||
|
yield new Error("Source path does not exist: " + sourcePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
Path destPath = Path.of(dest);
|
||||||
|
if (Files.exists(destPath)) {
|
||||||
|
// Already predigested, go straight to convert step
|
||||||
|
yield new ConvertEncyclopedia(dest, baseUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
Path tempFile = Files.createTempFile(destPath.getParent(), "encyclopedia", "db.tmp");
|
||||||
|
|
||||||
|
try {
|
||||||
|
EncyclopediaConverter.convert(sourcePath, tempFile);
|
||||||
|
Files.move(tempFile, destPath);
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
logger.error("Failed to convert ZIM file to sqlite database", e);
|
||||||
|
Files.deleteIfExists(tempFile);
|
||||||
|
Files.deleteIfExists(destPath);
|
||||||
|
|
||||||
|
yield new Error("Failed to convert ZIM file to sqlite database: " + e.getMessage());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Go back to convert step with the new database file
|
||||||
|
yield new ConvertEncyclopedia(dest, baseUrl);
|
||||||
|
}
|
||||||
case ConvertStackexchange(String source) -> {
|
case ConvertStackexchange(String source) -> {
|
||||||
|
|
||||||
Path sourcePath = Path.of(source);
|
Path sourcePath = Path.of(source);
|
||||||
@ -150,6 +200,22 @@ public class ConvertActor extends RecordActorPrototype {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String getCrc32FileHash(Path file) throws IOException {
|
||||||
|
ByteBuffer buffer = ByteBuffer.allocate(8192);
|
||||||
|
|
||||||
|
try (var channel = Files.newByteChannel(file)) {
|
||||||
|
CRC32 crc = new CRC32();
|
||||||
|
|
||||||
|
while (channel.read(buffer) > 0) {
|
||||||
|
buffer.flip();
|
||||||
|
crc.update(buffer);
|
||||||
|
buffer.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
return Long.toHexString(crc.getValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String describe() {
|
public String describe() {
|
||||||
return "Convert a set of crawl data into a format suitable for loading into the database.";
|
return "Convert a set of crawl data into a format suitable for loading into the database.";
|
||||||
@ -165,6 +231,5 @@ public class ConvertActor extends RecordActorPrototype {
|
|||||||
this.processWatcher = processWatcher;
|
this.processWatcher = processWatcher;
|
||||||
this.mqConverterOutbox = processOutboxes.getConverterOutbox();
|
this.mqConverterOutbox = processOutboxes.getConverterOutbox();
|
||||||
this.storageService = storageService;
|
this.storageService = storageService;
|
||||||
this.gson = gson;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -99,6 +99,7 @@ include 'third-party:monkey-patch-opennlp'
|
|||||||
include 'third-party:monkey-patch-gson'
|
include 'third-party:monkey-patch-gson'
|
||||||
include 'third-party:commons-codec'
|
include 'third-party:commons-codec'
|
||||||
include 'third-party:parquet-floor'
|
include 'third-party:parquet-floor'
|
||||||
|
include 'third-party:encyclopedia-marginalia-nu'
|
||||||
|
|
||||||
|
|
||||||
dependencyResolutionManagement {
|
dependencyResolutionManagement {
|
||||||
|
4
third-party/README.md
vendored
4
third-party/README.md
vendored
@ -9,9 +9,9 @@ or lack an artifact, or to override some default that is inappropriate for the t
|
|||||||
* [RDRPosTagger](rdrpostagger/) - GPL3
|
* [RDRPosTagger](rdrpostagger/) - GPL3
|
||||||
* [PorterStemmer](porterstemmer/) - LGPL3
|
* [PorterStemmer](porterstemmer/) - LGPL3
|
||||||
* [Uppend](uppend/) - MIT
|
* [Uppend](uppend/) - MIT
|
||||||
* [OpenZIM](openzim/) - GPL-2.0
|
* [OpenZIM](openzim/) - GPL-2.0+
|
||||||
* [Commons Codec](commons-codec/) - Apache 2.0
|
* [Commons Codec](commons-codec/) - Apache 2.0
|
||||||
|
* [encylopedia.marginalia.nu](encyclopedia-marginalia-nu/) - GPL 2.0+
|
||||||
### Repackaged
|
### Repackaged
|
||||||
* [SymSpell](symspell/) - LGPL-3.0
|
* [SymSpell](symspell/) - LGPL-3.0
|
||||||
* [Count-Min-Sketch](count-min-sketch/) - Apache 2.0
|
* [Count-Min-Sketch](count-min-sketch/) - Apache 2.0
|
||||||
|
26
third-party/encyclopedia-marginalia-nu/build.gradle
vendored
Normal file
26
third-party/encyclopedia-marginalia-nu/build.gradle
vendored
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(21))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation libs.jsoup
|
||||||
|
implementation libs.notnull
|
||||||
|
implementation libs.bundles.gson
|
||||||
|
implementation libs.zstd
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
|
implementation project(':code:libraries:blocking-thread-pool')
|
||||||
|
|
||||||
|
implementation project(':third-party:xz')
|
||||||
|
implementation project(':third-party:openzim')
|
||||||
|
}
|
||||||
|
|
||||||
|
test {
|
||||||
|
useJUnitPlatform()
|
||||||
|
}
|
5
third-party/encyclopedia-marginalia-nu/readme.md
vendored
Normal file
5
third-party/encyclopedia-marginalia-nu/readme.md
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
This package contains a severely stripped down version of the codebase from
|
||||||
|
[encyclopedia.marginalia.nu](https://encyclopedia.marginalia.nu/).
|
||||||
|
|
||||||
|
The extracted code is a ZimFile reader and WikiHTML cleaner. It is used by the
|
||||||
|
encyclopedia side-loader.
|
@ -0,0 +1,67 @@
|
|||||||
|
package nu.marginalia.encyclopedia;
|
||||||
|
|
||||||
|
import nu.marginalia.encyclopedia.cleaner.WikiCleaner;
|
||||||
|
import nu.marginalia.encyclopedia.store.ArticleDbProvider;
|
||||||
|
import nu.marginalia.encyclopedia.store.ArticleStoreWriter;
|
||||||
|
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||||
|
import org.openzim.ZIMTypes.ZIMFile;
|
||||||
|
import org.openzim.ZIMTypes.ZIMReader;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
import java.util.function.BiConsumer;
|
||||||
|
import java.util.function.Predicate;
|
||||||
|
|
||||||
|
/** Converts an OpenZim file with Wikipedia articles to a SQLite database
|
||||||
|
* with cleaned-up MediaWiki HTML
|
||||||
|
*/
|
||||||
|
public class EncyclopediaConverter {
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(EncyclopediaConverter.class);
|
||||||
|
|
||||||
|
public static void convert(Path inputFile, Path outputFile) throws IOException, SQLException, InterruptedException {
|
||||||
|
var wc = new WikiCleaner();
|
||||||
|
var pool = new SimpleBlockingThreadPool("Convert ZIM",
|
||||||
|
Math.clamp(Runtime.getRuntime().availableProcessors() - 2, 1, 32),
|
||||||
|
2);
|
||||||
|
var size = new AtomicInteger();
|
||||||
|
|
||||||
|
if (!Files.exists(inputFile)) {
|
||||||
|
throw new IllegalStateException("ZIM file not found: " + inputFile);
|
||||||
|
}
|
||||||
|
Files.deleteIfExists(outputFile);
|
||||||
|
|
||||||
|
try (var asw = new ArticleStoreWriter(new ArticleDbProvider(outputFile))) {
|
||||||
|
Predicate<Integer> keepGoing = (s) -> true;
|
||||||
|
|
||||||
|
BiConsumer<String, String> handleArticle = (url, html) -> {
|
||||||
|
if (pool.isTerminated())
|
||||||
|
return;
|
||||||
|
|
||||||
|
pool.submitQuietly(() -> {
|
||||||
|
int sz = size.incrementAndGet();
|
||||||
|
if (sz % 1000 == 0) {
|
||||||
|
System.out.printf("\u001b[2K\r%d", sz);
|
||||||
|
}
|
||||||
|
asw.add(wc.cleanWikiJunk(url, html));
|
||||||
|
});
|
||||||
|
|
||||||
|
size.incrementAndGet();
|
||||||
|
};
|
||||||
|
|
||||||
|
new ZIMReader(new ZIMFile(inputFile.toString())).forEachArticles(handleArticle, keepGoing);
|
||||||
|
|
||||||
|
pool.shutDown();
|
||||||
|
logger.info("Waiting for pool to finish");
|
||||||
|
|
||||||
|
while (!pool.awaitTermination(1, TimeUnit.SECONDS)) {
|
||||||
|
// ...
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,60 @@
|
|||||||
|
package nu.marginalia.encyclopedia.cleaner;
|
||||||
|
|
||||||
|
import lombok.Builder;
|
||||||
|
import org.jsoup.nodes.Comment;
|
||||||
|
import org.jsoup.nodes.Element;
|
||||||
|
import org.jsoup.nodes.Node;
|
||||||
|
import org.jsoup.select.NodeFilter;
|
||||||
|
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.function.Predicate;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
@Builder
|
||||||
|
public class CleanerFilter implements NodeFilter {
|
||||||
|
final Set<String> badTags;
|
||||||
|
final Set<String> badIds;
|
||||||
|
final Set<String> badClasses;
|
||||||
|
|
||||||
|
final Set<Predicate<Element>> predicates;
|
||||||
|
|
||||||
|
private static final Pattern spacePattern = Pattern.compile("\\s+");
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public FilterResult head(Node node, int depth) {
|
||||||
|
if (node instanceof Element el) {
|
||||||
|
if (badTags != null && badTags.contains(el.tagName()))
|
||||||
|
return FilterResult.REMOVE;
|
||||||
|
|
||||||
|
if (badIds != null && badIds.contains(el.id()))
|
||||||
|
return FilterResult.REMOVE;
|
||||||
|
|
||||||
|
if (badClasses != null) {
|
||||||
|
String className = el.className();
|
||||||
|
if (className.contains(" ")) {
|
||||||
|
String[] parts = spacePattern.split(className);
|
||||||
|
for (var c : parts) {
|
||||||
|
if (badClasses.contains(c))
|
||||||
|
return FilterResult.REMOVE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (badClasses.contains(className)) {
|
||||||
|
return FilterResult.REMOVE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (predicates != null) {
|
||||||
|
for (var pred : predicates) {
|
||||||
|
if (pred.test(el))
|
||||||
|
return FilterResult.REMOVE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (node instanceof Comment) {
|
||||||
|
return FilterResult.REMOVE;
|
||||||
|
}
|
||||||
|
|
||||||
|
return FilterResult.CONTINUE;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,329 @@
|
|||||||
|
package nu.marginalia.encyclopedia.cleaner;
|
||||||
|
|
||||||
|
import nu.marginalia.encyclopedia.cleaner.model.ArticleData;
|
||||||
|
import nu.marginalia.encyclopedia.cleaner.model.ArticleParts;
|
||||||
|
import nu.marginalia.encyclopedia.model.Article;
|
||||||
|
import nu.marginalia.encyclopedia.model.Link;
|
||||||
|
import nu.marginalia.encyclopedia.model.LinkList;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
import org.jsoup.nodes.*;
|
||||||
|
import org.jsoup.select.Elements;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
public class WikiCleaner {
|
||||||
|
|
||||||
|
private static final String licenseFooter = "This article is issued from Wikipedia. The text is licensed under Creative Commons - Attribution - Sharealike. Additional terms may apply for the media files.";
|
||||||
|
public ArticleData cleanWikiJunk(String url, String html) {
|
||||||
|
return cleanWikiJunk(url, Jsoup.parse(html));
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean isPresentationRole(Element el) {
|
||||||
|
return "presentation".equals(el.attr("role"));
|
||||||
|
}
|
||||||
|
private boolean isLicenseFooter(Element el) {
|
||||||
|
// We'll add our own later
|
||||||
|
if ("div".equals(el.tagName())) {
|
||||||
|
return licenseFooter.equals(el.wholeOwnText().trim());
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ArticleData cleanWikiJunk(String url, Document doc) {
|
||||||
|
|
||||||
|
if (doc.getElementById("content") == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<Link> disambig = getDisambiguationLinks(doc);
|
||||||
|
List<Link> topLinks = getWikiPageLinks(doc);
|
||||||
|
|
||||||
|
doc.filter(CleanerFilter.builder()
|
||||||
|
.badClasses(Set.of("infobox", "collapsible", "navbar", "printfooter",
|
||||||
|
"mw-editsection", "thumb", "sidebar", "navbox", "mw-jump-link",
|
||||||
|
"vertical-navbox", "mw-indicators", "noprint", "sistersitebox",
|
||||||
|
"BarChartTemplate"))
|
||||||
|
.badIds(Set.of("coordinates", "mw-page-base", "mw-head-base", "site-notice", "contentSub", "contentSub2"))
|
||||||
|
.badTags(Set.of("footer", "script", "object", "embed", "audio", "style", "nosript", "link", "meta", "img"))
|
||||||
|
.predicates(Set.of(this::isPresentationRole, this::isLicenseFooter))
|
||||||
|
.build());
|
||||||
|
|
||||||
|
doc.getElementsByTag("a").forEach(tag -> {
|
||||||
|
var href = tag.attr("href");
|
||||||
|
var parent = tag.parent();
|
||||||
|
|
||||||
|
if (null != parent && "li".equals(parent.tagName())) {
|
||||||
|
tag.removeAttr("title");
|
||||||
|
|
||||||
|
if (href.startsWith("http://")) {
|
||||||
|
tag.addClass("extern-link");
|
||||||
|
tag.attr("rel", "nofollow");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
tag.replaceWith(new TextNode(tag.text()));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
doc.getElementsByTag("cite").tagName("span");
|
||||||
|
|
||||||
|
doc.filter(CleanerFilter.builder()
|
||||||
|
.badIds(Set.of("toc", "catlinks", "Notes", "mw-navigation", "mw-data-after-content", "jump-to-nav"))
|
||||||
|
.badClasses(Set.of("mw-references-wrap", "references", "reference", "siteSub", "refbegin"))
|
||||||
|
.build()
|
||||||
|
);
|
||||||
|
|
||||||
|
doc.getAllElements().forEach(elem -> {
|
||||||
|
if (elem.parent() != null
|
||||||
|
&& "summary".equals(elem.parent().tagName()))
|
||||||
|
{
|
||||||
|
elem.parent().replaceWith(elem);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
doc.getElementsByClass("mwe-math-element").forEach(mathSpan -> {
|
||||||
|
var mathTag = mathSpan.getElementsByTag("math").first();
|
||||||
|
if (mathTag != null) {
|
||||||
|
mathSpan.replaceWith(mathTag);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
doc.getElementsByTag("span").forEach(elem -> {
|
||||||
|
if ("pre".equals(elem.parent().tagName())) {
|
||||||
|
if (elem.hasClass("linenos")) {
|
||||||
|
elem.replaceWith(new TextNode(String.format("%-4s", elem.text())));
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
elem.replaceWith(new TextNode(elem.text()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
elem.replaceWith(new TextNode(" " + elem.text() + " "));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
doc.getElementsByTag("details").forEach(deets -> {
|
||||||
|
if (deets.children().size() == 1) {
|
||||||
|
deets.replaceWith(deets.children().first());
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
deets.tagName("div");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
removeSingularlyNestedDivs(doc);
|
||||||
|
|
||||||
|
removeEmptyTags(doc, "li");
|
||||||
|
removeEmptyTags(doc, "ul");
|
||||||
|
removeEmptyTags(doc, "div");
|
||||||
|
|
||||||
|
doc.getElementsByTag("p").forEach(elem -> {
|
||||||
|
if ("blockquote".equals(elem.parent().tagName())) {
|
||||||
|
elem.replaceWith(new TextNode(elem.text()));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
removeEmptyTags(doc, "p");
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
cascadingHeaderCleanup(doc, "h4", "h3", "h2");
|
||||||
|
cascadingHeaderCleanup(doc, "h3", "h2");
|
||||||
|
cascadingHeaderCleanup(doc, "h2");
|
||||||
|
|
||||||
|
doc.getElementsByTag("table").forEach(table -> {
|
||||||
|
table.attr("border", "1");
|
||||||
|
|
||||||
|
if ("right".equals(table.attr("align"))) {
|
||||||
|
table.remove();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
doc.getAllElements().forEach(elem -> {
|
||||||
|
removeWikiClassNames(elem);
|
||||||
|
|
||||||
|
elem.removeAttr("lang");
|
||||||
|
elem.removeAttr("dir");
|
||||||
|
elem.removeAttr("id");
|
||||||
|
elem.removeAttr("role");
|
||||||
|
elem.removeAttr("style");
|
||||||
|
elem.removeAttr("tabindex");
|
||||||
|
elem.removeAttr("aria-haspopup");
|
||||||
|
elem.removeAttr("data-section-id");
|
||||||
|
elem.removeAttr("aria-expanded");
|
||||||
|
elem.removeAttr("aria-pressed");
|
||||||
|
elem.removeAttr("open");
|
||||||
|
elem.removeAttr("data-level");
|
||||||
|
});
|
||||||
|
|
||||||
|
doc.getElementsByTag("table").remove();
|
||||||
|
|
||||||
|
// Remove the first header since we'll insert our own in the templating
|
||||||
|
Optional.ofNullable(doc.getElementsByTag("h1").first()).ifPresent(Element::remove);
|
||||||
|
|
||||||
|
ArticleParts articleParts = getDocumentParts(doc);
|
||||||
|
|
||||||
|
return new Article(
|
||||||
|
url,
|
||||||
|
doc.title(),
|
||||||
|
articleParts.getSummary(),
|
||||||
|
articleParts,
|
||||||
|
new LinkList(topLinks),
|
||||||
|
new LinkList(disambig)
|
||||||
|
).asData();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void removeWikiClassNames(Element elem) {
|
||||||
|
final String classNames = elem.className();
|
||||||
|
|
||||||
|
// Note that the string with class names isn't split,
|
||||||
|
// this is fairly expensive and since most tags don't even
|
||||||
|
// have classes, we'll optimistically check for presence and then
|
||||||
|
// pay for the expensive removeClass operation even if unnecessary
|
||||||
|
// due to a false positive
|
||||||
|
|
||||||
|
if (classNames.contains("verb")) {
|
||||||
|
elem.removeClass("verb");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (classNames.contains("extern-link")) {
|
||||||
|
elem.removeClass("extern-link");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (classNames.contains("margin-note")) {
|
||||||
|
elem.removeClass("margin-note");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (classNames.contains("wikitable")) {
|
||||||
|
elem.removeClass("wikitable");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ArticleParts getDocumentParts(Document doc) {
|
||||||
|
|
||||||
|
// We expect the document to be one container div with a bunch of children
|
||||||
|
// each corresponding to a section of the document
|
||||||
|
|
||||||
|
var rootDiv = doc.getElementsByTag("div").first();
|
||||||
|
|
||||||
|
if (null == rootDiv) {
|
||||||
|
return new ArticleParts(List.of());
|
||||||
|
}
|
||||||
|
|
||||||
|
// To be maximally useful, we want the article as a series of divs corresponding to
|
||||||
|
// logical sections of the article
|
||||||
|
|
||||||
|
List<String> parts = new ArrayList<>();
|
||||||
|
|
||||||
|
Element normalizingDiv = null;
|
||||||
|
for (Element child : rootDiv.children()) {
|
||||||
|
boolean isDiv = "div".equals(child.tagName());
|
||||||
|
|
||||||
|
if (!isDiv && normalizingDiv == null) {
|
||||||
|
normalizingDiv = new Element("div");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isDiv && normalizingDiv != null) {
|
||||||
|
if (normalizingDiv.childrenSize() > 0) {
|
||||||
|
parts.add(normalizingDiv.outerHtml());
|
||||||
|
}
|
||||||
|
normalizingDiv = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (normalizingDiv != null) normalizingDiv.appendChild(child.clone());
|
||||||
|
if (isDiv && child.childrenSize() > 0) parts.add(child.outerHtml());
|
||||||
|
|
||||||
|
}
|
||||||
|
if (normalizingDiv != null &&
|
||||||
|
normalizingDiv.childrenSize() > 0)
|
||||||
|
{
|
||||||
|
parts.add(normalizingDiv.outerHtml());
|
||||||
|
}
|
||||||
|
|
||||||
|
return new ArticleParts(parts);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void removeSingularlyNestedDivs(Document doc) {
|
||||||
|
// Remove divs that only contain a single div, and replace them with the inner div
|
||||||
|
|
||||||
|
for (Element div : doc.getElementsByTag("div")) {
|
||||||
|
final Elements children = div.children();
|
||||||
|
|
||||||
|
if (children.size() != 1)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
final Element childDiv = children.first();
|
||||||
|
|
||||||
|
if (null != childDiv && "div".equals(childDiv.tagName())) {
|
||||||
|
div.replaceWith(childDiv);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void cascadingHeaderCleanup(Document doc, String currH, String... nextHeaders) {
|
||||||
|
doc.getElementsByTag(currH).forEach(elem -> {
|
||||||
|
var next = elem.nextElementSibling();
|
||||||
|
if (next == null) {
|
||||||
|
elem.remove();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
String nextTagName = next.tagName();
|
||||||
|
if (currH.equals(nextTagName)) {
|
||||||
|
elem.remove();
|
||||||
|
}
|
||||||
|
else for (String h : nextHeaders) {
|
||||||
|
if (h.equals(nextTagName)) {
|
||||||
|
elem.remove();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private void removeEmptyTags(Document doc, String tag) {
|
||||||
|
doc.getElementsByTag(tag).forEach(elem -> {
|
||||||
|
if (elem.text().isBlank() && elem.getElementsByTag("img").isEmpty()) {
|
||||||
|
elem.replaceWith(new TextNode(" "));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
@NotNull
|
||||||
|
private List<Link> getWikiPageLinks(Document doc) {
|
||||||
|
List<Link> topLinks = new ArrayList<>();
|
||||||
|
doc.select("p a").forEach(atag -> {
|
||||||
|
String href = atag.attr("href");
|
||||||
|
|
||||||
|
if (!href.isBlank()
|
||||||
|
&& !href.contains(":")
|
||||||
|
&& !href.startsWith("#")
|
||||||
|
) {
|
||||||
|
topLinks.add(new Link(href, atag.attr("title")));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return topLinks;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@NotNull
|
||||||
|
private List<Link> getDisambiguationLinks(Document doc) {
|
||||||
|
List<Link> disambig = new ArrayList<>();
|
||||||
|
|
||||||
|
for (var note: doc.getElementsByClass("hatnote")) {
|
||||||
|
for (var atag : note.getElementsByTag("a")) {
|
||||||
|
String href = atag.attr("href");
|
||||||
|
if (atag.hasClass("mw-disambig") && !href.isBlank()) {
|
||||||
|
disambig.add(new Link(href, atag.attr("title")));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
note.remove();
|
||||||
|
}
|
||||||
|
|
||||||
|
return disambig;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,10 @@
|
|||||||
|
package nu.marginalia.encyclopedia.cleaner.model;
|
||||||
|
|
||||||
|
public record ArticleData(
|
||||||
|
String url,
|
||||||
|
String title,
|
||||||
|
String summary,
|
||||||
|
byte[] parts,
|
||||||
|
byte[] links,
|
||||||
|
byte[] disambigs) {
|
||||||
|
}
|
@ -0,0 +1,44 @@
|
|||||||
|
package nu.marginalia.encyclopedia.cleaner.model;
|
||||||
|
|
||||||
|
import org.jsoup.Jsoup;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public record ArticleParts(List<String> parts) {
|
||||||
|
public ArticleParts(String... parts) {
|
||||||
|
this(List.of(parts));
|
||||||
|
}
|
||||||
|
public String articleHtml() {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for (String part : parts()) {
|
||||||
|
sb.append(part);
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getSummary() {
|
||||||
|
if (parts.isEmpty())
|
||||||
|
return "";
|
||||||
|
|
||||||
|
String firstPart = parts.get(0);
|
||||||
|
var doclet = Jsoup.parse(firstPart);
|
||||||
|
doclet.getElementsByTag("b").tagName("span");
|
||||||
|
var firstP = doclet.select("p").first();
|
||||||
|
|
||||||
|
if (null == firstP)
|
||||||
|
return "";
|
||||||
|
|
||||||
|
StringBuilder ret = new StringBuilder();
|
||||||
|
ret.append(firstP.outerHtml());
|
||||||
|
|
||||||
|
var nextSibling = firstP.nextElementSibling();
|
||||||
|
|
||||||
|
if (nextSibling != null &&
|
||||||
|
!"p".equals(nextSibling.tagName()) &&
|
||||||
|
!"table".equals(nextSibling.tagName()))
|
||||||
|
{
|
||||||
|
ret.append(" ").append(nextSibling.outerHtml());
|
||||||
|
}
|
||||||
|
return ret.toString();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,35 @@
|
|||||||
|
package nu.marginalia.encyclopedia.model;
|
||||||
|
|
||||||
|
import nu.marginalia.encyclopedia.cleaner.model.ArticleData;
|
||||||
|
import nu.marginalia.encyclopedia.cleaner.model.ArticleParts;
|
||||||
|
import nu.marginalia.encyclopedia.store.ArticleCodec;
|
||||||
|
|
||||||
|
public record Article (
|
||||||
|
String url,
|
||||||
|
String title,
|
||||||
|
String summary,
|
||||||
|
ArticleParts parts,
|
||||||
|
LinkList urls,
|
||||||
|
LinkList disambigs)
|
||||||
|
{
|
||||||
|
|
||||||
|
public ArticleData asData() {
|
||||||
|
return new ArticleData(
|
||||||
|
url(),
|
||||||
|
title(),
|
||||||
|
summary(),
|
||||||
|
ArticleCodec.toCompressedJson(parts),
|
||||||
|
ArticleCodec.toCompressedJson(urls),
|
||||||
|
ArticleCodec.toCompressedJson(disambigs)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Used by template */
|
||||||
|
public String articleHtml() {
|
||||||
|
if (parts == null) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
return parts.articleHtml();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,3 @@
|
|||||||
|
package nu.marginalia.encyclopedia.model;
|
||||||
|
|
||||||
|
public record Link(String url, String text) { }
|
@ -0,0 +1,13 @@
|
|||||||
|
package nu.marginalia.encyclopedia.model;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public record LinkList(List<Link> links) {
|
||||||
|
public LinkList(Link... links) {
|
||||||
|
this(List.of(links));
|
||||||
|
}
|
||||||
|
|
||||||
|
public int size() {
|
||||||
|
return links.size();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,33 @@
|
|||||||
|
package nu.marginalia.encyclopedia.model;
|
||||||
|
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public record ReferencedArticle(String title,
|
||||||
|
List<String> aliases,
|
||||||
|
String url,
|
||||||
|
String summary) implements Comparable<ReferencedArticle> {
|
||||||
|
public ReferencedArticle(String title, String url, String summary) {
|
||||||
|
this(title, List.of(), url, summary);
|
||||||
|
}
|
||||||
|
|
||||||
|
public ReferencedArticle withAliases(List<String> aliases) {
|
||||||
|
if (aliases != null && aliases.size() > 1) {
|
||||||
|
var cleanAliases = new ArrayList<>(aliases);
|
||||||
|
cleanAliases.remove(title());
|
||||||
|
return new ReferencedArticle(title(), cleanAliases, url(), summary());
|
||||||
|
}
|
||||||
|
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String compareKey() {
|
||||||
|
return url.toLowerCase();
|
||||||
|
}
|
||||||
|
@Override
|
||||||
|
public int compareTo(@NotNull ReferencedArticle referencedArticle) {
|
||||||
|
return compareKey().compareTo(referencedArticle.compareKey());
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,25 @@
|
|||||||
|
package nu.marginalia.encyclopedia.store;
|
||||||
|
|
||||||
|
import com.github.luben.zstd.Zstd;
|
||||||
|
import com.github.luben.zstd.ZstdInputStream;
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.gson.GsonBuilder;
|
||||||
|
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
|
||||||
|
public class ArticleCodec {
|
||||||
|
private static final Gson gson = new GsonBuilder()
|
||||||
|
.registerTypeAdapterFactory(RecordTypeAdapterFactory.builder().allowMissingComponentValues().create())
|
||||||
|
.create();
|
||||||
|
|
||||||
|
public static byte[] toCompressedJson(Object any) {
|
||||||
|
return Zstd.compress(gson.toJson(any).getBytes());
|
||||||
|
}
|
||||||
|
public static <T> T fromCompressedJson(byte[] stream, Class<T> type) throws IOException {
|
||||||
|
return gson.fromJson(new InputStreamReader(new ZstdInputStream(new ByteArrayInputStream(stream))), type);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,33 @@
|
|||||||
|
package nu.marginalia.encyclopedia.store;
|
||||||
|
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.sql.Connection;
|
||||||
|
import java.sql.DriverManager;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
|
||||||
|
public class ArticleDbProvider {
|
||||||
|
private final Connection connection;
|
||||||
|
|
||||||
|
public ArticleDbProvider(Path filename) throws SQLException {
|
||||||
|
String sqliteDbString = "jdbc:sqlite:" + filename.toString();
|
||||||
|
connection = DriverManager.getConnection(sqliteDbString);
|
||||||
|
|
||||||
|
try (var stmt = connection.createStatement()) {
|
||||||
|
stmt.executeUpdate("""
|
||||||
|
CREATE TABLE IF NOT EXISTS articles (
|
||||||
|
url TEXT PRIMARY KEY,
|
||||||
|
title TEXT NOT NULL,
|
||||||
|
summary TEXT NOT NULL,
|
||||||
|
html BLOB NOT NULL,
|
||||||
|
urls BLOB NOT NULL,
|
||||||
|
disambigs BLOB NOT NULL
|
||||||
|
)
|
||||||
|
""");
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Connection getConnection() {
|
||||||
|
return connection;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,102 @@
|
|||||||
|
package nu.marginalia.encyclopedia.store;
|
||||||
|
|
||||||
|
import nu.marginalia.encyclopedia.cleaner.model.ArticleData;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.sql.Connection;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.concurrent.LinkedBlockingQueue;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
public class ArticleStoreWriter implements AutoCloseable {
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
private final Connection connection;
|
||||||
|
private final LinkedBlockingQueue<ArticleData> queue = new LinkedBlockingQueue<>(1000);
|
||||||
|
|
||||||
|
Thread insertThread;
|
||||||
|
volatile boolean running;
|
||||||
|
|
||||||
|
public ArticleStoreWriter(ArticleDbProvider dbProvider) throws SQLException {
|
||||||
|
connection = dbProvider.getConnection();
|
||||||
|
|
||||||
|
try (var stmt = connection.createStatement()) {
|
||||||
|
stmt.execute("PRAGMA synchronous = OFF");
|
||||||
|
stmt.execute("PRAGMA journal_mode = MEMORY");
|
||||||
|
}
|
||||||
|
|
||||||
|
running = true;
|
||||||
|
insertThread = new Thread(this::insertLoop);
|
||||||
|
insertThread.start();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void insertLoop() {
|
||||||
|
List<ArticleData> toAdd = new ArrayList<>();
|
||||||
|
while (running || !queue.isEmpty()) {
|
||||||
|
try {
|
||||||
|
while (0 != queue.drainTo(toAdd, 100)) {
|
||||||
|
insertItems(toAdd);
|
||||||
|
toAdd.clear();
|
||||||
|
}
|
||||||
|
if (queue.isEmpty()) {
|
||||||
|
// Yield for a moment to avoid busy looping
|
||||||
|
TimeUnit.NANOSECONDS.sleep(100);
|
||||||
|
}
|
||||||
|
} catch (SQLException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void insertItems(List<ArticleData> toAdd) throws SQLException {
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
INSERT OR IGNORE INTO articles (url, title, html, summary, urls, disambigs)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?)
|
||||||
|
"""))
|
||||||
|
{
|
||||||
|
connection.setAutoCommit(false); // Disable auto-commit mode
|
||||||
|
for (var article : toAdd) {
|
||||||
|
stmt.setString(1, article.url());
|
||||||
|
stmt.setString(2, article.title());
|
||||||
|
stmt.setBytes(3, article.parts());
|
||||||
|
stmt.setString(4, article.summary());
|
||||||
|
stmt.setBytes(5, article.links());
|
||||||
|
stmt.setBytes(6, article.disambigs());
|
||||||
|
|
||||||
|
stmt.addBatch();
|
||||||
|
}
|
||||||
|
stmt.executeBatch();
|
||||||
|
connection.commit(); // Commit the transaction
|
||||||
|
} catch (SQLException e) {
|
||||||
|
connection.rollback(); // Rollback the transaction in case of error
|
||||||
|
logger.warn("SQL error", e);
|
||||||
|
} finally {
|
||||||
|
connection.setAutoCommit(true); // Re-enable auto-commit mode
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void add(ArticleData article) {
|
||||||
|
try {
|
||||||
|
queue.put(article);
|
||||||
|
}
|
||||||
|
catch (InterruptedException e) {
|
||||||
|
logger.warn("Interrupted", e);
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() {
|
||||||
|
running = false;
|
||||||
|
try {
|
||||||
|
insertThread.join();
|
||||||
|
connection.close();
|
||||||
|
} catch (InterruptedException|SQLException e) {
|
||||||
|
logger.warn("Error", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -221,7 +221,7 @@ public class ZIMReader {
|
|||||||
|
|
||||||
// Gives the minimum required information needed for the given articleName
|
// Gives the minimum required information needed for the given articleName
|
||||||
public DirectoryEntry forEachArticles(BiConsumer<String, String> consumer, Predicate<Integer> blobPred)
|
public DirectoryEntry forEachArticles(BiConsumer<String, String> consumer, Predicate<Integer> blobPred)
|
||||||
throws IOException {
|
throws IOException, InterruptedException {
|
||||||
|
|
||||||
int numberOfArticles = mFile.getArticleCount();
|
int numberOfArticles = mFile.getArticleCount();
|
||||||
long beg = mFile.getTitlePtrPos();
|
long beg = mFile.getTitlePtrPos();
|
||||||
@ -237,6 +237,10 @@ public class ZIMReader {
|
|||||||
for (long i = beg; i < end; i+=4) {
|
for (long i = beg; i < end; i+=4) {
|
||||||
var entry = getDirectoryInfoAtTitlePosition(i);
|
var entry = getDirectoryInfoAtTitlePosition(i);
|
||||||
|
|
||||||
|
if (Thread.interrupted()) {
|
||||||
|
throw new InterruptedException();
|
||||||
|
}
|
||||||
|
|
||||||
if (((i-beg)%100_000) == 0) {
|
if (((i-beg)%100_000) == 0) {
|
||||||
System.out.printf("%f%%\n", ((i-beg) * 100.) / (end-beg));
|
System.out.printf("%f%%\n", ((i-beg) * 100.) / (end-beg));
|
||||||
}
|
}
|
||||||
@ -249,21 +253,25 @@ public class ZIMReader {
|
|||||||
|
|
||||||
System.out.println("Iterating over " + data.keySet().stream().mapToInt(Integer::intValue).max() + "clusters");
|
System.out.println("Iterating over " + data.keySet().stream().mapToInt(Integer::intValue).max() + "clusters");
|
||||||
|
|
||||||
data.forEach((pos,blobs) -> {
|
var iter = data.entrySet().iterator();
|
||||||
if (!blobPred.test(pos)) {
|
while (iter.hasNext()) {
|
||||||
return;
|
if (Thread.interrupted()) throw new InterruptedException();
|
||||||
}
|
|
||||||
|
var next = iter.next();
|
||||||
|
int pos = next.getKey();
|
||||||
|
|
||||||
|
if (!blobPred.test(pos)) continue;
|
||||||
|
Map<Integer, String> blobs = next.getValue();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
getArticleData(consumer, pos, blobs);
|
getArticleData(consumer, pos, blobs);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
ex.printStackTrace();
|
throw new RuntimeException(ex);
|
||||||
}
|
}
|
||||||
});
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user