From 2bf0c4497dec9597c2933539643636966dad9ff0 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 19 Oct 2023 13:20:32 +0200 Subject: [PATCH] (*) Tool for unfcking old crawl data so that it aligns with the new style IDs --- .../crawling/io/CrawlerOutputFile.java | 6 ++ code/tools/crawl-data-unfcker/build.gradle | 55 +++++++++++++ code/tools/crawl-data-unfcker/readme.md | 3 + .../nu/marginalia/tools/CrawlDataUnfcker.java | 77 +++++++++++++++++++ settings.gradle | 1 + 5 files changed, 142 insertions(+) create mode 100644 code/tools/crawl-data-unfcker/build.gradle create mode 100644 code/tools/crawl-data-unfcker/readme.md create mode 100644 code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java index 37bafa92..a7661085 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java @@ -1,5 +1,7 @@ package nu.marginalia.crawling.io; +import org.apache.logging.log4j.util.Strings; + import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; @@ -18,6 +20,10 @@ public class CrawlerOutputFile { /** Return the Path to a file for the given id and name, creating the prerequisite * directory structure as necessary. */ public static Path createOutputPath(Path base, String id, String name) throws IOException { + if (id.length() < 4) { + id = Strings.repeat("0", 4 - id.length()) + id; + } + String first = id.substring(0, 2); String second = id.substring(2, 4); diff --git a/code/tools/crawl-data-unfcker/build.gradle b/code/tools/crawl-data-unfcker/build.gradle new file mode 100644 index 00000000..6673eab6 --- /dev/null +++ b/code/tools/crawl-data-unfcker/build.gradle @@ -0,0 +1,55 @@ +plugins { + id 'java' + + id 'application' + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(21)) + } +} + +application { + mainClass = 'nu.marginalia.tools.CrawlDataUnfcker' + applicationName = 'crawl-data-unfcker' +} + +tasks.distZip.enabled = false + +dependencies { + implementation project(':third-party:rdrpostagger') + implementation project(':third-party:porterstemmer') + implementation project(':third-party:monkey-patch-opennlp') + implementation project(':code:common:model') + implementation project(':code:common:config') + implementation project(':code:common:process') + implementation project(':code:common:service') + implementation project(':code:libraries:language-processing') + implementation project(':code:libraries:term-frequency-dict') + implementation project(':code:libraries:big-string') + implementation project(':code:processes:converting-process') + implementation project(':code:process-models:crawling-model') + + implementation project(':code:features-convert:adblock') + implementation project(':code:features-convert:topic-detection') + implementation project(':code:features-convert:keyword-extraction') + + implementation libs.bundles.slf4j + implementation libs.notnull + + implementation libs.guice + implementation libs.jsoup + implementation libs.trove + implementation libs.fastutil + + implementation libs.bundles.nlp + implementation libs.commons.lang3 + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} + diff --git a/code/tools/crawl-data-unfcker/readme.md b/code/tools/crawl-data-unfcker/readme.md new file mode 100644 index 00000000..9c870953 --- /dev/null +++ b/code/tools/crawl-data-unfcker/readme.md @@ -0,0 +1,3 @@ +# Crawl Data Unfcker + +This is a migration tool that patches the generated ID of crawl data. \ No newline at end of file diff --git a/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java b/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java new file mode 100644 index 00000000..1a73a952 --- /dev/null +++ b/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java @@ -0,0 +1,77 @@ +package nu.marginalia.tools; + +import nu.marginalia.crawling.io.CrawlerOutputFile; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.process.log.WorkLog; +import nu.marginalia.crawling.io.CrawledDomainReader; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Optional; + +public class CrawlDataUnfcker { + public static void main(String... args) { + if (args.length != 2) { + System.out.println("Usage: crawl-data-unfcker input output"); + return; + } + + Path input = Path.of(args[0]); + Path output = Path.of(args[1]); + + if (!Files.isDirectory(input)) { + System.err.println("Input directory is not valid"); + return; + } + if (!Files.isDirectory(output)) { + System.err.println("Output directory is not valid"); + return; + } + + var reader = new CrawledDomainReader(); + + try (var wl = new WorkLog(output.resolve("crawler.log"))) { + for (var inputItem : WorkLog.iterable(input.resolve("crawler.log"))) { + Path inputPath = input.resolve(inputItem.relPath()); + + var domainMaybe = readDomain(reader, inputPath).map(CrawledDomain::getDomain); + if (domainMaybe.isEmpty()) + continue; + var domain = domainMaybe.get(); + + // Generate conformant ID + String newId = Integer.toHexString(domain.hashCode()); + + var outputPath = CrawlerOutputFile.createOutputPath(output, newId, domain); + var outputFileName = outputPath.toFile().getName(); + + System.out.println(inputPath + " -> " + outputPath); + Files.move(inputPath, outputPath); + + wl.setJobToFinished(domain, outputFileName, inputItem.cnt()); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + static Optional readDomain(CrawledDomainReader reader, Path file) { + if (!Files.exists(file)) { + System.out.println("Missing file " + file); + return Optional.empty(); + } + + try (var stream = reader.createDataStream(file)) { + while (stream.hasNext()) { + if (stream.next() instanceof CrawledDomain domain) { + return Optional.of(domain); + } + } + } + catch (Exception ex) { + ex.printStackTrace(); + } + return Optional.empty(); + } +} diff --git a/settings.gradle b/settings.gradle index 45e6bcea..a7315ff1 100644 --- a/settings.gradle +++ b/settings.gradle @@ -79,6 +79,7 @@ include 'code:tools:website-adjacencies-calculator' include 'code:tools:screenshot-capture-tool' include 'code:tools:load-test' include 'code:tools:stackexchange-converter' +include 'code:tools:crawl-data-unfcker' include 'third-party:porterstemmer' include 'third-party:xz'