From 2bf0c4497dec9597c2933539643636966dad9ff0 Mon Sep 17 00:00:00 2001
From: Viktor Lofgren <vlofgren@gmail.com>
Date: Thu, 19 Oct 2023 13:20:32 +0200
Subject: [PATCH] (*) Tool for unfcking old crawl data so that it aligns with
 the new style IDs

---
 .../crawling/io/CrawlerOutputFile.java        |  6 ++
 code/tools/crawl-data-unfcker/build.gradle    | 55 +++++++++++++
 code/tools/crawl-data-unfcker/readme.md       |  3 +
 .../nu/marginalia/tools/CrawlDataUnfcker.java | 77 +++++++++++++++++++
 settings.gradle                               |  1 +
 5 files changed, 142 insertions(+)
 create mode 100644 code/tools/crawl-data-unfcker/build.gradle
 create mode 100644 code/tools/crawl-data-unfcker/readme.md
 create mode 100644 code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java

diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java
index 37bafa92..a7661085 100644
--- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java
@@ -1,5 +1,7 @@
 package nu.marginalia.crawling.io;
 
+import org.apache.logging.log4j.util.Strings;
+
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
@@ -18,6 +20,10 @@ public class CrawlerOutputFile {
     /** Return the Path to a file for the given id and name, creating the prerequisite
      * directory structure as necessary. */
     public static Path createOutputPath(Path base, String id, String name) throws IOException {
+        if (id.length() < 4) {
+            id = Strings.repeat("0", 4 - id.length()) + id;
+        }
+
         String first = id.substring(0, 2);
         String second = id.substring(2, 4);
 
diff --git a/code/tools/crawl-data-unfcker/build.gradle b/code/tools/crawl-data-unfcker/build.gradle
new file mode 100644
index 00000000..6673eab6
--- /dev/null
+++ b/code/tools/crawl-data-unfcker/build.gradle
@@ -0,0 +1,55 @@
+plugins {
+    id 'java'
+
+    id 'application'
+
+    id 'jvm-test-suite'
+}
+
+java {
+    toolchain {
+        languageVersion.set(JavaLanguageVersion.of(21))
+    }
+}
+
+application {
+    mainClass = 'nu.marginalia.tools.CrawlDataUnfcker'
+    applicationName = 'crawl-data-unfcker'
+}
+
+tasks.distZip.enabled = false
+
+dependencies {
+    implementation project(':third-party:rdrpostagger')
+    implementation project(':third-party:porterstemmer')
+    implementation project(':third-party:monkey-patch-opennlp')
+    implementation project(':code:common:model')
+    implementation project(':code:common:config')
+    implementation project(':code:common:process')
+    implementation project(':code:common:service')
+    implementation project(':code:libraries:language-processing')
+    implementation project(':code:libraries:term-frequency-dict')
+    implementation project(':code:libraries:big-string')
+    implementation project(':code:processes:converting-process')
+    implementation project(':code:process-models:crawling-model')
+
+    implementation project(':code:features-convert:adblock')
+    implementation project(':code:features-convert:topic-detection')
+    implementation project(':code:features-convert:keyword-extraction')
+
+    implementation libs.bundles.slf4j
+    implementation libs.notnull
+
+    implementation libs.guice
+    implementation libs.jsoup
+    implementation libs.trove
+    implementation libs.fastutil
+
+    implementation libs.bundles.nlp
+    implementation libs.commons.lang3
+
+    testImplementation libs.bundles.slf4j.test
+    testImplementation libs.bundles.junit
+    testImplementation libs.mockito
+}
+
diff --git a/code/tools/crawl-data-unfcker/readme.md b/code/tools/crawl-data-unfcker/readme.md
new file mode 100644
index 00000000..9c870953
--- /dev/null
+++ b/code/tools/crawl-data-unfcker/readme.md
@@ -0,0 +1,3 @@
+# Crawl Data Unfcker
+
+This is a migration tool that patches the generated ID of crawl data.
\ No newline at end of file
diff --git a/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java b/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java
new file mode 100644
index 00000000..1a73a952
--- /dev/null
+++ b/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java
@@ -0,0 +1,77 @@
+package nu.marginalia.tools;
+
+import nu.marginalia.crawling.io.CrawlerOutputFile;
+import nu.marginalia.crawling.model.CrawledDomain;
+import nu.marginalia.process.log.WorkLog;
+import nu.marginalia.crawling.io.CrawledDomainReader;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Optional;
+
+public class CrawlDataUnfcker {
+    public static void main(String... args) {
+        if (args.length != 2) {
+            System.out.println("Usage: crawl-data-unfcker input output");
+            return;
+        }
+
+        Path input = Path.of(args[0]);
+        Path output = Path.of(args[1]);
+
+        if (!Files.isDirectory(input)) {
+            System.err.println("Input directory is not valid");
+            return;
+        }
+        if (!Files.isDirectory(output)) {
+            System.err.println("Output directory is not valid");
+            return;
+        }
+
+        var reader = new CrawledDomainReader();
+
+        try (var wl = new WorkLog(output.resolve("crawler.log"))) {
+            for (var inputItem : WorkLog.iterable(input.resolve("crawler.log"))) {
+                Path inputPath = input.resolve(inputItem.relPath());
+
+                var domainMaybe = readDomain(reader, inputPath).map(CrawledDomain::getDomain);
+                if (domainMaybe.isEmpty())
+                    continue;
+                var domain = domainMaybe.get();
+
+                // Generate conformant ID
+                String newId = Integer.toHexString(domain.hashCode());
+
+                var outputPath = CrawlerOutputFile.createOutputPath(output, newId, domain);
+                var outputFileName = outputPath.toFile().getName();
+
+                System.out.println(inputPath + " -> " + outputPath);
+                Files.move(inputPath, outputPath);
+
+                wl.setJobToFinished(domain, outputFileName, inputItem.cnt());
+            }
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    static Optional<CrawledDomain> readDomain(CrawledDomainReader reader, Path file) {
+        if (!Files.exists(file)) {
+            System.out.println("Missing file " + file);
+            return Optional.empty();
+        }
+
+        try (var stream = reader.createDataStream(file)) {
+            while (stream.hasNext()) {
+                if (stream.next() instanceof CrawledDomain domain) {
+                    return Optional.of(domain);
+                }
+            }
+        }
+        catch (Exception ex) {
+            ex.printStackTrace();
+        }
+        return Optional.empty();
+    }
+}
diff --git a/settings.gradle b/settings.gradle
index 45e6bcea..a7315ff1 100644
--- a/settings.gradle
+++ b/settings.gradle
@@ -79,6 +79,7 @@ include 'code:tools:website-adjacencies-calculator'
 include 'code:tools:screenshot-capture-tool'
 include 'code:tools:load-test'
 include 'code:tools:stackexchange-converter'
+include 'code:tools:crawl-data-unfcker'
 
 include 'third-party:porterstemmer'
 include 'third-party:xz'