(*) Tool for unfcking old crawl data so that it aligns with the new style IDs

2025-02-23 13:09:00 +00:00 · 2023-10-19 13:20:32 +02:00 · 2023-10-19 13:20:32 +02:00 · 2bf0c4497d
commit 2bf0c4497d
parent 93122bdd18
5 changed files with 142 additions and 0 deletions
--- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java
+++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java
@ -1,5 +1,7 @@
 package nu.marginalia.crawling.io;
 import org.apache.logging.log4j.util.Strings;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
@ -18,6 +20,10 @@ public class CrawlerOutputFile {
    /** Return the Path to a file for the given id and name, creating the prerequisite
     * directory structure as necessary. */
    public static Path createOutputPath(Path base, String id, String name) throws IOException {
        if (id.length() < 4) {
            id = Strings.repeat("0", 4 - id.length()) + id;
        }
        String first = id.substring(0, 2);
        String second = id.substring(2, 4);
--- a/code/tools/crawl-data-unfcker/build.gradle
+++ b/code/tools/crawl-data-unfcker/build.gradle
@ -0,0 +1,55 @@
 plugins {
    id 'java'
    id 'application'
    id 'jvm-test-suite'
 }
 java {
    toolchain {
        languageVersion.set(JavaLanguageVersion.of(21))
    }
 }
 application {
    mainClass = 'nu.marginalia.tools.CrawlDataUnfcker'
    applicationName = 'crawl-data-unfcker'
 }
 tasks.distZip.enabled = false
 dependencies {
    implementation project(':third-party:rdrpostagger')
    implementation project(':third-party:porterstemmer')
    implementation project(':third-party:monkey-patch-opennlp')
    implementation project(':code:common:model')
    implementation project(':code:common:config')
    implementation project(':code:common:process')
    implementation project(':code:common:service')
    implementation project(':code:libraries:language-processing')
    implementation project(':code:libraries:term-frequency-dict')
    implementation project(':code:libraries:big-string')
    implementation project(':code:processes:converting-process')
    implementation project(':code:process-models:crawling-model')
    implementation project(':code:features-convert:adblock')
    implementation project(':code:features-convert:topic-detection')
    implementation project(':code:features-convert:keyword-extraction')
    implementation libs.bundles.slf4j
    implementation libs.notnull
    implementation libs.guice
    implementation libs.jsoup
    implementation libs.trove
    implementation libs.fastutil
    implementation libs.bundles.nlp
    implementation libs.commons.lang3
    testImplementation libs.bundles.slf4j.test
    testImplementation libs.bundles.junit
    testImplementation libs.mockito
 }
--- a/code/tools/crawl-data-unfcker/readme.md
+++ b/code/tools/crawl-data-unfcker/readme.md
@ -0,0 +1,3 @@
 # Crawl Data Unfcker
 This is a migration tool that patches the generated ID of crawl data.
--- a/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java
+++ b/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java
@ -0,0 +1,77 @@
 package nu.marginalia.tools;
 import nu.marginalia.crawling.io.CrawlerOutputFile;
 import nu.marginalia.crawling.model.CrawledDomain;
 import nu.marginalia.process.log.WorkLog;
 import nu.marginalia.crawling.io.CrawledDomainReader;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.Optional;
 public class CrawlDataUnfcker {
    public static void main(String... args) {
        if (args.length != 2) {
            System.out.println("Usage: crawl-data-unfcker input output");
            return;
        }
        Path input = Path.of(args[0]);
        Path output = Path.of(args[1]);
        if (!Files.isDirectory(input)) {
            System.err.println("Input directory is not valid");
            return;
        }
        if (!Files.isDirectory(output)) {
            System.err.println("Output directory is not valid");
            return;
        }
        var reader = new CrawledDomainReader();
        try (var wl = new WorkLog(output.resolve("crawler.log"))) {
            for (var inputItem : WorkLog.iterable(input.resolve("crawler.log"))) {
                Path inputPath = input.resolve(inputItem.relPath());
                var domainMaybe = readDomain(reader, inputPath).map(CrawledDomain::getDomain);
                if (domainMaybe.isEmpty())
                    continue;
                var domain = domainMaybe.get();
                // Generate conformant ID
                String newId = Integer.toHexString(domain.hashCode());
                var outputPath = CrawlerOutputFile.createOutputPath(output, newId, domain);
                var outputFileName = outputPath.toFile().getName();
                System.out.println(inputPath + " -> " + outputPath);
                Files.move(inputPath, outputPath);
                wl.setJobToFinished(domain, outputFileName, inputItem.cnt());
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }
    static Optional<CrawledDomain> readDomain(CrawledDomainReader reader, Path file) {
        if (!Files.exists(file)) {
            System.out.println("Missing file " + file);
            return Optional.empty();
        }
        try (var stream = reader.createDataStream(file)) {
            while (stream.hasNext()) {
                if (stream.next() instanceof CrawledDomain domain) {
                    return Optional.of(domain);
                }
            }
        }
        catch (Exception ex) {
            ex.printStackTrace();
        }
        return Optional.empty();
    }
 }
--- a/settings.gradle
+++ b/settings.gradle
@ -79,6 +79,7 @@ include 'code:tools:website-adjacencies-calculator'
 include 'code:tools:screenshot-capture-tool'
 include 'code:tools:load-test'
 include 'code:tools:stackexchange-converter'
 include 'code:tools:crawl-data-unfcker'
 include 'third-party:porterstemmer'
 include 'third-party:xz'
		`@ -0,0 +1,3 @@`
							`# Crawl Data Unfcker`

							`This is a migration tool that patches the generated ID of crawl data.`