mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(*) Tool for unfcking old crawl data so that it aligns with the new style IDs
This commit is contained in:
parent
93122bdd18
commit
2bf0c4497d
@ -1,5 +1,7 @@
|
|||||||
package nu.marginalia.crawling.io;
|
package nu.marginalia.crawling.io;
|
||||||
|
|
||||||
|
import org.apache.logging.log4j.util.Strings;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
@ -18,6 +20,10 @@ public class CrawlerOutputFile {
|
|||||||
/** Return the Path to a file for the given id and name, creating the prerequisite
|
/** Return the Path to a file for the given id and name, creating the prerequisite
|
||||||
* directory structure as necessary. */
|
* directory structure as necessary. */
|
||||||
public static Path createOutputPath(Path base, String id, String name) throws IOException {
|
public static Path createOutputPath(Path base, String id, String name) throws IOException {
|
||||||
|
if (id.length() < 4) {
|
||||||
|
id = Strings.repeat("0", 4 - id.length()) + id;
|
||||||
|
}
|
||||||
|
|
||||||
String first = id.substring(0, 2);
|
String first = id.substring(0, 2);
|
||||||
String second = id.substring(2, 4);
|
String second = id.substring(2, 4);
|
||||||
|
|
||||||
|
55
code/tools/crawl-data-unfcker/build.gradle
Normal file
55
code/tools/crawl-data-unfcker/build.gradle
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
plugins {
|
||||||
|
id 'java'
|
||||||
|
|
||||||
|
id 'application'
|
||||||
|
|
||||||
|
id 'jvm-test-suite'
|
||||||
|
}
|
||||||
|
|
||||||
|
java {
|
||||||
|
toolchain {
|
||||||
|
languageVersion.set(JavaLanguageVersion.of(21))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
application {
|
||||||
|
mainClass = 'nu.marginalia.tools.CrawlDataUnfcker'
|
||||||
|
applicationName = 'crawl-data-unfcker'
|
||||||
|
}
|
||||||
|
|
||||||
|
tasks.distZip.enabled = false
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
implementation project(':third-party:rdrpostagger')
|
||||||
|
implementation project(':third-party:porterstemmer')
|
||||||
|
implementation project(':third-party:monkey-patch-opennlp')
|
||||||
|
implementation project(':code:common:model')
|
||||||
|
implementation project(':code:common:config')
|
||||||
|
implementation project(':code:common:process')
|
||||||
|
implementation project(':code:common:service')
|
||||||
|
implementation project(':code:libraries:language-processing')
|
||||||
|
implementation project(':code:libraries:term-frequency-dict')
|
||||||
|
implementation project(':code:libraries:big-string')
|
||||||
|
implementation project(':code:processes:converting-process')
|
||||||
|
implementation project(':code:process-models:crawling-model')
|
||||||
|
|
||||||
|
implementation project(':code:features-convert:adblock')
|
||||||
|
implementation project(':code:features-convert:topic-detection')
|
||||||
|
implementation project(':code:features-convert:keyword-extraction')
|
||||||
|
|
||||||
|
implementation libs.bundles.slf4j
|
||||||
|
implementation libs.notnull
|
||||||
|
|
||||||
|
implementation libs.guice
|
||||||
|
implementation libs.jsoup
|
||||||
|
implementation libs.trove
|
||||||
|
implementation libs.fastutil
|
||||||
|
|
||||||
|
implementation libs.bundles.nlp
|
||||||
|
implementation libs.commons.lang3
|
||||||
|
|
||||||
|
testImplementation libs.bundles.slf4j.test
|
||||||
|
testImplementation libs.bundles.junit
|
||||||
|
testImplementation libs.mockito
|
||||||
|
}
|
||||||
|
|
3
code/tools/crawl-data-unfcker/readme.md
Normal file
3
code/tools/crawl-data-unfcker/readme.md
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
# Crawl Data Unfcker
|
||||||
|
|
||||||
|
This is a migration tool that patches the generated ID of crawl data.
|
@ -0,0 +1,77 @@
|
|||||||
|
package nu.marginalia.tools;
|
||||||
|
|
||||||
|
import nu.marginalia.crawling.io.CrawlerOutputFile;
|
||||||
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
|
import nu.marginalia.process.log.WorkLog;
|
||||||
|
import nu.marginalia.crawling.io.CrawledDomainReader;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
public class CrawlDataUnfcker {
|
||||||
|
public static void main(String... args) {
|
||||||
|
if (args.length != 2) {
|
||||||
|
System.out.println("Usage: crawl-data-unfcker input output");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
Path input = Path.of(args[0]);
|
||||||
|
Path output = Path.of(args[1]);
|
||||||
|
|
||||||
|
if (!Files.isDirectory(input)) {
|
||||||
|
System.err.println("Input directory is not valid");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!Files.isDirectory(output)) {
|
||||||
|
System.err.println("Output directory is not valid");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
var reader = new CrawledDomainReader();
|
||||||
|
|
||||||
|
try (var wl = new WorkLog(output.resolve("crawler.log"))) {
|
||||||
|
for (var inputItem : WorkLog.iterable(input.resolve("crawler.log"))) {
|
||||||
|
Path inputPath = input.resolve(inputItem.relPath());
|
||||||
|
|
||||||
|
var domainMaybe = readDomain(reader, inputPath).map(CrawledDomain::getDomain);
|
||||||
|
if (domainMaybe.isEmpty())
|
||||||
|
continue;
|
||||||
|
var domain = domainMaybe.get();
|
||||||
|
|
||||||
|
// Generate conformant ID
|
||||||
|
String newId = Integer.toHexString(domain.hashCode());
|
||||||
|
|
||||||
|
var outputPath = CrawlerOutputFile.createOutputPath(output, newId, domain);
|
||||||
|
var outputFileName = outputPath.toFile().getName();
|
||||||
|
|
||||||
|
System.out.println(inputPath + " -> " + outputPath);
|
||||||
|
Files.move(inputPath, outputPath);
|
||||||
|
|
||||||
|
wl.setJobToFinished(domain, outputFileName, inputItem.cnt());
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static Optional<CrawledDomain> readDomain(CrawledDomainReader reader, Path file) {
|
||||||
|
if (!Files.exists(file)) {
|
||||||
|
System.out.println("Missing file " + file);
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
try (var stream = reader.createDataStream(file)) {
|
||||||
|
while (stream.hasNext()) {
|
||||||
|
if (stream.next() instanceof CrawledDomain domain) {
|
||||||
|
return Optional.of(domain);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
ex.printStackTrace();
|
||||||
|
}
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
@ -79,6 +79,7 @@ include 'code:tools:website-adjacencies-calculator'
|
|||||||
include 'code:tools:screenshot-capture-tool'
|
include 'code:tools:screenshot-capture-tool'
|
||||||
include 'code:tools:load-test'
|
include 'code:tools:load-test'
|
||||||
include 'code:tools:stackexchange-converter'
|
include 'code:tools:stackexchange-converter'
|
||||||
|
include 'code:tools:crawl-data-unfcker'
|
||||||
|
|
||||||
include 'third-party:porterstemmer'
|
include 'third-party:porterstemmer'
|
||||||
include 'third-party:xz'
|
include 'third-party:xz'
|
||||||
|
Loading…
Reference in New Issue
Block a user