Improved crawl-job-extractor.

Let crawl-job-extractor run offline and allow it to read domains from file. Improved docs.
2025-02-23 13:09:00 +00:00 · 2023-06-20 11:35:14 +02:00 · 2023-06-20 11:35:14 +02:00 · 4fc0ddbc45
commit 4fc0ddbc45
parent 9455100907
4 changed files with 114 additions and 11 deletions
--- a/code/tools/crawl-job-extractor/build.gradle
+++ b/code/tools/crawl-job-extractor/build.gradle
@ -24,6 +24,7 @@ dependencies {
    implementation project(':code:common:db')
    implementation project(':code:common:model')
    implementation project(':code:common:service')
    implementation project(':code:common:service-discovery')
    implementation project(':code:process-models:crawling-model')
    implementation libs.lombok
--- a/code/tools/crawl-job-extractor/readme.md
+++ b/code/tools/crawl-job-extractor/readme.md
@ -3,4 +3,49 @@
 The crawl job extractor creates a file containing a list of domains
 along with known URLs. 
-This is consumed by [processes/crawling-process](../../processes/crawling-process).
+This is consumed by [processes/crawling-process](../../processes/crawling-process).
 ## Usage
 The crawl job extractor has three modes of operation:
 ```
 # 1  grab domains from the database
 ./crawl-job-extractor file.out
 # 2  grab domains from a file
 ./crawl-job-extractor file.out -f domains.txt
 # 3  grab domains from the command line
 ./crawl-job-extractor file.out domain1 domain2 ...
 ```
 * When only a single argument is passed, the file name to write to, it will create a complete list of domains
  and URLs known to the system from the list of already indexed domains, 
  as well as domains from the CRAWL_QUEUE table in the database.
 * When the command line is passed like `./crawl-job-extractor output-file -f domains.txt`,
  domains will be read from non-blank and non-comment lines in the file.
 * In other cases, the 2nd argument onward to the command will be interpreted as domain-names.
 In the last two modes, if the crawl-job-extractor is able to connect to the database, it will use
 information from the link database to populate the list of URLs for each domain, otherwise it will
 create a spec with only the domain name and the index address, so the crawler will have to figure out
 the rest. 
 The crawl-specification is zstd-compressed json.
 ## Tricks
 ### Joining two specifications
 Two or more specifications can be joined with a shell command on the form
 ```shell
 $ zstdcat file1 file2 | zstd -o new-file
 ```
 ### Inspection
 The file can also be inspected with `zstdless`, 
 or combinations like `zstdcat file | jq`
--- a/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java
+++ b/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java
@ -4,8 +4,8 @@ import com.google.common.hash.HashFunction;
 import com.google.common.hash.Hashing;
 import com.zaxxer.hikari.HikariDataSource;
 import nu.marginalia.crawling.model.spec.CrawlingSpecification;
 import nu.marginalia.db.DomainBlacklist;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.db.DomainBlacklistImpl;
 import java.sql.ResultSet;
 import java.sql.SQLException;
@ -68,11 +68,11 @@ public class CrawlJobDomainExtractor {
            """;
-    private final DomainBlacklistImpl blacklist;
+    private final DomainBlacklist blacklist;
    private final HikariDataSource dataSource;
    private static final HashFunction hasher = Hashing.murmur3_128(0);
-    public CrawlJobDomainExtractor(DomainBlacklistImpl blacklist, HikariDataSource dataSource) {
+    public CrawlJobDomainExtractor(DomainBlacklist blacklist, HikariDataSource dataSource) {
        this.blacklist = blacklist;
        this.dataSource = dataSource;
    }
@ -107,7 +107,20 @@ public class CrawlJobDomainExtractor {
                .map(this::createCrawlJobForDomain);
    }
-    public CrawlingSpecification extractDomain(EdgeDomain domain) {
+    public CrawlingSpecification extractNewDomain(EdgeDomain domain) {
        CrawlingSpecification spec = new CrawlingSpecification();
        spec.domain = domain.toString();
        spec.id = createId(domain);
        spec.urls = new ArrayList<>(1000);
        spec.urls.add("https://"+domain+"/");
        spec.crawlDepth = MIN_VISIT_COUNT;
        return spec;
    }
    public CrawlingSpecification extractKnownDomain(EdgeDomain domain) {
        CrawlingSpecification spec = new CrawlingSpecification();
        spec.domain = domain.toString();
@ -143,6 +156,7 @@ public class CrawlJobDomainExtractor {
        return spec;
    }
    private record DomainWithId(String domainName, int id) {
--- a/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java
+++ b/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java
@ -1,13 +1,16 @@
 package nu.marginalia.crawl;
 import com.zaxxer.hikari.HikariDataSource;
 import nu.marginalia.crawling.model.spec.CrawlingSpecification;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.db.DomainBlacklistImpl;
 import nu.marginalia.service.ServiceHomeNotConfiguredException;
 import nu.marginalia.service.module.DatabaseModule;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.sql.SQLException;
 import java.util.*;
 import java.util.stream.Stream;
@ -15,7 +18,7 @@ public class CrawlJobExtractorMain {
    public static void main(String... args) throws IOException {
        if (args.length == 0) {
-            System.out.println("Parameters: outputfile.spec [domain1, domain2, ...]");
+            System.out.println("Parameters: outputfile.spec [-f domains.txt] | [domain1, domain2, ...]");
            System.out.println();
            System.out.println("If no domains are provided, a full crawl spec is created from the database");
            return;
@ -27,21 +30,61 @@ public class CrawlJobExtractorMain {
            return;
        }
-        String[] targetDomains = Arrays.copyOfRange(args, 1, args.length);
+        String[] targetDomains = getTargetDomains(Arrays.copyOfRange(args, 1, args.length));
        try (CrawlJobSpecWriter out = new CrawlJobSpecWriter(outFile))
        {
            streamSpecs(targetDomains).forEach(out::accept);
        }
        System.out.println("All done! Wrote " + outFile);
    }
    private static String[] getTargetDomains(String[] strings) throws IOException {
        if (strings.length == 0)
            return strings;
        if (strings.length == 2 && "-f".equals(strings[0])) {
            Path file = Path.of(strings[1]);
            System.out.println("Reading domains from " + file);
            try (var lines = Files.lines(file)) {
                return lines
                        .filter(s -> !s.isBlank())
                        .filter(s -> !s.startsWith("#"))
                        .map(String::trim)
                        .map(String::toLowerCase)
                        .toArray(String[]::new);
            }
        }
        return strings;
    }
    private static Stream<CrawlingSpecification> streamSpecs(String[] targetDomains) {
        var ds = new DatabaseModule().provideConnection();
        var domainExtractor = new CrawlJobDomainExtractor(new DomainBlacklistImpl(ds), ds);
        if (targetDomains.length > 0) {
-            return Arrays.stream(targetDomains).map(EdgeDomain::new).map(domainExtractor::extractDomain);
+
            try {
                var dataSource = new DatabaseModule().provideConnection();
                var domainExtractor = new CrawlJobDomainExtractor(new DomainBlacklistImpl(dataSource), dataSource);
                return Arrays.stream(targetDomains).map(EdgeDomain::new).map(domainExtractor::extractKnownDomain);
            }
            catch (ServiceHomeNotConfiguredException ex) {
                System.err.println("""
                    Could not connect to database, running crawl job creation in bootstrap mode.
                    This means that the crawl job will be created without any knowledge of the domains in the database.
                    If this is not desirable, ensure that WMSA_HOME is configured and that the database is running.
                    """);
                var domainExtractor = new CrawlJobDomainExtractor(domain -> false, null);
                return Arrays.stream(targetDomains).map(EdgeDomain::new).map(domainExtractor::extractNewDomain);
            }
        } else {
            var ds = new DatabaseModule().provideConnection();
            var domainExtractor = new CrawlJobDomainExtractor(new DomainBlacklistImpl(ds), ds);
            return domainExtractor.extractDomainsFromQueue();
        }
    }