Improved crawl-job-extractor.

Let crawl-job-extractor run offline and allow it to read domains from file. Improved docs.
2025-02-23 21:18:58 +00:00 · 2023-06-20 11:35:14 +02:00 · 2023-06-20 11:35:14 +02:00 · 4fc0ddbc45
commit 4fc0ddbc45
parent 9455100907
4 changed files with 114 additions and 11 deletions
--- a/code/tools/crawl-job-extractor/build.gradle
+++ b/code/tools/crawl-job-extractor/build.gradle
@ -24,6 +24,7 @@ dependencies {
    implementation project(':code:common:db')
    implementation project(':code:common:model')
    implementation project(':code:common:service')
+    implementation project(':code:common:service-discovery')
    implementation project(':code:process-models:crawling-model')

    implementation libs.lombok
--- a/code/tools/crawl-job-extractor/readme.md
+++ b/code/tools/crawl-job-extractor/readme.md
@ -4,3 +4,48 @@ The crawl job extractor creates a file containing a list of domains
 along with known URLs. 

 This is consumed by [processes/crawling-process](../../processes/crawling-process).
+
+## Usage
+
+
+The crawl job extractor has three modes of operation:
+
+```
+# 1  grab domains from the database
+./crawl-job-extractor file.out
+
+# 2  grab domains from a file
+./crawl-job-extractor file.out -f domains.txt
+
+# 3  grab domains from the command line
+./crawl-job-extractor file.out domain1 domain2 ...
+```
+
+* When only a single argument is passed, the file name to write to, it will create a complete list of domains
+  and URLs known to the system from the list of already indexed domains, 
+  as well as domains from the CRAWL_QUEUE table in the database.
+* When the command line is passed like `./crawl-job-extractor output-file -f domains.txt`,
+  domains will be read from non-blank and non-comment lines in the file.
+* In other cases, the 2nd argument onward to the command will be interpreted as domain-names.
+
+In the last two modes, if the crawl-job-extractor is able to connect to the database, it will use
+information from the link database to populate the list of URLs for each domain, otherwise it will
+create a spec with only the domain name and the index address, so the crawler will have to figure out
+the rest. 
+
+The crawl-specification is zstd-compressed json.
+
+## Tricks
+
+### Joining two specifications
+
+Two or more specifications can be joined with a shell command on the form
+
+```shell
+$ zstdcat file1 file2 | zstd -o new-file
+```
+
+### Inspection
+
+The file can also be inspected with `zstdless`, 
+or combinations like `zstdcat file | jq`
--- a/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java
+++ b/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java
@ -4,8 +4,8 @@ import com.google.common.hash.HashFunction;
 import com.google.common.hash.Hashing;
 import com.zaxxer.hikari.HikariDataSource;
 import nu.marginalia.crawling.model.spec.CrawlingSpecification;
+import nu.marginalia.db.DomainBlacklist;
 import nu.marginalia.model.EdgeDomain;
-import nu.marginalia.db.DomainBlacklistImpl;

 import java.sql.ResultSet;
 import java.sql.SQLException;
@ -68,11 +68,11 @@ public class CrawlJobDomainExtractor {
            """;


-    private final DomainBlacklistImpl blacklist;
+    private final DomainBlacklist blacklist;
    private final HikariDataSource dataSource;
    private static final HashFunction hasher = Hashing.murmur3_128(0);

-    public CrawlJobDomainExtractor(DomainBlacklistImpl blacklist, HikariDataSource dataSource) {
+    public CrawlJobDomainExtractor(DomainBlacklist blacklist, HikariDataSource dataSource) {
        this.blacklist = blacklist;
        this.dataSource = dataSource;
    }
@ -107,7 +107,20 @@ public class CrawlJobDomainExtractor {
                .map(this::createCrawlJobForDomain);
    }

-    public CrawlingSpecification extractDomain(EdgeDomain domain) {
+    public CrawlingSpecification extractNewDomain(EdgeDomain domain) {
+        CrawlingSpecification spec = new CrawlingSpecification();
+
+        spec.domain = domain.toString();
+        spec.id = createId(domain);
+        spec.urls = new ArrayList<>(1000);
+
+        spec.urls.add("https://"+domain+"/");
+        spec.crawlDepth = MIN_VISIT_COUNT;
+
+        return spec;
+    }
+
+    public CrawlingSpecification extractKnownDomain(EdgeDomain domain) {
        CrawlingSpecification spec = new CrawlingSpecification();

        spec.domain = domain.toString();
@ -143,6 +156,7 @@ public class CrawlJobDomainExtractor {

        return spec;
    }
+
    private record DomainWithId(String domainName, int id) {


--- a/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java
+++ b/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java
@ -1,13 +1,16 @@
 package nu.marginalia.crawl;

+import com.zaxxer.hikari.HikariDataSource;
 import nu.marginalia.crawling.model.spec.CrawlingSpecification;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.db.DomainBlacklistImpl;
+import nu.marginalia.service.ServiceHomeNotConfiguredException;
 import nu.marginalia.service.module.DatabaseModule;

 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
+import java.sql.SQLException;
 import java.util.*;
 import java.util.stream.Stream;

@ -15,7 +18,7 @@ public class CrawlJobExtractorMain {

    public static void main(String... args) throws IOException {
        if (args.length == 0) {
-            System.out.println("Parameters: outputfile.spec [domain1, domain2, ...]");
+            System.out.println("Parameters: outputfile.spec [-f domains.txt] | [domain1, domain2, ...]");
            System.out.println();
            System.out.println("If no domains are provided, a full crawl spec is created from the database");
            return;
@ -27,21 +30,61 @@ public class CrawlJobExtractorMain {
            return;
        }

-        String[] targetDomains = Arrays.copyOfRange(args, 1, args.length);
+        String[] targetDomains = getTargetDomains(Arrays.copyOfRange(args, 1, args.length));

        try (CrawlJobSpecWriter out = new CrawlJobSpecWriter(outFile))
        {
            streamSpecs(targetDomains).forEach(out::accept);
        }
+
+        System.out.println("All done! Wrote " + outFile);
+    }
+
+    private static String[] getTargetDomains(String[] strings) throws IOException {
+        if (strings.length == 0)
+            return strings;
+
+        if (strings.length == 2 && "-f".equals(strings[0])) {
+            Path file = Path.of(strings[1]);
+
+            System.out.println("Reading domains from " + file);
+
+            try (var lines = Files.lines(file)) {
+                return lines
+                        .filter(s -> !s.isBlank())
+                        .filter(s -> !s.startsWith("#"))
+                        .map(String::trim)
+                        .map(String::toLowerCase)
+                        .toArray(String[]::new);
+            }
+        }
+
+        return strings;
    }

    private static Stream<CrawlingSpecification> streamSpecs(String[] targetDomains) {
+        if (targetDomains.length > 0) {
+
+            try {
+                var dataSource = new DatabaseModule().provideConnection();
+                var domainExtractor = new CrawlJobDomainExtractor(new DomainBlacklistImpl(dataSource), dataSource);
+                return Arrays.stream(targetDomains).map(EdgeDomain::new).map(domainExtractor::extractKnownDomain);
+            }
+            catch (ServiceHomeNotConfiguredException ex) {
+                System.err.println("""
+                    Could not connect to database, running crawl job creation in bootstrap mode.
+                    This means that the crawl job will be created without any knowledge of the domains in the database.
+                    
+                    If this is not desirable, ensure that WMSA_HOME is configured and that the database is running.
+                    """);
+
+                var domainExtractor = new CrawlJobDomainExtractor(domain -> false, null);
+                return Arrays.stream(targetDomains).map(EdgeDomain::new).map(domainExtractor::extractNewDomain);
+            }
+
+        } else {
            var ds = new DatabaseModule().provideConnection();
            var domainExtractor = new CrawlJobDomainExtractor(new DomainBlacklistImpl(ds), ds);
-
-        if (targetDomains.length > 0) {
-            return Arrays.stream(targetDomains).map(EdgeDomain::new).map(domainExtractor::extractDomain);
-        } else {
            return domainExtractor.extractDomainsFromQueue();
        }
    }