diff --git a/code/tools/crawl-job-extractor/build.gradle b/code/tools/crawl-job-extractor/build.gradle index d79d7cb8..0211fbfb 100644 --- a/code/tools/crawl-job-extractor/build.gradle +++ b/code/tools/crawl-job-extractor/build.gradle @@ -24,6 +24,7 @@ dependencies { implementation project(':code:common:db') implementation project(':code:common:model') implementation project(':code:common:service') + implementation project(':code:common:service-discovery') implementation project(':code:process-models:crawling-model') implementation libs.lombok diff --git a/code/tools/crawl-job-extractor/readme.md b/code/tools/crawl-job-extractor/readme.md index ea242ba2..565c8354 100644 --- a/code/tools/crawl-job-extractor/readme.md +++ b/code/tools/crawl-job-extractor/readme.md @@ -3,4 +3,49 @@ The crawl job extractor creates a file containing a list of domains along with known URLs. -This is consumed by [processes/crawling-process](../../processes/crawling-process). \ No newline at end of file +This is consumed by [processes/crawling-process](../../processes/crawling-process). + +## Usage + + +The crawl job extractor has three modes of operation: + +``` +# 1 grab domains from the database +./crawl-job-extractor file.out + +# 2 grab domains from a file +./crawl-job-extractor file.out -f domains.txt + +# 3 grab domains from the command line +./crawl-job-extractor file.out domain1 domain2 ... +``` + +* When only a single argument is passed, the file name to write to, it will create a complete list of domains + and URLs known to the system from the list of already indexed domains, + as well as domains from the CRAWL_QUEUE table in the database. +* When the command line is passed like `./crawl-job-extractor output-file -f domains.txt`, + domains will be read from non-blank and non-comment lines in the file. +* In other cases, the 2nd argument onward to the command will be interpreted as domain-names. + +In the last two modes, if the crawl-job-extractor is able to connect to the database, it will use +information from the link database to populate the list of URLs for each domain, otherwise it will +create a spec with only the domain name and the index address, so the crawler will have to figure out +the rest. + +The crawl-specification is zstd-compressed json. + +## Tricks + +### Joining two specifications + +Two or more specifications can be joined with a shell command on the form + +```shell +$ zstdcat file1 file2 | zstd -o new-file +``` + +### Inspection + +The file can also be inspected with `zstdless`, +or combinations like `zstdcat file | jq` \ No newline at end of file diff --git a/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java b/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java index c7909ac7..253a9bff 100644 --- a/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java +++ b/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobDomainExtractor.java @@ -4,8 +4,8 @@ import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.crawling.model.spec.CrawlingSpecification; +import nu.marginalia.db.DomainBlacklist; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.db.DomainBlacklistImpl; import java.sql.ResultSet; import java.sql.SQLException; @@ -68,11 +68,11 @@ public class CrawlJobDomainExtractor { """; - private final DomainBlacklistImpl blacklist; + private final DomainBlacklist blacklist; private final HikariDataSource dataSource; private static final HashFunction hasher = Hashing.murmur3_128(0); - public CrawlJobDomainExtractor(DomainBlacklistImpl blacklist, HikariDataSource dataSource) { + public CrawlJobDomainExtractor(DomainBlacklist blacklist, HikariDataSource dataSource) { this.blacklist = blacklist; this.dataSource = dataSource; } @@ -107,7 +107,20 @@ public class CrawlJobDomainExtractor { .map(this::createCrawlJobForDomain); } - public CrawlingSpecification extractDomain(EdgeDomain domain) { + public CrawlingSpecification extractNewDomain(EdgeDomain domain) { + CrawlingSpecification spec = new CrawlingSpecification(); + + spec.domain = domain.toString(); + spec.id = createId(domain); + spec.urls = new ArrayList<>(1000); + + spec.urls.add("https://"+domain+"/"); + spec.crawlDepth = MIN_VISIT_COUNT; + + return spec; + } + + public CrawlingSpecification extractKnownDomain(EdgeDomain domain) { CrawlingSpecification spec = new CrawlingSpecification(); spec.domain = domain.toString(); @@ -143,6 +156,7 @@ public class CrawlJobDomainExtractor { return spec; } + private record DomainWithId(String domainName, int id) { diff --git a/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java b/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java index 90bf9326..e8de4de6 100644 --- a/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java +++ b/code/tools/crawl-job-extractor/src/main/java/nu/marginalia/crawl/CrawlJobExtractorMain.java @@ -1,13 +1,16 @@ package nu.marginalia.crawl; +import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.model.EdgeDomain; import nu.marginalia.db.DomainBlacklistImpl; +import nu.marginalia.service.ServiceHomeNotConfiguredException; import nu.marginalia.service.module.DatabaseModule; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.sql.SQLException; import java.util.*; import java.util.stream.Stream; @@ -15,7 +18,7 @@ public class CrawlJobExtractorMain { public static void main(String... args) throws IOException { if (args.length == 0) { - System.out.println("Parameters: outputfile.spec [domain1, domain2, ...]"); + System.out.println("Parameters: outputfile.spec [-f domains.txt] | [domain1, domain2, ...]"); System.out.println(); System.out.println("If no domains are provided, a full crawl spec is created from the database"); return; @@ -27,21 +30,61 @@ public class CrawlJobExtractorMain { return; } - String[] targetDomains = Arrays.copyOfRange(args, 1, args.length); + String[] targetDomains = getTargetDomains(Arrays.copyOfRange(args, 1, args.length)); try (CrawlJobSpecWriter out = new CrawlJobSpecWriter(outFile)) { streamSpecs(targetDomains).forEach(out::accept); } + + System.out.println("All done! Wrote " + outFile); + } + + private static String[] getTargetDomains(String[] strings) throws IOException { + if (strings.length == 0) + return strings; + + if (strings.length == 2 && "-f".equals(strings[0])) { + Path file = Path.of(strings[1]); + + System.out.println("Reading domains from " + file); + + try (var lines = Files.lines(file)) { + return lines + .filter(s -> !s.isBlank()) + .filter(s -> !s.startsWith("#")) + .map(String::trim) + .map(String::toLowerCase) + .toArray(String[]::new); + } + } + + return strings; } private static Stream streamSpecs(String[] targetDomains) { - var ds = new DatabaseModule().provideConnection(); - var domainExtractor = new CrawlJobDomainExtractor(new DomainBlacklistImpl(ds), ds); - if (targetDomains.length > 0) { - return Arrays.stream(targetDomains).map(EdgeDomain::new).map(domainExtractor::extractDomain); + + try { + var dataSource = new DatabaseModule().provideConnection(); + var domainExtractor = new CrawlJobDomainExtractor(new DomainBlacklistImpl(dataSource), dataSource); + return Arrays.stream(targetDomains).map(EdgeDomain::new).map(domainExtractor::extractKnownDomain); + } + catch (ServiceHomeNotConfiguredException ex) { + System.err.println(""" + Could not connect to database, running crawl job creation in bootstrap mode. + This means that the crawl job will be created without any knowledge of the domains in the database. + + If this is not desirable, ensure that WMSA_HOME is configured and that the database is running. + """); + + var domainExtractor = new CrawlJobDomainExtractor(domain -> false, null); + return Arrays.stream(targetDomains).map(EdgeDomain::new).map(domainExtractor::extractNewDomain); + } + } else { + var ds = new DatabaseModule().provideConnection(); + var domainExtractor = new CrawlJobDomainExtractor(new DomainBlacklistImpl(ds), ds); return domainExtractor.extractDomainsFromQueue(); } }