mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Improved crawl-job-extractor.
Let crawl-job-extractor run offline and allow it to read domains from file. Improved docs.
This commit is contained in:
parent
9455100907
commit
4fc0ddbc45
@ -24,6 +24,7 @@ dependencies {
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:service-discovery')
|
||||
implementation project(':code:process-models:crawling-model')
|
||||
|
||||
implementation libs.lombok
|
||||
|
@ -4,3 +4,48 @@ The crawl job extractor creates a file containing a list of domains
|
||||
along with known URLs.
|
||||
|
||||
This is consumed by [processes/crawling-process](../../processes/crawling-process).
|
||||
|
||||
## Usage
|
||||
|
||||
|
||||
The crawl job extractor has three modes of operation:
|
||||
|
||||
```
|
||||
# 1 grab domains from the database
|
||||
./crawl-job-extractor file.out
|
||||
|
||||
# 2 grab domains from a file
|
||||
./crawl-job-extractor file.out -f domains.txt
|
||||
|
||||
# 3 grab domains from the command line
|
||||
./crawl-job-extractor file.out domain1 domain2 ...
|
||||
```
|
||||
|
||||
* When only a single argument is passed, the file name to write to, it will create a complete list of domains
|
||||
and URLs known to the system from the list of already indexed domains,
|
||||
as well as domains from the CRAWL_QUEUE table in the database.
|
||||
* When the command line is passed like `./crawl-job-extractor output-file -f domains.txt`,
|
||||
domains will be read from non-blank and non-comment lines in the file.
|
||||
* In other cases, the 2nd argument onward to the command will be interpreted as domain-names.
|
||||
|
||||
In the last two modes, if the crawl-job-extractor is able to connect to the database, it will use
|
||||
information from the link database to populate the list of URLs for each domain, otherwise it will
|
||||
create a spec with only the domain name and the index address, so the crawler will have to figure out
|
||||
the rest.
|
||||
|
||||
The crawl-specification is zstd-compressed json.
|
||||
|
||||
## Tricks
|
||||
|
||||
### Joining two specifications
|
||||
|
||||
Two or more specifications can be joined with a shell command on the form
|
||||
|
||||
```shell
|
||||
$ zstdcat file1 file2 | zstd -o new-file
|
||||
```
|
||||
|
||||
### Inspection
|
||||
|
||||
The file can also be inspected with `zstdless`,
|
||||
or combinations like `zstdcat file | jq`
|
@ -4,8 +4,8 @@ import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
||||
import nu.marginalia.db.DomainBlacklist;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.db.DomainBlacklistImpl;
|
||||
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
@ -68,11 +68,11 @@ public class CrawlJobDomainExtractor {
|
||||
""";
|
||||
|
||||
|
||||
private final DomainBlacklistImpl blacklist;
|
||||
private final DomainBlacklist blacklist;
|
||||
private final HikariDataSource dataSource;
|
||||
private static final HashFunction hasher = Hashing.murmur3_128(0);
|
||||
|
||||
public CrawlJobDomainExtractor(DomainBlacklistImpl blacklist, HikariDataSource dataSource) {
|
||||
public CrawlJobDomainExtractor(DomainBlacklist blacklist, HikariDataSource dataSource) {
|
||||
this.blacklist = blacklist;
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
@ -107,7 +107,20 @@ public class CrawlJobDomainExtractor {
|
||||
.map(this::createCrawlJobForDomain);
|
||||
}
|
||||
|
||||
public CrawlingSpecification extractDomain(EdgeDomain domain) {
|
||||
public CrawlingSpecification extractNewDomain(EdgeDomain domain) {
|
||||
CrawlingSpecification spec = new CrawlingSpecification();
|
||||
|
||||
spec.domain = domain.toString();
|
||||
spec.id = createId(domain);
|
||||
spec.urls = new ArrayList<>(1000);
|
||||
|
||||
spec.urls.add("https://"+domain+"/");
|
||||
spec.crawlDepth = MIN_VISIT_COUNT;
|
||||
|
||||
return spec;
|
||||
}
|
||||
|
||||
public CrawlingSpecification extractKnownDomain(EdgeDomain domain) {
|
||||
CrawlingSpecification spec = new CrawlingSpecification();
|
||||
|
||||
spec.domain = domain.toString();
|
||||
@ -143,6 +156,7 @@ public class CrawlJobDomainExtractor {
|
||||
|
||||
return spec;
|
||||
}
|
||||
|
||||
private record DomainWithId(String domainName, int id) {
|
||||
|
||||
|
||||
|
@ -1,13 +1,16 @@
|
||||
package nu.marginalia.crawl;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.db.DomainBlacklistImpl;
|
||||
import nu.marginalia.service.ServiceHomeNotConfiguredException;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@ -15,7 +18,7 @@ public class CrawlJobExtractorMain {
|
||||
|
||||
public static void main(String... args) throws IOException {
|
||||
if (args.length == 0) {
|
||||
System.out.println("Parameters: outputfile.spec [domain1, domain2, ...]");
|
||||
System.out.println("Parameters: outputfile.spec [-f domains.txt] | [domain1, domain2, ...]");
|
||||
System.out.println();
|
||||
System.out.println("If no domains are provided, a full crawl spec is created from the database");
|
||||
return;
|
||||
@ -27,21 +30,61 @@ public class CrawlJobExtractorMain {
|
||||
return;
|
||||
}
|
||||
|
||||
String[] targetDomains = Arrays.copyOfRange(args, 1, args.length);
|
||||
String[] targetDomains = getTargetDomains(Arrays.copyOfRange(args, 1, args.length));
|
||||
|
||||
try (CrawlJobSpecWriter out = new CrawlJobSpecWriter(outFile))
|
||||
{
|
||||
streamSpecs(targetDomains).forEach(out::accept);
|
||||
}
|
||||
|
||||
System.out.println("All done! Wrote " + outFile);
|
||||
}
|
||||
|
||||
private static String[] getTargetDomains(String[] strings) throws IOException {
|
||||
if (strings.length == 0)
|
||||
return strings;
|
||||
|
||||
if (strings.length == 2 && "-f".equals(strings[0])) {
|
||||
Path file = Path.of(strings[1]);
|
||||
|
||||
System.out.println("Reading domains from " + file);
|
||||
|
||||
try (var lines = Files.lines(file)) {
|
||||
return lines
|
||||
.filter(s -> !s.isBlank())
|
||||
.filter(s -> !s.startsWith("#"))
|
||||
.map(String::trim)
|
||||
.map(String::toLowerCase)
|
||||
.toArray(String[]::new);
|
||||
}
|
||||
}
|
||||
|
||||
return strings;
|
||||
}
|
||||
|
||||
private static Stream<CrawlingSpecification> streamSpecs(String[] targetDomains) {
|
||||
if (targetDomains.length > 0) {
|
||||
|
||||
try {
|
||||
var dataSource = new DatabaseModule().provideConnection();
|
||||
var domainExtractor = new CrawlJobDomainExtractor(new DomainBlacklistImpl(dataSource), dataSource);
|
||||
return Arrays.stream(targetDomains).map(EdgeDomain::new).map(domainExtractor::extractKnownDomain);
|
||||
}
|
||||
catch (ServiceHomeNotConfiguredException ex) {
|
||||
System.err.println("""
|
||||
Could not connect to database, running crawl job creation in bootstrap mode.
|
||||
This means that the crawl job will be created without any knowledge of the domains in the database.
|
||||
|
||||
If this is not desirable, ensure that WMSA_HOME is configured and that the database is running.
|
||||
""");
|
||||
|
||||
var domainExtractor = new CrawlJobDomainExtractor(domain -> false, null);
|
||||
return Arrays.stream(targetDomains).map(EdgeDomain::new).map(domainExtractor::extractNewDomain);
|
||||
}
|
||||
|
||||
} else {
|
||||
var ds = new DatabaseModule().provideConnection();
|
||||
var domainExtractor = new CrawlJobDomainExtractor(new DomainBlacklistImpl(ds), ds);
|
||||
|
||||
if (targetDomains.length > 0) {
|
||||
return Arrays.stream(targetDomains).map(EdgeDomain::new).map(domainExtractor::extractDomain);
|
||||
} else {
|
||||
return domainExtractor.extractDomainsFromQueue();
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user