mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Improved crawl-job-extractor.
Let crawl-job-extractor run offline and allow it to read domains from file. Improved docs.
This commit is contained in:
parent
9455100907
commit
4fc0ddbc45
@ -24,6 +24,7 @@ dependencies {
|
|||||||
implementation project(':code:common:db')
|
implementation project(':code:common:db')
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
|
implementation project(':code:common:service-discovery')
|
||||||
implementation project(':code:process-models:crawling-model')
|
implementation project(':code:process-models:crawling-model')
|
||||||
|
|
||||||
implementation libs.lombok
|
implementation libs.lombok
|
||||||
|
@ -3,4 +3,49 @@
|
|||||||
The crawl job extractor creates a file containing a list of domains
|
The crawl job extractor creates a file containing a list of domains
|
||||||
along with known URLs.
|
along with known URLs.
|
||||||
|
|
||||||
This is consumed by [processes/crawling-process](../../processes/crawling-process).
|
This is consumed by [processes/crawling-process](../../processes/crawling-process).
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
|
||||||
|
The crawl job extractor has three modes of operation:
|
||||||
|
|
||||||
|
```
|
||||||
|
# 1 grab domains from the database
|
||||||
|
./crawl-job-extractor file.out
|
||||||
|
|
||||||
|
# 2 grab domains from a file
|
||||||
|
./crawl-job-extractor file.out -f domains.txt
|
||||||
|
|
||||||
|
# 3 grab domains from the command line
|
||||||
|
./crawl-job-extractor file.out domain1 domain2 ...
|
||||||
|
```
|
||||||
|
|
||||||
|
* When only a single argument is passed, the file name to write to, it will create a complete list of domains
|
||||||
|
and URLs known to the system from the list of already indexed domains,
|
||||||
|
as well as domains from the CRAWL_QUEUE table in the database.
|
||||||
|
* When the command line is passed like `./crawl-job-extractor output-file -f domains.txt`,
|
||||||
|
domains will be read from non-blank and non-comment lines in the file.
|
||||||
|
* In other cases, the 2nd argument onward to the command will be interpreted as domain-names.
|
||||||
|
|
||||||
|
In the last two modes, if the crawl-job-extractor is able to connect to the database, it will use
|
||||||
|
information from the link database to populate the list of URLs for each domain, otherwise it will
|
||||||
|
create a spec with only the domain name and the index address, so the crawler will have to figure out
|
||||||
|
the rest.
|
||||||
|
|
||||||
|
The crawl-specification is zstd-compressed json.
|
||||||
|
|
||||||
|
## Tricks
|
||||||
|
|
||||||
|
### Joining two specifications
|
||||||
|
|
||||||
|
Two or more specifications can be joined with a shell command on the form
|
||||||
|
|
||||||
|
```shell
|
||||||
|
$ zstdcat file1 file2 | zstd -o new-file
|
||||||
|
```
|
||||||
|
|
||||||
|
### Inspection
|
||||||
|
|
||||||
|
The file can also be inspected with `zstdless`,
|
||||||
|
or combinations like `zstdcat file | jq`
|
@ -4,8 +4,8 @@ import com.google.common.hash.HashFunction;
|
|||||||
import com.google.common.hash.Hashing;
|
import com.google.common.hash.Hashing;
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
||||||
|
import nu.marginalia.db.DomainBlacklist;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.db.DomainBlacklistImpl;
|
|
||||||
|
|
||||||
import java.sql.ResultSet;
|
import java.sql.ResultSet;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
@ -68,11 +68,11 @@ public class CrawlJobDomainExtractor {
|
|||||||
""";
|
""";
|
||||||
|
|
||||||
|
|
||||||
private final DomainBlacklistImpl blacklist;
|
private final DomainBlacklist blacklist;
|
||||||
private final HikariDataSource dataSource;
|
private final HikariDataSource dataSource;
|
||||||
private static final HashFunction hasher = Hashing.murmur3_128(0);
|
private static final HashFunction hasher = Hashing.murmur3_128(0);
|
||||||
|
|
||||||
public CrawlJobDomainExtractor(DomainBlacklistImpl blacklist, HikariDataSource dataSource) {
|
public CrawlJobDomainExtractor(DomainBlacklist blacklist, HikariDataSource dataSource) {
|
||||||
this.blacklist = blacklist;
|
this.blacklist = blacklist;
|
||||||
this.dataSource = dataSource;
|
this.dataSource = dataSource;
|
||||||
}
|
}
|
||||||
@ -107,7 +107,20 @@ public class CrawlJobDomainExtractor {
|
|||||||
.map(this::createCrawlJobForDomain);
|
.map(this::createCrawlJobForDomain);
|
||||||
}
|
}
|
||||||
|
|
||||||
public CrawlingSpecification extractDomain(EdgeDomain domain) {
|
public CrawlingSpecification extractNewDomain(EdgeDomain domain) {
|
||||||
|
CrawlingSpecification spec = new CrawlingSpecification();
|
||||||
|
|
||||||
|
spec.domain = domain.toString();
|
||||||
|
spec.id = createId(domain);
|
||||||
|
spec.urls = new ArrayList<>(1000);
|
||||||
|
|
||||||
|
spec.urls.add("https://"+domain+"/");
|
||||||
|
spec.crawlDepth = MIN_VISIT_COUNT;
|
||||||
|
|
||||||
|
return spec;
|
||||||
|
}
|
||||||
|
|
||||||
|
public CrawlingSpecification extractKnownDomain(EdgeDomain domain) {
|
||||||
CrawlingSpecification spec = new CrawlingSpecification();
|
CrawlingSpecification spec = new CrawlingSpecification();
|
||||||
|
|
||||||
spec.domain = domain.toString();
|
spec.domain = domain.toString();
|
||||||
@ -143,6 +156,7 @@ public class CrawlJobDomainExtractor {
|
|||||||
|
|
||||||
return spec;
|
return spec;
|
||||||
}
|
}
|
||||||
|
|
||||||
private record DomainWithId(String domainName, int id) {
|
private record DomainWithId(String domainName, int id) {
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,13 +1,16 @@
|
|||||||
package nu.marginalia.crawl;
|
package nu.marginalia.crawl;
|
||||||
|
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.db.DomainBlacklistImpl;
|
import nu.marginalia.db.DomainBlacklistImpl;
|
||||||
|
import nu.marginalia.service.ServiceHomeNotConfiguredException;
|
||||||
import nu.marginalia.service.module.DatabaseModule;
|
import nu.marginalia.service.module.DatabaseModule;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.sql.SQLException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
@ -15,7 +18,7 @@ public class CrawlJobExtractorMain {
|
|||||||
|
|
||||||
public static void main(String... args) throws IOException {
|
public static void main(String... args) throws IOException {
|
||||||
if (args.length == 0) {
|
if (args.length == 0) {
|
||||||
System.out.println("Parameters: outputfile.spec [domain1, domain2, ...]");
|
System.out.println("Parameters: outputfile.spec [-f domains.txt] | [domain1, domain2, ...]");
|
||||||
System.out.println();
|
System.out.println();
|
||||||
System.out.println("If no domains are provided, a full crawl spec is created from the database");
|
System.out.println("If no domains are provided, a full crawl spec is created from the database");
|
||||||
return;
|
return;
|
||||||
@ -27,21 +30,61 @@ public class CrawlJobExtractorMain {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
String[] targetDomains = Arrays.copyOfRange(args, 1, args.length);
|
String[] targetDomains = getTargetDomains(Arrays.copyOfRange(args, 1, args.length));
|
||||||
|
|
||||||
try (CrawlJobSpecWriter out = new CrawlJobSpecWriter(outFile))
|
try (CrawlJobSpecWriter out = new CrawlJobSpecWriter(outFile))
|
||||||
{
|
{
|
||||||
streamSpecs(targetDomains).forEach(out::accept);
|
streamSpecs(targetDomains).forEach(out::accept);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
System.out.println("All done! Wrote " + outFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String[] getTargetDomains(String[] strings) throws IOException {
|
||||||
|
if (strings.length == 0)
|
||||||
|
return strings;
|
||||||
|
|
||||||
|
if (strings.length == 2 && "-f".equals(strings[0])) {
|
||||||
|
Path file = Path.of(strings[1]);
|
||||||
|
|
||||||
|
System.out.println("Reading domains from " + file);
|
||||||
|
|
||||||
|
try (var lines = Files.lines(file)) {
|
||||||
|
return lines
|
||||||
|
.filter(s -> !s.isBlank())
|
||||||
|
.filter(s -> !s.startsWith("#"))
|
||||||
|
.map(String::trim)
|
||||||
|
.map(String::toLowerCase)
|
||||||
|
.toArray(String[]::new);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return strings;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Stream<CrawlingSpecification> streamSpecs(String[] targetDomains) {
|
private static Stream<CrawlingSpecification> streamSpecs(String[] targetDomains) {
|
||||||
var ds = new DatabaseModule().provideConnection();
|
|
||||||
var domainExtractor = new CrawlJobDomainExtractor(new DomainBlacklistImpl(ds), ds);
|
|
||||||
|
|
||||||
if (targetDomains.length > 0) {
|
if (targetDomains.length > 0) {
|
||||||
return Arrays.stream(targetDomains).map(EdgeDomain::new).map(domainExtractor::extractDomain);
|
|
||||||
|
try {
|
||||||
|
var dataSource = new DatabaseModule().provideConnection();
|
||||||
|
var domainExtractor = new CrawlJobDomainExtractor(new DomainBlacklistImpl(dataSource), dataSource);
|
||||||
|
return Arrays.stream(targetDomains).map(EdgeDomain::new).map(domainExtractor::extractKnownDomain);
|
||||||
|
}
|
||||||
|
catch (ServiceHomeNotConfiguredException ex) {
|
||||||
|
System.err.println("""
|
||||||
|
Could not connect to database, running crawl job creation in bootstrap mode.
|
||||||
|
This means that the crawl job will be created without any knowledge of the domains in the database.
|
||||||
|
|
||||||
|
If this is not desirable, ensure that WMSA_HOME is configured and that the database is running.
|
||||||
|
""");
|
||||||
|
|
||||||
|
var domainExtractor = new CrawlJobDomainExtractor(domain -> false, null);
|
||||||
|
return Arrays.stream(targetDomains).map(EdgeDomain::new).map(domainExtractor::extractNewDomain);
|
||||||
|
}
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
var ds = new DatabaseModule().provideConnection();
|
||||||
|
var domainExtractor = new CrawlJobDomainExtractor(new DomainBlacklistImpl(ds), ds);
|
||||||
return domainExtractor.extractDomainsFromQueue();
|
return domainExtractor.extractDomainsFromQueue();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user