mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(crawler) Clean up the crawler code a bit, removing vestigial abstractions and historical debris
This commit is contained in:
parent
481f999b70
commit
7305afa0f8
@ -7,12 +7,12 @@ import nu.marginalia.UserAgent;
|
|||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.converting.model.ProcessedDomain;
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||||
import nu.marginalia.crawl.retreival.DomainProber;
|
import nu.marginalia.crawl.retreival.DomainProber;
|
||||||
import nu.marginalia.crawl.spec.CrawlSpecProvider;
|
|
||||||
import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream;
|
import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
@ -77,7 +77,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testInvalidDomain() throws IOException {
|
public void testInvalidDomain() throws IOException {
|
||||||
// Attempt to fetch an invalid domain
|
// Attempt to fetch an invalid domain
|
||||||
var specs = new CrawlSpecProvider.CrawlSpecRecord("invalid.invalid.invalid", 10);
|
var specs = new CrawlerMain.CrawlSpecRecord("invalid.invalid.invalid", 10);
|
||||||
|
|
||||||
CrawledDomain crawlData = crawl(specs);
|
CrawledDomain crawlData = crawl(specs);
|
||||||
|
|
||||||
@ -93,7 +93,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testRedirectingDomain() throws IOException {
|
public void testRedirectingDomain() throws IOException {
|
||||||
// Attempt to fetch an invalid domain
|
// Attempt to fetch an invalid domain
|
||||||
var specs = new CrawlSpecProvider.CrawlSpecRecord("memex.marginalia.nu", 10);
|
var specs = new CrawlerMain.CrawlSpecRecord("memex.marginalia.nu", 10);
|
||||||
|
|
||||||
CrawledDomain crawlData = crawl(specs);
|
CrawledDomain crawlData = crawl(specs);
|
||||||
|
|
||||||
@ -112,7 +112,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testBlockedDomain() throws IOException {
|
public void testBlockedDomain() throws IOException {
|
||||||
// Attempt to fetch an invalid domain
|
// Attempt to fetch an invalid domain
|
||||||
var specs = new CrawlSpecProvider.CrawlSpecRecord("search.marginalia.nu", 10);
|
var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 10);
|
||||||
|
|
||||||
CrawledDomain crawlData = crawl(specs, d->false); // simulate blocking by blacklisting everything
|
CrawledDomain crawlData = crawl(specs, d->false); // simulate blocking by blacklisting everything
|
||||||
|
|
||||||
@ -128,7 +128,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void crawlSunnyDay() throws IOException {
|
public void crawlSunnyDay() throws IOException {
|
||||||
var specs = new CrawlSpecProvider.CrawlSpecRecord("www.marginalia.nu", 10);
|
var specs = new CrawlerMain.CrawlSpecRecord("www.marginalia.nu", 10);
|
||||||
|
|
||||||
CrawledDomain domain = crawl(specs);
|
CrawledDomain domain = crawl(specs);
|
||||||
assertFalse(domain.doc.isEmpty());
|
assertFalse(domain.doc.isEmpty());
|
||||||
@ -161,7 +161,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void crawlContentTypes() throws IOException {
|
public void crawlContentTypes() throws IOException {
|
||||||
var specs = new CrawlSpecProvider.CrawlSpecRecord("www.marginalia.nu", 10,
|
var specs = new CrawlerMain.CrawlSpecRecord("www.marginalia.nu", 10,
|
||||||
List.of(
|
List.of(
|
||||||
"https://www.marginalia.nu/sanic.png",
|
"https://www.marginalia.nu/sanic.png",
|
||||||
"https://www.marginalia.nu/invalid"
|
"https://www.marginalia.nu/invalid"
|
||||||
@ -199,7 +199,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void crawlRobotsTxt() throws IOException {
|
public void crawlRobotsTxt() throws IOException {
|
||||||
var specs = new CrawlSpecProvider.CrawlSpecRecord("search.marginalia.nu", 5,
|
var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 5,
|
||||||
List.of("https://search.marginalia.nu/search?q=hello+world")
|
List.of("https://search.marginalia.nu/search?q=hello+world")
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -238,11 +238,11 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
return null; // unreachable
|
return null; // unreachable
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
private CrawledDomain crawl(CrawlSpecProvider.CrawlSpecRecord specs) throws IOException {
|
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs) throws IOException {
|
||||||
return crawl(specs, domain -> true);
|
return crawl(specs, domain -> true);
|
||||||
}
|
}
|
||||||
|
|
||||||
private CrawledDomain crawl(CrawlSpecProvider.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws IOException {
|
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws IOException {
|
||||||
List<SerializableCrawlData> data = new ArrayList<>();
|
List<SerializableCrawlData> data = new ArrayList<>();
|
||||||
|
|
||||||
try (var recorder = new WarcRecorder(fileName)) {
|
try (var recorder = new WarcRecorder(fileName)) {
|
||||||
|
@ -1,46 +0,0 @@
|
|||||||
package nu.marginalia.crawl;
|
|
||||||
|
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
|
|
||||||
public class AbortMonitor {
|
|
||||||
private volatile boolean abort = false;
|
|
||||||
private static volatile AbortMonitor instance = null;
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(AbortMonitor.class);
|
|
||||||
|
|
||||||
public static AbortMonitor getInstance() {
|
|
||||||
if (instance == null) {
|
|
||||||
synchronized (AbortMonitor.class) {
|
|
||||||
if (instance == null) {
|
|
||||||
instance = new AbortMonitor();
|
|
||||||
new Thread(instance::run, "AbortMon").start();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return instance;
|
|
||||||
}
|
|
||||||
|
|
||||||
private AbortMonitor() {
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public void run() {
|
|
||||||
for (;;) {
|
|
||||||
Thread.sleep(1000);
|
|
||||||
if (Files.exists(Path.of("/tmp/stop"))) {
|
|
||||||
logger.warn("Abort file found");
|
|
||||||
abort = true;
|
|
||||||
Files.delete(Path.of("/tmp/stop"));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isAlive() {
|
|
||||||
return !abort;
|
|
||||||
}
|
|
||||||
}
|
|
@ -4,10 +4,13 @@ import com.google.gson.Gson;
|
|||||||
import com.google.inject.Guice;
|
import com.google.inject.Guice;
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Injector;
|
import com.google.inject.Injector;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import lombok.Builder;
|
||||||
import nu.marginalia.ProcessConfiguration;
|
import nu.marginalia.ProcessConfiguration;
|
||||||
import nu.marginalia.ProcessConfigurationModule;
|
import nu.marginalia.ProcessConfigurationModule;
|
||||||
import nu.marginalia.UserAgent;
|
import nu.marginalia.UserAgent;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
import nu.marginalia.atags.source.AnchorTagsSource;
|
import nu.marginalia.atags.source.AnchorTagsSource;
|
||||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
@ -16,9 +19,9 @@ import nu.marginalia.crawl.logic.DomainLocks;
|
|||||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||||
import nu.marginalia.crawl.retreival.DomainProber;
|
import nu.marginalia.crawl.retreival.DomainProber;
|
||||||
import nu.marginalia.crawl.spec.CrawlSpecProvider;
|
|
||||||
import nu.marginalia.crawl.warc.WarcArchiverFactory;
|
import nu.marginalia.crawl.warc.WarcArchiverFactory;
|
||||||
import nu.marginalia.crawl.warc.WarcArchiverIf;
|
import nu.marginalia.crawl.warc.WarcArchiverIf;
|
||||||
|
import nu.marginalia.db.DomainBlacklist;
|
||||||
import nu.marginalia.io.CrawledDomainReader;
|
import nu.marginalia.io.CrawledDomainReader;
|
||||||
import nu.marginalia.io.CrawlerOutputFile;
|
import nu.marginalia.io.CrawlerOutputFile;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
@ -35,6 +38,7 @@ import nu.marginalia.storage.FileStorageService;
|
|||||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||||
import okhttp3.ConnectionPool;
|
import okhttp3.ConnectionPool;
|
||||||
import okhttp3.Dispatcher;
|
import okhttp3.Dispatcher;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -44,10 +48,7 @@ import java.nio.file.Path;
|
|||||||
import java.nio.file.StandardCopyOption;
|
import java.nio.file.StandardCopyOption;
|
||||||
import java.security.Security;
|
import java.security.Security;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.List;
|
import java.util.*;
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.UUID;
|
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
@ -62,22 +63,28 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
private final MessageQueueFactory messageQueueFactory;
|
private final MessageQueueFactory messageQueueFactory;
|
||||||
private final DomainProber domainProber;
|
private final DomainProber domainProber;
|
||||||
private final FileStorageService fileStorageService;
|
private final FileStorageService fileStorageService;
|
||||||
private final CrawlSpecProvider crawlSpecProvider;
|
|
||||||
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
||||||
private final WarcArchiverFactory warcArchiverFactory;
|
private final WarcArchiverFactory warcArchiverFactory;
|
||||||
|
private final HikariDataSource dataSource;
|
||||||
|
private final DomainBlacklist blacklist;
|
||||||
private final Gson gson;
|
private final Gson gson;
|
||||||
private final int node;
|
private final int node;
|
||||||
private final SimpleBlockingThreadPool pool;
|
private final SimpleBlockingThreadPool pool;
|
||||||
|
|
||||||
private final DomainLocks domainLocks = new DomainLocks();
|
private final DomainLocks domainLocks = new DomainLocks();
|
||||||
|
|
||||||
private final Map<String, String> processingIds = new ConcurrentHashMap<>();
|
private final Map<String, CrawlTask> pendingCrawlTasks = new ConcurrentHashMap<>();
|
||||||
|
|
||||||
private final AbortMonitor abortMonitor = AbortMonitor.getInstance();
|
|
||||||
private final AtomicInteger tasksDone = new AtomicInteger(0);
|
private final AtomicInteger tasksDone = new AtomicInteger(0);
|
||||||
private final HttpFetcherImpl fetcher;
|
private final HttpFetcherImpl fetcher;
|
||||||
|
|
||||||
private volatile int totalTasks;
|
private int totalTasks = 1;
|
||||||
|
|
||||||
|
private static final double URL_GROWTH_FACTOR = Double.parseDouble(System.getProperty("crawler.crawlSetGrowthFactor", "1.25"));
|
||||||
|
private static final int MIN_URLS_PER_DOMAIN = Integer.getInteger("crawler.minUrlsPerDomain", 100);
|
||||||
|
private static final int MID_URLS_PER_DOMAIN = Integer.getInteger("crawler.minUrlsPerDomain", 2_000);
|
||||||
|
private static final int MAX_URLS_PER_DOMAIN = Integer.getInteger("crawler.maxUrlsPerDomain", 10_000);
|
||||||
|
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public CrawlerMain(UserAgent userAgent,
|
public CrawlerMain(UserAgent userAgent,
|
||||||
@ -85,18 +92,20 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
MessageQueueFactory messageQueueFactory, DomainProber domainProber,
|
MessageQueueFactory messageQueueFactory, DomainProber domainProber,
|
||||||
FileStorageService fileStorageService,
|
FileStorageService fileStorageService,
|
||||||
ProcessConfiguration processConfiguration,
|
ProcessConfiguration processConfiguration,
|
||||||
CrawlSpecProvider crawlSpecProvider,
|
|
||||||
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
||||||
WarcArchiverFactory warcArchiverFactory,
|
WarcArchiverFactory warcArchiverFactory,
|
||||||
Gson gson) {
|
HikariDataSource dataSource,
|
||||||
|
DomainBlacklist blacklist,
|
||||||
|
Gson gson) throws InterruptedException {
|
||||||
this.userAgent = userAgent;
|
this.userAgent = userAgent;
|
||||||
this.heartbeat = heartbeat;
|
this.heartbeat = heartbeat;
|
||||||
this.messageQueueFactory = messageQueueFactory;
|
this.messageQueueFactory = messageQueueFactory;
|
||||||
this.domainProber = domainProber;
|
this.domainProber = domainProber;
|
||||||
this.fileStorageService = fileStorageService;
|
this.fileStorageService = fileStorageService;
|
||||||
this.crawlSpecProvider = crawlSpecProvider;
|
|
||||||
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
|
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
|
||||||
this.warcArchiverFactory = warcArchiverFactory;
|
this.warcArchiverFactory = warcArchiverFactory;
|
||||||
|
this.dataSource = dataSource;
|
||||||
|
this.blacklist = blacklist;
|
||||||
this.gson = gson;
|
this.gson = gson;
|
||||||
this.node = processConfiguration.node();
|
this.node = processConfiguration.node();
|
||||||
|
|
||||||
@ -108,15 +117,13 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
new Dispatcher(),
|
new Dispatcher(),
|
||||||
new ConnectionPool(5, 10, TimeUnit.SECONDS)
|
new ConnectionPool(5, 10, TimeUnit.SECONDS)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Wait for the blacklist to be loaded before starting the crawl
|
||||||
|
blacklist.waitUntilLoaded();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String... args) throws Exception {
|
public static void main(String... args) throws Exception {
|
||||||
|
|
||||||
if (!AbortMonitor.getInstance().isAlive()) {
|
|
||||||
System.err.println("Remove abort file first");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Prevent Java from caching DNS lookups forever (filling up the system RAM as a result)
|
// Prevent Java from caching DNS lookups forever (filling up the system RAM as a result)
|
||||||
Security.setProperty("networkaddress.cache.ttl" , "3600");
|
Security.setProperty("networkaddress.cache.ttl" , "3600");
|
||||||
|
|
||||||
@ -144,7 +151,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
crawler.runForSingleDomain(instructions.targetDomainName, instructions.outputDir);
|
crawler.runForSingleDomain(instructions.targetDomainName, instructions.outputDir);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
crawler.run(instructions.outputDir);
|
crawler.runForDatabaseDomains(instructions.outputDir);
|
||||||
}
|
}
|
||||||
instructions.ok();
|
instructions.ok();
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
@ -160,34 +167,99 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
System.exit(0);
|
System.exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void run(Path outputDir) throws Exception {
|
public void runForDatabaseDomains(Path outputDir) throws Exception {
|
||||||
|
|
||||||
heartbeat.start();
|
heartbeat.start();
|
||||||
|
|
||||||
|
logger.info("Loading domains to be crawled");
|
||||||
|
|
||||||
|
final List<CrawlSpecRecord> crawlSpecRecords = new ArrayList<>();
|
||||||
|
final List<EdgeDomain> domainsToCrawl = new ArrayList<>();
|
||||||
|
|
||||||
|
// Assign any domains with node_affinity=0 to this node, and then fetch all domains assigned to this node
|
||||||
|
// to be crawled.
|
||||||
|
|
||||||
|
try (var conn = dataSource.getConnection()) {
|
||||||
|
try (var assignFreeDomains = conn.prepareStatement(
|
||||||
|
"""
|
||||||
|
UPDATE EC_DOMAIN
|
||||||
|
SET NODE_AFFINITY=?
|
||||||
|
WHERE NODE_AFFINITY=0
|
||||||
|
"""))
|
||||||
|
{
|
||||||
|
// Assign any domains with node_affinity=0 to this node. We must do this now, before we start crawling
|
||||||
|
// to avoid race conditions with other crawl runs. We don't want multiple crawlers to crawl the same domain.
|
||||||
|
assignFreeDomains.setInt(1, node);
|
||||||
|
assignFreeDomains.executeUpdate();
|
||||||
|
}
|
||||||
|
|
||||||
|
try (var query = conn.prepareStatement("""
|
||||||
|
SELECT DOMAIN_NAME, COALESCE(VISITED_URLS, 0), EC_DOMAIN.ID
|
||||||
|
FROM EC_DOMAIN
|
||||||
|
LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||||
|
WHERE NODE_AFFINITY=?
|
||||||
|
""")) {
|
||||||
|
// Fetch the domains to be crawled
|
||||||
|
query.setInt(1, node);
|
||||||
|
query.setFetchSize(10_000);
|
||||||
|
var rs = query.executeQuery();
|
||||||
|
|
||||||
|
while (rs.next()) {
|
||||||
|
// Skip blacklisted domains
|
||||||
|
int domainId = rs.getInt(3);
|
||||||
|
if (blacklist.isBlacklisted(domainId))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
int existingUrls = rs.getInt(2);
|
||||||
|
String domainName = rs.getString(1);
|
||||||
|
|
||||||
|
domainsToCrawl.add(new EdgeDomain(domainName));
|
||||||
|
crawlSpecRecords.add(CrawlSpecRecord.growExistingDomain(domainName, existingUrls));
|
||||||
|
totalTasks++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("Loaded {} domains", crawlSpecRecords.size());
|
||||||
|
|
||||||
|
// Shuffle the domains to ensure we get a good mix of domains in each crawl,
|
||||||
|
// so that e.g. the big domains don't get all crawled at once, or we end up
|
||||||
|
// crawling the same server in parallel from different subdomains...
|
||||||
|
Collections.shuffle(crawlSpecRecords);
|
||||||
|
|
||||||
// First a validation run to ensure the file is all good to parse
|
// First a validation run to ensure the file is all good to parse
|
||||||
totalTasks = crawlSpecProvider.totalCount();
|
if (crawlSpecRecords.isEmpty()) {
|
||||||
if (totalTasks == 0) {
|
|
||||||
// This is an error state, and we should make noise about it
|
// This is an error state, and we should make noise about it
|
||||||
throw new IllegalStateException("No crawl tasks found, refusing to continue");
|
throw new IllegalStateException("No crawl tasks found, refusing to continue");
|
||||||
}
|
}
|
||||||
logger.info("Queued {} crawl tasks, let's go", totalTasks);
|
else {
|
||||||
|
logger.info("Queued {} crawl tasks, let's go", crawlSpecRecords.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set up the work log and the warc archiver so we can keep track of what we've done
|
||||||
try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler.log"));
|
try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler.log"));
|
||||||
WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir);
|
WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir);
|
||||||
AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(crawlSpecProvider.getDomains())
|
AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(domainsToCrawl)
|
||||||
) {
|
) {
|
||||||
// Set the number of tasks done to the number of tasks that are already finished,
|
// Set the number of tasks done to the number of tasks that are already finished,
|
||||||
// (this happens when the process is restarted after a crash or a shutdown)
|
// (this happens when the process is restarted after a crash or a shutdown)
|
||||||
tasksDone.set(workLog.countFinishedJobs());
|
tasksDone.set(workLog.countFinishedJobs());
|
||||||
|
|
||||||
// Process the crawl tasks
|
// Create crawl tasks and submit them to the pool for execution
|
||||||
try (var specStream = crawlSpecProvider.stream()) {
|
for (CrawlSpecRecord crawlSpec : crawlSpecRecords) {
|
||||||
specStream
|
if (workLog.isJobFinished(crawlSpec.domain()))
|
||||||
.takeWhile((e) -> abortMonitor.isAlive())
|
continue;
|
||||||
.filter(e -> !workLog.isJobFinished(e.domain()))
|
|
||||||
.filter(e -> processingIds.put(e.domain(), "") == null)
|
var task = new CrawlTask(
|
||||||
.map(e -> new CrawlTask(e, anchorTagsSource, outputDir, warcArchiver, workLog))
|
crawlSpec,
|
||||||
.forEach(pool::submitQuietly);
|
anchorTagsSource,
|
||||||
|
outputDir,
|
||||||
|
warcArchiver,
|
||||||
|
workLog);
|
||||||
|
|
||||||
|
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) == null) {
|
||||||
|
pool.submitQuietly(task);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info("Shutting down the pool, waiting for tasks to complete...");
|
logger.info("Shutting down the pool, waiting for tasks to complete...");
|
||||||
@ -222,7 +294,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir);
|
WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir);
|
||||||
AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(List.of(new EdgeDomain(targetDomainName)))
|
AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(List.of(new EdgeDomain(targetDomainName)))
|
||||||
) {
|
) {
|
||||||
var spec = new CrawlSpecProvider.CrawlSpecRecord(targetDomainName, 1000, List.of());
|
var spec = new CrawlSpecRecord(targetDomainName, 1000, List.of());
|
||||||
var task = new CrawlTask(spec, anchorTagsSource, outputDir, warcArchiver, workLog);
|
var task = new CrawlTask(spec, anchorTagsSource, outputDir, warcArchiver, workLog);
|
||||||
task.run();
|
task.run();
|
||||||
}
|
}
|
||||||
@ -234,9 +306,9 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
class CrawlTask implements SimpleBlockingThreadPool.Task {
|
private class CrawlTask implements SimpleBlockingThreadPool.Task {
|
||||||
|
|
||||||
private final CrawlSpecProvider.CrawlSpecRecord specification;
|
private final CrawlSpecRecord specification;
|
||||||
|
|
||||||
private final String domain;
|
private final String domain;
|
||||||
private final String id;
|
private final String id;
|
||||||
@ -246,7 +318,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
private final WarcArchiverIf warcArchiver;
|
private final WarcArchiverIf warcArchiver;
|
||||||
private final WorkLog workLog;
|
private final WorkLog workLog;
|
||||||
|
|
||||||
CrawlTask(CrawlSpecProvider.CrawlSpecRecord specification,
|
CrawlTask(CrawlSpecRecord specification,
|
||||||
AnchorTagsSource anchorTagsSource,
|
AnchorTagsSource anchorTagsSource,
|
||||||
Path outputDir,
|
Path outputDir,
|
||||||
WarcArchiverIf warcArchiver,
|
WarcArchiverIf warcArchiver,
|
||||||
@ -269,6 +341,8 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
|
Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP);
|
||||||
Path parquetFile = CrawlerOutputFile.createParquetPath(outputDir, id, domain);
|
Path parquetFile = CrawlerOutputFile.createParquetPath(outputDir, id, domain);
|
||||||
|
|
||||||
|
// Move the WARC file to a temp file if it exists, so we can resume the crawl using the old data
|
||||||
|
// while writing to the same file name as before
|
||||||
if (Files.exists(newWarcFile)) {
|
if (Files.exists(newWarcFile)) {
|
||||||
Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING);
|
Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING);
|
||||||
}
|
}
|
||||||
@ -276,31 +350,29 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
Files.deleteIfExists(tempFile);
|
Files.deleteIfExists(tempFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
var domainLock = domainLocks.getSemaphore(new EdgeDomain(specification.domain()));
|
|
||||||
|
|
||||||
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
|
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
|
||||||
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder);
|
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder);
|
||||||
CrawlDataReference reference = getReference())
|
CrawlDataReference reference = getReference();
|
||||||
|
)
|
||||||
{
|
{
|
||||||
// acquire the domain lock to prevent other threads from crawling the same domain,
|
// Resume the crawl if it was aborted
|
||||||
// we release it at the end of the task to let them go ahead
|
|
||||||
Thread.currentThread().setName("crawling:" + domain + " [await domain lock]");
|
|
||||||
domainLock.acquire();
|
|
||||||
Thread.currentThread().setName("crawling:" + domain);
|
|
||||||
|
|
||||||
var domainLinks = anchorTagsSource.getAnchorTags(domain);
|
|
||||||
|
|
||||||
if (Files.exists(tempFile)) {
|
if (Files.exists(tempFile)) {
|
||||||
retriever.syncAbortedRun(tempFile);
|
retriever.syncAbortedRun(tempFile);
|
||||||
Files.delete(tempFile);
|
Files.delete(tempFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
int size = retriever.crawlDomain(domainLinks, reference);
|
DomainLinks domainLinks = anchorTagsSource.getAnchorTags(domain);
|
||||||
|
|
||||||
|
int size;
|
||||||
|
try (var lock = domainLocks.lockDomain(new EdgeDomain(domain))) {
|
||||||
|
size = retriever.crawlDomain(domainLinks, reference);
|
||||||
|
}
|
||||||
|
|
||||||
// Delete the reference crawl data if it's not the same as the new one
|
// Delete the reference crawl data if it's not the same as the new one
|
||||||
// (mostly a case when migrating from legacy->warc)
|
// (mostly a case when migrating from legacy->warc)
|
||||||
reference.delete();
|
reference.delete();
|
||||||
|
|
||||||
|
// Convert the WARC file to Parquet
|
||||||
CrawledDocumentParquetRecordFileWriter
|
CrawledDocumentParquetRecordFileWriter
|
||||||
.convertWarc(domain, userAgent, newWarcFile, parquetFile);
|
.convertWarc(domain, userAgent, newWarcFile, parquetFile);
|
||||||
|
|
||||||
@ -308,7 +380,10 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
// otherwise delete it:
|
// otherwise delete it:
|
||||||
warcArchiver.consumeWarc(newWarcFile, domain);
|
warcArchiver.consumeWarc(newWarcFile, domain);
|
||||||
|
|
||||||
|
// Mark the domain as finished in the work log
|
||||||
workLog.setJobToFinished(domain, parquetFile.toString(), size);
|
workLog.setJobToFinished(domain, parquetFile.toString(), size);
|
||||||
|
|
||||||
|
// Update the progress bar
|
||||||
heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
|
heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
|
||||||
|
|
||||||
logger.info("Fetched {}", domain);
|
logger.info("Fetched {}", domain);
|
||||||
@ -316,11 +391,8 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
logger.error("Error fetching domain " + domain, e);
|
logger.error("Error fetching domain " + domain, e);
|
||||||
}
|
}
|
||||||
finally {
|
finally {
|
||||||
// release the domain lock to permit other threads to crawl subdomains of this domain
|
|
||||||
domainLock.release();
|
|
||||||
|
|
||||||
// We don't need to double-count these; it's also kept int he workLog
|
// We don't need to double-count these; it's also kept int he workLog
|
||||||
processingIds.remove(domain);
|
pendingCrawlTasks.remove(domain);
|
||||||
Thread.currentThread().setName("[idle]");
|
Thread.currentThread().setName("[idle]");
|
||||||
|
|
||||||
Files.deleteIfExists(newWarcFile);
|
Files.deleteIfExists(newWarcFile);
|
||||||
@ -379,12 +451,11 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received"));
|
var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received"));
|
||||||
|
|
||||||
var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.crawling.CrawlRequest.class);
|
var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.crawling.CrawlRequest.class);
|
||||||
|
var crawlStorage = fileStorageService.getStorage(request.crawlStorage);
|
||||||
var crawlData = fileStorageService.getStorage(request.crawlStorage);
|
|
||||||
|
|
||||||
return new CrawlRequest(
|
return new CrawlRequest(
|
||||||
request.targetDomainName,
|
request.targetDomainName,
|
||||||
crawlData.asPath(),
|
crawlStorage.asPath(),
|
||||||
msg,
|
msg,
|
||||||
inbox);
|
inbox);
|
||||||
}
|
}
|
||||||
@ -404,4 +475,25 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Builder
|
||||||
|
public record CrawlSpecRecord(@NotNull String domain, int crawlDepth, @NotNull List<String> urls) {
|
||||||
|
|
||||||
|
public CrawlSpecRecord(String domain, int crawlDepth) {
|
||||||
|
this(domain, crawlDepth, List.of());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static CrawlSpecRecord growExistingDomain(String domain, int visitedUrls) {
|
||||||
|
// Calculate the number of URLs to fetch for this domain, based on the number of URLs
|
||||||
|
// already fetched, and a growth factor that gets a bonus for small domains
|
||||||
|
return new CrawlSpecRecord(domain,
|
||||||
|
(int) Math.clamp(
|
||||||
|
(visitedUrls * (visitedUrls < MID_URLS_PER_DOMAIN
|
||||||
|
? Math.max(2.5, URL_GROWTH_FACTOR)
|
||||||
|
: URL_GROWTH_FACTOR)
|
||||||
|
),
|
||||||
|
MIN_URLS_PER_DOMAIN,
|
||||||
|
MAX_URLS_PER_DOMAIN));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -18,8 +18,9 @@ public class DomainLocks {
|
|||||||
/** Returns a lock object corresponding to the given domain. The object is returned as-is,
|
/** Returns a lock object corresponding to the given domain. The object is returned as-is,
|
||||||
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
|
* and may be held by another thread. The caller is responsible for locking and releasing the lock.
|
||||||
*/
|
*/
|
||||||
public Semaphore getSemaphore(EdgeDomain domain) {
|
public DomainLock lockDomain(EdgeDomain domain) throws InterruptedException {
|
||||||
return locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits);
|
return new DomainLock(domain.toString(),
|
||||||
|
locks.computeIfAbsent(domain.topDomain.toLowerCase(), this::defaultPermits));
|
||||||
}
|
}
|
||||||
|
|
||||||
private Semaphore defaultPermits(String topDomain) {
|
private Semaphore defaultPermits(String topDomain) {
|
||||||
@ -42,4 +43,24 @@ public class DomainLocks {
|
|||||||
|
|
||||||
return new Semaphore(2);
|
return new Semaphore(2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static class DomainLock implements AutoCloseable {
|
||||||
|
private final String domainName;
|
||||||
|
private final Semaphore semaphore;
|
||||||
|
|
||||||
|
DomainLock(String domainName, Semaphore semaphore) throws InterruptedException {
|
||||||
|
this.domainName = domainName;
|
||||||
|
this.semaphore = semaphore;
|
||||||
|
|
||||||
|
Thread.currentThread().setName("crawling:" + domainName + " [await domain lock]");
|
||||||
|
semaphore.acquire();
|
||||||
|
Thread.currentThread().setName("crawling:" + domainName);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws Exception {
|
||||||
|
semaphore.release();
|
||||||
|
Thread.currentThread().setName("crawling:" + domainName + " [wrapping up]");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -36,6 +36,10 @@ public class CrawlDataReference implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Get the next document from the crawl data,
|
||||||
|
* returning null when there are no more documents
|
||||||
|
* available
|
||||||
|
*/
|
||||||
@Nullable
|
@Nullable
|
||||||
public CrawledDocument nextDocument() {
|
public CrawledDocument nextDocument() {
|
||||||
try {
|
try {
|
||||||
@ -52,7 +56,7 @@ public class CrawlDataReference implements AutoCloseable {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isContentBodySame(String one, String other) {
|
public static boolean isContentBodySame(String one, String other) {
|
||||||
|
|
||||||
final long contentHashOne = contentHash(one);
|
final long contentHashOne = contentHash(one);
|
||||||
final long contentHashOther = contentHash(other);
|
final long contentHashOther = contentHash(other);
|
||||||
@ -60,7 +64,7 @@ public class CrawlDataReference implements AutoCloseable {
|
|||||||
return EasyLSH.hammingDistance(contentHashOne, contentHashOther) < 4;
|
return EasyLSH.hammingDistance(contentHashOne, contentHashOther) < 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
private long contentHash(String content) {
|
private static long contentHash(String content) {
|
||||||
EasyLSH hash = new EasyLSH();
|
EasyLSH hash = new EasyLSH();
|
||||||
int next = 0;
|
int next = 0;
|
||||||
|
|
||||||
@ -83,8 +87,8 @@ public class CrawlDataReference implements AutoCloseable {
|
|||||||
return hash.get();
|
return hash.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
private final HashFunction hashFunction = Hashing.murmur3_128();
|
private static final HashFunction hashFunction = Hashing.murmur3_128();
|
||||||
private int hashInt(int v) {
|
private static int hashInt(int v) {
|
||||||
return hashFunction.hashInt(v).asInt();
|
return hashFunction.hashInt(v).asInt();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3,6 +3,7 @@ package nu.marginalia.crawl.retreival;
|
|||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
import nu.marginalia.contenttype.ContentType;
|
import nu.marginalia.contenttype.ContentType;
|
||||||
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
@ -11,7 +12,6 @@ import nu.marginalia.crawl.logic.LinkFilterSelector;
|
|||||||
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
|
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
|
||||||
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
|
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
|
||||||
import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
|
import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
|
||||||
import nu.marginalia.crawl.spec.CrawlSpecProvider;
|
|
||||||
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
||||||
import nu.marginalia.link_parser.LinkParser;
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
@ -54,7 +54,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
|
|
||||||
public CrawlerRetreiver(HttpFetcher fetcher,
|
public CrawlerRetreiver(HttpFetcher fetcher,
|
||||||
DomainProber domainProber,
|
DomainProber domainProber,
|
||||||
CrawlSpecProvider.CrawlSpecRecord specs,
|
CrawlerMain.CrawlSpecRecord specs,
|
||||||
WarcRecorder warcRecorder)
|
WarcRecorder warcRecorder)
|
||||||
{
|
{
|
||||||
this.warcRecorder = warcRecorder;
|
this.warcRecorder = warcRecorder;
|
||||||
@ -117,9 +117,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
sniffRootDocument(rootUrl, delayTimer);
|
sniffRootDocument(rootUrl, delayTimer);
|
||||||
|
|
||||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||||
int fetchedCount = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
|
if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) {
|
||||||
|
|
||||||
if (fetchedCount > 0) {
|
|
||||||
// If we have reference data, we will always grow the crawl depth a bit
|
// If we have reference data, we will always grow the crawl depth a bit
|
||||||
crawlFrontier.increaseDepth(1.5, 2500);
|
crawlFrontier.increaseDepth(1.5, 2500);
|
||||||
}
|
}
|
||||||
@ -162,9 +160,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
continue;
|
continue;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (fetchContentWithReference(top, delayTimer, DocumentWithReference.empty()).isOk()) {
|
fetchContentWithReference(top, delayTimer, DocumentWithReference.empty());
|
||||||
fetchedCount++;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
catch (InterruptedException ex) {
|
catch (InterruptedException ex) {
|
||||||
Thread.currentThread().interrupt();
|
Thread.currentThread().interrupt();
|
||||||
@ -172,7 +168,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return fetchedCount;
|
return crawlFrontier.visitedSize();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void syncAbortedRun(Path warcFile) {
|
public void syncAbortedRun(Path warcFile) {
|
||||||
|
@ -165,7 +165,7 @@ public class DomainCrawlFrontier {
|
|||||||
public int queueSize() {
|
public int queueSize() {
|
||||||
return queue.size();
|
return queue.size();
|
||||||
}
|
}
|
||||||
|
public int visitedSize() { return visited.size(); }
|
||||||
|
|
||||||
public void enqueueLinksFromDocument(EdgeUrl baseUrl, Document parsed) {
|
public void enqueueLinksFromDocument(EdgeUrl baseUrl, Document parsed) {
|
||||||
baseUrl = linkParser.getBaseLink(parsed, baseUrl);
|
baseUrl = linkParser.getBaseLink(parsed, baseUrl);
|
||||||
|
@ -42,7 +42,7 @@ public record DocumentWithReference(
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return reference.isContentBodySame(doc.documentBody, bodyOk.body());
|
return CrawlDataReference.isContentBodySame(doc.documentBody, bodyOk.body());
|
||||||
}
|
}
|
||||||
|
|
||||||
public ContentTags getContentTags() {
|
public ContentTags getContentTags() {
|
||||||
|
@ -1,137 +0,0 @@
|
|||||||
package nu.marginalia.crawl.spec;
|
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
|
||||||
import com.zaxxer.hikari.HikariDataSource;
|
|
||||||
import lombok.Builder;
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import nu.marginalia.ProcessConfiguration;
|
|
||||||
import nu.marginalia.db.DomainBlacklist;
|
|
||||||
import nu.marginalia.model.EdgeDomain;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
// FIXME: This design is a vestige from when there were multiple sources of crawl data. It should be simplified and probably merged with CrawlerMain.
|
|
||||||
public class CrawlSpecProvider {
|
|
||||||
private final HikariDataSource dataSource;
|
|
||||||
private final ProcessConfiguration processConfiguration;
|
|
||||||
private final DomainBlacklist blacklist;
|
|
||||||
|
|
||||||
private List<CrawlSpecRecord> domains;
|
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(CrawlSpecProvider.class);
|
|
||||||
|
|
||||||
private static final double URL_GROWTH_FACTOR = Double.parseDouble(System.getProperty("crawler.crawlSetGrowthFactor", "1.25"));
|
|
||||||
private static final int MIN_URLS_PER_DOMAIN = Integer.getInteger("crawler.minUrlsPerDomain", 100);
|
|
||||||
private static final int MID_URLS_PER_DOMAIN = Integer.getInteger("crawler.minUrlsPerDomain", 2_000);
|
|
||||||
private static final int MAX_URLS_PER_DOMAIN = Integer.getInteger("crawler.maxUrlsPerDomain", 10_000);
|
|
||||||
|
|
||||||
@Inject
|
|
||||||
public CrawlSpecProvider(HikariDataSource dataSource,
|
|
||||||
ProcessConfiguration processConfiguration,
|
|
||||||
DomainBlacklist blacklist
|
|
||||||
) {
|
|
||||||
this.dataSource = dataSource;
|
|
||||||
this.processConfiguration = processConfiguration;
|
|
||||||
this.blacklist = blacklist;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Load the domains into memory to ensure the crawler is resilient to database blips
|
|
||||||
private List<CrawlSpecRecord> loadData() throws Exception {
|
|
||||||
var domains = new ArrayList<CrawlSpecRecord>();
|
|
||||||
|
|
||||||
logger.info("Loading domains to be crawled");
|
|
||||||
|
|
||||||
blacklist.waitUntilLoaded();
|
|
||||||
|
|
||||||
try (var conn = dataSource.getConnection();
|
|
||||||
var assignFreeDomains = conn.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY=? WHERE NODE_AFFINITY=0");
|
|
||||||
var query = conn.prepareStatement("""
|
|
||||||
SELECT DOMAIN_NAME, COALESCE(VISITED_URLS, 0), EC_DOMAIN.ID
|
|
||||||
FROM EC_DOMAIN
|
|
||||||
LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
|
||||||
WHERE NODE_AFFINITY=?
|
|
||||||
""")
|
|
||||||
)
|
|
||||||
{
|
|
||||||
|
|
||||||
// Assign any domains with node_affinity=0 to this node. We must do this now, before we start crawling
|
|
||||||
// to avoid race conditions with other crawl runs. We don't want multiple crawlers to crawl the same domain.
|
|
||||||
assignFreeDomains.setInt(1, processConfiguration.node());
|
|
||||||
assignFreeDomains.executeUpdate();
|
|
||||||
|
|
||||||
// Fetch the domains to be crawled
|
|
||||||
query.setInt(1, processConfiguration.node());
|
|
||||||
query.setFetchSize(10_000);
|
|
||||||
var rs = query.executeQuery();
|
|
||||||
|
|
||||||
while (rs.next()) {
|
|
||||||
// Skip blacklisted domains
|
|
||||||
int id = rs.getInt(3);
|
|
||||||
if (blacklist.isBlacklisted(id))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
int urls = rs.getInt(2);
|
|
||||||
|
|
||||||
double growthFactor = urls < MID_URLS_PER_DOMAIN
|
|
||||||
? Math.max(2.5, URL_GROWTH_FACTOR)
|
|
||||||
: URL_GROWTH_FACTOR;
|
|
||||||
|
|
||||||
int urlsToFetch = Math.clamp((int) (growthFactor * rs.getInt(2)), MIN_URLS_PER_DOMAIN, MAX_URLS_PER_DOMAIN);
|
|
||||||
|
|
||||||
var record = new CrawlSpecRecord(
|
|
||||||
rs.getString(1),
|
|
||||||
urlsToFetch,
|
|
||||||
List.of()
|
|
||||||
);
|
|
||||||
|
|
||||||
domains.add(record);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info("Loaded {} domains", domains.size());
|
|
||||||
|
|
||||||
// Shuffle the domains to ensure we get a good mix of domains in each crawl,
|
|
||||||
// so that e.g. the big domains don't get all crawled at once, or we end up
|
|
||||||
// crawling the same server in parallel from different subdomains...
|
|
||||||
Collections.shuffle(domains);
|
|
||||||
|
|
||||||
return domains;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<EdgeDomain> getDomains() {
|
|
||||||
return stream().map(CrawlSpecRecord::domain).map(EdgeDomain::new).toList();
|
|
||||||
}
|
|
||||||
|
|
||||||
public int totalCount() throws Exception {
|
|
||||||
if (domains == null) {
|
|
||||||
domains = loadData();
|
|
||||||
}
|
|
||||||
return domains.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public Stream<CrawlSpecRecord> stream() {
|
|
||||||
if (domains == null) {
|
|
||||||
domains = loadData();
|
|
||||||
}
|
|
||||||
|
|
||||||
return domains.stream();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Builder
|
|
||||||
public record CrawlSpecRecord(@NotNull String domain,
|
|
||||||
int crawlDepth,
|
|
||||||
@NotNull List<String> urls) {
|
|
||||||
public CrawlSpecRecord(String domain, int crawlDepth) {
|
|
||||||
this(domain, crawlDepth, List.of());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -2,6 +2,7 @@ package nu.marginalia.crawling.retreival;
|
|||||||
|
|
||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
@ -9,7 +10,6 @@ import nu.marginalia.crawl.fetcher.SitemapRetriever;
|
|||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||||
import nu.marginalia.crawl.retreival.DomainProber;
|
import nu.marginalia.crawl.retreival.DomainProber;
|
||||||
import nu.marginalia.crawl.spec.CrawlSpecProvider;
|
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
@ -68,7 +68,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void crawl(CrawlSpecProvider.CrawlSpecRecord spec) throws IOException {
|
void crawl(CrawlerMain.CrawlSpecRecord spec) throws IOException {
|
||||||
try (var recorder = new WarcRecorder()) {
|
try (var recorder = new WarcRecorder()) {
|
||||||
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder)
|
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder)
|
||||||
.crawlDomain();
|
.crawlDomain();
|
||||||
@ -83,7 +83,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html");
|
registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html");
|
||||||
registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html");
|
registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html");
|
||||||
|
|
||||||
crawl(new CrawlSpecProvider.CrawlSpecRecord("startrek.website", 10, new ArrayList<>()));
|
crawl(new CrawlerMain.CrawlSpecRecord("startrek.website", 10, new ArrayList<>()));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@ -92,7 +92,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
|
|
||||||
registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
|
registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
|
||||||
|
|
||||||
crawl(new CrawlSpecProvider.CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()));
|
crawl(new CrawlerMain.CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@ -103,7 +103,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html");
|
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html");
|
||||||
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html");
|
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html");
|
||||||
|
|
||||||
crawl(new CrawlSpecProvider.CrawlSpecRecord("community.tt-rss.org", 10, new ArrayList<>()));
|
crawl(new CrawlerMain.CrawlSpecRecord("community.tt-rss.org", 10, new ArrayList<>()));
|
||||||
}
|
}
|
||||||
|
|
||||||
class MockFetcher implements HttpFetcher {
|
class MockFetcher implements HttpFetcher {
|
||||||
|
@ -4,11 +4,11 @@ import lombok.SneakyThrows;
|
|||||||
import nu.marginalia.UserAgent;
|
import nu.marginalia.UserAgent;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawl.retreival.*;
|
import nu.marginalia.crawl.retreival.*;
|
||||||
import nu.marginalia.crawl.spec.CrawlSpecProvider;
|
|
||||||
import nu.marginalia.io.CrawledDomainReader;
|
import nu.marginalia.io.CrawledDomainReader;
|
||||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
@ -76,7 +76,7 @@ class CrawlerRetreiverTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testWarcOutput() throws IOException {
|
public void testWarcOutput() throws IOException {
|
||||||
var specs = CrawlSpecProvider.CrawlSpecRecord
|
var specs = CrawlerMain.CrawlSpecRecord
|
||||||
.builder()
|
.builder()
|
||||||
.crawlDepth(5)
|
.crawlDepth(5)
|
||||||
.domain("www.marginalia.nu")
|
.domain("www.marginalia.nu")
|
||||||
@ -118,7 +118,7 @@ class CrawlerRetreiverTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testWarcOutputNoKnownUrls() throws IOException {
|
public void testWarcOutputNoKnownUrls() throws IOException {
|
||||||
var specs = CrawlSpecProvider.CrawlSpecRecord
|
var specs = CrawlerMain.CrawlSpecRecord
|
||||||
.builder()
|
.builder()
|
||||||
.crawlDepth(5)
|
.crawlDepth(5)
|
||||||
.domain("www.marginalia.nu")
|
.domain("www.marginalia.nu")
|
||||||
@ -161,7 +161,7 @@ class CrawlerRetreiverTest {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
@Test
|
@Test
|
||||||
public void testResync() throws IOException {
|
public void testResync() throws IOException {
|
||||||
var specs = CrawlSpecProvider.CrawlSpecRecord
|
var specs = CrawlerMain.CrawlSpecRecord
|
||||||
.builder()
|
.builder()
|
||||||
.crawlDepth(5)
|
.crawlDepth(5)
|
||||||
.domain("www.marginalia.nu")
|
.domain("www.marginalia.nu")
|
||||||
@ -210,7 +210,7 @@ class CrawlerRetreiverTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testWithKnownDomains() throws IOException {
|
public void testWithKnownDomains() throws IOException {
|
||||||
var specs = CrawlSpecProvider.CrawlSpecRecord
|
var specs = CrawlerMain.CrawlSpecRecord
|
||||||
.builder()
|
.builder()
|
||||||
.crawlDepth(5)
|
.crawlDepth(5)
|
||||||
.domain("www.marginalia.nu")
|
.domain("www.marginalia.nu")
|
||||||
@ -254,7 +254,7 @@ class CrawlerRetreiverTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testRedirect() throws IOException, URISyntaxException {
|
public void testRedirect() throws IOException, URISyntaxException {
|
||||||
var specs = CrawlSpecProvider.CrawlSpecRecord
|
var specs = CrawlerMain.CrawlSpecRecord
|
||||||
.builder()
|
.builder()
|
||||||
.crawlDepth(3)
|
.crawlDepth(3)
|
||||||
.domain("www.marginalia.nu")
|
.domain("www.marginalia.nu")
|
||||||
@ -312,7 +312,7 @@ class CrawlerRetreiverTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testEmptySet() throws IOException {
|
public void testEmptySet() throws IOException {
|
||||||
|
|
||||||
var specs = CrawlSpecProvider.CrawlSpecRecord
|
var specs = CrawlerMain.CrawlSpecRecord
|
||||||
.builder()
|
.builder()
|
||||||
.crawlDepth(5)
|
.crawlDepth(5)
|
||||||
.domain("www.marginalia.nu")
|
.domain("www.marginalia.nu")
|
||||||
@ -360,7 +360,7 @@ class CrawlerRetreiverTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testRecrawl() throws IOException {
|
public void testRecrawl() throws IOException {
|
||||||
|
|
||||||
var specs = CrawlSpecProvider.CrawlSpecRecord
|
var specs = CrawlerMain.CrawlSpecRecord
|
||||||
.builder()
|
.builder()
|
||||||
.crawlDepth(12)
|
.crawlDepth(12)
|
||||||
.domain("www.marginalia.nu")
|
.domain("www.marginalia.nu")
|
||||||
@ -420,7 +420,7 @@ class CrawlerRetreiverTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testRecrawlWithResync() throws IOException {
|
public void testRecrawlWithResync() throws IOException {
|
||||||
|
|
||||||
var specs = CrawlSpecProvider.CrawlSpecRecord
|
var specs = CrawlerMain.CrawlSpecRecord
|
||||||
.builder()
|
.builder()
|
||||||
.crawlDepth(12)
|
.crawlDepth(12)
|
||||||
.domain("www.marginalia.nu")
|
.domain("www.marginalia.nu")
|
||||||
@ -508,7 +508,7 @@ class CrawlerRetreiverTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void doCrawlWithReferenceStream(CrawlSpecProvider.CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
|
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
|
||||||
try (var recorder = new WarcRecorder(tempFileWarc2)) {
|
try (var recorder = new WarcRecorder(tempFileWarc2)) {
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).crawlDomain(new DomainLinks(),
|
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).crawlDomain(new DomainLinks(),
|
||||||
new CrawlDataReference(stream));
|
new CrawlDataReference(stream));
|
||||||
@ -519,7 +519,7 @@ class CrawlerRetreiverTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@NotNull
|
@NotNull
|
||||||
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlSpecProvider.CrawlSpecRecord specs) {
|
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) {
|
||||||
try (var recorder = new WarcRecorder(tempFileWarc1)) {
|
try (var recorder = new WarcRecorder(tempFileWarc1)) {
|
||||||
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder);
|
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder);
|
||||||
crawler.crawlDomain();
|
crawler.crawlDomain();
|
||||||
|
Loading…
Reference in New Issue
Block a user