2024-11-19 18:35:01 +00:00
|
|
|
package nu.marginalia.livecrawler;
|
|
|
|
|
|
|
|
import crawlercommons.robots.SimpleRobotRules;
|
|
|
|
import crawlercommons.robots.SimpleRobotRulesParser;
|
|
|
|
import nu.marginalia.WmsaHome;
|
|
|
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
|
|
|
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
|
|
|
import nu.marginalia.db.DbDomainQueries;
|
|
|
|
import nu.marginalia.db.DomainBlacklist;
|
|
|
|
import nu.marginalia.link_parser.LinkParser;
|
|
|
|
import nu.marginalia.model.EdgeDomain;
|
|
|
|
import nu.marginalia.model.EdgeUrl;
|
|
|
|
import nu.marginalia.util.SimpleBlockingThreadPool;
|
2024-11-21 23:55:46 +00:00
|
|
|
import org.slf4j.Logger;
|
|
|
|
import org.slf4j.LoggerFactory;
|
2024-11-19 18:35:01 +00:00
|
|
|
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.net.URISyntaxException;
|
|
|
|
import java.net.http.HttpClient;
|
|
|
|
import java.net.http.HttpHeaders;
|
|
|
|
import java.net.http.HttpRequest;
|
|
|
|
import java.net.http.HttpResponse;
|
|
|
|
import java.time.Duration;
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.Optional;
|
2024-11-21 23:55:46 +00:00
|
|
|
import java.util.concurrent.ThreadLocalRandom;
|
2024-11-19 18:35:01 +00:00
|
|
|
import java.util.concurrent.TimeUnit;
|
|
|
|
|
|
|
|
/** A simple link scraper that fetches URLs and stores them in a database,
|
|
|
|
* with no concept of a crawl frontier, WARC output, or other advanced features
|
|
|
|
*/
|
|
|
|
public class SimpleLinkScraper implements AutoCloseable {
|
2024-11-21 23:55:46 +00:00
|
|
|
private static final Logger logger = LoggerFactory.getLogger(SimpleLinkScraper.class);
|
|
|
|
|
2024-11-19 18:35:01 +00:00
|
|
|
private final SimpleBlockingThreadPool pool = new SimpleBlockingThreadPool("LiveCrawler", 32, 10);
|
|
|
|
private final LinkParser lp = new LinkParser();
|
|
|
|
private final LiveCrawlDataSet dataSet;
|
|
|
|
private final DbDomainQueries domainQueries;
|
|
|
|
private final DomainBlacklist domainBlacklist;
|
|
|
|
private final Duration connectTimeout = Duration.ofSeconds(10);
|
|
|
|
private final Duration readTimeout = Duration.ofSeconds(10);
|
|
|
|
|
|
|
|
public SimpleLinkScraper(LiveCrawlDataSet dataSet,
|
|
|
|
DbDomainQueries domainQueries,
|
|
|
|
DomainBlacklist domainBlacklist) {
|
|
|
|
this.dataSet = dataSet;
|
|
|
|
this.domainQueries = domainQueries;
|
|
|
|
this.domainBlacklist = domainBlacklist;
|
|
|
|
}
|
|
|
|
|
|
|
|
public void scheduleRetrieval(EdgeDomain domain, List<String> urls) {
|
|
|
|
|
|
|
|
var id = domainQueries.tryGetDomainId(domain);
|
|
|
|
if (id.isEmpty() || domainBlacklist.isBlacklisted(id.getAsInt())) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
pool.submitQuietly(() -> retrieveNow(domain, id.getAsInt(), urls));
|
|
|
|
}
|
|
|
|
|
|
|
|
public void retrieveNow(EdgeDomain domain, int domainId, List<String> urls) throws Exception {
|
|
|
|
try (HttpClient client = HttpClient
|
|
|
|
.newBuilder()
|
|
|
|
.connectTimeout(connectTimeout)
|
|
|
|
.followRedirects(HttpClient.Redirect.NEVER)
|
|
|
|
.version(HttpClient.Version.HTTP_2)
|
|
|
|
.build()) {
|
|
|
|
|
|
|
|
EdgeUrl rootUrl = domain.toRootUrlHttps();
|
|
|
|
|
|
|
|
SimpleRobotRules rules = fetchRobotsRules(rootUrl, client);
|
|
|
|
|
|
|
|
CrawlDelayTimer timer = new CrawlDelayTimer(rules.getCrawlDelay());
|
|
|
|
|
|
|
|
for (var url : urls) {
|
|
|
|
Optional<EdgeUrl> optParsedUrl = lp.parseLink(rootUrl, url);
|
|
|
|
if (optParsedUrl.isEmpty()) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (dataSet.hasUrl(optParsedUrl.get())) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
EdgeUrl parsedUrl = optParsedUrl.get();
|
|
|
|
if (!rules.isAllowed(url)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2024-11-21 23:55:46 +00:00
|
|
|
switch (fetchUrl(domainId, parsedUrl, timer, client)) {
|
|
|
|
case FetchResult.Success(int id, EdgeUrl docUrl, String body, String headers)
|
|
|
|
-> dataSet.saveDocument(id, docUrl, body, headers, "");
|
|
|
|
case FetchResult.Error(EdgeUrl docUrl) ->
|
|
|
|
{
|
|
|
|
// To give bad URLs a chance to be re-fetched, we only flag them as bad
|
|
|
|
// with a 20% probability. This will prevent the same bad URL being
|
|
|
|
// re-fetched over and over again for several months, but still allow
|
|
|
|
// us to *mostly* re-fetch it if it was just a transient error.
|
|
|
|
|
|
|
|
// There's of course the chance we immediately flag it as bad on an
|
|
|
|
// unlucky roll, but you know, that's xcom baby
|
|
|
|
if (ThreadLocalRandom.current().nextDouble(0, 1) < 0.2) {
|
|
|
|
dataSet.flagAsBad(docUrl);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-11-19 18:35:01 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private SimpleRobotRules fetchRobotsRules(EdgeUrl rootUrl, HttpClient client) throws IOException, InterruptedException, URISyntaxException {
|
|
|
|
var robotsRequest = HttpRequest.newBuilder(rootUrl.withPathAndParam("/robots.txt", null).asURI())
|
|
|
|
.GET()
|
|
|
|
.header("User-Agent", WmsaHome.getUserAgent().uaString())
|
|
|
|
.timeout(readTimeout);
|
|
|
|
|
|
|
|
// Fetch the robots.txt
|
|
|
|
|
|
|
|
SimpleRobotRulesParser parser = new SimpleRobotRulesParser();
|
|
|
|
SimpleRobotRules rules = new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL);
|
|
|
|
HttpResponse<byte[]> robotsTxt = client.send(robotsRequest.build(), HttpResponse.BodyHandlers.ofByteArray());
|
|
|
|
if (robotsTxt.statusCode() == 200) {
|
|
|
|
rules = parser.parseContent(rootUrl.toString(),
|
|
|
|
robotsTxt.body(),
|
|
|
|
robotsTxt.headers().firstValue("Content-Type").orElse("text/plain"),
|
|
|
|
WmsaHome.getUserAgent().uaIdentifier());
|
|
|
|
}
|
|
|
|
|
|
|
|
return rules;
|
|
|
|
}
|
|
|
|
|
2024-11-21 23:55:46 +00:00
|
|
|
/** Fetch a URL and store it in the database
|
|
|
|
*/
|
|
|
|
private FetchResult fetchUrl(int domainId, EdgeUrl parsedUrl, CrawlDelayTimer timer, HttpClient client) throws Exception {
|
2024-11-19 18:35:01 +00:00
|
|
|
|
|
|
|
timer.waitFetchDelay();
|
|
|
|
|
2024-11-21 23:55:46 +00:00
|
|
|
HttpRequest request = HttpRequest.newBuilder(parsedUrl.asURI())
|
|
|
|
.GET()
|
|
|
|
.header("User-Agent", WmsaHome.getUserAgent().uaString())
|
|
|
|
.header("Accept", "text/html")
|
|
|
|
.timeout(readTimeout)
|
|
|
|
.build();
|
2024-11-19 18:35:01 +00:00
|
|
|
|
2024-11-21 23:55:46 +00:00
|
|
|
try {
|
2024-11-19 18:35:01 +00:00
|
|
|
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
|
|
|
|
|
2024-11-21 23:55:46 +00:00
|
|
|
// Handle rate limiting by waiting and retrying once
|
2024-11-19 18:35:01 +00:00
|
|
|
if (response.statusCode() == 429) {
|
|
|
|
timer.waitRetryDelay(new HttpFetcherImpl.RateLimitException(
|
|
|
|
response.headers().firstValue("Retry-After").orElse("5")
|
|
|
|
));
|
2024-11-21 23:55:46 +00:00
|
|
|
response = client.send(request, HttpResponse.BodyHandlers.ofString());
|
2024-11-19 18:35:01 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
String contentType = response.headers().firstValue("Content-Type").orElse("").toLowerCase();
|
|
|
|
|
2024-11-21 23:55:46 +00:00
|
|
|
if (response.statusCode() == 200) {
|
|
|
|
if (!contentType.toLowerCase().startsWith("text/html")) {
|
|
|
|
return new FetchResult.Error(parsedUrl);
|
|
|
|
}
|
2024-11-19 18:35:01 +00:00
|
|
|
|
2024-11-21 23:55:46 +00:00
|
|
|
String body = response.body();
|
|
|
|
if (body.length() > 1024 * 1024) {
|
|
|
|
return new FetchResult.Error(parsedUrl);
|
|
|
|
}
|
|
|
|
|
|
|
|
return new FetchResult.Success(domainId, parsedUrl, body, headersToString(response.headers()));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
catch (IOException ex) {
|
|
|
|
// We don't want a full stack trace on every error, as it's quite common and very noisy
|
|
|
|
logger.error("Error fetching URL {}: {} {}", parsedUrl, ex.getClass().getSimpleName(), ex.getMessage());
|
2024-11-19 18:35:01 +00:00
|
|
|
}
|
2024-11-21 23:55:46 +00:00
|
|
|
|
|
|
|
return new FetchResult.Error(parsedUrl);
|
|
|
|
}
|
|
|
|
|
|
|
|
sealed interface FetchResult {
|
|
|
|
record Success(int domainId, EdgeUrl url, String body, String headers) implements FetchResult {}
|
|
|
|
record Error(EdgeUrl url) implements FetchResult {}
|
2024-11-19 18:35:01 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
private String headersToString(HttpHeaders headers) {
|
|
|
|
StringBuilder headersStr = new StringBuilder();
|
|
|
|
headers.map().forEach((k, v) -> {
|
|
|
|
headersStr.append(k).append(": ").append(v).append("\n");
|
|
|
|
});
|
|
|
|
return headersStr.toString();
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public void close() throws Exception {
|
|
|
|
pool.shutDown();
|
|
|
|
for (int i = 0; i < 4; i++) {
|
|
|
|
pool.awaitTermination(1, TimeUnit.HOURS);
|
|
|
|
}
|
|
|
|
pool.shutDownNow();
|
|
|
|
}
|
|
|
|
}
|