mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(crawler) Don't read all the data into RAM when doing a refresh-crawl
This commit is contained in:
parent
7bc1cff286
commit
58f2f86ea8
@ -2,8 +2,10 @@ package nu.marginalia.crawling.io;
|
||||
|
||||
import com.github.luben.zstd.ZstdInputStream;
|
||||
import com.google.gson.Gson;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -14,6 +16,7 @@ import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
@ -27,6 +30,44 @@ public class CrawledDomainReader {
|
||||
public CrawledDomainReader() {
|
||||
}
|
||||
|
||||
public Iterator<SerializableCrawlData> createIterator(Path path) throws IOException {
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))));
|
||||
|
||||
return new Iterator<>() {
|
||||
SerializableCrawlData next;
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public boolean hasNext() {
|
||||
String identifier = br.readLine();
|
||||
if (identifier == null) {
|
||||
br.close();
|
||||
return false;
|
||||
}
|
||||
String data = br.readLine();
|
||||
if (data == null) {
|
||||
br.close();
|
||||
return false;
|
||||
}
|
||||
|
||||
if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
|
||||
next = gson.fromJson(data, CrawledDomain.class);
|
||||
} else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
|
||||
next = gson.fromJson(data, CrawledDocument.class);
|
||||
}
|
||||
else {
|
||||
throw new IllegalStateException("Unknown identifier: " + identifier);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SerializableCrawlData next() {
|
||||
return next;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
public CrawledDomain read(Path path) throws IOException {
|
||||
DomainDataAssembler domainData = new DomainDataAssembler();
|
||||
|
||||
|
@ -8,65 +8,22 @@ import java.util.concurrent.Semaphore;
|
||||
public class CrawlLimiter {
|
||||
public static final int maxPoolSize = Integer.getInteger("crawler.pool-size", 512);
|
||||
|
||||
// We'll round up to this size when we're crawling a new domain to prevent
|
||||
// too many concurrent connections
|
||||
public static final int minCrawlDataSizeKb = 128; // 100 Kb
|
||||
|
||||
// The largest size on disk where we'll permit a refresh crawl
|
||||
// (these files easily grow into the gigabytes, we don't want that in RAM)
|
||||
public static final int maxRefreshableCrawlDataSizeKBytes = 1024*128; // 128 Mb
|
||||
|
||||
// This limits how many concurrent crawl tasks we can have running at once
|
||||
// based on their size on disk. The on-disk size is compressed, and the
|
||||
// in-ram size is partially compressed (i.e. only the document body); so
|
||||
// maybe a fair estimate is something like 2-4x this figure for RAM usage
|
||||
//
|
||||
public static final int maxConcurrentCrawlTaskSizeKb = 512*1024; // 512 Mb
|
||||
|
||||
static {
|
||||
// Sanity check; if this is false we'll get a deadlock on taskSemRAM
|
||||
assert maxConcurrentCrawlTaskSizeKb >= maxRefreshableCrawlDataSizeKBytes
|
||||
: "maxConcurrentCrawlTaskSizeKb must be larger than maxRefreshableCrawlDataSizeKBytes";
|
||||
}
|
||||
|
||||
public record CrawlTaskLimits(Path refreshPath, boolean isRefreshable, int taskSize) {}
|
||||
|
||||
// We use two semaphores to keep track of the number of concurrent crawls;
|
||||
// first a RAM sempahore to limit the amount of RAM used by refresh crawls.
|
||||
// then a count semaphore to limit the number of concurrent threads (this keeps the connection count manageable)
|
||||
private final Semaphore taskSemRAM = new Semaphore(maxConcurrentCrawlTaskSizeKb);
|
||||
private final Semaphore taskSemCount = new Semaphore(maxPoolSize);
|
||||
|
||||
|
||||
public CrawlTaskLimits getTaskLimits(Path fileName) {
|
||||
long size;
|
||||
|
||||
try {
|
||||
size = Math.max(minCrawlDataSizeKb, Files.size(fileName) / 1024);
|
||||
} catch (IOException ex) {
|
||||
// If we can't read the file, we'll assume it's small since we won't be able to read it later for the refresh either
|
||||
return new CrawlTaskLimits(null,false, minCrawlDataSizeKb);
|
||||
}
|
||||
|
||||
// We'll only permit refresh crawls if the file is small enough
|
||||
boolean isRefreshable = size < maxRefreshableCrawlDataSizeKBytes;
|
||||
|
||||
// We'll truncate this down to maxRefreshableCrawlDataSizeKBytes to ensure
|
||||
// it's possible to acquire the RAM semaphore
|
||||
int effectiveSize = (int) Math.min(maxRefreshableCrawlDataSizeKBytes, size);
|
||||
|
||||
return new CrawlTaskLimits(fileName, isRefreshable, effectiveSize);
|
||||
return new CrawlTaskLimits(fileName, true, 1);
|
||||
}
|
||||
|
||||
|
||||
public void acquire(CrawlTaskLimits properties) throws InterruptedException {
|
||||
// It's very important that we acquire the RAM semaphore first to avoid a deadlock
|
||||
taskSemRAM.acquire(properties.taskSize);
|
||||
taskSemCount.acquire(1);
|
||||
}
|
||||
|
||||
public void release(CrawlTaskLimits properties) {
|
||||
taskSemCount.release(1);
|
||||
taskSemRAM.release(properties.taskSize);
|
||||
}
|
||||
}
|
||||
|
@ -10,6 +10,7 @@ import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawling.io.CrawledDomainReader;
|
||||
import nu.marginalia.crawling.io.CrawlerOutputFile;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||
import nu.marginalia.db.storage.FileStorageService;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.mq.MqMessage;
|
||||
@ -32,10 +33,7 @@ import org.slf4j.LoggerFactory;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.UUID;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.*;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
@ -201,19 +199,23 @@ public class CrawlerMain implements AutoCloseable {
|
||||
|
||||
HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool);
|
||||
|
||||
// Read the previous crawl's data for this domain, if it exists and has a reasonable size
|
||||
Optional<CrawledDomain> domain;
|
||||
if (limits.isRefreshable()) {
|
||||
domain = reader.readOptionally(limits.refreshPath());
|
||||
if (domain.isPresent()) {
|
||||
specification = specification.withOldData(domain.get());
|
||||
Iterator<SerializableCrawlData> iterator;
|
||||
try {
|
||||
if (limits.isRefreshable()) {
|
||||
iterator = reader.createIterator(limits.refreshPath());
|
||||
}
|
||||
else {
|
||||
iterator = Collections.emptyIterator();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
logger.warn("Failed to read previous crawl data for {}", specification.domain);
|
||||
iterator = Collections.emptyIterator();
|
||||
}
|
||||
|
||||
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
|
||||
var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept);
|
||||
|
||||
int size = retreiver.fetch();
|
||||
int size = retreiver.fetch(iterator);
|
||||
|
||||
workLog.setJobToFinished(specification.id, writer.getOutputFile().toString(), size);
|
||||
|
||||
|
@ -4,6 +4,7 @@ import com.google.common.hash.HashFunction;
|
||||
import com.google.common.hash.Hashing;
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever;
|
||||
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
||||
@ -18,6 +19,7 @@ import org.jsoup.nodes.Document;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.net.InetAddress;
|
||||
import java.net.UnknownHostException;
|
||||
import java.time.LocalDateTime;
|
||||
@ -58,15 +60,13 @@ public class CrawlerRetreiver {
|
||||
private final SitemapRetriever sitemapRetriever;
|
||||
private final DomainCrawlFrontier crawlFrontier;
|
||||
|
||||
private final CrawlDataReference oldCrawlData;
|
||||
|
||||
int errorCount = 0;
|
||||
private String retainedTag = "RETAINED/304";
|
||||
|
||||
public CrawlerRetreiver(HttpFetcher fetcher,
|
||||
CrawlingSpecification specs,
|
||||
Consumer<SerializableCrawlData> writer) {
|
||||
this.fetcher = fetcher;
|
||||
this.oldCrawlData = new CrawlDataReference(specs.oldData);
|
||||
|
||||
id = specs.id;
|
||||
domain = specs.domain;
|
||||
@ -97,10 +97,14 @@ public class CrawlerRetreiver {
|
||||
}
|
||||
|
||||
public int fetch() {
|
||||
return fetch(Collections.emptyIterator());
|
||||
}
|
||||
|
||||
public int fetch(Iterator<SerializableCrawlData> oldCrawlData) {
|
||||
final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, crawlFrontier.peek());
|
||||
|
||||
if (probeResult instanceof DomainProber.ProbeResultOk) {
|
||||
return crawlDomain();
|
||||
return crawlDomain(oldCrawlData);
|
||||
}
|
||||
|
||||
// handle error cases for probe
|
||||
@ -137,44 +141,29 @@ public class CrawlerRetreiver {
|
||||
throw new IllegalStateException("Unknown probe result: " + probeResult);
|
||||
};
|
||||
|
||||
private int crawlDomain() {
|
||||
private int crawlDomain(Iterator<SerializableCrawlData> oldCrawlData) {
|
||||
String ip = findIp(domain);
|
||||
|
||||
assert !crawlFrontier.isEmpty();
|
||||
|
||||
var robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain);
|
||||
long crawlDelay = robotsRules.getCrawlDelay();
|
||||
|
||||
CrawlDataComparison comparison = compareWithOldData(robotsRules);
|
||||
logger.info("Comparison result for {} : {}", domain, comparison);
|
||||
sniffRootDocument();
|
||||
|
||||
// If we have reference data, we will always grow the crawl depth a bit
|
||||
if (oldCrawlData.size() > 0) {
|
||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||
int recrawled = recrawl(oldCrawlData, robotsRules, crawlDelay);
|
||||
|
||||
if (recrawled > 0) {
|
||||
// If we have reference data, we will always grow the crawl depth a bit
|
||||
crawlFrontier.increaseDepth(1.5);
|
||||
}
|
||||
|
||||
// When the reference data doesn't appear to have changed, we'll forego
|
||||
// re-fetching it and just use the old data
|
||||
if (comparison == CrawlDataComparison.NO_CHANGES) {
|
||||
oldCrawlData.allDocuments().forEach((url, doc) -> {
|
||||
if (crawlFrontier.addVisited(url)) {
|
||||
doc.recrawlState = "RETAINED";
|
||||
crawledDomainWriter.accept(doc);
|
||||
}
|
||||
});
|
||||
|
||||
// We don't need to hold onto this in RAM anymore
|
||||
oldCrawlData.evict();
|
||||
}
|
||||
|
||||
|
||||
downloadSitemaps(robotsRules);
|
||||
sniffRootDocument();
|
||||
|
||||
long crawlDelay = robotsRules.getCrawlDelay();
|
||||
|
||||
CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, new ArrayList<>(), null);
|
||||
|
||||
int fetchedCount = 0;
|
||||
int fetchedCount = recrawled;
|
||||
|
||||
while (!crawlFrontier.isEmpty()
|
||||
&& !crawlFrontier.isCrawlDepthReached()
|
||||
@ -187,11 +176,6 @@ public class CrawlerRetreiver {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Don't re-fetch links that were previously found dead as it's very unlikely that a
|
||||
// 404:ing link will suddenly start working at a later point
|
||||
if (oldCrawlData.isPreviouslyDead(top))
|
||||
continue;
|
||||
|
||||
// Check the link filter if the endpoint should be fetched based on site-type
|
||||
if (!crawlFrontier.filterLink(top))
|
||||
continue;
|
||||
@ -211,7 +195,7 @@ public class CrawlerRetreiver {
|
||||
continue;
|
||||
|
||||
|
||||
if (fetchDocument(top, crawlDelay).isPresent()) {
|
||||
if (fetchDocument(top, null, crawlDelay).isPresent()) {
|
||||
fetchedCount++;
|
||||
}
|
||||
}
|
||||
@ -223,63 +207,69 @@ public class CrawlerRetreiver {
|
||||
return fetchedCount;
|
||||
}
|
||||
|
||||
private CrawlDataComparison compareWithOldData(SimpleRobotRules robotsRules) {
|
||||
private int recrawl(Iterator<SerializableCrawlData> oldCrawlData,
|
||||
SimpleRobotRules robotsRules,
|
||||
long crawlDelay) {
|
||||
int recrawled = 0;
|
||||
int retained = 0;
|
||||
|
||||
int numGoodDocuments = oldCrawlData.size();
|
||||
while (oldCrawlData.hasNext()) {
|
||||
if (!(oldCrawlData.next() instanceof CrawledDocument doc)) continue;
|
||||
|
||||
if (numGoodDocuments == 0)
|
||||
return CrawlDataComparison.NO_OLD_DATA;
|
||||
// This Shouldn't Happen (TM)
|
||||
var urlMaybe = EdgeUrl.parse(doc.url);
|
||||
if (urlMaybe.isEmpty()) continue;
|
||||
var url = urlMaybe.get();
|
||||
|
||||
if (numGoodDocuments < 10)
|
||||
return CrawlDataComparison.SMALL_SAMPLE;
|
||||
|
||||
// We fetch a sample of the data to assess how much it has changed
|
||||
int sampleSize = (int) Math.min(20, 0.25 * numGoodDocuments);
|
||||
Map<EdgeUrl, CrawledDocument> referenceUrls = oldCrawlData.sample(sampleSize);
|
||||
|
||||
int differences = 0;
|
||||
|
||||
long crawlDelay = robotsRules.getCrawlDelay();
|
||||
for (var url : referenceUrls.keySet()) {
|
||||
|
||||
var docMaybe = fetchDocument(url, crawlDelay);
|
||||
if (docMaybe.isEmpty()) {
|
||||
differences++;
|
||||
// If we've previously 404:d on this URL, we'll refrain from trying to fetch it again
|
||||
if (doc.httpStatus == 404) {
|
||||
crawlFrontier.addVisited(url);
|
||||
continue;
|
||||
}
|
||||
|
||||
var newDoc = docMaybe.get();
|
||||
var referenceDoc = referenceUrls.get(url);
|
||||
if (doc.httpStatus != 200) continue;
|
||||
|
||||
// This looks like a bug but it is not, we want to compare references
|
||||
// to detect if the page has bounced off etag or last-modified headers
|
||||
// to avoid having to do a full content comparison
|
||||
if (newDoc == referenceDoc)
|
||||
if (!robotsRules.isAllowed(url.toString())) {
|
||||
crawledDomainWriter.accept(createRobotsError(url));
|
||||
continue;
|
||||
}
|
||||
if (!crawlFrontier.filterLink(url))
|
||||
continue;
|
||||
if (!crawlFrontier.addVisited(url))
|
||||
continue;
|
||||
|
||||
if (newDoc.httpStatus != referenceDoc.httpStatus) {
|
||||
differences++;
|
||||
|
||||
if (recrawled > 10
|
||||
&& retained > 0.9 * recrawled
|
||||
&& Math.random() < 0.75)
|
||||
{
|
||||
logger.info("Direct-loading {}", url);
|
||||
|
||||
// Since it looks like most of these documents haven't changed,
|
||||
// we'll load the documents directly; but we do this in a random
|
||||
// fashion to make sure we eventually catch changes over time
|
||||
|
||||
crawledDomainWriter.accept(doc);
|
||||
crawlFrontier.addVisited(url);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (newDoc.documentBody == null) {
|
||||
differences++;
|
||||
continue;
|
||||
|
||||
// GET the document with the stored document as a reference
|
||||
// providing etag and last-modified headers, so we can recycle the
|
||||
// document if it hasn't changed without actually downloading it
|
||||
|
||||
var fetchedDocOpt = fetchDocument(url, doc, crawlDelay);
|
||||
if (fetchedDocOpt.isEmpty()) continue;
|
||||
|
||||
if (Objects.equals(fetchedDocOpt.get().recrawlState, retainedTag)) {
|
||||
retained ++;
|
||||
}
|
||||
|
||||
long referenceLsh = hashDoc(referenceDoc);
|
||||
long newLsh = hashDoc(newDoc);
|
||||
|
||||
if (EasyLSH.hammingDistance(referenceLsh, newLsh) > 5) {
|
||||
differences++;
|
||||
}
|
||||
}
|
||||
if (differences > sampleSize/4) {
|
||||
return CrawlDataComparison.CHANGES_FOUND;
|
||||
}
|
||||
else {
|
||||
return CrawlDataComparison.NO_CHANGES;
|
||||
recrawled ++;
|
||||
}
|
||||
|
||||
return recrawled;
|
||||
}
|
||||
|
||||
private static final HashFunction hasher = Hashing.murmur3_128(0);
|
||||
@ -346,7 +336,7 @@ public class CrawlerRetreiver {
|
||||
|
||||
var url = crawlFrontier.peek().withPathAndParam("/", null);
|
||||
|
||||
var maybeSample = fetchUrl(url).filter(sample -> sample.httpStatus == 200);
|
||||
var maybeSample = fetchUrl(url, null).filter(sample -> sample.httpStatus == 200);
|
||||
if (maybeSample.isEmpty())
|
||||
return;
|
||||
var sample = maybeSample.get();
|
||||
@ -382,23 +372,21 @@ public class CrawlerRetreiver {
|
||||
}
|
||||
}
|
||||
|
||||
private Optional<CrawledDocument> fetchDocument(EdgeUrl top, long crawlDelay) {
|
||||
private Optional<CrawledDocument> fetchDocument(EdgeUrl top,
|
||||
@Nullable CrawledDocument reference,
|
||||
long crawlDelay) {
|
||||
logger.debug("Fetching {}", top);
|
||||
|
||||
long startTime = System.currentTimeMillis();
|
||||
|
||||
var doc = fetchUrl(top);
|
||||
var doc = fetchUrl(top, reference);
|
||||
if (doc.isPresent()) {
|
||||
var d = doc.get();
|
||||
crawledDomainWriter.accept(d);
|
||||
oldCrawlData.dispose(top);
|
||||
|
||||
if (d.url != null) {
|
||||
// We may have redirected to a different path
|
||||
EdgeUrl.parse(d.url).ifPresent(url -> {
|
||||
crawlFrontier.addVisited(url);
|
||||
oldCrawlData.dispose(url);
|
||||
});
|
||||
EdgeUrl.parse(d.url).ifPresent(crawlFrontier::addVisited);
|
||||
}
|
||||
|
||||
if ("ERROR".equals(d.crawlerStatus) && d.httpStatus != 404) {
|
||||
@ -418,14 +406,31 @@ public class CrawlerRetreiver {
|
||||
|| proto.equalsIgnoreCase("https");
|
||||
}
|
||||
|
||||
private Optional<CrawledDocument> fetchUrl(EdgeUrl top) {
|
||||
private Optional<CrawledDocument> fetchUrl(EdgeUrl top, @Nullable CrawledDocument reference) {
|
||||
try {
|
||||
var doc = fetchContent(top);
|
||||
var contentTags = getContentTags(reference);
|
||||
var fetchedDoc = fetchContent(top, contentTags);
|
||||
CrawledDocument doc;
|
||||
|
||||
// HTTP status 304 is NOT MODIFIED, which means the document is the same as it was when
|
||||
// we fetched it last time. We can recycle the reference document.
|
||||
if (reference != null
|
||||
&& fetchedDoc.httpStatus == 304)
|
||||
{
|
||||
doc = reference;
|
||||
doc.recrawlState = retainedTag;
|
||||
doc.timestamp = LocalDateTime.now().toString();
|
||||
}
|
||||
else {
|
||||
doc = fetchedDoc;
|
||||
}
|
||||
|
||||
if (doc.documentBody != null) {
|
||||
doc.documentBodyHash = createHash(doc.documentBody.decode());
|
||||
var decoded = doc.documentBody.decode();
|
||||
|
||||
Optional<Document> parsedDoc = parseDoc(doc);
|
||||
doc.documentBodyHash = createHash(decoded);
|
||||
|
||||
Optional<Document> parsedDoc = parseDoc(decoded);
|
||||
EdgeUrl url = new EdgeUrl(doc.url);
|
||||
|
||||
parsedDoc.ifPresent(parsed -> findLinks(url, parsed));
|
||||
@ -443,23 +448,37 @@ public class CrawlerRetreiver {
|
||||
|
||||
}
|
||||
|
||||
private ContentTags getContentTags(@Nullable CrawledDocument reference) {
|
||||
if (null == reference)
|
||||
return ContentTags.empty();
|
||||
|
||||
String headers = reference.headers;
|
||||
if (headers == null)
|
||||
return ContentTags.empty();
|
||||
|
||||
String[] headersLines = headers.split("\n");
|
||||
|
||||
String lastmod = null;
|
||||
String etag = null;
|
||||
|
||||
for (String line : headersLines) {
|
||||
if (line.toLowerCase().startsWith("etag:")) {
|
||||
etag = line.substring(5).trim();
|
||||
}
|
||||
if (line.toLowerCase().startsWith("last-modified:")) {
|
||||
lastmod = line.substring(14).trim();
|
||||
}
|
||||
}
|
||||
|
||||
return new ContentTags(etag, lastmod);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private CrawledDocument fetchContent(EdgeUrl top) {
|
||||
private CrawledDocument fetchContent(EdgeUrl top, ContentTags tags) {
|
||||
for (int i = 0; i < 2; i++) {
|
||||
try {
|
||||
var doc = fetcher.fetchContent(top, oldCrawlData.getEtag(top), oldCrawlData.getLastModified(top));
|
||||
|
||||
var doc = fetcher.fetchContent(top, tags);
|
||||
doc.recrawlState = "NEW";
|
||||
|
||||
if (doc.httpStatus == 304) {
|
||||
var referenceData = oldCrawlData.getDoc(top);
|
||||
if (referenceData != null) {
|
||||
referenceData.recrawlState = "304/UNCHANGED";
|
||||
return referenceData;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return doc;
|
||||
}
|
||||
catch (RateLimitException ex) {
|
||||
@ -478,10 +497,8 @@ public class CrawlerRetreiver {
|
||||
return hashMethod.hashUnencodedChars(documentBodyHash).toString();
|
||||
}
|
||||
|
||||
private Optional<Document> parseDoc(CrawledDocument doc) {
|
||||
if (doc.documentBody == null)
|
||||
return Optional.empty();
|
||||
return Optional.of(Jsoup.parse(doc.documentBody.decode()));
|
||||
private Optional<Document> parseDoc(String decoded) {
|
||||
return Optional.of(Jsoup.parse(decoded));
|
||||
}
|
||||
|
||||
private void findLinks(EdgeUrl baseUrl, Document parsed) {
|
||||
|
@ -0,0 +1,24 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
import okhttp3.Request;
|
||||
|
||||
/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
|
||||
public record ContentTags(String etag, String lastMod) {
|
||||
public static ContentTags empty() {
|
||||
return new ContentTags(null, null);
|
||||
}
|
||||
|
||||
public boolean isPresent() {
|
||||
return etag != null || lastMod != null;
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return etag == null && lastMod == null;
|
||||
}
|
||||
|
||||
/** Paints the tags onto the request builder. */
|
||||
public void paint(Request.Builder getBuilder) {
|
||||
if (etag != null) getBuilder.addHeader("If-None-Match", etag);
|
||||
if (lastMod != null) getBuilder.addHeader("If-Modified-Since", lastMod);
|
||||
}
|
||||
}
|
@ -18,7 +18,7 @@ public interface HttpFetcher {
|
||||
|
||||
FetchResult probeDomain(EdgeUrl url);
|
||||
|
||||
CrawledDocument fetchContent(EdgeUrl url, String etag, String lastMod) throws RateLimitException;
|
||||
CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) throws RateLimitException;
|
||||
|
||||
SimpleRobotRules fetchRobotRules(EdgeDomain domain);
|
||||
|
||||
|
@ -128,9 +128,15 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public CrawledDocument fetchContent(EdgeUrl url, String etag, String lastMod) throws RateLimitException {
|
||||
public CrawledDocument fetchContent(EdgeUrl url,
|
||||
ContentTags contentTags)
|
||||
throws RateLimitException
|
||||
{
|
||||
|
||||
if (contentTypeLogic.isUrlLikeBinary(url)) {
|
||||
// We don't want to waste time and resources on URLs that are not HTML, so if the file ending
|
||||
// looks like it might be something else, we perform a HEAD first to check the content type
|
||||
if (contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url))
|
||||
{
|
||||
logger.debug("Probing suspected binary {}", url);
|
||||
|
||||
var headBuilder = new Request.Builder().head()
|
||||
@ -146,6 +152,21 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
|
||||
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "Early probe failed");
|
||||
}
|
||||
|
||||
// Update the URL to the final URL of the HEAD request, otherwise we might end up doing
|
||||
|
||||
// HEAD 301 url1 -> url2
|
||||
// HEAD 200 url2
|
||||
// GET 301 url1 -> url2
|
||||
// GET 200 url2
|
||||
|
||||
// which is not what we want. Overall we want to do as few requests as possible to not raise
|
||||
// too many eyebrows when looking at the logs on the target server. Overall it's probably desirable
|
||||
// that it looks like the traffic makes sense, as opposed to looking like a broken bot.
|
||||
|
||||
var redirectUrl = new EdgeUrl(rsp.request().url().toString());
|
||||
if (Objects.equals(redirectUrl.domain, url.domain))
|
||||
url = redirectUrl;
|
||||
}
|
||||
catch (SocketTimeoutException ex) {
|
||||
return createTimeoutErrorRsp(url, ex);
|
||||
@ -157,12 +178,12 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
}
|
||||
|
||||
var getBuilder = new Request.Builder().get();
|
||||
|
||||
getBuilder.addHeader("User-agent", userAgent)
|
||||
.url(url.toString())
|
||||
.addHeader("Accept-Encoding", "gzip");
|
||||
|
||||
if (etag != null) getBuilder.addHeader("If-None-Match", etag);
|
||||
if (lastMod != null) getBuilder.addHeader("If-Modified-Since", lastMod);
|
||||
contentTags.paint(getBuilder);
|
||||
|
||||
var get = getBuilder.build();
|
||||
var call = client.newCall(get);
|
||||
@ -314,7 +335,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, EdgeDomain domain) {
|
||||
try {
|
||||
var url = new EdgeUrl(proto, domain, null, "/robots.txt", null);
|
||||
return Optional.of(parseRobotsTxt(fetchContent(url, null, null)));
|
||||
return Optional.of(parseRobotsTxt(fetchContent(url, ContentTags.empty())));
|
||||
}
|
||||
catch (Exception ex) {
|
||||
return Optional.empty();
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.crawling;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.crawl.retreival.RateLimitException;
|
||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.retreival.logic.ContentTypeLogic;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
@ -29,14 +30,14 @@ class HttpFetcherTest {
|
||||
@Test
|
||||
void fetchUTF8() throws URISyntaxException, RateLimitException {
|
||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), null, null);
|
||||
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), ContentTags.empty());
|
||||
System.out.println(str.contentType);
|
||||
}
|
||||
|
||||
@Test
|
||||
void fetchText() throws URISyntaxException, RateLimitException {
|
||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), null, null);
|
||||
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), ContentTags.empty());
|
||||
System.out.println(str);
|
||||
}
|
||||
}
|
@ -4,10 +4,7 @@ import crawlercommons.robots.SimpleRobotRules;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.bigstring.BigString;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.fetcher.FetchResult;
|
||||
import nu.marginalia.crawl.retreival.fetcher.FetchResultState;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever;
|
||||
import nu.marginalia.crawl.retreival.fetcher.*;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||
@ -126,7 +123,7 @@ public class CrawlerMockFetcherTest {
|
||||
}
|
||||
|
||||
@Override
|
||||
public CrawledDocument fetchContent(EdgeUrl url, String etag, String lastModified) {
|
||||
public CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) {
|
||||
logger.info("Fetching {}", url);
|
||||
if (mockData.containsKey(url)) {
|
||||
return mockData.get(url);
|
||||
|
@ -5,12 +5,18 @@ import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawling.io.CrawledDomainReader;
|
||||
import nu.marginalia.crawling.io.CrawledDomainWriter;
|
||||
import nu.marginalia.crawling.io.CrawlerOutputFile;
|
||||
import nu.marginalia.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.crawling.model.CrawledDomain;
|
||||
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||
import org.junit.jupiter.api.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
@ -99,7 +105,7 @@ class CrawlerRetreiverTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRecrawl() {
|
||||
public void testRecrawl() throws IOException {
|
||||
|
||||
var specs = CrawlingSpecification
|
||||
.builder()
|
||||
@ -110,6 +116,8 @@ class CrawlerRetreiverTest {
|
||||
.build();
|
||||
|
||||
|
||||
Path out = Files.createTempDirectory("crawling-process");
|
||||
var writer = new CrawledDomainWriter(out, "test", "123456");
|
||||
Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
|
||||
|
||||
new CrawlerRetreiver(httpFetcher, specs, d -> {
|
||||
@ -117,7 +125,12 @@ class CrawlerRetreiverTest {
|
||||
if (d instanceof CrawledDocument doc) {
|
||||
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
|
||||
}
|
||||
writer.accept(d);
|
||||
}).fetch();
|
||||
writer.close();
|
||||
|
||||
var reader = new CrawledDomainReader();
|
||||
var iter = reader.createIterator(CrawlerOutputFile.getOutputFile(out, "123456", "test"));
|
||||
|
||||
CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
|
||||
domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
|
||||
@ -128,6 +141,7 @@ class CrawlerRetreiverTest {
|
||||
if (d instanceof CrawledDocument doc) {
|
||||
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
|
||||
}
|
||||
}).fetch();
|
||||
}).fetch(iter);
|
||||
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user