(crawler) Don't read all the data into RAM when doing a refresh-crawl

This commit is contained in:
Viktor Lofgren 2023-07-21 19:47:52 +02:00
parent 7bc1cff286
commit 58f2f86ea8
10 changed files with 249 additions and 175 deletions

View File

@ -2,8 +2,10 @@ package nu.marginalia.crawling.io;
import com.github.luben.zstd.ZstdInputStream;
import com.google.gson.Gson;
import lombok.SneakyThrows;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.crawling.model.SerializableCrawlData;
import nu.marginalia.model.gson.GsonFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -14,6 +16,7 @@ import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.ForkJoinPool;
@ -27,6 +30,44 @@ public class CrawledDomainReader {
public CrawledDomainReader() {
}
public Iterator<SerializableCrawlData> createIterator(Path path) throws IOException {
BufferedReader br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))));
return new Iterator<>() {
SerializableCrawlData next;
@Override
@SneakyThrows
public boolean hasNext() {
String identifier = br.readLine();
if (identifier == null) {
br.close();
return false;
}
String data = br.readLine();
if (data == null) {
br.close();
return false;
}
if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) {
next = gson.fromJson(data, CrawledDomain.class);
} else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) {
next = gson.fromJson(data, CrawledDocument.class);
}
else {
throw new IllegalStateException("Unknown identifier: " + identifier);
}
return true;
}
@Override
public SerializableCrawlData next() {
return next;
}
};
}
public CrawledDomain read(Path path) throws IOException {
DomainDataAssembler domainData = new DomainDataAssembler();

View File

@ -8,65 +8,22 @@ import java.util.concurrent.Semaphore;
public class CrawlLimiter {
public static final int maxPoolSize = Integer.getInteger("crawler.pool-size", 512);
// We'll round up to this size when we're crawling a new domain to prevent
// too many concurrent connections
public static final int minCrawlDataSizeKb = 128; // 100 Kb
// The largest size on disk where we'll permit a refresh crawl
// (these files easily grow into the gigabytes, we don't want that in RAM)
public static final int maxRefreshableCrawlDataSizeKBytes = 1024*128; // 128 Mb
// This limits how many concurrent crawl tasks we can have running at once
// based on their size on disk. The on-disk size is compressed, and the
// in-ram size is partially compressed (i.e. only the document body); so
// maybe a fair estimate is something like 2-4x this figure for RAM usage
//
public static final int maxConcurrentCrawlTaskSizeKb = 512*1024; // 512 Mb
static {
// Sanity check; if this is false we'll get a deadlock on taskSemRAM
assert maxConcurrentCrawlTaskSizeKb >= maxRefreshableCrawlDataSizeKBytes
: "maxConcurrentCrawlTaskSizeKb must be larger than maxRefreshableCrawlDataSizeKBytes";
}
public record CrawlTaskLimits(Path refreshPath, boolean isRefreshable, int taskSize) {}
// We use two semaphores to keep track of the number of concurrent crawls;
// first a RAM sempahore to limit the amount of RAM used by refresh crawls.
// then a count semaphore to limit the number of concurrent threads (this keeps the connection count manageable)
private final Semaphore taskSemRAM = new Semaphore(maxConcurrentCrawlTaskSizeKb);
private final Semaphore taskSemCount = new Semaphore(maxPoolSize);
public CrawlTaskLimits getTaskLimits(Path fileName) {
long size;
try {
size = Math.max(minCrawlDataSizeKb, Files.size(fileName) / 1024);
} catch (IOException ex) {
// If we can't read the file, we'll assume it's small since we won't be able to read it later for the refresh either
return new CrawlTaskLimits(null,false, minCrawlDataSizeKb);
}
// We'll only permit refresh crawls if the file is small enough
boolean isRefreshable = size < maxRefreshableCrawlDataSizeKBytes;
// We'll truncate this down to maxRefreshableCrawlDataSizeKBytes to ensure
// it's possible to acquire the RAM semaphore
int effectiveSize = (int) Math.min(maxRefreshableCrawlDataSizeKBytes, size);
return new CrawlTaskLimits(fileName, isRefreshable, effectiveSize);
return new CrawlTaskLimits(fileName, true, 1);
}
public void acquire(CrawlTaskLimits properties) throws InterruptedException {
// It's very important that we acquire the RAM semaphore first to avoid a deadlock
taskSemRAM.acquire(properties.taskSize);
taskSemCount.acquire(1);
}
public void release(CrawlTaskLimits properties) {
taskSemCount.release(1);
taskSemRAM.release(properties.taskSize);
}
}

View File

@ -10,6 +10,7 @@ import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.io.CrawlerOutputFile;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.crawling.model.SerializableCrawlData;
import nu.marginalia.db.storage.FileStorageService;
import nu.marginalia.mq.MessageQueueFactory;
import nu.marginalia.mq.MqMessage;
@ -32,10 +33,7 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.HashSet;
import java.util.Optional;
import java.util.Set;
import java.util.UUID;
import java.util.*;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicInteger;
@ -201,19 +199,23 @@ public class CrawlerMain implements AutoCloseable {
HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool);
// Read the previous crawl's data for this domain, if it exists and has a reasonable size
Optional<CrawledDomain> domain;
if (limits.isRefreshable()) {
domain = reader.readOptionally(limits.refreshPath());
if (domain.isPresent()) {
specification = specification.withOldData(domain.get());
Iterator<SerializableCrawlData> iterator;
try {
if (limits.isRefreshable()) {
iterator = reader.createIterator(limits.refreshPath());
}
else {
iterator = Collections.emptyIterator();
}
} catch (IOException e) {
logger.warn("Failed to read previous crawl data for {}", specification.domain);
iterator = Collections.emptyIterator();
}
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept);
int size = retreiver.fetch();
int size = retreiver.fetch(iterator);
workLog.setJobToFinished(specification.id, writer.getOutputFile().toString(), size);

View File

@ -4,6 +4,7 @@ import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;
import crawlercommons.robots.SimpleRobotRules;
import lombok.SneakyThrows;
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever;
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
@ -18,6 +19,7 @@ import org.jsoup.nodes.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.time.LocalDateTime;
@ -58,15 +60,13 @@ public class CrawlerRetreiver {
private final SitemapRetriever sitemapRetriever;
private final DomainCrawlFrontier crawlFrontier;
private final CrawlDataReference oldCrawlData;
int errorCount = 0;
private String retainedTag = "RETAINED/304";
public CrawlerRetreiver(HttpFetcher fetcher,
CrawlingSpecification specs,
Consumer<SerializableCrawlData> writer) {
this.fetcher = fetcher;
this.oldCrawlData = new CrawlDataReference(specs.oldData);
id = specs.id;
domain = specs.domain;
@ -97,10 +97,14 @@ public class CrawlerRetreiver {
}
public int fetch() {
return fetch(Collections.emptyIterator());
}
public int fetch(Iterator<SerializableCrawlData> oldCrawlData) {
final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, crawlFrontier.peek());
if (probeResult instanceof DomainProber.ProbeResultOk) {
return crawlDomain();
return crawlDomain(oldCrawlData);
}
// handle error cases for probe
@ -137,44 +141,29 @@ public class CrawlerRetreiver {
throw new IllegalStateException("Unknown probe result: " + probeResult);
};
private int crawlDomain() {
private int crawlDomain(Iterator<SerializableCrawlData> oldCrawlData) {
String ip = findIp(domain);
assert !crawlFrontier.isEmpty();
var robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain);
long crawlDelay = robotsRules.getCrawlDelay();
CrawlDataComparison comparison = compareWithOldData(robotsRules);
logger.info("Comparison result for {} : {}", domain, comparison);
sniffRootDocument();
// If we have reference data, we will always grow the crawl depth a bit
if (oldCrawlData.size() > 0) {
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
int recrawled = recrawl(oldCrawlData, robotsRules, crawlDelay);
if (recrawled > 0) {
// If we have reference data, we will always grow the crawl depth a bit
crawlFrontier.increaseDepth(1.5);
}
// When the reference data doesn't appear to have changed, we'll forego
// re-fetching it and just use the old data
if (comparison == CrawlDataComparison.NO_CHANGES) {
oldCrawlData.allDocuments().forEach((url, doc) -> {
if (crawlFrontier.addVisited(url)) {
doc.recrawlState = "RETAINED";
crawledDomainWriter.accept(doc);
}
});
// We don't need to hold onto this in RAM anymore
oldCrawlData.evict();
}
downloadSitemaps(robotsRules);
sniffRootDocument();
long crawlDelay = robotsRules.getCrawlDelay();
CrawledDomain ret = new CrawledDomain(id, domain, null, CrawlerDomainStatus.OK.name(), null, ip, new ArrayList<>(), null);
int fetchedCount = 0;
int fetchedCount = recrawled;
while (!crawlFrontier.isEmpty()
&& !crawlFrontier.isCrawlDepthReached()
@ -187,11 +176,6 @@ public class CrawlerRetreiver {
continue;
}
// Don't re-fetch links that were previously found dead as it's very unlikely that a
// 404:ing link will suddenly start working at a later point
if (oldCrawlData.isPreviouslyDead(top))
continue;
// Check the link filter if the endpoint should be fetched based on site-type
if (!crawlFrontier.filterLink(top))
continue;
@ -211,7 +195,7 @@ public class CrawlerRetreiver {
continue;
if (fetchDocument(top, crawlDelay).isPresent()) {
if (fetchDocument(top, null, crawlDelay).isPresent()) {
fetchedCount++;
}
}
@ -223,63 +207,69 @@ public class CrawlerRetreiver {
return fetchedCount;
}
private CrawlDataComparison compareWithOldData(SimpleRobotRules robotsRules) {
private int recrawl(Iterator<SerializableCrawlData> oldCrawlData,
SimpleRobotRules robotsRules,
long crawlDelay) {
int recrawled = 0;
int retained = 0;
int numGoodDocuments = oldCrawlData.size();
while (oldCrawlData.hasNext()) {
if (!(oldCrawlData.next() instanceof CrawledDocument doc)) continue;
if (numGoodDocuments == 0)
return CrawlDataComparison.NO_OLD_DATA;
// This Shouldn't Happen (TM)
var urlMaybe = EdgeUrl.parse(doc.url);
if (urlMaybe.isEmpty()) continue;
var url = urlMaybe.get();
if (numGoodDocuments < 10)
return CrawlDataComparison.SMALL_SAMPLE;
// We fetch a sample of the data to assess how much it has changed
int sampleSize = (int) Math.min(20, 0.25 * numGoodDocuments);
Map<EdgeUrl, CrawledDocument> referenceUrls = oldCrawlData.sample(sampleSize);
int differences = 0;
long crawlDelay = robotsRules.getCrawlDelay();
for (var url : referenceUrls.keySet()) {
var docMaybe = fetchDocument(url, crawlDelay);
if (docMaybe.isEmpty()) {
differences++;
// If we've previously 404:d on this URL, we'll refrain from trying to fetch it again
if (doc.httpStatus == 404) {
crawlFrontier.addVisited(url);
continue;
}
var newDoc = docMaybe.get();
var referenceDoc = referenceUrls.get(url);
if (doc.httpStatus != 200) continue;
// This looks like a bug but it is not, we want to compare references
// to detect if the page has bounced off etag or last-modified headers
// to avoid having to do a full content comparison
if (newDoc == referenceDoc)
if (!robotsRules.isAllowed(url.toString())) {
crawledDomainWriter.accept(createRobotsError(url));
continue;
}
if (!crawlFrontier.filterLink(url))
continue;
if (!crawlFrontier.addVisited(url))
continue;
if (newDoc.httpStatus != referenceDoc.httpStatus) {
differences++;
if (recrawled > 10
&& retained > 0.9 * recrawled
&& Math.random() < 0.75)
{
logger.info("Direct-loading {}", url);
// Since it looks like most of these documents haven't changed,
// we'll load the documents directly; but we do this in a random
// fashion to make sure we eventually catch changes over time
crawledDomainWriter.accept(doc);
crawlFrontier.addVisited(url);
continue;
}
if (newDoc.documentBody == null) {
differences++;
continue;
// GET the document with the stored document as a reference
// providing etag and last-modified headers, so we can recycle the
// document if it hasn't changed without actually downloading it
var fetchedDocOpt = fetchDocument(url, doc, crawlDelay);
if (fetchedDocOpt.isEmpty()) continue;
if (Objects.equals(fetchedDocOpt.get().recrawlState, retainedTag)) {
retained ++;
}
long referenceLsh = hashDoc(referenceDoc);
long newLsh = hashDoc(newDoc);
if (EasyLSH.hammingDistance(referenceLsh, newLsh) > 5) {
differences++;
}
}
if (differences > sampleSize/4) {
return CrawlDataComparison.CHANGES_FOUND;
}
else {
return CrawlDataComparison.NO_CHANGES;
recrawled ++;
}
return recrawled;
}
private static final HashFunction hasher = Hashing.murmur3_128(0);
@ -346,7 +336,7 @@ public class CrawlerRetreiver {
var url = crawlFrontier.peek().withPathAndParam("/", null);
var maybeSample = fetchUrl(url).filter(sample -> sample.httpStatus == 200);
var maybeSample = fetchUrl(url, null).filter(sample -> sample.httpStatus == 200);
if (maybeSample.isEmpty())
return;
var sample = maybeSample.get();
@ -382,23 +372,21 @@ public class CrawlerRetreiver {
}
}
private Optional<CrawledDocument> fetchDocument(EdgeUrl top, long crawlDelay) {
private Optional<CrawledDocument> fetchDocument(EdgeUrl top,
@Nullable CrawledDocument reference,
long crawlDelay) {
logger.debug("Fetching {}", top);
long startTime = System.currentTimeMillis();
var doc = fetchUrl(top);
var doc = fetchUrl(top, reference);
if (doc.isPresent()) {
var d = doc.get();
crawledDomainWriter.accept(d);
oldCrawlData.dispose(top);
if (d.url != null) {
// We may have redirected to a different path
EdgeUrl.parse(d.url).ifPresent(url -> {
crawlFrontier.addVisited(url);
oldCrawlData.dispose(url);
});
EdgeUrl.parse(d.url).ifPresent(crawlFrontier::addVisited);
}
if ("ERROR".equals(d.crawlerStatus) && d.httpStatus != 404) {
@ -418,14 +406,31 @@ public class CrawlerRetreiver {
|| proto.equalsIgnoreCase("https");
}
private Optional<CrawledDocument> fetchUrl(EdgeUrl top) {
private Optional<CrawledDocument> fetchUrl(EdgeUrl top, @Nullable CrawledDocument reference) {
try {
var doc = fetchContent(top);
var contentTags = getContentTags(reference);
var fetchedDoc = fetchContent(top, contentTags);
CrawledDocument doc;
// HTTP status 304 is NOT MODIFIED, which means the document is the same as it was when
// we fetched it last time. We can recycle the reference document.
if (reference != null
&& fetchedDoc.httpStatus == 304)
{
doc = reference;
doc.recrawlState = retainedTag;
doc.timestamp = LocalDateTime.now().toString();
}
else {
doc = fetchedDoc;
}
if (doc.documentBody != null) {
doc.documentBodyHash = createHash(doc.documentBody.decode());
var decoded = doc.documentBody.decode();
Optional<Document> parsedDoc = parseDoc(doc);
doc.documentBodyHash = createHash(decoded);
Optional<Document> parsedDoc = parseDoc(decoded);
EdgeUrl url = new EdgeUrl(doc.url);
parsedDoc.ifPresent(parsed -> findLinks(url, parsed));
@ -443,23 +448,37 @@ public class CrawlerRetreiver {
}
private ContentTags getContentTags(@Nullable CrawledDocument reference) {
if (null == reference)
return ContentTags.empty();
String headers = reference.headers;
if (headers == null)
return ContentTags.empty();
String[] headersLines = headers.split("\n");
String lastmod = null;
String etag = null;
for (String line : headersLines) {
if (line.toLowerCase().startsWith("etag:")) {
etag = line.substring(5).trim();
}
if (line.toLowerCase().startsWith("last-modified:")) {
lastmod = line.substring(14).trim();
}
}
return new ContentTags(etag, lastmod);
}
@SneakyThrows
private CrawledDocument fetchContent(EdgeUrl top) {
private CrawledDocument fetchContent(EdgeUrl top, ContentTags tags) {
for (int i = 0; i < 2; i++) {
try {
var doc = fetcher.fetchContent(top, oldCrawlData.getEtag(top), oldCrawlData.getLastModified(top));
var doc = fetcher.fetchContent(top, tags);
doc.recrawlState = "NEW";
if (doc.httpStatus == 304) {
var referenceData = oldCrawlData.getDoc(top);
if (referenceData != null) {
referenceData.recrawlState = "304/UNCHANGED";
return referenceData;
}
}
return doc;
}
catch (RateLimitException ex) {
@ -478,10 +497,8 @@ public class CrawlerRetreiver {
return hashMethod.hashUnencodedChars(documentBodyHash).toString();
}
private Optional<Document> parseDoc(CrawledDocument doc) {
if (doc.documentBody == null)
return Optional.empty();
return Optional.of(Jsoup.parse(doc.documentBody.decode()));
private Optional<Document> parseDoc(String decoded) {
return Optional.of(Jsoup.parse(decoded));
}
private void findLinks(EdgeUrl baseUrl, Document parsed) {

View File

@ -0,0 +1,24 @@
package nu.marginalia.crawl.retreival.fetcher;
import okhttp3.Request;
/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
public record ContentTags(String etag, String lastMod) {
public static ContentTags empty() {
return new ContentTags(null, null);
}
public boolean isPresent() {
return etag != null || lastMod != null;
}
public boolean isEmpty() {
return etag == null && lastMod == null;
}
/** Paints the tags onto the request builder. */
public void paint(Request.Builder getBuilder) {
if (etag != null) getBuilder.addHeader("If-None-Match", etag);
if (lastMod != null) getBuilder.addHeader("If-Modified-Since", lastMod);
}
}

View File

@ -18,7 +18,7 @@ public interface HttpFetcher {
FetchResult probeDomain(EdgeUrl url);
CrawledDocument fetchContent(EdgeUrl url, String etag, String lastMod) throws RateLimitException;
CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) throws RateLimitException;
SimpleRobotRules fetchRobotRules(EdgeDomain domain);

View File

@ -128,9 +128,15 @@ public class HttpFetcherImpl implements HttpFetcher {
@Override
@SneakyThrows
public CrawledDocument fetchContent(EdgeUrl url, String etag, String lastMod) throws RateLimitException {
public CrawledDocument fetchContent(EdgeUrl url,
ContentTags contentTags)
throws RateLimitException
{
if (contentTypeLogic.isUrlLikeBinary(url)) {
// We don't want to waste time and resources on URLs that are not HTML, so if the file ending
// looks like it might be something else, we perform a HEAD first to check the content type
if (contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url))
{
logger.debug("Probing suspected binary {}", url);
var headBuilder = new Request.Builder().head()
@ -146,6 +152,21 @@ public class HttpFetcherImpl implements HttpFetcher {
if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "Early probe failed");
}
// Update the URL to the final URL of the HEAD request, otherwise we might end up doing
// HEAD 301 url1 -> url2
// HEAD 200 url2
// GET 301 url1 -> url2
// GET 200 url2
// which is not what we want. Overall we want to do as few requests as possible to not raise
// too many eyebrows when looking at the logs on the target server. Overall it's probably desirable
// that it looks like the traffic makes sense, as opposed to looking like a broken bot.
var redirectUrl = new EdgeUrl(rsp.request().url().toString());
if (Objects.equals(redirectUrl.domain, url.domain))
url = redirectUrl;
}
catch (SocketTimeoutException ex) {
return createTimeoutErrorRsp(url, ex);
@ -157,12 +178,12 @@ public class HttpFetcherImpl implements HttpFetcher {
}
var getBuilder = new Request.Builder().get();
getBuilder.addHeader("User-agent", userAgent)
.url(url.toString())
.addHeader("Accept-Encoding", "gzip");
if (etag != null) getBuilder.addHeader("If-None-Match", etag);
if (lastMod != null) getBuilder.addHeader("If-Modified-Since", lastMod);
contentTags.paint(getBuilder);
var get = getBuilder.build();
var call = client.newCall(get);
@ -314,7 +335,7 @@ public class HttpFetcherImpl implements HttpFetcher {
private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, EdgeDomain domain) {
try {
var url = new EdgeUrl(proto, domain, null, "/robots.txt", null);
return Optional.of(parseRobotsTxt(fetchContent(url, null, null)));
return Optional.of(parseRobotsTxt(fetchContent(url, ContentTags.empty())));
}
catch (Exception ex) {
return Optional.empty();

View File

@ -2,6 +2,7 @@ package nu.marginalia.crawling;
import lombok.SneakyThrows;
import nu.marginalia.crawl.retreival.RateLimitException;
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.logic.ContentTypeLogic;
import nu.marginalia.model.EdgeUrl;
@ -29,14 +30,14 @@ class HttpFetcherTest {
@Test
void fetchUTF8() throws URISyntaxException, RateLimitException {
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), null, null);
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), ContentTags.empty());
System.out.println(str.contentType);
}
@Test
void fetchText() throws URISyntaxException, RateLimitException {
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), null, null);
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), ContentTags.empty());
System.out.println(str);
}
}

View File

@ -4,10 +4,7 @@ import crawlercommons.robots.SimpleRobotRules;
import lombok.SneakyThrows;
import nu.marginalia.bigstring.BigString;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.fetcher.FetchResult;
import nu.marginalia.crawl.retreival.fetcher.FetchResultState;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever;
import nu.marginalia.crawl.retreival.fetcher.*;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
import nu.marginalia.crawling.model.SerializableCrawlData;
@ -126,7 +123,7 @@ public class CrawlerMockFetcherTest {
}
@Override
public CrawledDocument fetchContent(EdgeUrl url, String etag, String lastModified) {
public CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) {
logger.info("Fetching {}", url);
if (mockData.containsKey(url)) {
return mockData.get(url);

View File

@ -5,12 +5,18 @@ import nu.marginalia.WmsaHome;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.io.CrawledDomainWriter;
import nu.marginalia.crawling.io.CrawlerOutputFile;
import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.crawling.model.CrawledDomain;
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
import nu.marginalia.crawling.model.SerializableCrawlData;
import org.junit.jupiter.api.*;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@ -99,7 +105,7 @@ class CrawlerRetreiverTest {
}
@Test
public void testRecrawl() {
public void testRecrawl() throws IOException {
var specs = CrawlingSpecification
.builder()
@ -110,6 +116,8 @@ class CrawlerRetreiverTest {
.build();
Path out = Files.createTempDirectory("crawling-process");
var writer = new CrawledDomainWriter(out, "test", "123456");
Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
new CrawlerRetreiver(httpFetcher, specs, d -> {
@ -117,7 +125,12 @@ class CrawlerRetreiverTest {
if (d instanceof CrawledDocument doc) {
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
}
writer.accept(d);
}).fetch();
writer.close();
var reader = new CrawledDomainReader();
var iter = reader.createIterator(CrawlerOutputFile.getOutputFile(out, "123456", "test"));
CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
@ -128,6 +141,7 @@ class CrawlerRetreiverTest {
if (d instanceof CrawledDocument doc) {
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
}
}).fetch();
}).fetch(iter);
}
}