(crawler) Fix a bug where reference copies of crawl data was written without etag and last-modified

This commit also adds a band-aid to ParquetSerializableCrawlDataStream to fetch this from the 304-entity.  This can be removed in a few months.
This commit is contained in:
Viktor Lofgren 2024-01-18 16:02:27 +01:00
parent 964419803a
commit 22c8fb3f59
4 changed files with 41 additions and 19 deletions

View File

@ -103,6 +103,8 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
)); ));
} }
private CrawledDocumentParquetRecord previousRecord = null;
private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) { private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) {
String bodyString = ""; String bodyString = "";
CrawlerDocumentStatus status = CrawlerDocumentStatus.OK; CrawlerDocumentStatus status = CrawlerDocumentStatus.OK;
@ -130,6 +132,24 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
status = CrawlerDocumentStatus.ERROR; status = CrawlerDocumentStatus.ERROR;
} }
String etag = nextRecord.etagHeader;
String lastModified = nextRecord.lastModifiedHeader;
// If we have a previous record, and it was a 304, and this one is a 200, we'll use the ETag and Last-Modified
// from the previous record, as it's not guaranteed the reference copy will have the same headers due to a bug
// in the crawler. The bug is fixed, but we still need to support old crawls.
//
// This was added in 2024-01-18, so we can remove it in a few months.
if (previousRecord != null
&& previousRecord.url.equals(nextRecord.url)
&& previousRecord.httpStatus == 304
&& nextRecord.httpStatus == 200)
{
etag = previousRecord.etagHeader;
lastModified = previousRecord.lastModifiedHeader;
}
nextQ.add(new CrawledDocument("", nextQ.add(new CrawledDocument("",
nextRecord.url, nextRecord.url,
nextRecord.contentType, nextRecord.contentType,
@ -144,11 +164,14 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
null, null,
"", "",
nextRecord.cookies, nextRecord.cookies,
nextRecord.lastModifiedHeader, lastModified,
nextRecord.etagHeader)); etag));
previousRecord = nextRecord;
} }
public void close() throws IOException { public void close() throws IOException {
previousRecord = null;
} }
@Override @Override

View File

@ -2,7 +2,6 @@ package nu.marginalia.crawl.retreival.fetcher.warc;
import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawling.body.HttpFetchResult;
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
@ -255,15 +254,6 @@ public class WarcRecorder implements AutoCloseable {
} }
} }
/**
* Flag the given URL as skipped by the crawler, so that it will not be retried.
* Which URLs were skipped is still important when resynchronizing on the WARC file,
* so that the crawler can avoid re-fetching them.
*/
public void flagAsSkipped(EdgeUrl url, String contentType, int statusCode, String documentBody) {
saveOldResponse(url, contentType, statusCode, documentBody, ContentTags.empty());
}
/** /**
* Write a reference copy of the given document data. This is used when the crawler provides * Write a reference copy of the given document data. This is used when the crawler provides
* an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this * an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this

View File

@ -5,6 +5,7 @@ import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.CrawlDelayTimer; import nu.marginalia.crawl.retreival.CrawlDelayTimer;
import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainCrawlFrontier; import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDocument;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
@ -84,7 +85,12 @@ public class CrawlerRevisitor {
} }
// Add a WARC record so we don't repeat this // Add a WARC record so we don't repeat this
warcRecorder.flagAsSkipped(url, doc.contentType, doc.httpStatus, doc.documentBody); warcRecorder.writeReferenceCopy(url,
doc.contentType,
doc.httpStatus,
doc.documentBody,
new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe)
);
} }
else { else {
// GET the document with the stored document as a reference // GET the document with the stored document as a reference

View File

@ -73,10 +73,11 @@ class WarcRecorderTest {
public void flagAsSkipped() throws IOException, URISyntaxException { public void flagAsSkipped() throws IOException, URISyntaxException {
try (var recorder = new WarcRecorder(fileNameWarc)) { try (var recorder = new WarcRecorder(fileNameWarc)) {
recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"), recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
"text/html", "text/html",
200, 200,
"<?doctype html><html><body>test</body></html>"); "<?doctype html><html><body>test</body></html>",
ContentTags.empty());
} }
try (var reader = new WarcReader(fileNameWarc)) { try (var reader = new WarcReader(fileNameWarc)) {
@ -95,10 +96,11 @@ class WarcRecorderTest {
public void flagAsSkippedNullBody() throws IOException, URISyntaxException { public void flagAsSkippedNullBody() throws IOException, URISyntaxException {
try (var recorder = new WarcRecorder(fileNameWarc)) { try (var recorder = new WarcRecorder(fileNameWarc)) {
recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"), recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
"text/html", "text/html",
200, 200,
null); null,
ContentTags.empty());
} }
} }
@ -106,10 +108,11 @@ class WarcRecorderTest {
@Test @Test
public void testSaveImport() throws URISyntaxException, IOException { public void testSaveImport() throws URISyntaxException, IOException {
try (var recorder = new WarcRecorder(fileNameWarc)) { try (var recorder = new WarcRecorder(fileNameWarc)) {
recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"), recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
"text/html", "text/html",
200, 200,
"<?doctype html><html><body>test</body></html>"); "<?doctype html><html><body>test</body></html>",
ContentTags.empty());
} }
try (var reader = new WarcReader(fileNameWarc)) { try (var reader = new WarcReader(fileNameWarc)) {