mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(crawler) Fix a bug where reference copies of crawl data was written without etag and last-modified
This commit also adds a band-aid to ParquetSerializableCrawlDataStream to fetch this from the 304-entity. This can be removed in a few months.
This commit is contained in:
parent
964419803a
commit
22c8fb3f59
@ -103,6 +103,8 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
|||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private CrawledDocumentParquetRecord previousRecord = null;
|
||||||
|
|
||||||
private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) {
|
private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) {
|
||||||
String bodyString = "";
|
String bodyString = "";
|
||||||
CrawlerDocumentStatus status = CrawlerDocumentStatus.OK;
|
CrawlerDocumentStatus status = CrawlerDocumentStatus.OK;
|
||||||
@ -130,6 +132,24 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
|||||||
status = CrawlerDocumentStatus.ERROR;
|
status = CrawlerDocumentStatus.ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
String etag = nextRecord.etagHeader;
|
||||||
|
String lastModified = nextRecord.lastModifiedHeader;
|
||||||
|
|
||||||
|
// If we have a previous record, and it was a 304, and this one is a 200, we'll use the ETag and Last-Modified
|
||||||
|
// from the previous record, as it's not guaranteed the reference copy will have the same headers due to a bug
|
||||||
|
// in the crawler. The bug is fixed, but we still need to support old crawls.
|
||||||
|
//
|
||||||
|
// This was added in 2024-01-18, so we can remove it in a few months.
|
||||||
|
|
||||||
|
if (previousRecord != null
|
||||||
|
&& previousRecord.url.equals(nextRecord.url)
|
||||||
|
&& previousRecord.httpStatus == 304
|
||||||
|
&& nextRecord.httpStatus == 200)
|
||||||
|
{
|
||||||
|
etag = previousRecord.etagHeader;
|
||||||
|
lastModified = previousRecord.lastModifiedHeader;
|
||||||
|
}
|
||||||
|
|
||||||
nextQ.add(new CrawledDocument("",
|
nextQ.add(new CrawledDocument("",
|
||||||
nextRecord.url,
|
nextRecord.url,
|
||||||
nextRecord.contentType,
|
nextRecord.contentType,
|
||||||
@ -144,11 +164,14 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
|||||||
null,
|
null,
|
||||||
"",
|
"",
|
||||||
nextRecord.cookies,
|
nextRecord.cookies,
|
||||||
nextRecord.lastModifiedHeader,
|
lastModified,
|
||||||
nextRecord.etagHeader));
|
etag));
|
||||||
|
|
||||||
|
previousRecord = nextRecord;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
|
previousRecord = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -2,7 +2,6 @@ package nu.marginalia.crawl.retreival.fetcher.warc;
|
|||||||
|
|
||||||
import nu.marginalia.crawl.retreival.DomainProber;
|
import nu.marginalia.crawl.retreival.DomainProber;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
|
|
||||||
import nu.marginalia.crawling.body.HttpFetchResult;
|
import nu.marginalia.crawling.body.HttpFetchResult;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
|
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
@ -255,15 +254,6 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Flag the given URL as skipped by the crawler, so that it will not be retried.
|
|
||||||
* Which URLs were skipped is still important when resynchronizing on the WARC file,
|
|
||||||
* so that the crawler can avoid re-fetching them.
|
|
||||||
*/
|
|
||||||
public void flagAsSkipped(EdgeUrl url, String contentType, int statusCode, String documentBody) {
|
|
||||||
saveOldResponse(url, contentType, statusCode, documentBody, ContentTags.empty());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Write a reference copy of the given document data. This is used when the crawler provides
|
* Write a reference copy of the given document data. This is used when the crawler provides
|
||||||
* an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this
|
* an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this
|
||||||
|
@ -5,6 +5,7 @@ import nu.marginalia.crawl.retreival.CrawlDataReference;
|
|||||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||||
import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
|
import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
|
||||||
|
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
@ -84,7 +85,12 @@ public class CrawlerRevisitor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Add a WARC record so we don't repeat this
|
// Add a WARC record so we don't repeat this
|
||||||
warcRecorder.flagAsSkipped(url, doc.contentType, doc.httpStatus, doc.documentBody);
|
warcRecorder.writeReferenceCopy(url,
|
||||||
|
doc.contentType,
|
||||||
|
doc.httpStatus,
|
||||||
|
doc.documentBody,
|
||||||
|
new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// GET the document with the stored document as a reference
|
// GET the document with the stored document as a reference
|
||||||
|
@ -73,10 +73,11 @@ class WarcRecorderTest {
|
|||||||
public void flagAsSkipped() throws IOException, URISyntaxException {
|
public void flagAsSkipped() throws IOException, URISyntaxException {
|
||||||
|
|
||||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
||||||
recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"),
|
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||||
"text/html",
|
"text/html",
|
||||||
200,
|
200,
|
||||||
"<?doctype html><html><body>test</body></html>");
|
"<?doctype html><html><body>test</body></html>",
|
||||||
|
ContentTags.empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var reader = new WarcReader(fileNameWarc)) {
|
try (var reader = new WarcReader(fileNameWarc)) {
|
||||||
@ -95,10 +96,11 @@ class WarcRecorderTest {
|
|||||||
public void flagAsSkippedNullBody() throws IOException, URISyntaxException {
|
public void flagAsSkippedNullBody() throws IOException, URISyntaxException {
|
||||||
|
|
||||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
||||||
recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"),
|
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||||
"text/html",
|
"text/html",
|
||||||
200,
|
200,
|
||||||
null);
|
null,
|
||||||
|
ContentTags.empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -106,10 +108,11 @@ class WarcRecorderTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testSaveImport() throws URISyntaxException, IOException {
|
public void testSaveImport() throws URISyntaxException, IOException {
|
||||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
||||||
recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"),
|
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||||
"text/html",
|
"text/html",
|
||||||
200,
|
200,
|
||||||
"<?doctype html><html><body>test</body></html>");
|
"<?doctype html><html><body>test</body></html>",
|
||||||
|
ContentTags.empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var reader = new WarcReader(fileNameWarc)) {
|
try (var reader = new WarcReader(fileNameWarc)) {
|
||||||
|
Loading…
Reference in New Issue
Block a user