From 3a56a06c4fc274b878f350fce9f901a91bea7d7b Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 18 Dec 2023 17:45:54 +0100 Subject: [PATCH] (warc) Add a fields for etags and last-modified headers to the new crawl data formats Make some temporary modifications to the CrawledDocument model to support both a "big string" style headers field like in the old formats, and explicit fields as in the new formats. This is a bit awkward to deal with, but it's a necessity until we migrate off the old formats entirely. The commit also adds a few tests to this logic. --- .../ParquetSerializableCrawlDataStream.java | 4 +- .../WarcSerializableCrawlDataStream.java | 10 +- .../crawling/model/CrawledDocument.java | 50 +++++++++ .../parquet/CrawledDocumentParquetRecord.java | 16 ++- ...rawledDocumentParquetRecordFileWriter.java | 20 +++- .../crawling/model/CrawledDocumentTest.java | 101 ++++++++++++++++++ ...edDocumentParquetRecordFileWriterTest.java | 3 +- .../sideload/SideloaderProcessing.java | 4 +- .../converting/ConvertingIntegrationTest.java | 4 +- .../revisit/DocumentWithReference.java | 19 +--- .../revisit/DocumentWithReferenceTest.java | 86 +++++++++++++++ .../retreival/CrawlerRetreiverTest.java | 6 +- 12 files changed, 294 insertions(+), 29 deletions(-) create mode 100644 code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/model/CrawledDocumentTest.java create mode 100644 code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReferenceTest.java diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java index d3e54a07..71159526 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java @@ -118,7 +118,9 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial nextRecord.url, null, "", - nextRecord.cookies)); + nextRecord.cookies, + nextRecord.lastModifiedHeader, + nextRecord.etagHeader)); } public void close() throws IOException { diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java index 2cdb7af1..b848d7ad 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java @@ -82,6 +82,8 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa return; } + var httpHeaders = http.headers(); + var parsedBody = DocumentBodyExtractor.asString(HttpFetchResult.importWarc(response)); if (parsedBody instanceof DocumentBodyResult.Error error) { next = new CrawledDocument( @@ -98,7 +100,9 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa "", "", "", - WarcXCookieInformationHeader.hasCookies(response) + WarcXCookieInformationHeader.hasCookies(response), + null, + null ); } else if (parsedBody instanceof DocumentBodyResult.Ok ok) { next = new CrawledDocument( @@ -115,7 +119,9 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa "", "", "", - WarcXCookieInformationHeader.hasCookies(response)); + WarcXCookieInformationHeader.hasCookies(response), + httpHeaders.first("Last-Modified").orElse(""), + httpHeaders.first("ETag").orElse("")); } else { // unreachable throw new IllegalStateException("Unknown body type: " + parsedBody); diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java index 39d9b56e..497f7a00 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java @@ -5,6 +5,8 @@ import lombok.Builder; import lombok.ToString; import nu.marginalia.bigstring.BigString; import nu.marginalia.model.EdgeUrl; +import org.apache.commons.lang3.StringUtils; +import org.jetbrains.annotations.Nullable; @Builder @AllArgsConstructor @@ -21,7 +23,10 @@ public class CrawledDocument implements SerializableCrawlData { public String crawlerStatus; public String crawlerStatusDesc; + @Nullable + @Deprecated // use getETag() or getLastModified() instead public String headers; + public String documentBody; @Deprecated @@ -38,6 +43,51 @@ public class CrawledDocument implements SerializableCrawlData { * information may come in CrawledDomain instead */ public Boolean hasCookies = false; + public String lastModifiedMaybe; + public String etagMaybe; + + @Nullable + private String getHeader(String header) { + if (headers == null) { + return null; + } + + String headerString = header + ":"; + + String[] headersLines = StringUtils.split(headers, '\n'); + for (String headerLine : headersLines) { + if (StringUtils.startsWithIgnoreCase(headerLine, headerString)) { + return headerLine.substring(headerString.length()).trim(); + } + } + + return null; + } + + /** Returns the ETag header, or null if not present; + *

+ * this is a compatibility shim between the old json format, which saves headers in a long string + * and the new parquet format which saves only the ETag and Last-Modified headers in separate columns + * */ + public String getEtag() { + if (etagMaybe != null) { + return etagMaybe; + } + return getHeader("ETag"); + } + + /** Returns the Last-Modified header, or null if not present + *

+ * this is a compatibility shim between the old json format, which saves headers in a long string + * * and the new parquet format which saves only the ETag and Last-Modified headers in separate columns + * */ + public String getLastModified() { + if (lastModifiedMaybe != null) { + return lastModifiedMaybe; + } + return getHeader("Last-Modified"); + } + public static final String SERIAL_IDENTIFIER = "// DOCUMENT"; @Override public String getSerialIdentifier() { diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java index c96aeb25..55deafdb 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java @@ -29,6 +29,9 @@ public class CrawledDocumentParquetRecord { public String contentType; public byte[] body; + public String etagHeader; + public String lastModifiedHeader; + public static Hydrator newHydrator() { return new CrawledDocumentParquetRecordHydrator(); } @@ -46,7 +49,9 @@ public class CrawledDocumentParquetRecord { Types.required(INT32).named("httpStatus"), Types.required(INT64).named("epochSeconds"), Types.required(BINARY).as(stringType()).named("contentType"), - Types.required(BINARY).named("body") + Types.required(BINARY).named("body"), + Types.optional(BINARY).as(stringType()).named("etagHeader"), + Types.optional(BINARY).as(stringType()).named("lastModifiedHeader") ); @@ -60,6 +65,9 @@ public class CrawledDocumentParquetRecord { case "contentType" -> contentType = (String) value; case "body" -> body = (byte[]) value; case "epochSeconds" -> timestamp = Instant.ofEpochSecond((Long) value); + case "etagHeader" -> etagHeader = (String) value; + case "lastModifiedHeader" -> lastModifiedHeader = (String) value; + default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); } return this; @@ -74,6 +82,12 @@ public class CrawledDocumentParquetRecord { valueWriter.write("cookies", cookies); valueWriter.write("contentType", contentType); valueWriter.write("body", body); + if (etagHeader != null) { + valueWriter.write("etagHeader", etagHeader); + } + if (lastModifiedHeader != null) { + valueWriter.write("lastModifiedHeader", lastModifiedHeader); + } } } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java index 9245156f..831399b9 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java @@ -131,11 +131,15 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { return; } + + byte[] bodyBytes; String contentType; var body = DocumentBodyExtractor.asBytes(result); + var headers = fetchOk.headers(); + if (body instanceof DocumentBodyResult.Ok bodyOk) { bodyBytes = bodyOk.body(); contentType = bodyOk.contentType().toString(); @@ -153,7 +157,9 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { fetchOk.statusCode(), response.date(), contentType, - bodyBytes) + bodyBytes, + headers.get("ETag"), + headers.get("Last-Modified")) ); } @@ -170,7 +176,9 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { 0, date, "x-marginalia/advisory;state=redirect", - new byte[0] + new byte[0], + null, + null ); } private CrawledDocumentParquetRecord forDomainError(String domain, Instant date, String ip, String errorStatus) { @@ -181,7 +189,9 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { 0, date, "x-marginalia/advisory;state=error", - errorStatus.getBytes() + errorStatus.getBytes(), + null, + null ); } @@ -193,7 +203,9 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { 0, date, errorStatus, - new byte[0] + new byte[0], + null, + null ); } diff --git a/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/model/CrawledDocumentTest.java b/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/model/CrawledDocumentTest.java new file mode 100644 index 00000000..8612fd39 --- /dev/null +++ b/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/model/CrawledDocumentTest.java @@ -0,0 +1,101 @@ +package nu.marginalia.crawling.model; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class CrawledDocumentTest { + + /** These tests are AI-generated hence have kinda inconsistent naming */ + + @Test + void getEtagShouldReturnEtagIfPresent() { + CrawledDocument crawledDocument = CrawledDocument.builder() + .etagMaybe("12345") + .build(); + + // Etag is present, method should return it. + String etag = crawledDocument.getEtag(); + assertEquals("12345", etag); + } + + @Test + void getEtagShouldReturnNullIfEtagIsAbsentAndHeadersAreNull() { + CrawledDocument crawledDocument = CrawledDocument.builder() + .etagMaybe(null) + .headers(null) + .build(); + + // Etag and headers are absent, method should return null. + String etag = crawledDocument.getEtag(); + assertNull(etag); + } + + @Test + void getEtagShouldReturnNullIfEtagIsAbsentAndHeadersDoNotContainEtag() { + CrawledDocument crawledDocument = CrawledDocument.builder() + .etagMaybe(null) + .headers("Some irrelevant headers") + .build(); + + // Headers do not contain an ETag, method should return null. + String etag = crawledDocument.getEtag(); + assertNull(etag); + } + + @Test + void getEtagShouldReturnEtagFromHeadersIfPresent() { + CrawledDocument crawledDocument = CrawledDocument.builder() + .etagMaybe(null) + .headers("ETag: 67890") + .build(); + + // Headers contain an ETag, method should return it. + String etag = crawledDocument.getEtag(); + assertEquals("67890", etag); + } + + @Test + public void testGetLastModified_withLastModifiedDateInHeaders() { + // Arrange + String lastModifiedDate = "Wed, 21 Oct 2015 07:28:00 GMT"; + CrawledDocument crawledDocument = CrawledDocument.builder() + .headers("Last-Modified: " + lastModifiedDate) + .build(); + + // Act + String actualLastModifiedDate = crawledDocument.getLastModified(); + + // Assert + assertEquals(lastModifiedDate, actualLastModifiedDate); + } + + @Test + public void testGetLastModified_withoutLastModifiedDateInHeaders() { + // Arrange + CrawledDocument crawledDocument = CrawledDocument.builder() + .headers("Some-Other-Header: Some value") + .build(); + + // Act + String actualLastModifiedDate = crawledDocument.getLastModified(); + + // Assert + assertNull(actualLastModifiedDate); + } + + @Test + public void testGetLastModified_withLastModifiedDateInField() { + // Arrange + String lastModifiedDate = "Wed, 21 Oct 2015 07:28:00 GMT"; + CrawledDocument crawledDocument = CrawledDocument.builder() + .lastModifiedMaybe(lastModifiedDate) + .build(); + + // Act + String actualLastModifiedDate = crawledDocument.getLastModified(); + + // Assert + assertEquals(lastModifiedDate, actualLastModifiedDate); + } +} \ No newline at end of file diff --git a/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java b/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java index c79154a4..17a8ad73 100644 --- a/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java +++ b/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java @@ -38,7 +38,8 @@ class CrawledDocumentParquetRecordFileWriterTest { 200, Instant.now(), "text/html", - "hello world".getBytes()); + "hello world".getBytes(), + null, null); try (var writer = new CrawledDocumentParquetRecordFileWriter(tempFile)) { writer.write(original); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java index 1a034ac2..3e871f9a 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java @@ -51,7 +51,9 @@ public class SideloaderProcessing { url, "", "SIDELOAD", - false + false, + null, + null ); var ret = new ProcessedDocument(); diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java index eaa9d813..b7963bdf 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -116,7 +116,9 @@ public class ConvertingIntegrationTest { "https://memex.marginalia.nu/" + file, null, "", - false + false, + null, + null ); docs.add(doc); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java index a0559aec..a1533480 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java @@ -49,22 +49,11 @@ public record DocumentWithReference( if (null == doc) return ContentTags.empty(); - String headers = doc.headers; - if (headers == null) + String lastmod = doc.getLastModified(); + String etag = doc.getEtag(); + + if (lastmod == null && etag == null) { return ContentTags.empty(); - - String[] headersLines = headers.split("\n"); - - String lastmod = null; - String etag = null; - - for (String line : headersLines) { - if (line.toLowerCase().startsWith("etag:")) { - etag = line.substring(5).trim(); - } - if (line.toLowerCase().startsWith("last-modified:")) { - lastmod = line.substring(14).trim(); - } } return new ContentTags(etag, lastmod); diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReferenceTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReferenceTest.java new file mode 100644 index 00000000..384c7d11 --- /dev/null +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReferenceTest.java @@ -0,0 +1,86 @@ +package nu.marginalia.crawl.retreival.revisit; + +import nu.marginalia.crawl.retreival.CrawlDataReference; +import nu.marginalia.crawl.retreival.fetcher.ContentTags; +import nu.marginalia.crawling.model.CrawledDocument; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +public class DocumentWithReferenceTest { + + // test case for when doc is null + @Test + public void getContentTags_docIsNull() { + // set up test data + CrawledDocument doc = null; + CrawlDataReference reference = new CrawlDataReference(); + + DocumentWithReference documentWithReference = new DocumentWithReference(doc, reference); + + // execute method under test + ContentTags contentTags = documentWithReference.getContentTags(); + + // verify that returned content tags is empty + assertTrue(contentTags.isEmpty()); + } + + // test case for when doc is not null, and lastModified and eTag are null + @Test + public void getContentTags_lastModifiedAndETagIsNull() { + // set up test data + CrawledDocument doc = CrawledDocument.builder().build(); // both lastModified and eTag are null + CrawlDataReference reference = new CrawlDataReference(); + + DocumentWithReference documentWithReference = new DocumentWithReference(doc, reference); + + // execute method under test + ContentTags contentTags = documentWithReference.getContentTags(); + + // verify that returned content tags is empty + assertTrue(contentTags.isEmpty()); + } + + // test case for when doc is not null, and lastModified and eTag are not null + @Test + public void getContentTags_lastModifiedAndETagAreNotNull_NewCrawlData() { + // set up test data + CrawledDocument doc = CrawledDocument.builder() + .etagMaybe("12345") + .lastModifiedMaybe("67890") + .build(); // assume lastModified and eTag are not null + CrawlDataReference reference = new CrawlDataReference(); + + DocumentWithReference documentWithReference = new DocumentWithReference(doc, reference); + + // execute method under test + ContentTags contentTags = documentWithReference.getContentTags(); + + // verify that returned content tags is present + assertFalse(contentTags.isEmpty()); + assertEquals("12345", contentTags.etag()); + assertEquals("67890", contentTags.lastMod()); + } + + @Test + public void getContentTags_lastModifiedAndETagAreNotNull_LegacyCrawlData() { + // set up test data + CrawledDocument doc = CrawledDocument.builder() + .headers(""" + Etag: 12345 + Last-Modified: 67890 + """) + .build(); // assume lastModified and eTag are not null + CrawlDataReference reference = new CrawlDataReference(); + + DocumentWithReference documentWithReference = new DocumentWithReference(doc, reference); + + // execute method under test + ContentTags contentTags = documentWithReference.getContentTags(); + + // verify that returned content tags is present + assertFalse(contentTags.isEmpty()); + assertEquals("12345", contentTags.etag()); + assertEquals("67890", contentTags.lastMod()); + } +} \ No newline at end of file diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 286f15f5..bfcc1617 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -234,6 +234,8 @@ class CrawlerRetreiverTest { } var stream = CrawledDomainReader.createDataStream(tempFile); + System.out.println("---"); + CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0); domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList()); try (var recorder = new WarcRecorder(tempFile2)) { @@ -244,8 +246,6 @@ class CrawlerRetreiverTest { Assertions.fail(ex); } - new GZIPInputStream(Files.newInputStream(tempFile2)).transferTo(System.out); - try (var reader = new WarcReader(tempFile2)) { WarcXResponseReference.register(reader); @@ -270,7 +270,7 @@ class CrawlerRetreiverTest { System.out.println(dr.domain + "/" + dr.crawlerStatus); } else if (doc instanceof CrawledDocument dc) { - System.out.println(dc.url + "/" + dc.crawlerStatus + "/" + dc.httpStatus); + System.out.println(dc.url + "/" + dc.crawlerStatus + "/" + dc.httpStatus + "/" + dc.timestamp); } } } catch (Exception e) {