(warc) Add a fields for etags and last-modified headers to the new crawl data formats

Make some temporary modifications to the CrawledDocument model to support both a "big string" style headers field like in the old formats, and explicit fields as in the new formats.  This is a bit awkward to deal with, but it's a necessity until we migrate off the old formats entirely.

The commit also adds a few tests to this logic.
This commit is contained in:
Viktor Lofgren 2023-12-18 17:45:54 +01:00
parent 126ac3816f
commit 3a56a06c4f
12 changed files with 294 additions and 29 deletions

View File

@ -118,7 +118,9 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
nextRecord.url, nextRecord.url,
null, null,
"", "",
nextRecord.cookies)); nextRecord.cookies,
nextRecord.lastModifiedHeader,
nextRecord.etagHeader));
} }
public void close() throws IOException { public void close() throws IOException {

View File

@ -82,6 +82,8 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa
return; return;
} }
var httpHeaders = http.headers();
var parsedBody = DocumentBodyExtractor.asString(HttpFetchResult.importWarc(response)); var parsedBody = DocumentBodyExtractor.asString(HttpFetchResult.importWarc(response));
if (parsedBody instanceof DocumentBodyResult.Error<String> error) { if (parsedBody instanceof DocumentBodyResult.Error<String> error) {
next = new CrawledDocument( next = new CrawledDocument(
@ -98,7 +100,9 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa
"", "",
"", "",
"", "",
WarcXCookieInformationHeader.hasCookies(response) WarcXCookieInformationHeader.hasCookies(response),
null,
null
); );
} else if (parsedBody instanceof DocumentBodyResult.Ok<String> ok) { } else if (parsedBody instanceof DocumentBodyResult.Ok<String> ok) {
next = new CrawledDocument( next = new CrawledDocument(
@ -115,7 +119,9 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa
"", "",
"", "",
"", "",
WarcXCookieInformationHeader.hasCookies(response)); WarcXCookieInformationHeader.hasCookies(response),
httpHeaders.first("Last-Modified").orElse(""),
httpHeaders.first("ETag").orElse(""));
} else { } else {
// unreachable // unreachable
throw new IllegalStateException("Unknown body type: " + parsedBody); throw new IllegalStateException("Unknown body type: " + parsedBody);

View File

@ -5,6 +5,8 @@ import lombok.Builder;
import lombok.ToString; import lombok.ToString;
import nu.marginalia.bigstring.BigString; import nu.marginalia.bigstring.BigString;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.Nullable;
@Builder @Builder
@AllArgsConstructor @AllArgsConstructor
@ -21,7 +23,10 @@ public class CrawledDocument implements SerializableCrawlData {
public String crawlerStatus; public String crawlerStatus;
public String crawlerStatusDesc; public String crawlerStatusDesc;
@Nullable
@Deprecated // use getETag() or getLastModified() instead
public String headers; public String headers;
public String documentBody; public String documentBody;
@Deprecated @Deprecated
@ -38,6 +43,51 @@ public class CrawledDocument implements SerializableCrawlData {
* information may come in CrawledDomain instead */ * information may come in CrawledDomain instead */
public Boolean hasCookies = false; public Boolean hasCookies = false;
public String lastModifiedMaybe;
public String etagMaybe;
@Nullable
private String getHeader(String header) {
if (headers == null) {
return null;
}
String headerString = header + ":";
String[] headersLines = StringUtils.split(headers, '\n');
for (String headerLine : headersLines) {
if (StringUtils.startsWithIgnoreCase(headerLine, headerString)) {
return headerLine.substring(headerString.length()).trim();
}
}
return null;
}
/** Returns the ETag header, or null if not present;
* <p>
* this is a compatibility shim between the old json format, which saves headers in a long string
* and the new parquet format which saves only the ETag and Last-Modified headers in separate columns
* */
public String getEtag() {
if (etagMaybe != null) {
return etagMaybe;
}
return getHeader("ETag");
}
/** Returns the Last-Modified header, or null if not present
* <p>
* this is a compatibility shim between the old json format, which saves headers in a long string
* * and the new parquet format which saves only the ETag and Last-Modified headers in separate columns
* */
public String getLastModified() {
if (lastModifiedMaybe != null) {
return lastModifiedMaybe;
}
return getHeader("Last-Modified");
}
public static final String SERIAL_IDENTIFIER = "// DOCUMENT"; public static final String SERIAL_IDENTIFIER = "// DOCUMENT";
@Override @Override
public String getSerialIdentifier() { public String getSerialIdentifier() {

View File

@ -29,6 +29,9 @@ public class CrawledDocumentParquetRecord {
public String contentType; public String contentType;
public byte[] body; public byte[] body;
public String etagHeader;
public String lastModifiedHeader;
public static Hydrator<CrawledDocumentParquetRecord, CrawledDocumentParquetRecord> newHydrator() { public static Hydrator<CrawledDocumentParquetRecord, CrawledDocumentParquetRecord> newHydrator() {
return new CrawledDocumentParquetRecordHydrator(); return new CrawledDocumentParquetRecordHydrator();
} }
@ -46,7 +49,9 @@ public class CrawledDocumentParquetRecord {
Types.required(INT32).named("httpStatus"), Types.required(INT32).named("httpStatus"),
Types.required(INT64).named("epochSeconds"), Types.required(INT64).named("epochSeconds"),
Types.required(BINARY).as(stringType()).named("contentType"), Types.required(BINARY).as(stringType()).named("contentType"),
Types.required(BINARY).named("body") Types.required(BINARY).named("body"),
Types.optional(BINARY).as(stringType()).named("etagHeader"),
Types.optional(BINARY).as(stringType()).named("lastModifiedHeader")
); );
@ -60,6 +65,9 @@ public class CrawledDocumentParquetRecord {
case "contentType" -> contentType = (String) value; case "contentType" -> contentType = (String) value;
case "body" -> body = (byte[]) value; case "body" -> body = (byte[]) value;
case "epochSeconds" -> timestamp = Instant.ofEpochSecond((Long) value); case "epochSeconds" -> timestamp = Instant.ofEpochSecond((Long) value);
case "etagHeader" -> etagHeader = (String) value;
case "lastModifiedHeader" -> lastModifiedHeader = (String) value;
default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"');
} }
return this; return this;
@ -74,6 +82,12 @@ public class CrawledDocumentParquetRecord {
valueWriter.write("cookies", cookies); valueWriter.write("cookies", cookies);
valueWriter.write("contentType", contentType); valueWriter.write("contentType", contentType);
valueWriter.write("body", body); valueWriter.write("body", body);
if (etagHeader != null) {
valueWriter.write("etagHeader", etagHeader);
}
if (lastModifiedHeader != null) {
valueWriter.write("lastModifiedHeader", lastModifiedHeader);
}
} }
} }

View File

@ -131,11 +131,15 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
return; return;
} }
byte[] bodyBytes; byte[] bodyBytes;
String contentType; String contentType;
var body = DocumentBodyExtractor.asBytes(result); var body = DocumentBodyExtractor.asBytes(result);
var headers = fetchOk.headers();
if (body instanceof DocumentBodyResult.Ok<byte[]> bodyOk) { if (body instanceof DocumentBodyResult.Ok<byte[]> bodyOk) {
bodyBytes = bodyOk.body(); bodyBytes = bodyOk.body();
contentType = bodyOk.contentType().toString(); contentType = bodyOk.contentType().toString();
@ -153,7 +157,9 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
fetchOk.statusCode(), fetchOk.statusCode(),
response.date(), response.date(),
contentType, contentType,
bodyBytes) bodyBytes,
headers.get("ETag"),
headers.get("Last-Modified"))
); );
} }
@ -170,7 +176,9 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
0, 0,
date, date,
"x-marginalia/advisory;state=redirect", "x-marginalia/advisory;state=redirect",
new byte[0] new byte[0],
null,
null
); );
} }
private CrawledDocumentParquetRecord forDomainError(String domain, Instant date, String ip, String errorStatus) { private CrawledDocumentParquetRecord forDomainError(String domain, Instant date, String ip, String errorStatus) {
@ -181,7 +189,9 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
0, 0,
date, date,
"x-marginalia/advisory;state=error", "x-marginalia/advisory;state=error",
errorStatus.getBytes() errorStatus.getBytes(),
null,
null
); );
} }
@ -193,7 +203,9 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
0, 0,
date, date,
errorStatus, errorStatus,
new byte[0] new byte[0],
null,
null
); );
} }

View File

@ -0,0 +1,101 @@
package nu.marginalia.crawling.model;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.*;
class CrawledDocumentTest {
/** These tests are AI-generated hence have kinda inconsistent naming */
@Test
void getEtagShouldReturnEtagIfPresent() {
CrawledDocument crawledDocument = CrawledDocument.builder()
.etagMaybe("12345")
.build();
// Etag is present, method should return it.
String etag = crawledDocument.getEtag();
assertEquals("12345", etag);
}
@Test
void getEtagShouldReturnNullIfEtagIsAbsentAndHeadersAreNull() {
CrawledDocument crawledDocument = CrawledDocument.builder()
.etagMaybe(null)
.headers(null)
.build();
// Etag and headers are absent, method should return null.
String etag = crawledDocument.getEtag();
assertNull(etag);
}
@Test
void getEtagShouldReturnNullIfEtagIsAbsentAndHeadersDoNotContainEtag() {
CrawledDocument crawledDocument = CrawledDocument.builder()
.etagMaybe(null)
.headers("Some irrelevant headers")
.build();
// Headers do not contain an ETag, method should return null.
String etag = crawledDocument.getEtag();
assertNull(etag);
}
@Test
void getEtagShouldReturnEtagFromHeadersIfPresent() {
CrawledDocument crawledDocument = CrawledDocument.builder()
.etagMaybe(null)
.headers("ETag: 67890")
.build();
// Headers contain an ETag, method should return it.
String etag = crawledDocument.getEtag();
assertEquals("67890", etag);
}
@Test
public void testGetLastModified_withLastModifiedDateInHeaders() {
// Arrange
String lastModifiedDate = "Wed, 21 Oct 2015 07:28:00 GMT";
CrawledDocument crawledDocument = CrawledDocument.builder()
.headers("Last-Modified: " + lastModifiedDate)
.build();
// Act
String actualLastModifiedDate = crawledDocument.getLastModified();
// Assert
assertEquals(lastModifiedDate, actualLastModifiedDate);
}
@Test
public void testGetLastModified_withoutLastModifiedDateInHeaders() {
// Arrange
CrawledDocument crawledDocument = CrawledDocument.builder()
.headers("Some-Other-Header: Some value")
.build();
// Act
String actualLastModifiedDate = crawledDocument.getLastModified();
// Assert
assertNull(actualLastModifiedDate);
}
@Test
public void testGetLastModified_withLastModifiedDateInField() {
// Arrange
String lastModifiedDate = "Wed, 21 Oct 2015 07:28:00 GMT";
CrawledDocument crawledDocument = CrawledDocument.builder()
.lastModifiedMaybe(lastModifiedDate)
.build();
// Act
String actualLastModifiedDate = crawledDocument.getLastModified();
// Assert
assertEquals(lastModifiedDate, actualLastModifiedDate);
}
}

View File

@ -38,7 +38,8 @@ class CrawledDocumentParquetRecordFileWriterTest {
200, 200,
Instant.now(), Instant.now(),
"text/html", "text/html",
"hello world".getBytes()); "hello world".getBytes(),
null, null);
try (var writer = new CrawledDocumentParquetRecordFileWriter(tempFile)) { try (var writer = new CrawledDocumentParquetRecordFileWriter(tempFile)) {
writer.write(original); writer.write(original);

View File

@ -51,7 +51,9 @@ public class SideloaderProcessing {
url, url,
"", "",
"SIDELOAD", "SIDELOAD",
false false,
null,
null
); );
var ret = new ProcessedDocument(); var ret = new ProcessedDocument();

View File

@ -116,7 +116,9 @@ public class ConvertingIntegrationTest {
"https://memex.marginalia.nu/" + file, "https://memex.marginalia.nu/" + file,
null, null,
"", "",
false false,
null,
null
); );
docs.add(doc); docs.add(doc);
} }

View File

@ -49,22 +49,11 @@ public record DocumentWithReference(
if (null == doc) if (null == doc)
return ContentTags.empty(); return ContentTags.empty();
String headers = doc.headers; String lastmod = doc.getLastModified();
if (headers == null) String etag = doc.getEtag();
if (lastmod == null && etag == null) {
return ContentTags.empty(); return ContentTags.empty();
String[] headersLines = headers.split("\n");
String lastmod = null;
String etag = null;
for (String line : headersLines) {
if (line.toLowerCase().startsWith("etag:")) {
etag = line.substring(5).trim();
}
if (line.toLowerCase().startsWith("last-modified:")) {
lastmod = line.substring(14).trim();
}
} }
return new ContentTags(etag, lastmod); return new ContentTags(etag, lastmod);

View File

@ -0,0 +1,86 @@
package nu.marginalia.crawl.retreival.revisit;
import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.crawling.model.CrawledDocument;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.*;
public class DocumentWithReferenceTest {
// test case for when doc is null
@Test
public void getContentTags_docIsNull() {
// set up test data
CrawledDocument doc = null;
CrawlDataReference reference = new CrawlDataReference();
DocumentWithReference documentWithReference = new DocumentWithReference(doc, reference);
// execute method under test
ContentTags contentTags = documentWithReference.getContentTags();
// verify that returned content tags is empty
assertTrue(contentTags.isEmpty());
}
// test case for when doc is not null, and lastModified and eTag are null
@Test
public void getContentTags_lastModifiedAndETagIsNull() {
// set up test data
CrawledDocument doc = CrawledDocument.builder().build(); // both lastModified and eTag are null
CrawlDataReference reference = new CrawlDataReference();
DocumentWithReference documentWithReference = new DocumentWithReference(doc, reference);
// execute method under test
ContentTags contentTags = documentWithReference.getContentTags();
// verify that returned content tags is empty
assertTrue(contentTags.isEmpty());
}
// test case for when doc is not null, and lastModified and eTag are not null
@Test
public void getContentTags_lastModifiedAndETagAreNotNull_NewCrawlData() {
// set up test data
CrawledDocument doc = CrawledDocument.builder()
.etagMaybe("12345")
.lastModifiedMaybe("67890")
.build(); // assume lastModified and eTag are not null
CrawlDataReference reference = new CrawlDataReference();
DocumentWithReference documentWithReference = new DocumentWithReference(doc, reference);
// execute method under test
ContentTags contentTags = documentWithReference.getContentTags();
// verify that returned content tags is present
assertFalse(contentTags.isEmpty());
assertEquals("12345", contentTags.etag());
assertEquals("67890", contentTags.lastMod());
}
@Test
public void getContentTags_lastModifiedAndETagAreNotNull_LegacyCrawlData() {
// set up test data
CrawledDocument doc = CrawledDocument.builder()
.headers("""
Etag: 12345
Last-Modified: 67890
""")
.build(); // assume lastModified and eTag are not null
CrawlDataReference reference = new CrawlDataReference();
DocumentWithReference documentWithReference = new DocumentWithReference(doc, reference);
// execute method under test
ContentTags contentTags = documentWithReference.getContentTags();
// verify that returned content tags is present
assertFalse(contentTags.isEmpty());
assertEquals("12345", contentTags.etag());
assertEquals("67890", contentTags.lastMod());
}
}

View File

@ -234,6 +234,8 @@ class CrawlerRetreiverTest {
} }
var stream = CrawledDomainReader.createDataStream(tempFile); var stream = CrawledDomainReader.createDataStream(tempFile);
System.out.println("---");
CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0); CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList()); domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
try (var recorder = new WarcRecorder(tempFile2)) { try (var recorder = new WarcRecorder(tempFile2)) {
@ -244,8 +246,6 @@ class CrawlerRetreiverTest {
Assertions.fail(ex); Assertions.fail(ex);
} }
new GZIPInputStream(Files.newInputStream(tempFile2)).transferTo(System.out);
try (var reader = new WarcReader(tempFile2)) { try (var reader = new WarcReader(tempFile2)) {
WarcXResponseReference.register(reader); WarcXResponseReference.register(reader);
@ -270,7 +270,7 @@ class CrawlerRetreiverTest {
System.out.println(dr.domain + "/" + dr.crawlerStatus); System.out.println(dr.domain + "/" + dr.crawlerStatus);
} }
else if (doc instanceof CrawledDocument dc) { else if (doc instanceof CrawledDocument dc) {
System.out.println(dc.url + "/" + dc.crawlerStatus + "/" + dc.httpStatus); System.out.println(dc.url + "/" + dc.crawlerStatus + "/" + dc.httpStatus + "/" + dc.timestamp);
} }
} }
} catch (Exception e) { } catch (Exception e) {