mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(warc) Add a fields for etags and last-modified headers to the new crawl data formats
Make some temporary modifications to the CrawledDocument model to support both a "big string" style headers field like in the old formats, and explicit fields as in the new formats. This is a bit awkward to deal with, but it's a necessity until we migrate off the old formats entirely. The commit also adds a few tests to this logic.
This commit is contained in:
parent
126ac3816f
commit
3a56a06c4f
@ -118,7 +118,9 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
|||||||
nextRecord.url,
|
nextRecord.url,
|
||||||
null,
|
null,
|
||||||
"",
|
"",
|
||||||
nextRecord.cookies));
|
nextRecord.cookies,
|
||||||
|
nextRecord.lastModifiedHeader,
|
||||||
|
nextRecord.etagHeader));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
|
@ -82,6 +82,8 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var httpHeaders = http.headers();
|
||||||
|
|
||||||
var parsedBody = DocumentBodyExtractor.asString(HttpFetchResult.importWarc(response));
|
var parsedBody = DocumentBodyExtractor.asString(HttpFetchResult.importWarc(response));
|
||||||
if (parsedBody instanceof DocumentBodyResult.Error<String> error) {
|
if (parsedBody instanceof DocumentBodyResult.Error<String> error) {
|
||||||
next = new CrawledDocument(
|
next = new CrawledDocument(
|
||||||
@ -98,7 +100,9 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa
|
|||||||
"",
|
"",
|
||||||
"",
|
"",
|
||||||
"",
|
"",
|
||||||
WarcXCookieInformationHeader.hasCookies(response)
|
WarcXCookieInformationHeader.hasCookies(response),
|
||||||
|
null,
|
||||||
|
null
|
||||||
);
|
);
|
||||||
} else if (parsedBody instanceof DocumentBodyResult.Ok<String> ok) {
|
} else if (parsedBody instanceof DocumentBodyResult.Ok<String> ok) {
|
||||||
next = new CrawledDocument(
|
next = new CrawledDocument(
|
||||||
@ -115,7 +119,9 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa
|
|||||||
"",
|
"",
|
||||||
"",
|
"",
|
||||||
"",
|
"",
|
||||||
WarcXCookieInformationHeader.hasCookies(response));
|
WarcXCookieInformationHeader.hasCookies(response),
|
||||||
|
httpHeaders.first("Last-Modified").orElse(""),
|
||||||
|
httpHeaders.first("ETag").orElse(""));
|
||||||
} else {
|
} else {
|
||||||
// unreachable
|
// unreachable
|
||||||
throw new IllegalStateException("Unknown body type: " + parsedBody);
|
throw new IllegalStateException("Unknown body type: " + parsedBody);
|
||||||
|
@ -5,6 +5,8 @@ import lombok.Builder;
|
|||||||
import lombok.ToString;
|
import lombok.ToString;
|
||||||
import nu.marginalia.bigstring.BigString;
|
import nu.marginalia.bigstring.BigString;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.jetbrains.annotations.Nullable;
|
||||||
|
|
||||||
@Builder
|
@Builder
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
@ -21,7 +23,10 @@ public class CrawledDocument implements SerializableCrawlData {
|
|||||||
public String crawlerStatus;
|
public String crawlerStatus;
|
||||||
public String crawlerStatusDesc;
|
public String crawlerStatusDesc;
|
||||||
|
|
||||||
|
@Nullable
|
||||||
|
@Deprecated // use getETag() or getLastModified() instead
|
||||||
public String headers;
|
public String headers;
|
||||||
|
|
||||||
public String documentBody;
|
public String documentBody;
|
||||||
|
|
||||||
@Deprecated
|
@Deprecated
|
||||||
@ -38,6 +43,51 @@ public class CrawledDocument implements SerializableCrawlData {
|
|||||||
* information may come in CrawledDomain instead */
|
* information may come in CrawledDomain instead */
|
||||||
public Boolean hasCookies = false;
|
public Boolean hasCookies = false;
|
||||||
|
|
||||||
|
public String lastModifiedMaybe;
|
||||||
|
public String etagMaybe;
|
||||||
|
|
||||||
|
@Nullable
|
||||||
|
private String getHeader(String header) {
|
||||||
|
if (headers == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
String headerString = header + ":";
|
||||||
|
|
||||||
|
String[] headersLines = StringUtils.split(headers, '\n');
|
||||||
|
for (String headerLine : headersLines) {
|
||||||
|
if (StringUtils.startsWithIgnoreCase(headerLine, headerString)) {
|
||||||
|
return headerLine.substring(headerString.length()).trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the ETag header, or null if not present;
|
||||||
|
* <p>
|
||||||
|
* this is a compatibility shim between the old json format, which saves headers in a long string
|
||||||
|
* and the new parquet format which saves only the ETag and Last-Modified headers in separate columns
|
||||||
|
* */
|
||||||
|
public String getEtag() {
|
||||||
|
if (etagMaybe != null) {
|
||||||
|
return etagMaybe;
|
||||||
|
}
|
||||||
|
return getHeader("ETag");
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the Last-Modified header, or null if not present
|
||||||
|
* <p>
|
||||||
|
* this is a compatibility shim between the old json format, which saves headers in a long string
|
||||||
|
* * and the new parquet format which saves only the ETag and Last-Modified headers in separate columns
|
||||||
|
* */
|
||||||
|
public String getLastModified() {
|
||||||
|
if (lastModifiedMaybe != null) {
|
||||||
|
return lastModifiedMaybe;
|
||||||
|
}
|
||||||
|
return getHeader("Last-Modified");
|
||||||
|
}
|
||||||
|
|
||||||
public static final String SERIAL_IDENTIFIER = "// DOCUMENT";
|
public static final String SERIAL_IDENTIFIER = "// DOCUMENT";
|
||||||
@Override
|
@Override
|
||||||
public String getSerialIdentifier() {
|
public String getSerialIdentifier() {
|
||||||
|
@ -29,6 +29,9 @@ public class CrawledDocumentParquetRecord {
|
|||||||
public String contentType;
|
public String contentType;
|
||||||
public byte[] body;
|
public byte[] body;
|
||||||
|
|
||||||
|
public String etagHeader;
|
||||||
|
public String lastModifiedHeader;
|
||||||
|
|
||||||
public static Hydrator<CrawledDocumentParquetRecord, CrawledDocumentParquetRecord> newHydrator() {
|
public static Hydrator<CrawledDocumentParquetRecord, CrawledDocumentParquetRecord> newHydrator() {
|
||||||
return new CrawledDocumentParquetRecordHydrator();
|
return new CrawledDocumentParquetRecordHydrator();
|
||||||
}
|
}
|
||||||
@ -46,7 +49,9 @@ public class CrawledDocumentParquetRecord {
|
|||||||
Types.required(INT32).named("httpStatus"),
|
Types.required(INT32).named("httpStatus"),
|
||||||
Types.required(INT64).named("epochSeconds"),
|
Types.required(INT64).named("epochSeconds"),
|
||||||
Types.required(BINARY).as(stringType()).named("contentType"),
|
Types.required(BINARY).as(stringType()).named("contentType"),
|
||||||
Types.required(BINARY).named("body")
|
Types.required(BINARY).named("body"),
|
||||||
|
Types.optional(BINARY).as(stringType()).named("etagHeader"),
|
||||||
|
Types.optional(BINARY).as(stringType()).named("lastModifiedHeader")
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
@ -60,6 +65,9 @@ public class CrawledDocumentParquetRecord {
|
|||||||
case "contentType" -> contentType = (String) value;
|
case "contentType" -> contentType = (String) value;
|
||||||
case "body" -> body = (byte[]) value;
|
case "body" -> body = (byte[]) value;
|
||||||
case "epochSeconds" -> timestamp = Instant.ofEpochSecond((Long) value);
|
case "epochSeconds" -> timestamp = Instant.ofEpochSecond((Long) value);
|
||||||
|
case "etagHeader" -> etagHeader = (String) value;
|
||||||
|
case "lastModifiedHeader" -> lastModifiedHeader = (String) value;
|
||||||
|
|
||||||
default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"');
|
default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"');
|
||||||
}
|
}
|
||||||
return this;
|
return this;
|
||||||
@ -74,6 +82,12 @@ public class CrawledDocumentParquetRecord {
|
|||||||
valueWriter.write("cookies", cookies);
|
valueWriter.write("cookies", cookies);
|
||||||
valueWriter.write("contentType", contentType);
|
valueWriter.write("contentType", contentType);
|
||||||
valueWriter.write("body", body);
|
valueWriter.write("body", body);
|
||||||
|
if (etagHeader != null) {
|
||||||
|
valueWriter.write("etagHeader", etagHeader);
|
||||||
|
}
|
||||||
|
if (lastModifiedHeader != null) {
|
||||||
|
valueWriter.write("lastModifiedHeader", lastModifiedHeader);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -131,11 +131,15 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
byte[] bodyBytes;
|
byte[] bodyBytes;
|
||||||
String contentType;
|
String contentType;
|
||||||
|
|
||||||
var body = DocumentBodyExtractor.asBytes(result);
|
var body = DocumentBodyExtractor.asBytes(result);
|
||||||
|
|
||||||
|
var headers = fetchOk.headers();
|
||||||
|
|
||||||
if (body instanceof DocumentBodyResult.Ok<byte[]> bodyOk) {
|
if (body instanceof DocumentBodyResult.Ok<byte[]> bodyOk) {
|
||||||
bodyBytes = bodyOk.body();
|
bodyBytes = bodyOk.body();
|
||||||
contentType = bodyOk.contentType().toString();
|
contentType = bodyOk.contentType().toString();
|
||||||
@ -153,7 +157,9 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
|||||||
fetchOk.statusCode(),
|
fetchOk.statusCode(),
|
||||||
response.date(),
|
response.date(),
|
||||||
contentType,
|
contentType,
|
||||||
bodyBytes)
|
bodyBytes,
|
||||||
|
headers.get("ETag"),
|
||||||
|
headers.get("Last-Modified"))
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -170,7 +176,9 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
|||||||
0,
|
0,
|
||||||
date,
|
date,
|
||||||
"x-marginalia/advisory;state=redirect",
|
"x-marginalia/advisory;state=redirect",
|
||||||
new byte[0]
|
new byte[0],
|
||||||
|
null,
|
||||||
|
null
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
private CrawledDocumentParquetRecord forDomainError(String domain, Instant date, String ip, String errorStatus) {
|
private CrawledDocumentParquetRecord forDomainError(String domain, Instant date, String ip, String errorStatus) {
|
||||||
@ -181,7 +189,9 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
|||||||
0,
|
0,
|
||||||
date,
|
date,
|
||||||
"x-marginalia/advisory;state=error",
|
"x-marginalia/advisory;state=error",
|
||||||
errorStatus.getBytes()
|
errorStatus.getBytes(),
|
||||||
|
null,
|
||||||
|
null
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -193,7 +203,9 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
|||||||
0,
|
0,
|
||||||
date,
|
date,
|
||||||
errorStatus,
|
errorStatus,
|
||||||
new byte[0]
|
new byte[0],
|
||||||
|
null,
|
||||||
|
null
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -0,0 +1,101 @@
|
|||||||
|
package nu.marginalia.crawling.model;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class CrawledDocumentTest {
|
||||||
|
|
||||||
|
/** These tests are AI-generated hence have kinda inconsistent naming */
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void getEtagShouldReturnEtagIfPresent() {
|
||||||
|
CrawledDocument crawledDocument = CrawledDocument.builder()
|
||||||
|
.etagMaybe("12345")
|
||||||
|
.build();
|
||||||
|
|
||||||
|
// Etag is present, method should return it.
|
||||||
|
String etag = crawledDocument.getEtag();
|
||||||
|
assertEquals("12345", etag);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void getEtagShouldReturnNullIfEtagIsAbsentAndHeadersAreNull() {
|
||||||
|
CrawledDocument crawledDocument = CrawledDocument.builder()
|
||||||
|
.etagMaybe(null)
|
||||||
|
.headers(null)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
// Etag and headers are absent, method should return null.
|
||||||
|
String etag = crawledDocument.getEtag();
|
||||||
|
assertNull(etag);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void getEtagShouldReturnNullIfEtagIsAbsentAndHeadersDoNotContainEtag() {
|
||||||
|
CrawledDocument crawledDocument = CrawledDocument.builder()
|
||||||
|
.etagMaybe(null)
|
||||||
|
.headers("Some irrelevant headers")
|
||||||
|
.build();
|
||||||
|
|
||||||
|
// Headers do not contain an ETag, method should return null.
|
||||||
|
String etag = crawledDocument.getEtag();
|
||||||
|
assertNull(etag);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void getEtagShouldReturnEtagFromHeadersIfPresent() {
|
||||||
|
CrawledDocument crawledDocument = CrawledDocument.builder()
|
||||||
|
.etagMaybe(null)
|
||||||
|
.headers("ETag: 67890")
|
||||||
|
.build();
|
||||||
|
|
||||||
|
// Headers contain an ETag, method should return it.
|
||||||
|
String etag = crawledDocument.getEtag();
|
||||||
|
assertEquals("67890", etag);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetLastModified_withLastModifiedDateInHeaders() {
|
||||||
|
// Arrange
|
||||||
|
String lastModifiedDate = "Wed, 21 Oct 2015 07:28:00 GMT";
|
||||||
|
CrawledDocument crawledDocument = CrawledDocument.builder()
|
||||||
|
.headers("Last-Modified: " + lastModifiedDate)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
// Act
|
||||||
|
String actualLastModifiedDate = crawledDocument.getLastModified();
|
||||||
|
|
||||||
|
// Assert
|
||||||
|
assertEquals(lastModifiedDate, actualLastModifiedDate);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetLastModified_withoutLastModifiedDateInHeaders() {
|
||||||
|
// Arrange
|
||||||
|
CrawledDocument crawledDocument = CrawledDocument.builder()
|
||||||
|
.headers("Some-Other-Header: Some value")
|
||||||
|
.build();
|
||||||
|
|
||||||
|
// Act
|
||||||
|
String actualLastModifiedDate = crawledDocument.getLastModified();
|
||||||
|
|
||||||
|
// Assert
|
||||||
|
assertNull(actualLastModifiedDate);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetLastModified_withLastModifiedDateInField() {
|
||||||
|
// Arrange
|
||||||
|
String lastModifiedDate = "Wed, 21 Oct 2015 07:28:00 GMT";
|
||||||
|
CrawledDocument crawledDocument = CrawledDocument.builder()
|
||||||
|
.lastModifiedMaybe(lastModifiedDate)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
// Act
|
||||||
|
String actualLastModifiedDate = crawledDocument.getLastModified();
|
||||||
|
|
||||||
|
// Assert
|
||||||
|
assertEquals(lastModifiedDate, actualLastModifiedDate);
|
||||||
|
}
|
||||||
|
}
|
@ -38,7 +38,8 @@ class CrawledDocumentParquetRecordFileWriterTest {
|
|||||||
200,
|
200,
|
||||||
Instant.now(),
|
Instant.now(),
|
||||||
"text/html",
|
"text/html",
|
||||||
"hello world".getBytes());
|
"hello world".getBytes(),
|
||||||
|
null, null);
|
||||||
|
|
||||||
try (var writer = new CrawledDocumentParquetRecordFileWriter(tempFile)) {
|
try (var writer = new CrawledDocumentParquetRecordFileWriter(tempFile)) {
|
||||||
writer.write(original);
|
writer.write(original);
|
||||||
|
@ -51,7 +51,9 @@ public class SideloaderProcessing {
|
|||||||
url,
|
url,
|
||||||
"",
|
"",
|
||||||
"SIDELOAD",
|
"SIDELOAD",
|
||||||
false
|
false,
|
||||||
|
null,
|
||||||
|
null
|
||||||
);
|
);
|
||||||
|
|
||||||
var ret = new ProcessedDocument();
|
var ret = new ProcessedDocument();
|
||||||
|
@ -116,7 +116,9 @@ public class ConvertingIntegrationTest {
|
|||||||
"https://memex.marginalia.nu/" + file,
|
"https://memex.marginalia.nu/" + file,
|
||||||
null,
|
null,
|
||||||
"",
|
"",
|
||||||
false
|
false,
|
||||||
|
null,
|
||||||
|
null
|
||||||
);
|
);
|
||||||
docs.add(doc);
|
docs.add(doc);
|
||||||
}
|
}
|
||||||
|
@ -49,22 +49,11 @@ public record DocumentWithReference(
|
|||||||
if (null == doc)
|
if (null == doc)
|
||||||
return ContentTags.empty();
|
return ContentTags.empty();
|
||||||
|
|
||||||
String headers = doc.headers;
|
String lastmod = doc.getLastModified();
|
||||||
if (headers == null)
|
String etag = doc.getEtag();
|
||||||
|
|
||||||
|
if (lastmod == null && etag == null) {
|
||||||
return ContentTags.empty();
|
return ContentTags.empty();
|
||||||
|
|
||||||
String[] headersLines = headers.split("\n");
|
|
||||||
|
|
||||||
String lastmod = null;
|
|
||||||
String etag = null;
|
|
||||||
|
|
||||||
for (String line : headersLines) {
|
|
||||||
if (line.toLowerCase().startsWith("etag:")) {
|
|
||||||
etag = line.substring(5).trim();
|
|
||||||
}
|
|
||||||
if (line.toLowerCase().startsWith("last-modified:")) {
|
|
||||||
lastmod = line.substring(14).trim();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return new ContentTags(etag, lastmod);
|
return new ContentTags(etag, lastmod);
|
||||||
|
@ -0,0 +1,86 @@
|
|||||||
|
package nu.marginalia.crawl.retreival.revisit;
|
||||||
|
|
||||||
|
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||||
|
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
||||||
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
public class DocumentWithReferenceTest {
|
||||||
|
|
||||||
|
// test case for when doc is null
|
||||||
|
@Test
|
||||||
|
public void getContentTags_docIsNull() {
|
||||||
|
// set up test data
|
||||||
|
CrawledDocument doc = null;
|
||||||
|
CrawlDataReference reference = new CrawlDataReference();
|
||||||
|
|
||||||
|
DocumentWithReference documentWithReference = new DocumentWithReference(doc, reference);
|
||||||
|
|
||||||
|
// execute method under test
|
||||||
|
ContentTags contentTags = documentWithReference.getContentTags();
|
||||||
|
|
||||||
|
// verify that returned content tags is empty
|
||||||
|
assertTrue(contentTags.isEmpty());
|
||||||
|
}
|
||||||
|
|
||||||
|
// test case for when doc is not null, and lastModified and eTag are null
|
||||||
|
@Test
|
||||||
|
public void getContentTags_lastModifiedAndETagIsNull() {
|
||||||
|
// set up test data
|
||||||
|
CrawledDocument doc = CrawledDocument.builder().build(); // both lastModified and eTag are null
|
||||||
|
CrawlDataReference reference = new CrawlDataReference();
|
||||||
|
|
||||||
|
DocumentWithReference documentWithReference = new DocumentWithReference(doc, reference);
|
||||||
|
|
||||||
|
// execute method under test
|
||||||
|
ContentTags contentTags = documentWithReference.getContentTags();
|
||||||
|
|
||||||
|
// verify that returned content tags is empty
|
||||||
|
assertTrue(contentTags.isEmpty());
|
||||||
|
}
|
||||||
|
|
||||||
|
// test case for when doc is not null, and lastModified and eTag are not null
|
||||||
|
@Test
|
||||||
|
public void getContentTags_lastModifiedAndETagAreNotNull_NewCrawlData() {
|
||||||
|
// set up test data
|
||||||
|
CrawledDocument doc = CrawledDocument.builder()
|
||||||
|
.etagMaybe("12345")
|
||||||
|
.lastModifiedMaybe("67890")
|
||||||
|
.build(); // assume lastModified and eTag are not null
|
||||||
|
CrawlDataReference reference = new CrawlDataReference();
|
||||||
|
|
||||||
|
DocumentWithReference documentWithReference = new DocumentWithReference(doc, reference);
|
||||||
|
|
||||||
|
// execute method under test
|
||||||
|
ContentTags contentTags = documentWithReference.getContentTags();
|
||||||
|
|
||||||
|
// verify that returned content tags is present
|
||||||
|
assertFalse(contentTags.isEmpty());
|
||||||
|
assertEquals("12345", contentTags.etag());
|
||||||
|
assertEquals("67890", contentTags.lastMod());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void getContentTags_lastModifiedAndETagAreNotNull_LegacyCrawlData() {
|
||||||
|
// set up test data
|
||||||
|
CrawledDocument doc = CrawledDocument.builder()
|
||||||
|
.headers("""
|
||||||
|
Etag: 12345
|
||||||
|
Last-Modified: 67890
|
||||||
|
""")
|
||||||
|
.build(); // assume lastModified and eTag are not null
|
||||||
|
CrawlDataReference reference = new CrawlDataReference();
|
||||||
|
|
||||||
|
DocumentWithReference documentWithReference = new DocumentWithReference(doc, reference);
|
||||||
|
|
||||||
|
// execute method under test
|
||||||
|
ContentTags contentTags = documentWithReference.getContentTags();
|
||||||
|
|
||||||
|
// verify that returned content tags is present
|
||||||
|
assertFalse(contentTags.isEmpty());
|
||||||
|
assertEquals("12345", contentTags.etag());
|
||||||
|
assertEquals("67890", contentTags.lastMod());
|
||||||
|
}
|
||||||
|
}
|
@ -234,6 +234,8 @@ class CrawlerRetreiverTest {
|
|||||||
}
|
}
|
||||||
var stream = CrawledDomainReader.createDataStream(tempFile);
|
var stream = CrawledDomainReader.createDataStream(tempFile);
|
||||||
|
|
||||||
|
System.out.println("---");
|
||||||
|
|
||||||
CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
|
CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
|
||||||
domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
|
domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
|
||||||
try (var recorder = new WarcRecorder(tempFile2)) {
|
try (var recorder = new WarcRecorder(tempFile2)) {
|
||||||
@ -244,8 +246,6 @@ class CrawlerRetreiverTest {
|
|||||||
Assertions.fail(ex);
|
Assertions.fail(ex);
|
||||||
}
|
}
|
||||||
|
|
||||||
new GZIPInputStream(Files.newInputStream(tempFile2)).transferTo(System.out);
|
|
||||||
|
|
||||||
try (var reader = new WarcReader(tempFile2)) {
|
try (var reader = new WarcReader(tempFile2)) {
|
||||||
WarcXResponseReference.register(reader);
|
WarcXResponseReference.register(reader);
|
||||||
|
|
||||||
@ -270,7 +270,7 @@ class CrawlerRetreiverTest {
|
|||||||
System.out.println(dr.domain + "/" + dr.crawlerStatus);
|
System.out.println(dr.domain + "/" + dr.crawlerStatus);
|
||||||
}
|
}
|
||||||
else if (doc instanceof CrawledDocument dc) {
|
else if (doc instanceof CrawledDocument dc) {
|
||||||
System.out.println(dc.url + "/" + dc.crawlerStatus + "/" + dc.httpStatus);
|
System.out.println(dc.url + "/" + dc.crawlerStatus + "/" + dc.httpStatus + "/" + dc.timestamp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
Loading…
Reference in New Issue
Block a user