From cf935a533110adda000a74064052e66fbc4558bc Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 15 Dec 2023 18:09:53 +0100 Subject: [PATCH] (converter) Read cookie information Add an optional new field to CrawledDocument containing information about whether the domain has cookies. This was previously on the CrawledDomain object, but since the WarcFormat requires us to write a WarcInfo object at the start of a crawl rather than at the end, this information is unobtainable when creating the CrawledDomain object. Also fix a bug in the deduplication logic in the DomainProcessor class that caused a test to break. --- .../io/format/ParquetSerializableCrawlDataStream.java | 3 ++- .../io/format/WarcSerializableCrawlDataStream.java | 8 +++++--- .../nu/marginalia/crawling/model/CrawledDocument.java | 4 ++++ .../nu/marginalia/crawling/model/CrawledDomain.java | 3 +++ .../converting/processor/DomainProcessor.java | 6 +++++- .../converting/sideload/SideloaderProcessing.java | 3 ++- .../converting/ConvertingIntegrationTest.java | 4 +++- .../CrawlingThenConvertingIntegrationTest.java | 10 +++++++++- 8 files changed, 33 insertions(+), 8 deletions(-) diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java index 0b852e01..e31913fd 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java @@ -101,7 +101,8 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial "", nextRecord.url, null, - "")); + "", + nextRecord.cookies)); } public void close() throws IOException { diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java index 02aefb6d..2cdb7af1 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java @@ -69,7 +69,6 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa redirectDomain = statusReason; } - // TODO: Fix cookies info somehow next = new CrawledDomain(domain, redirectDomain, status, statusReason, ip, new ArrayList<>(), new ArrayList<>() @@ -98,7 +97,9 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa response.payloadDigest().map(WarcDigest::base64).orElse(""), "", "", - ""); + "", + WarcXCookieInformationHeader.hasCookies(response) + ); } else if (parsedBody instanceof DocumentBodyResult.Ok ok) { next = new CrawledDocument( "", @@ -113,7 +114,8 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa response.payloadDigest().map(WarcDigest::base64).orElse(""), "", "", - ""); + "", + WarcXCookieInformationHeader.hasCookies(response)); } else { // unreachable throw new IllegalStateException("Unknown body type: " + parsedBody); diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java index 143c775b..7d85bdfd 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java @@ -30,6 +30,10 @@ public class CrawledDocument implements SerializableCrawlData { public String recrawlState; + /** This is not guaranteed to be set in all versions of the format, + * information may come in CrawledDomain instead */ + public Boolean hasCookies = false; + public static final String SERIAL_IDENTIFIER = "// DOCUMENT"; @Override public String getSerialIdentifier() { diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDomain.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDomain.java index 55ec27a6..3add3b8d 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDomain.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDomain.java @@ -17,6 +17,9 @@ public class CrawledDomain implements SerializableCrawlData { public String ip; public List doc; + + /** This is not guaranteed to be set in all versions of the format, + * information may come in CrawledDocument instead */ public List cookies; public int size() { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index 2f0fc690..f86b6bfe 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -97,11 +97,15 @@ public class DomainProcessor { } else if (data instanceof CrawledDocument doc) { try { - if (doc.url == null || processedUrls.add(doc.url)) + if (doc.url == null || !processedUrls.add(doc.url)) continue; fixBadCanonicalTag(doc); + if (Boolean.TRUE.equals(doc.hasCookies)) { + cookies = true; + } + // This case should never be reachable, as we should have initiated // the externalDomainLinks variable above if we made it past the // doc.url == null check; but we'll leave it here just in case diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java index 65f0bd41..16a1ae7c 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java @@ -50,7 +50,8 @@ public class SideloaderProcessing { Integer.toHexString(url.hashCode()), url, "", - "SIDELOAD" + "SIDELOAD", + false ); var ret = new ProcessedDocument(); diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java index ce0d8f4a..eaa9d813 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -65,6 +65,7 @@ public class ConvertingIntegrationTest { @Test public void testMemexMarginaliaNu() throws IOException { var ret = domainProcessor.process(asSerializableCrawlData(readMarginaliaWorkingSet())); + assertNotNull(ret); assertEquals(ret.state, DomainIndexingState.ACTIVE); assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu")); @@ -114,7 +115,8 @@ public class ConvertingIntegrationTest { Double.toString(Math.random()), "https://memex.marginalia.nu/" + file, null, - "" + "", + false ); docs.add(doc); } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 844062bb..51667b3a 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -11,10 +11,13 @@ import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream; import nu.marginalia.crawling.io.format.WarcSerializableCrawlDataStream; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecord; +import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; import nu.marginalia.model.crawlspec.CrawlSpecRecord; import org.junit.jupiter.api.*; @@ -31,6 +34,7 @@ public class CrawlingThenConvertingIntegrationTest { private HttpFetcher httpFetcher; private Path fileName; + private Path fileName2; @SneakyThrows @BeforeAll @@ -49,11 +53,13 @@ public class CrawlingThenConvertingIntegrationTest { domainProcessor = injector.getInstance(DomainProcessor.class); httpFetcher = new HttpFetcherImpl(WmsaHome.getUserAgent().uaString()); this.fileName = Files.createTempFile("crawling-then-converting", ".warc.gz"); + this.fileName2 = Files.createTempFile("crawling-then-converting", ".warc.gz"); } @AfterEach public void tearDown() throws IOException { Files.deleteIfExists(fileName); + Files.deleteIfExists(fileName2); } @Test @@ -90,7 +96,9 @@ public class CrawlingThenConvertingIntegrationTest { new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(); } - try (var reader = new WarcSerializableCrawlDataStream(fileName)) { + CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain, fileName, fileName2); + + try (var reader = new ParquetSerializableCrawlDataStream(fileName2)) { while (reader.hasNext()) { data.add(reader.next()); }