From fabffa80f0522c9c4fee7111c63ebdba8723fcc6 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 7 Dec 2023 15:26:01 +0100 Subject: [PATCH] (warc) Integrate the crawler's content type parsing and charset logic into the WarcSideloader --- .../processes/converting-process/build.gradle | 1 + .../sideload/warc/WarcSideloader.java | 41 ++++++++++++++----- .../sideload/warc/WarcSideloaderTest.java | 24 +++++++++-- 3 files changed, 51 insertions(+), 15 deletions(-) diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index 58b0ecdd..4a3f2290 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -58,6 +58,7 @@ dependencies { implementation project(':code:features-crawl:crawl-blocklist') implementation project(':code:features-crawl:link-parser') + implementation project(':code:features-crawl:content-type') testImplementation project(':code:libraries:term-frequency-dict') testImplementation project(':code:process-models:crawl-spec') diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java index 73d29a30..2d8c1bda 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java @@ -2,6 +2,8 @@ package nu.marginalia.converting.sideload.warc; import lombok.SneakyThrows; import nu.marginalia.atags.model.DomainLinks; +import nu.marginalia.contenttype.ContentTypeParser; +import nu.marginalia.contenttype.DocumentBodyToString; import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDomain; @@ -11,31 +13,32 @@ import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.DomainIndexingState; import org.netpreserve.jwarc.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.net.URISyntaxException; -import java.nio.charset.StandardCharsets; import java.nio.file.Path; import java.util.Iterator; import java.util.List; import java.util.Objects; import java.util.Optional; -import java.util.stream.StreamSupport; public class WarcSideloader implements SideloadSource, AutoCloseable { - private final Path warcFile; + private static final Logger logger = LoggerFactory.getLogger(WarcSideloader.class); + private final SideloaderProcessing sideloaderProcessing; private final WarcReader reader; private final EdgeDomain domain; + public WarcSideloader(Path warcFile, SideloaderProcessing sideloaderProcessing) throws IOException { - this.warcFile = warcFile; this.sideloaderProcessing = sideloaderProcessing; this.reader = new WarcReader(warcFile); this.domain = sniffDomainFromWarc() @@ -82,6 +85,8 @@ public class WarcSideloader implements SideloadSource, AutoCloseable { .map(WarcResponse.class::cast) .filter(this::isRelevantResponse) .map(this::process) + .filter(Optional::isPresent) + .map(Optional::get) .iterator(); } @@ -109,8 +114,8 @@ public class WarcSideloader implements SideloadSource, AutoCloseable { } @SneakyThrows - private ProcessedDocument process(WarcResponse response) { - String body = getBody(response); + private Optional process(WarcResponse response) { + Optional body = getBody(response); String url = response.target(); // We trim "/index.html"-suffixes from the index if they are present, @@ -119,18 +124,32 @@ public class WarcSideloader implements SideloadSource, AutoCloseable { url = url.substring(0, url.length() - "index.html".length()); } - return sideloaderProcessing - .processDocument(url, body, List.of(), new DomainLinks(), + if (body.isEmpty()) { + return Optional.empty(); + } + + return Optional.of(sideloaderProcessing + .processDocument(url, body.get(), List.of(), new DomainLinks(), GeneratorType.DOCS, - 10_000); + 10_000)); } @SneakyThrows - private String getBody(WarcResponse response) { + private Optional getBody(WarcResponse response) { var http = response.http(); // TODO: We should support additional encodings here - return new String(http.body().stream().readAllBytes(), StandardCharsets.UTF_8); + try (var body = http.body()) { + String contentType = http.headers().first("Content-Type").orElse(null); + byte[] bytes = body.stream().readAllBytes(); + + var ct = ContentTypeParser.parseContentType(contentType, bytes); + return Optional.of(DocumentBodyToString.getStringData(ct, bytes)); + } + catch (Exception ex) { + logger.info("Failed to parse body", ex); + } + return Optional.empty(); } @Override diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/warc/WarcSideloaderTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/warc/WarcSideloaderTest.java index 4e9fb406..da94e3a8 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/warc/WarcSideloaderTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/warc/WarcSideloaderTest.java @@ -3,6 +3,8 @@ package nu.marginalia.converting.sideload.warc; import com.google.inject.AbstractModule; import com.google.inject.Guice; import nu.marginalia.converting.ConverterModule; +import nu.marginalia.converting.model.ProcessedDocument; +import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.processor.ConverterDomainTypes; import nu.marginalia.converting.sideload.SideloaderProcessing; import org.junit.jupiter.api.AfterEach; @@ -16,7 +18,11 @@ import java.net.URI; import java.net.URISyntaxException; import java.nio.file.Files; import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.mockito.Mockito.when; class WarcSideloaderTest extends AbstractModule { @@ -53,13 +59,23 @@ class WarcSideloaderTest extends AbstractModule { throw new RuntimeException(e); } - try (var sideloader = new WarcSideloader(warcFile, processing)) { + ProcessedDomain domain; + List docs = new ArrayList<>(); - var domain = sideloader.getDomain(); - System.out.println(domain); - sideloader.getDocumentsStream().forEachRemaining(System.out::println); + try (var sideloader = new WarcSideloader(warcFile, processing)) { + domain = sideloader.getDomain(); + sideloader.getDocumentsStream().forEachRemaining(docs::add); } catch (Exception e) { throw new RuntimeException(e); } + + assertNotNull(domain); + assertEquals(3, docs.size()); + List fetchedUrls = docs.stream().map(doc -> doc.url).map(Object::toString).toList(); + assertEquals(List.of( + "https://www.marginalia.nu/", + "https://www.marginalia.nu/log/93_atags/", + "https://www.marginalia.nu/links/"), + fetchedUrls); } } \ No newline at end of file