diff --git a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertAction.java b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertAction.java index 833ad3f0..17102c06 100644 --- a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertAction.java +++ b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertAction.java @@ -4,5 +4,6 @@ public enum ConvertAction { ConvertCrawlData, SideloadEncyclopedia, SideloadDirtree, + SideloadWarc, SideloadStackexchange } diff --git a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertRequest.java b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertRequest.java index fffed79b..cf445e5a 100644 --- a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertRequest.java +++ b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertRequest.java @@ -38,6 +38,13 @@ public class ConvertRequest { destId, null); } + public static ConvertRequest forWarc(Path sourcePath, FileStorageId destId) { + return new ConvertRequest(ConvertAction.SideloadWarc, + sourcePath.toString(), + null, + destId, + null); + } public static ConvertRequest forStackexchange(Path sourcePath, FileStorageId destId) { return new ConvertRequest(ConvertAction.SideloadStackexchange, diff --git a/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java b/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java index 9def0480..c09ed550 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java +++ b/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java @@ -224,12 +224,19 @@ public class EdgeUrl implements Serializable { } public URL asURL() throws MalformedURLException { - int port = this.port != null ? this.port : switch(proto) { - case "http" -> 80; - case "https" -> 443; - default -> 0; - }; + try { + return asURI().toURL(); + } + catch (URISyntaxException e) { + throw new MalformedURLException(e.getMessage()); + } + } - return new URL(this.proto, this.domain.toString(), port, this.path); + public URI asURI() throws URISyntaxException { + if (port != null) { + return new URI(this.proto, null, this.domain.toString(), this.port, this.path, this.param, null); + } + + return new URI(this.proto, this.domain.toString(), this.path, this.param, null); } } diff --git a/code/features-crawl/content-type/build.gradle b/code/features-crawl/content-type/build.gradle new file mode 100644 index 00000000..73a155cb --- /dev/null +++ b/code/features-crawl/content-type/build.gradle @@ -0,0 +1,29 @@ +plugins { + id 'java' + + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(21)) + } +} + +dependencies { + implementation project(':code:common:model') + implementation libs.crawlercommons + implementation libs.notnull + + implementation libs.bundles.gson + implementation libs.bundles.slf4j + testImplementation libs.bundles.slf4j.test + + implementation libs.jsoup + implementation libs.commons.lang3 + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} diff --git a/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java new file mode 100644 index 00000000..095497c8 --- /dev/null +++ b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java @@ -0,0 +1,28 @@ +package nu.marginalia.contenttype; + +import org.apache.commons.lang3.StringUtils; + +/** Content type and charset of a document + * @param contentType The content type, e.g. "text/html" + * @param charset The charset, e.g. "UTF-8" + */ +public record ContentType(String contentType, String charset) { + public static ContentType parse(String contentTypeHeader) { + String[] parts = StringUtils.split(contentTypeHeader, ";", 2); + String contentType = parts[0].trim(); + String charset = parts.length > 1 ? parts[1].trim() : "UTF-8"; + + return new ContentType(contentType, charset); + } + + public boolean is(String contentType) { + return this.contentType.equalsIgnoreCase(contentType); + } + + public String toString() { + if (charset == null || charset.isBlank()) + return contentType; + + return STR."\{contentType}; charset=\{charset}"; + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeParser.java b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentTypeParser.java similarity index 60% rename from code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeParser.java rename to code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentTypeParser.java index 604264e3..5b794246 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeParser.java +++ b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentTypeParser.java @@ -1,7 +1,8 @@ -package nu.marginalia.crawl.retreival.logic; +package nu.marginalia.contenttype; import crawlercommons.mimetypes.MimeTypeDetector; -import nu.marginalia.crawling.model.ContentType; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; import org.jsoup.Jsoup; import java.util.Arrays; @@ -11,28 +12,40 @@ public class ContentTypeParser { static final MimeTypeDetector mimeTypeDetector = new MimeTypeDetector(); - public static ContentType parse(String contentType, byte[] data) { - return getContentTypeFromContentTypeString(contentType) - .or(() -> getContentTypeStringFromTag(data)) + /** Parse the content type and charset from a content type header and/or the body of a document, + * best effort + */ + public static ContentType parseContentType( + @Nullable String contentTypeHeader, + @NotNull byte[] body) + { + return getContentTypeFromContentTypeString(contentTypeHeader) + .or(() -> getContentTypeStringFromTag(body)) .orElseGet(() -> { - Optional charset = getCharsetFromTag(data); + Optional charset = getCharsetFromTag(body); return new ContentType( - Optional.ofNullable(contentType) - .or(() -> Optional.ofNullable(mimeTypeDetector.detect(data))) - .orElseGet(() -> ContentTypeParser.shittyMimeSniffer(data)), charset.orElse("ISO_8859_1")); + Optional.ofNullable(contentTypeHeader) + .or(() -> Optional.ofNullable(mimeTypeDetector.detect(body))) + .orElseGet(() -> ContentTypeParser.shittyMimeSniffer(body)), charset.orElse("ISO_8859_1")); }); } - private static Optional getContentTypeFromContentTypeString(String contentType) { - if (contentType != null && contentType.contains(";")) { - var parts = contentType.split(";"); - var content = parts[0].trim(); - var extra = parts[1].trim(); - if (extra.startsWith("charset=")) { - return Optional.of(new ContentType(content, extra.substring("charset=".length()))); - } - } - return Optional.empty(); + /** Parse the charset from a content type string. */ + private static Optional getContentTypeFromContentTypeString(@Nullable String contentType) { + if (contentType == null) + return Optional.empty(); + + if (!contentType.contains(";")) + return Optional.empty(); + + var parts = contentType.split(";"); + var content = parts[0].trim(); + var extra = parts[1].trim(); + + if (!extra.startsWith("charset=")) + return Optional.empty(); + + return Optional.of(new ContentType(content, extra.substring("charset=".length()))); } private static String shittyMimeSniffer(byte[] data) { @@ -45,6 +58,7 @@ public class ContentTypeParser { String startStr = new String(Arrays.copyOf(data, Math.min(128, data.length))).trim().toLowerCase(); if (startStr.contains("Title".getBytes(StandardCharsets.UTF_8); + String contentTypeHeader = "text/html; charset=UTF-8"; + ContentType result = ContentTypeParser.parseContentType(contentTypeHeader, body); + assertNotNull(result); + assertEquals("text/html", result.contentType()); + assertEquals("UTF-8", result.charset()); + } + + @Test + public void testParseContentTypeWithMetaCharset() { + byte[] body = "Title".getBytes(StandardCharsets.UTF_8); + ContentType result = ContentTypeParser.parseContentType(null, body); + assertNotNull(result); + assertEquals("text/html", result.contentType()); + assertEquals("UTF-8", result.charset()); + } + + @Test + public void testParseContentTypeWithHeaderValueAbsent() { + byte[] body = "Some random text.".getBytes(StandardCharsets.UTF_8); + String contentTypeHeader = "text/plain"; + ContentType result = ContentTypeParser.parseContentType(contentTypeHeader, body); + assertNotNull(result); + assertEquals("text/plain", result.contentType()); + assertEquals("ISO_8859_1", result.charset()); + } + + @Test + public void testParseContentTypeWithBinaryData() { + byte[] body = new byte[128]; + body[0] = 31; // ascii value less than 32 + ContentType result = ContentTypeParser.parseContentType(null, body); + assertNotNull(result); + assertEquals("application/binary", result.contentType()); + assertEquals("ISO_8859_1", result.charset()); + } +} \ No newline at end of file diff --git a/code/features-crawl/content-type/src/test/java/nu/marginalia/contenttype/DocumentBodyToStringTest.java b/code/features-crawl/content-type/src/test/java/nu/marginalia/contenttype/DocumentBodyToStringTest.java new file mode 100644 index 00000000..f7cf120d --- /dev/null +++ b/code/features-crawl/content-type/src/test/java/nu/marginalia/contenttype/DocumentBodyToStringTest.java @@ -0,0 +1,48 @@ +package nu.marginalia.contenttype; + +import org.junit.jupiter.api.Test; +import static org.junit.jupiter.api.Assertions.*; + +import java.nio.charset.StandardCharsets; + +public class DocumentBodyToStringTest { + @Test + public void testGetStringData_onUTF8(){ + + ContentType type = new ContentType("text/html", "UTF-8"); + + String expected = "Hello, World!"; + byte[] data = expected.getBytes(StandardCharsets.UTF_8); + + String result = DocumentBodyToString.getStringData(type, data); + + assertEquals(expected, result, "Result should match the expected string"); + } + + @Test + public void testGetStringData_onIllegalCharsetName(){ + + ContentType type = new ContentType("text/html", "unsupportedname"); + + String expected = "Hello, World!"; + byte[] data = expected.getBytes(StandardCharsets.UTF_8); + + String result = DocumentBodyToString.getStringData(type, data); + + assertEquals(expected, result, "Result should match the expected string if charset is illegal name"); + } + + @Test + public void testGetStringData_onUnsupportedCharset(){ + + ContentType type = new ContentType("text/html", "Macintosh"); + + String expected = "Hello, World!"; + byte[] data = expected.getBytes(StandardCharsets.UTF_8); + + String result = DocumentBodyToString.getStringData(type, data); + + assertEquals(expected, result, "Result should fall back to UTF-8 parsing if charset is unsupported"); + } + +} \ No newline at end of file diff --git a/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java b/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java index 13b982f5..67dd6366 100644 --- a/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java +++ b/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java @@ -37,7 +37,9 @@ public class GeoIpDictionary { throw new RuntimeException(e); } finally { - this.notifyAll(); + synchronized (this) { + this.notifyAll(); + } } }); } diff --git a/code/process-models/crawling-model/build.gradle b/code/process-models/crawling-model/build.gradle index ebbea855..ab4e8a8a 100644 --- a/code/process-models/crawling-model/build.gradle +++ b/code/process-models/crawling-model/build.gradle @@ -15,18 +15,28 @@ java { dependencies { implementation project(':code:common:model') implementation project(':code:common:db') + implementation project(':code:common:config') implementation project(':code:common:process') implementation project(':code:libraries:big-string') implementation project(':code:api:index-api') implementation project(':code:common:service-discovery') implementation project(':code:common:service-client') + implementation project(':code:features-crawl:content-type') implementation project(':code:libraries:language-processing') + implementation project(':third-party:parquet-floor') + implementation project(':third-party:commons-codec') implementation libs.bundles.slf4j implementation libs.notnull + implementation libs.bundles.parquet + implementation libs.jwarc implementation libs.gson + implementation libs.commons.io + implementation libs.commons.lang3 + implementation libs.okhttp3 + implementation libs.jsoup implementation libs.snakeyaml implementation libs.zstd diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeLogic.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/ContentTypeLogic.java similarity index 88% rename from code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeLogic.java rename to code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/ContentTypeLogic.java index c5860913..d884dbe5 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeLogic.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/ContentTypeLogic.java @@ -1,5 +1,6 @@ -package nu.marginalia.crawl.retreival.logic; +package nu.marginalia.crawling.body; +import nu.marginalia.contenttype.ContentType; import nu.marginalia.model.EdgeUrl; import java.util.List; @@ -37,6 +38,9 @@ public class ContentTypeLogic { return probableBinaryPattern.test(pathLowerCase); } + public boolean isAllowableContentType(ContentType contentType) { + return isAllowableContentType(contentType.contentType()); + } public boolean isAllowableContentType(String contentType) { if (allowAllContentTypes) return true; diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java new file mode 100644 index 00000000..019aa761 --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java @@ -0,0 +1,76 @@ +package nu.marginalia.crawling.body; + +import nu.marginalia.contenttype.ContentType; +import nu.marginalia.contenttype.ContentTypeParser; +import nu.marginalia.contenttype.DocumentBodyToString; +import nu.marginalia.crawling.model.CrawlerDocumentStatus; +import org.apache.commons.io.input.BOMInputStream; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.zip.GZIPInputStream; + +public class DocumentBodyExtractor { + private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic(); + + private static final Logger logger = LoggerFactory.getLogger(DocumentBodyExtractor.class); + + /** Extract the body from a fetch result as a byte array. */ + public static DocumentBodyResult asBytes(HttpFetchResult result) { + if (result instanceof HttpFetchResult.ResultOk fetchOk) { + return asBytes(fetchOk); + } + else if (result instanceof HttpFetchResult.Result304ReplacedWithReference retained) { + return new DocumentBodyResult.Ok<>(retained.contentType(), retained.body().getBytes()); + } + + return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.ERROR, "Fetch Result Not Ok"); + } + + /** Extract the body from a fetch result as a string. This function performs + * content-type checks to ensure that the content-type is such that this operation + * makes sense. + * + * @see ContentTypeLogic#isAllowableContentType(String) + * */ + public static DocumentBodyResult asString(HttpFetchResult result) { + return asBytes(result).flatMap(DocumentBodyExtractor::toStringResult); + } + + private static DocumentBodyResult toStringResult(ContentType contentType, byte[] bytes) { + if (contentTypeLogic.isAllowableContentType(contentType)) { + try { + return new DocumentBodyResult.Ok<>(contentType, DocumentBodyToString.getStringData(contentType, bytes)); + } + catch (Exception ex) { + return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); + } + } + else { + return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); + } + } + + /** Extract the body from a fetch result as a byte array. */ + public static DocumentBodyResult asBytes(HttpFetchResult.ResultOk rsp) { + try { + var byteStream = rsp.getInputStream(); + + if ("gzip".equals(rsp.header("Content-Encoding"))) { + byteStream = new GZIPInputStream(byteStream); + } + byteStream = new BOMInputStream(byteStream); + + var contentTypeHeader = rsp.header("Content-Type"); + + byte[] data = byteStream.readAllBytes(); // size is limited by WarcRecorder + var contentType = ContentTypeParser.parseContentType(contentTypeHeader, data); + + return new DocumentBodyResult.Ok<>(contentType, data); + } catch (Exception ex) { + logger.error("Failed to extract body", ex); + return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.ERROR, ""); + } + } + +} diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyResult.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyResult.java new file mode 100644 index 00000000..04e3fedb --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyResult.java @@ -0,0 +1,58 @@ +package nu.marginalia.crawling.body; + +import nu.marginalia.contenttype.ContentType; +import nu.marginalia.crawling.model.CrawlerDocumentStatus; + +import java.util.Optional; +import java.util.function.BiFunction; + +public sealed interface DocumentBodyResult { + record Ok(ContentType contentType, T body) implements DocumentBodyResult { + + @Override + public Optional mapOpt(BiFunction mapper) { + return Optional.of(mapper.apply(contentType, body)); + } + @Override + public Optional flatMapOpt(BiFunction> mapper) { + return mapper.apply(contentType, body); + } + + @Override + public DocumentBodyResult flatMap(BiFunction> mapper) { + return mapper.apply(contentType, body); + } + + @Override + public void ifPresent(ExConsumer consumer) throws Exception { + consumer.accept(contentType, body); + } + } + record Error(CrawlerDocumentStatus status, String why) implements DocumentBodyResult { + @Override + public Optional mapOpt(BiFunction mapper) { + return Optional.empty(); + } + public Optional flatMapOpt(BiFunction> mapper) { return Optional.empty(); } + + @Override + @SuppressWarnings("unchecked") + public DocumentBodyResult flatMap(BiFunction> mapper) { + return (DocumentBodyResult) this; + } + + @Override + public void ifPresent(ExConsumer consumer) throws Exception { + } + } + + Optional mapOpt(BiFunction mapper); + Optional flatMapOpt(BiFunction> mapper); + DocumentBodyResult flatMap(BiFunction> mapper); + + void ifPresent(ExConsumer consumer) throws Exception; + + interface ExConsumer { + void accept(ContentType contentType, T t) throws E; + } +} diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/HttpFetchResult.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/HttpFetchResult.java new file mode 100644 index 00000000..f0db28e8 --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/HttpFetchResult.java @@ -0,0 +1,160 @@ +package nu.marginalia.crawling.body; + +import nu.marginalia.contenttype.ContentType; +import okhttp3.Headers; +import org.jsoup.Jsoup; +import org.netpreserve.jwarc.MessageHeaders; +import org.netpreserve.jwarc.WarcResponse; +import org.jsoup.nodes.Document; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.InetAddress; +import java.net.URI; +import java.util.Optional; + +/* FIXME: This interface has a very unfortunate name that is not very descriptive. + */ +public sealed interface HttpFetchResult { + + boolean isOk(); + + /** Convert a WarcResponse to a HttpFetchResult */ + static HttpFetchResult importWarc(WarcResponse response) { + try { + var http = response.http(); + + try (var body = http.body()) { + byte[] bytes = body.stream().readAllBytes(); + + String ipAddress = response + .ipAddress() + .map(InetAddress::getHostAddress) + .orElse(""); + + return new ResultOk( + response.targetURI(), + http.status(), + http.headers(), + ipAddress, + bytes, + 0, + bytes.length + ); + } + } + catch (Exception ex) { + return new ResultException(ex); + } + } + + + /** Corresponds to a successful retrieval of a document + * from the remote server. Note that byte[] is only borrowed + * and subsequent calls may overwrite the contents of this buffer. + */ + record ResultOk(URI uri, + int statusCode, + Headers headers, + String ipAddress, + byte[] bytesRaw, + int bytesStart, + int bytesLength + ) implements HttpFetchResult { + + public boolean isOk() { + return statusCode >= 200 && statusCode < 300; + } + + public ResultOk(URI uri, + int statusCode, + MessageHeaders headers, + String ipAddress, + byte[] bytesRaw, + int bytesStart, + int bytesLength) { + this(uri, statusCode, convertHeaders(headers), ipAddress, bytesRaw, bytesStart, bytesLength); + } + + private static Headers convertHeaders(MessageHeaders headers) { + var ret = new Headers.Builder(); + for (var header : headers.map().entrySet()) { + for (var value : header.getValue()) { + ret.add(header.getKey(), value); + } + } + return ret.build(); + } + + public InputStream getInputStream() { + return new ByteArrayInputStream(bytesRaw, bytesStart, bytesLength); + } + + public Optional parseDocument() throws IOException { + return DocumentBodyExtractor.asString(this).flatMapOpt((contentType, body) -> { + if (contentType.is("text/html")) { + return Optional.of(Jsoup.parse(body)); + } + else { + return Optional.empty(); + } + }); + } + + public String header(String name) { + return headers.get(name); + } + + }; + + /** This is a special case where the document was not fetched + * because it was already in the database. In this case, we + * replace the original data. + * + * @see Result304Raw for the case where the document has not yet been replaced with the reference data. + */ + record Result304ReplacedWithReference(String url, ContentType contentType, String body) implements HttpFetchResult { + + public boolean isOk() { + return true; + } + + public Optional parseDocument() { + try { + return Optional.of(Jsoup.parse(body)); + } + catch (Exception ex) { + return Optional.empty(); + } + } + }; + + /** Fetching resulted in an exception */ + record ResultException(Exception ex) implements HttpFetchResult { + public boolean isOk() { + return false; + } + }; + + /** Fetching resulted in a HTTP 304, the remote content is identical to + * our reference copy. This will be replaced with a Result304ReplacedWithReference + * at a later stage. + * + * @see Result304ReplacedWithReference + */ + record Result304Raw() implements HttpFetchResult { + public boolean isOk() { + return false; + } + }; + + /** No result. This is typically injected at a later stage + * of processing, e.g. after filtering out irrelevant responses. + */ + record ResultNone() implements HttpFetchResult { + public boolean isOk() { + return false; + } + }; +} diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java index b7021ace..eb7ffd75 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java @@ -1,156 +1,52 @@ package nu.marginalia.crawling.io; -import com.github.luben.zstd.RecyclingBufferPool; -import com.github.luben.zstd.ZstdInputStream; import com.google.gson.Gson; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.crawling.io.format.LegacySerializableCrawlDataStream; +import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream; +import nu.marginalia.crawling.io.format.WarcSerializableCrawlDataStream; import nu.marginalia.model.gson.GsonFactory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.*; +import java.nio.file.Files; import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; -import java.util.Optional; -import java.util.concurrent.ForkJoinPool; -import java.util.concurrent.TimeUnit; public class CrawledDomainReader { - private final Gson gson = GsonFactory.get(); - private final Logger logger = LoggerFactory.getLogger(getClass()); - private final ForkJoinPool pool = new ForkJoinPool(6); + private static final Gson gson = GsonFactory.get(); public CrawledDomainReader() { } /** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */ - public SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException { - return new FileReadingSerializableCrawlDataStream(gson, fullPath.toFile()); + public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException { + String fileName = fullPath.getFileName().toString(); + if (fileName.endsWith(".zstd")) { + return new LegacySerializableCrawlDataStream(gson, fullPath.toFile()); + } + else if (fileName.endsWith(".warc") || fileName.endsWith(".warc.gz")) { + return new WarcSerializableCrawlDataStream(fullPath); + } + else if (fileName.endsWith(".parquet")) { + return new ParquetSerializableCrawlDataStream(fullPath); + } + else { + throw new IllegalArgumentException("Unknown file type: " + fullPath); + } } /** An iterator-like access to domain data. This must be closed otherwise it will leak off-heap memory! */ - public SerializableCrawlDataStream createDataStream(Path basePath, String domain, String id) throws IOException { - return createDataStream(CrawlerOutputFile.getOutputFile(basePath, id, domain)); - } + public static SerializableCrawlDataStream createDataStream(Path basePath, String domain, String id) throws IOException { + Path parquetPath = CrawlerOutputFile.getParquetPath(basePath, id, domain); + Path warcPath = CrawlerOutputFile.getWarcPath(basePath, id, domain, CrawlerOutputFile.WarcFileVersion.FINAL); - /** Read the entirety of the domain data into memory. This uses a lot of RAM */ - public CrawledDomain read(Path path) throws IOException { - DomainDataAssembler domainData = new DomainDataAssembler(); - - try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()), RecyclingBufferPool.INSTANCE)))) { - String line; - while ((line = br.readLine()) != null) { - if (line.startsWith("//")) { - String identifier = line; - String data = br.readLine(); - - pool.execute(() -> deserializeLine(identifier, data, domainData)); - } - } + if (Files.exists(parquetPath)) { + return createDataStream(parquetPath); } - - while (!pool.awaitQuiescence(1, TimeUnit.SECONDS)); - - return domainData.assemble(); - } - - - private void deserializeLine(String identifier, String data, DomainDataAssembler assembler) { - if (null == data) { - return; + if (Files.exists(warcPath)) { + return createDataStream(warcPath); } - if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) { - assembler.acceptDomain(gson.fromJson(data, CrawledDomain.class)); - } else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) { - assembler.acceptDoc(gson.fromJson(data, CrawledDocument.class)); + else { + return createDataStream(CrawlerOutputFile.getLegacyOutputFile(basePath, id, domain)); } } - public Optional readOptionally(Path path) { - try { - return Optional.of(read(path)); - } - catch (Exception ex) { - return Optional.empty(); - } - } - - private static class DomainDataAssembler { - private CrawledDomain domainPrototype; - private final List docs = new ArrayList<>(); - - public synchronized void acceptDomain(CrawledDomain domain) { - this.domainPrototype = domain; - } - - public synchronized void acceptDoc(CrawledDocument doc) { - docs.add(doc); - } - - public synchronized CrawledDomain assemble() { - if (!docs.isEmpty()) { - if (domainPrototype.doc == null) - domainPrototype.doc = new ArrayList<>(); - - domainPrototype.doc.addAll(docs); - } - return domainPrototype; - } - } - - private static class FileReadingSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { - private final Gson gson; - private final BufferedReader bufferedReader; - private SerializableCrawlData next = null; - - public FileReadingSerializableCrawlDataStream(Gson gson, File file) throws IOException { - this.gson = gson; - bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE))); - } - - @Override - public SerializableCrawlData next() throws IOException { - if (hasNext()) { - var ret = next; - next = null; - return ret; - } - throw new IllegalStateException("No more data"); - } - - @Override - public boolean hasNext() throws IOException { - if (next != null) - return true; - - String identifier = bufferedReader.readLine(); - if (identifier == null) { - bufferedReader.close(); - return false; - } - String data = bufferedReader.readLine(); - if (data == null) { - bufferedReader.close(); - return false; - } - - if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) { - next = gson.fromJson(data, CrawledDomain.class); - } else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) { - next = gson.fromJson(data, CrawledDocument.class); - } - else { - throw new IllegalStateException("Unknown identifier: " + identifier); - } - return true; - } - - @Override - public void close() throws Exception { - bufferedReader.close(); - } - } } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java index 0e278f09..f21715ee 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java @@ -55,7 +55,7 @@ public class CrawledDomainWriter implements AutoCloseable { } private Path getOutputFile(String id, String name) throws IOException { - return CrawlerOutputFile.createOutputPath(outputDir, id, name); + return CrawlerOutputFile.createLegacyOutputPath(outputDir, id, name); } @Override diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java index a7661085..ad6b4358 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java @@ -9,20 +9,20 @@ import java.nio.file.Path; public class CrawlerOutputFile { /** Return the Path to a file for the given id and name */ - public static Path getOutputFile(Path base, String id, String name) { + public static Path getLegacyOutputFile(Path base, String id, String name) { + id = padId(id); + String first = id.substring(0, 2); String second = id.substring(2, 4); Path destDir = base.resolve(first).resolve(second); - return destDir.resolve(id + "-" + filesystemSafeName(name) + ".zstd"); + return destDir.resolve(STR."\{id}-\{filesystemSafeName(name)}.zstd"); } /** Return the Path to a file for the given id and name, creating the prerequisite * directory structure as necessary. */ - public static Path createOutputPath(Path base, String id, String name) throws IOException { - if (id.length() < 4) { - id = Strings.repeat("0", 4 - id.length()) + id; - } + public static Path createLegacyOutputPath(Path base, String id, String name) throws IOException { + id = padId(id); String first = id.substring(0, 2); String second = id.substring(2, 4); @@ -31,7 +31,7 @@ public class CrawlerOutputFile { if (!Files.exists(destDir)) { Files.createDirectories(destDir); } - return destDir.resolve(id + "-" + filesystemSafeName(name) + ".zstd"); + return destDir.resolve(STR."\{id}-\{filesystemSafeName(name)}.zstd"); } @@ -49,4 +49,71 @@ public class CrawlerOutputFile { } + public static Path createWarcPath(Path basePath, String id, String domain, WarcFileVersion version) throws IOException { + id = padId(id); + + String first = id.substring(0, 2); + String second = id.substring(2, 4); + + Path destDir = basePath.resolve(first).resolve(second); + if (!Files.exists(destDir)) { + Files.createDirectories(destDir); + } + return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}-\{version.suffix}.warc.gz"); + } + + public static Path createParquetPath(Path basePath, String id, String domain) throws IOException { + id = padId(id); + + String first = id.substring(0, 2); + String second = id.substring(2, 4); + + Path destDir = basePath.resolve(first).resolve(second); + if (!Files.exists(destDir)) { + Files.createDirectories(destDir); + } + return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}.parquet"); + } + public static Path getParquetPath(Path basePath, String id, String domain) { + id = padId(id); + + String first = id.substring(0, 2); + String second = id.substring(2, 4); + + Path destDir = basePath.resolve(first).resolve(second); + return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}.parquet"); + } + public static Path getWarcPath(Path basePath, String id, String domain, WarcFileVersion version) { + id = padId(id); + + String first = id.substring(0, 2); + String second = id.substring(2, 4); + + Path destDir = basePath.resolve(first).resolve(second); + return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}.warc\{version.suffix}"); + } + + /** + * Pads the given ID with leading zeros to ensure it has a length of 4 characters. + */ + private static String padId(String id) { + if (id.length() < 4) { + id = Strings.repeat("0", 4 - id.length()) + id; + } + + return id; + } + + + public enum WarcFileVersion { + LIVE("open"), + TEMP("tmp"), + FINAL("final"); + + public final String suffix; + + WarcFileVersion(String suffix) { + this.suffix = suffix; + } + } } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java index 3aecc0fc..9598d002 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java @@ -1,11 +1,13 @@ package nu.marginalia.crawling.io; import nu.marginalia.crawling.model.SerializableCrawlData; +import org.jetbrains.annotations.Nullable; import java.io.IOException; +import java.nio.file.Path; import java.util.Iterator; -/** Closable iterator over serialized crawl data +/** Closable iterator exceptional over serialized crawl data * The data may appear in any order, and the iterator must be closed. * * @see CrawledDomainReader @@ -17,6 +19,8 @@ public interface SerializableCrawlDataStream extends AutoCloseable { boolean hasNext() throws IOException; + @Nullable + default Path path() { return null; } // Dummy iterator over nothing static SerializableCrawlDataStream empty() { diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacySerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacySerializableCrawlDataStream.java new file mode 100644 index 00000000..bfd52b78 --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacySerializableCrawlDataStream.java @@ -0,0 +1,73 @@ +package nu.marginalia.crawling.io.format; + +import com.github.luben.zstd.RecyclingBufferPool; +import com.github.luben.zstd.ZstdInputStream; +import com.google.gson.Gson; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.SerializableCrawlData; + +import java.io.*; +import java.nio.file.Path; + +/** This class is used to read the old format of crawl data, which was zstd-compressed JSON + * with type delimiters between records. + */ +public class LegacySerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { + private final Gson gson; + private final BufferedReader bufferedReader; + private SerializableCrawlData next = null; + + private final Path path; + public LegacySerializableCrawlDataStream(Gson gson, File file) throws IOException { + this.gson = gson; + bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE))); + path = file.toPath(); + } + + @Override + public Path path() { + return path; + } + @Override + public SerializableCrawlData next() throws IOException { + if (hasNext()) { + var ret = next; + next = null; + return ret; + } + throw new IllegalStateException("No more data"); + } + + @Override + public boolean hasNext() throws IOException { + if (next != null) + return true; + + String identifier = bufferedReader.readLine(); + if (identifier == null) { + bufferedReader.close(); + return false; + } + String data = bufferedReader.readLine(); + if (data == null) { + bufferedReader.close(); + return false; + } + + if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) { + next = gson.fromJson(data, CrawledDomain.class); + } else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) { + next = gson.fromJson(data, CrawledDocument.class); + } else { + throw new IllegalStateException("Unknown identifier: " + identifier); + } + return true; + } + + @Override + public void close() throws Exception { + bufferedReader.close(); + } +} diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java new file mode 100644 index 00000000..d3e54a07 --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java @@ -0,0 +1,135 @@ +package nu.marginalia.crawling.io.format; + +import lombok.SneakyThrows; +import nu.marginalia.contenttype.ContentType; +import nu.marginalia.contenttype.DocumentBodyToString; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.crawling.model.*; +import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecord; +import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader; +import nu.marginalia.hash.MurmurHash3_128; +import nu.marginalia.model.EdgeUrl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.file.Path; +import java.util.*; + +public class ParquetSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { + private static final Logger logger = LoggerFactory.getLogger(ParquetSerializableCrawlDataStream.class); + + private final MurmurHash3_128 hash = new MurmurHash3_128(); + private final Iterator backingIterator; + private final Deque nextQ = new ArrayDeque<>(); + private boolean wroteDomainRecord = false; + private final Path path; + + public ParquetSerializableCrawlDataStream(Path file) throws IOException { + path = file; + + backingIterator = CrawledDocumentParquetRecordFileReader.stream(file).iterator(); + } + + @Override + public Path path() { + return path; + } + + @Override + @SneakyThrows + public boolean hasNext() { + while (backingIterator.hasNext() && nextQ.isEmpty()) { + var nextRecord = backingIterator.next(); + if (!wroteDomainRecord) { + createDomainRecord(nextRecord); + wroteDomainRecord = true; + } + createDocumentRecord(nextRecord); + } + return !nextQ.isEmpty(); + } + + private void createDomainRecord(CrawledDocumentParquetRecord parquetRecord) throws URISyntaxException { + + CrawlerDomainStatus status = CrawlerDomainStatus.OK; + String statusReason = ""; + + String redirectDomain = null; + if (parquetRecord.contentType.equals("x-marginalia/advisory;state=redirect")) { + EdgeUrl crawledUrl = new EdgeUrl(parquetRecord.url); + redirectDomain = crawledUrl.getDomain().toString(); + status = CrawlerDomainStatus.REDIRECT; + } + else if (parquetRecord.contentType.equals("x-marginalia/advisory;state=blocked")) { + status = CrawlerDomainStatus.BLOCKED; + } + else if (parquetRecord.contentType.equals("x-marginalia/advisory;state=error")) { + status = CrawlerDomainStatus.ERROR; + statusReason = new String(parquetRecord.body); + } + + nextQ.add(new CrawledDomain( + parquetRecord.domain, + redirectDomain, + status.toString(), + statusReason, + parquetRecord.ip, + new ArrayList<>(), + new ArrayList<>() + )); + } + + private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) { + String bodyString = ""; + CrawlerDocumentStatus status = CrawlerDocumentStatus.OK; + + if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=content-type-failed-probe")) { + status = CrawlerDocumentStatus.BAD_CONTENT_TYPE; + } + else if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=robots-txt-skipped")) { + status = CrawlerDocumentStatus.ROBOTS_TXT; + } + else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { // other advisory stuff we don't want + return; + } + else { + try { + bodyString = DocumentBodyToString.getStringData( + ContentType.parse(nextRecord.contentType), + nextRecord.body); + } catch (Exception ex) { + logger.error("Failed to convert body to string", ex); + status = CrawlerDocumentStatus.BAD_CHARSET; + } + } + + nextQ.add(new CrawledDocument("", + nextRecord.url, + nextRecord.contentType, + nextRecord.timestamp.toString(), + nextRecord.httpStatus, + status.toString(), + "", + "", + bodyString, + Long.toHexString(hash.hashNearlyASCII(bodyString)), // this field isn't actually used, maybe we can skip calculating it? + nextRecord.url, + null, + "", + nextRecord.cookies)); + } + + public void close() throws IOException { + } + + @Override + public SerializableCrawlData next() throws IOException { + if (!hasNext()) + throw new NoSuchElementException(); + + return nextQ.poll(); + } + +} diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java new file mode 100644 index 00000000..2cdb7af1 --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java @@ -0,0 +1,151 @@ +package nu.marginalia.crawling.io.format; + +import lombok.SneakyThrows; +import nu.marginalia.crawling.body.DocumentBodyExtractor; +import nu.marginalia.crawling.body.DocumentBodyResult; +import nu.marginalia.crawling.body.HttpFetchResult; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.SerializableCrawlData; +import org.netpreserve.jwarc.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.*; + +public class WarcSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { + private static final Logger logger = LoggerFactory.getLogger(WarcSerializableCrawlDataStream.class); + + private final WarcReader reader; + private final Iterator backingIterator; + private SerializableCrawlData next = null; + private final Path path; + + public WarcSerializableCrawlDataStream(Path file) throws IOException { + path = file; + reader = new WarcReader(file); + WarcXResponseReference.register(reader); + WarcXEntityRefused.register(reader); + + backingIterator = reader.iterator(); + } + + @Override + public Path path() { + return path; + } + + @Override + @SneakyThrows + public boolean hasNext() { + while (backingIterator.hasNext() && next == null) { + var nextRecord = backingIterator.next(); + if (nextRecord instanceof WarcResponse response) { // this also includes WarcXResponseReference + convertResponse(response); + } + else if (nextRecord instanceof Warcinfo warcinfo) { + convertWarcinfo(warcinfo); + } + } + return next != null; + } + + private void convertWarcinfo(Warcinfo warcinfo) throws IOException { + var headers = warcinfo.fields(); + String probeStatus = headers.first("X-WARC-Probe-Status").orElse(""); + String[] parts = probeStatus.split(" ", 2); + + + String domain = headers.first("domain").orElseThrow(() -> new IllegalStateException("Missing domain header")); + String status = parts[0]; + String statusReason = parts.length > 1 ? parts[1] : ""; + String ip = headers.first("ip").orElse(""); + + String redirectDomain = null; + if ("REDIRECT".equalsIgnoreCase(status)) { + redirectDomain = statusReason; + } + + next = new CrawledDomain(domain, redirectDomain, status, statusReason, ip, + new ArrayList<>(), + new ArrayList<>() + ); + } + + private void convertResponse(WarcResponse response) throws IOException { + var http = response.http(); + + if (http.status() != 200) { + return; + } + + var parsedBody = DocumentBodyExtractor.asString(HttpFetchResult.importWarc(response)); + if (parsedBody instanceof DocumentBodyResult.Error error) { + next = new CrawledDocument( + "", + response.targetURI().toString(), + http.contentType().raw(), + response.date().toString(), + http.status(), + error.status().toString(), + error.why(), + headers(http.headers()), + null, + response.payloadDigest().map(WarcDigest::base64).orElse(""), + "", + "", + "", + WarcXCookieInformationHeader.hasCookies(response) + ); + } else if (parsedBody instanceof DocumentBodyResult.Ok ok) { + next = new CrawledDocument( + "", + response.targetURI().toString(), + ok.contentType().toString(), + response.date().toString(), + http.status(), + "OK", + "", + headers(http.headers()), + ok.body(), + response.payloadDigest().map(WarcDigest::base64).orElse(""), + "", + "", + "", + WarcXCookieInformationHeader.hasCookies(response)); + } else { + // unreachable + throw new IllegalStateException("Unknown body type: " + parsedBody); + } + } + + public String headers(MessageHeaders headers) { + StringJoiner ret = new StringJoiner("\n"); + for (var header : headers.map().entrySet()) { + for (var value : header.getValue()) { + ret.add(STR."\{header.getKey()}: \{value}"); + } + } + return ret.toString(); + } + + public void close() throws IOException { + reader.close(); + } + + @Override + public SerializableCrawlData next() throws IOException { + if (!hasNext()) + throw new NoSuchElementException(); + try { + return next; + } + finally { + next = null; + } + } + +} diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/ContentType.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/ContentType.java deleted file mode 100644 index e8a9fca1..00000000 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/ContentType.java +++ /dev/null @@ -1,5 +0,0 @@ -package nu.marginalia.crawling.model; - - -public record ContentType(String contentType, String charset) { -} diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java index 143c775b..6b9ba1be 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java @@ -23,13 +23,21 @@ public class CrawledDocument implements SerializableCrawlData { public String headers; public String documentBody; + + @Deprecated public String documentBodyHash; + @Deprecated public String canonicalUrl; public String redirectUrl; + @Deprecated public String recrawlState; + /** This is not guaranteed to be set in all versions of the format, + * information may come in CrawledDomain instead */ + public Boolean hasCookies = false; + public static final String SERIAL_IDENTIFIER = "// DOCUMENT"; @Override public String getSerialIdentifier() { diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDomain.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDomain.java index 482311c1..3add3b8d 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDomain.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDomain.java @@ -17,6 +17,9 @@ public class CrawledDomain implements SerializableCrawlData { public String ip; public List doc; + + /** This is not guaranteed to be set in all versions of the format, + * information may come in CrawledDocument instead */ public List cookies; public int size() { @@ -24,6 +27,10 @@ public class CrawledDomain implements SerializableCrawlData { return doc.size(); } + public boolean hasCookies() { + return cookies != null && !cookies.isEmpty(); + } + public static final String SERIAL_IDENTIFIER = "// DOMAIN"; @Override public String getSerialIdentifier() { diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java new file mode 100644 index 00000000..c96aeb25 --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java @@ -0,0 +1,97 @@ +package nu.marginalia.crawling.parquet; + +import blue.strategic.parquet.Dehydrator; +import blue.strategic.parquet.Hydrator; +import blue.strategic.parquet.ValueWriter; +import lombok.AllArgsConstructor; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; +import lombok.ToString; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Types; + +import java.time.Instant; + +import static org.apache.parquet.schema.LogicalTypeAnnotation.*; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*; + +@AllArgsConstructor +@NoArgsConstructor +@EqualsAndHashCode +@ToString +public class CrawledDocumentParquetRecord { + public String domain; + public String url; + public String ip; + public boolean cookies; + public int httpStatus; + public Instant timestamp; + public String contentType; + public byte[] body; + + public static Hydrator newHydrator() { + return new CrawledDocumentParquetRecordHydrator(); + } + + public static Dehydrator newDehydrator() { + return CrawledDocumentParquetRecord::dehydrate; + } + + public static MessageType schema = new MessageType( + CrawledDocumentParquetRecord.class.getSimpleName(), + Types.required(BINARY).as(stringType()).named("domain"), + Types.required(BINARY).as(stringType()).named("url"), + Types.required(BINARY).as(stringType()).named("ip"), + Types.required(BOOLEAN).named("cookies"), + Types.required(INT32).named("httpStatus"), + Types.required(INT64).named("epochSeconds"), + Types.required(BINARY).as(stringType()).named("contentType"), + Types.required(BINARY).named("body") + ); + + + public CrawledDocumentParquetRecord add(String heading, Object value) { + switch (heading) { + case "domain" -> domain = (String) value; + case "url" -> url = (String) value; + case "ip" -> ip = (String) value; + case "httpStatus" -> httpStatus = (Integer) value; + case "cookies" -> cookies = (Boolean) value; + case "contentType" -> contentType = (String) value; + case "body" -> body = (byte[]) value; + case "epochSeconds" -> timestamp = Instant.ofEpochSecond((Long) value); + default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); + } + return this; + } + + public void dehydrate(ValueWriter valueWriter) { + valueWriter.write("domain", domain); + valueWriter.write("url", url); + valueWriter.write("ip", ip); + valueWriter.write("epochSeconds", timestamp.getEpochSecond()); + valueWriter.write("httpStatus", httpStatus); + valueWriter.write("cookies", cookies); + valueWriter.write("contentType", contentType); + valueWriter.write("body", body); + } +} + +class CrawledDocumentParquetRecordHydrator implements Hydrator { + + @Override + public CrawledDocumentParquetRecord start() { + return new CrawledDocumentParquetRecord(); + } + + @Override + public CrawledDocumentParquetRecord add(CrawledDocumentParquetRecord target, String heading, Object value) { + return target.add(heading, value); + } + + @Override + public CrawledDocumentParquetRecord finish(CrawledDocumentParquetRecord target) { + return target; + } + +} diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java new file mode 100644 index 00000000..7e8c7501 --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java @@ -0,0 +1,19 @@ +package nu.marginalia.crawling.parquet; + +import blue.strategic.parquet.HydratorSupplier; +import blue.strategic.parquet.ParquetReader; +import org.jetbrains.annotations.NotNull; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.stream.Stream; + +public class CrawledDocumentParquetRecordFileReader { + + @NotNull + public static Stream stream(Path path) throws IOException { + return ParquetReader.streamContent(path.toFile(), + HydratorSupplier.constantly(CrawledDocumentParquetRecord.newHydrator())); + } + +} diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java new file mode 100644 index 00000000..9245156f --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java @@ -0,0 +1,247 @@ +package nu.marginalia.crawling.parquet; + +import blue.strategic.parquet.ParquetWriter; +import nu.marginalia.UserAgent; +import nu.marginalia.crawling.body.DocumentBodyExtractor; +import nu.marginalia.crawling.body.DocumentBodyResult; +import nu.marginalia.crawling.body.HttpFetchResult; +import org.apache.commons.lang3.StringUtils; +import org.netpreserve.jwarc.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URI; +import java.nio.file.Path; +import java.time.Instant; +import java.util.List; +import java.util.Objects; + +public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { + private final ParquetWriter writer; + private static final Logger logger = LoggerFactory.getLogger(CrawledDocumentParquetRecordFileWriter.class); + + public static void convertWarc(String domain, + UserAgent userAgent, + Path warcInputFile, + Path parquetOutputFile) { + try (var warcReader = new WarcReader(warcInputFile); + var parquetWriter = new CrawledDocumentParquetRecordFileWriter(parquetOutputFile) + ) { + WarcXResponseReference.register(warcReader); + WarcXEntityRefused.register(warcReader); + + String uaString = userAgent.uaString(); + + for (var record : warcReader) { + if (record instanceof WarcResponse response) { + // this also captures WarcXResponseReference, which inherits from WarcResponse + // and is used to store old responses from previous crawls; in this part of the logic + // we treat them the same as a normal response + + if (!filterResponse(uaString, response)) { + continue; + } + + parquetWriter.write(domain, response); + } + else if (record instanceof WarcXEntityRefused refused) { + parquetWriter.write(domain, refused); + } + else if (record instanceof Warcinfo warcinfo) { + parquetWriter.write(warcinfo); + } + } + } + catch (Exception ex) { + logger.error("Failed to convert WARC file to Parquet", ex); + } + } + + /** Return true if the WarcResponse should be excluded from conversion */ + private static boolean filterResponse(String uaString, WarcResponse response) throws IOException { + + // We don't want to store robots.txt files, as they are not + // interesting for the analysis we want to do. This is important + // since txt-files in general are interesting, and we don't want to + // exclude them as a class. + + if (response.targetURI().getPath().equals("/robots.txt")) { + return false; + } + + var robotsTags = response.http().headers().all("X-Robots-Tag"); + if (!isXRobotsTagsPermitted(robotsTags, uaString)) { + return false; + } + + return true; + } + + private void write(String domain, WarcXEntityRefused refused) throws IOException { + URI profile = refused.profile(); + + String meta; + if (profile.equals(WarcXEntityRefused.documentRobotsTxtSkippedURN)) { + meta = "x-marginalia/advisory;state=robots-txt-skipped"; + } + else if (profile.equals(WarcXEntityRefused.documentBadContentTypeURN)) { + meta = "x-marginalia/advisory;state=content-type-failed-probe"; + } + else if (profile.equals(WarcXEntityRefused.documentProbeTimeout)) { + meta = "x-marginalia/advisory;state=timeout-probe"; + } + else if (profile.equals(WarcXEntityRefused.documentUnspecifiedError)) { + meta = "x-marginalia/advisory;state=doc-error"; + } + else { + meta = "x-marginalia/advisory;state=unknown"; + } + + write(forDocError(domain, refused.date(), refused.target(), meta)); + } + + private void write(Warcinfo warcinfo) throws IOException { + String selfDomain = warcinfo.fields().first("domain").orElse(""); + String ip = warcinfo.fields().first("ip").orElse(""); + String probeStatus = warcinfo.fields().first("X-WARC-Probe-Status").orElse(""); + + if (probeStatus.startsWith("REDIRECT")) { + String redirectDomain = probeStatus.substring("REDIRECT;".length()); + write(forDomainRedirect(selfDomain, warcinfo.date(), redirectDomain)); + } + else if (!"OK".equals(probeStatus)) { + write(forDomainError(selfDomain, warcinfo.date(), ip, probeStatus)); + } + } + + public CrawledDocumentParquetRecordFileWriter(Path file) throws IOException { + writer = ParquetWriter.writeFile(CrawledDocumentParquetRecord.schema, + file.toFile(), CrawledDocumentParquetRecord.newDehydrator()); + } + + public void write(CrawledDocumentParquetRecord domainData) throws IOException { + writer.write(domainData); + } + + public void write(String domain, WarcResponse response) throws IOException { + + HttpFetchResult result = HttpFetchResult.importWarc(response); + if (!(result instanceof HttpFetchResult.ResultOk fetchOk)) { + return; + } + + byte[] bodyBytes; + String contentType; + + var body = DocumentBodyExtractor.asBytes(result); + + if (body instanceof DocumentBodyResult.Ok bodyOk) { + bodyBytes = bodyOk.body(); + contentType = bodyOk.contentType().toString(); + } + else { + bodyBytes = new byte[0]; + contentType = ""; + } + + write(new CrawledDocumentParquetRecord( + domain, + response.target(), + fetchOk.ipAddress(), + WarcXCookieInformationHeader.hasCookies(response), + fetchOk.statusCode(), + response.date(), + contentType, + bodyBytes) + ); + } + + + public void close() throws IOException { + writer.close(); + } + + private CrawledDocumentParquetRecord forDomainRedirect(String domain, Instant date, String redirectDomain) { + return new CrawledDocumentParquetRecord(domain, + STR."https://\{redirectDomain}/", + "", + false, + 0, + date, + "x-marginalia/advisory;state=redirect", + new byte[0] + ); + } + private CrawledDocumentParquetRecord forDomainError(String domain, Instant date, String ip, String errorStatus) { + return new CrawledDocumentParquetRecord(domain, + STR."https://\{domain}/", + ip, + false, + 0, + date, + "x-marginalia/advisory;state=error", + errorStatus.getBytes() + ); + } + + private CrawledDocumentParquetRecord forDocError(String domain, Instant date, String url, String errorStatus) { + return new CrawledDocumentParquetRecord(domain, + url, + "", + false, + 0, + date, + errorStatus, + new byte[0] + ); + } + + + /** Check X-Robots-Tag header tag to see if we are allowed to index this page. + *

+ * Reference: https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag + * + * @param xRobotsHeaderTags List of X-Robots-Tag values + * @param userAgent User agent string + * @return true if we are allowed to index this page + */ + // Visible for tests + public static boolean isXRobotsTagsPermitted(List xRobotsHeaderTags, String userAgent) { + boolean isPermittedGeneral = true; + boolean isPermittedMarginalia = false; + boolean isForbiddenMarginalia = false; + + for (String header : xRobotsHeaderTags) { + if (header.indexOf(':') >= 0) { + String[] parts = StringUtils.split(header, ":", 2); + + if (parts.length < 2) + continue; + + // Is this relevant to us? + if (!Objects.equals(parts[0].trim(), userAgent)) + continue; + + if (parts[1].contains("noindex")) + isForbiddenMarginalia = true; + else if (parts[1].contains("none")) + isForbiddenMarginalia = true; + else if (parts[1].contains("all")) + isPermittedMarginalia = true; + } + else { + if (header.contains("noindex")) + isPermittedGeneral = false; + if (header.contains("none")) + isPermittedGeneral = false; + } + } + + if (isPermittedMarginalia) + return true; + if (isForbiddenMarginalia) + return false; + return isPermittedGeneral; + } +} diff --git a/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java new file mode 100644 index 00000000..7d983580 --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java @@ -0,0 +1,35 @@ +package org.netpreserve.jwarc; + +import okhttp3.HttpUrl; +import okhttp3.OkHttpClient; + +/** Encapsulates out-of-band information about whether a website uses cookies, + * using a non-standard WARC header "X-Has-Cookies". + */ +public class WarcXCookieInformationHeader { + private boolean hasCookies = false; + private static final String headerName = "X-Has-Cookies"; + + public void update(OkHttpClient client, HttpUrl url) { + if (!hasCookies) { + hasCookies = !client.cookieJar().loadForRequest(url).isEmpty(); + } + } + + public boolean hasCookies() { + return hasCookies; + } + + public void paint(WarcResponse.Builder builder) { + builder.addHeader(headerName, hasCookies ? "1" : "0"); + } + public void paint(WarcXResponseReference.Builder builder) { + builder.addHeader(headerName, hasCookies ? "1" : "0"); + } + + public static boolean hasCookies(WarcRecord record) { + return record.headers().contains(headerName, "1"); + } + + +} diff --git a/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXEntityRefused.java b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXEntityRefused.java new file mode 100644 index 00000000..4480115e --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXEntityRefused.java @@ -0,0 +1,45 @@ +package org.netpreserve.jwarc; + +import java.io.IOException; +import java.net.URI; + +/** This defines a non-standard extension to WARC for storing old HTTP responses, + * essentially a 'response' with different semantics + */ +public class WarcXEntityRefused extends WarcRevisit { + private static final String TYPE_NAME = "x-entity-refused"; + + public static final URI documentRobotsTxtSkippedURN = URI.create("urn:marginalia/meta/doc/robots-txt-skipped"); + public static final URI documentBadContentTypeURN = URI.create("urn:marginalia/meta/doc/content-type-failed-probe"); + public static final URI documentProbeTimeout = URI.create("urn:marginalia/meta/doc/timeout-probe"); + public static final URI documentUnspecifiedError = URI.create("urn:marginalia/meta/doc/error"); + + WarcXEntityRefused(MessageVersion version, MessageHeaders headers, MessageBody body) { + super(version, headers, body); + } + + public static void register(WarcReader reader) { + reader.registerType(TYPE_NAME, WarcXEntityRefused::new); + } + + public static class Builder extends AbstractBuilder { + public Builder(URI targetURI, URI profile) { + this(targetURI.toString(), profile.toString()); + } + + public Builder(String targetURI, String profileURI) { + super(TYPE_NAME); + setHeader("WARC-Target-URI", targetURI); + setHeader("WARC-Profile", profileURI); + } + + public Builder body(HttpResponse httpResponse) throws IOException { + return body(MediaType.HTTP_RESPONSE, httpResponse); + } + + @Override + public WarcXEntityRefused build() { + return build(WarcXEntityRefused::new); + } + } +} diff --git a/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java new file mode 100644 index 00000000..19a5a00f --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java @@ -0,0 +1,42 @@ +package org.netpreserve.jwarc; + +import java.io.IOException; +import java.net.URI; + +/** This defines a non-standard extension to WARC for storing old HTTP responses, + * essentially a 'response' with different semantics.. + *

+ * An x-response-reference record is a response record with a full body, where + * the data is a reconstructed HTTP response from a previous crawl. + */ +public class WarcXResponseReference extends WarcResponse { + private static final String TYPE_NAME = "x-response-reference"; + + WarcXResponseReference(MessageVersion version, MessageHeaders headers, MessageBody body) { + super(version, headers, body); + } + + public static void register(WarcReader reader) { + reader.registerType(TYPE_NAME, WarcXResponseReference::new); + } + + public static class Builder extends AbstractBuilder { + public Builder(URI targetURI) { + this(targetURI.toString()); + } + + public Builder(String targetURI) { + super(TYPE_NAME); + setHeader("WARC-Target-URI", targetURI); + } + + public Builder body(HttpResponse httpResponse) throws IOException { + return body(MediaType.HTTP_RESPONSE, httpResponse); + } + + @Override + public WarcXResponseReference build() { + return build(WarcXResponseReference::new); + } + } +} diff --git a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java index 718dea06..cbb88772 100644 --- a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java +++ b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java @@ -74,23 +74,13 @@ public class CrawlPlan { return count; } + @Deprecated public Iterable domainsIterable() { - final CrawledDomainReader reader = new CrawledDomainReader(); - - return WorkLog.iterableMap(crawl.getLogFile(), - entry -> { - var path = getCrawledFilePath(entry.path()); - if (!Files.exists(path)) { - logger.warn("File not found: {}", path); - return Optional.empty(); - } - return reader.readOptionally(path); - }); + // This is no longer supported + throw new UnsupportedOperationException(); } public Iterable crawlDataIterable(Predicate idPredicate) { - final CrawledDomainReader reader = new CrawledDomainReader(); - return WorkLog.iterableMap(crawl.getLogFile(), entry -> { if (!idPredicate.test(entry.id())) { @@ -105,7 +95,7 @@ public class CrawlPlan { } try { - return Optional.of(reader.createDataStream(path)); + return Optional.of(CrawledDomainReader.createDataStream(path)); } catch (IOException ex) { return Optional.empty(); diff --git a/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java b/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java new file mode 100644 index 00000000..c79154a4 --- /dev/null +++ b/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java @@ -0,0 +1,78 @@ +package nu.marginalia.crawling.parquet; + +import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.SerializableCrawlData; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Instant; +import java.util.ArrayList; + +import static org.junit.jupiter.api.Assertions.*; + +class CrawledDocumentParquetRecordFileWriterTest { + Path tempFile; + + @BeforeEach + public void setUp() throws IOException { + tempFile = Files.createTempFile("test", ".parquet"); + } + + @AfterEach + public void tearDown() throws IOException { + Files.delete(tempFile); + } + + @Test + void testWriteRead() throws IOException { + var original = new CrawledDocumentParquetRecord("www.marginalia.nu", + "https://www.marginalia.nu/", + "127.0.0.1", + false, + 200, + Instant.now(), + "text/html", + "hello world".getBytes()); + + try (var writer = new CrawledDocumentParquetRecordFileWriter(tempFile)) { + writer.write(original); + } + + var items = new ArrayList(); + + try (var stream = new ParquetSerializableCrawlDataStream(tempFile)) { + while (stream.hasNext()) { + items.add(stream.next()); + } + } + + assertEquals(2, items.size()); + + var firstItem = items.get(0); + assertInstanceOf(CrawledDomain.class, firstItem); + var domain = (CrawledDomain) firstItem; + assertEquals("www.marginalia.nu", domain.domain); + assertNull(domain.redirectDomain); + assertEquals("OK", domain.crawlerStatus); + assertEquals("", domain.crawlerStatusDesc); + assertEquals(new ArrayList<>(), domain.doc); + assertEquals(new ArrayList<>(), domain.cookies); + + var secondItem = items.get(1); + assertInstanceOf(CrawledDocument.class, secondItem); + + var document = (CrawledDocument) secondItem; + assertEquals("https://www.marginalia.nu/", document.url); + assertEquals("text/html", document.contentType); + assertEquals("hello world", document.documentBody); + assertEquals(200, document.httpStatus); + } + + +} \ No newline at end of file diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index 979260df..556f8015 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -59,6 +59,7 @@ dependencies { implementation project(':code:features-crawl:crawl-blocklist') implementation project(':code:features-crawl:link-parser') + implementation project(':code:features-crawl:content-type') testImplementation project(':code:libraries:term-frequency-dict') testImplementation project(':code:process-models:crawl-spec') @@ -66,6 +67,7 @@ dependencies { implementation libs.bundles.slf4j implementation libs.notnull + implementation libs.jwarc implementation libs.jsoup diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index ebfb1bc2..3bada914 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -268,6 +268,14 @@ public class ConverterMain { processData.asPath(), msg, inbox); } + case SideloadWarc -> { + var processData = fileStorageService.getStorage(request.processedDataStorage); + + yield new SideloadAction( + sideloadSourceFactory.sideloadWarc(Path.of(request.inputSource)), + processData.asPath(), + msg, inbox); + } case SideloadStackexchange -> { var processData = fileStorageService.getStorage(request.processedDataStorage); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java index 8e8841a0..4b5d9173 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java @@ -105,13 +105,6 @@ public class DocumentProcessor { private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument) throws URISyntaxException { - if (crawledDocument.canonicalUrl != null) { - try { - return new EdgeUrl(crawledDocument.canonicalUrl); - } - catch (URISyntaxException ex) { /* fallthrough */ } - } - return new EdgeUrl(crawledDocument.url); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index fc824906..e9794aad 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -18,6 +18,7 @@ import nu.marginalia.model.EdgeUrl; import nu.marginalia.converting.processor.logic.links.TopKeywords; import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator; import nu.marginalia.model.crawl.HtmlFeature; +import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -53,9 +54,15 @@ public class DomainProcessor { } @SneakyThrows + @Nullable public ProcessedDomain process(SerializableCrawlDataStream dataStream) { + if (!dataStream.hasNext()) { + return null; + } + var ret = new ProcessedDomain(); List docs = new ArrayList<>(); + Set processedUrls = new HashSet<>(); boolean cookies = false; String ip = ""; @@ -79,7 +86,7 @@ public class DomainProcessor { ret.domain = new EdgeDomain(crawledDomain.domain); ret.ip = crawledDomain.ip; - cookies = Objects.requireNonNullElse(crawledDomain.cookies, Collections.emptyList()).size() > 0; + cookies = crawledDomain.hasCookies(); ip = crawledDomain.ip; if (crawledDomain.redirectDomain != null) { @@ -90,10 +97,12 @@ public class DomainProcessor { } else if (data instanceof CrawledDocument doc) { try { - if (doc.url == null) + if (doc.url == null || !processedUrls.add(doc.url)) continue; - fixBadCanonicalTag(doc); + if (Boolean.TRUE.equals(doc.hasCookies)) { + cookies = true; + } // This case should never be reachable, as we should have initiated // the externalDomainLinks variable above if we made it past the @@ -161,25 +170,6 @@ public class DomainProcessor { return false; } - private void fixBadCanonicalTag(CrawledDocument doc) { - // Some sites have a canonical tag that points to a different domain, - // but our loader can not support this, so we point these back to the - // original url. - - var canonicalOpt = EdgeUrl.parse(doc.canonicalUrl); - if (canonicalOpt.isEmpty()) return; - - var urlOpt = EdgeUrl.parse(doc.url); - if (urlOpt.isEmpty()) return; - - var urlActual = urlOpt.get(); - var canonicalActual = canonicalOpt.get(); - - if (!Objects.equals(urlActual.domain, canonicalActual.domain)) { - doc.canonicalUrl = doc.url; - } - } - private void calculateStatistics(ProcessedDomain ret, DomainLinks externalDomainLinks) { LinkGraph linkGraph = new LinkGraph(); TopKeywords topKeywords = new TopKeywords(); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java index 60f81d19..808d4224 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java @@ -7,6 +7,7 @@ import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory; import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader; import nu.marginalia.converting.sideload.stackexchange.StackexchangeSideloader; +import nu.marginalia.converting.sideload.warc.WarcSideloadFactory; import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider; @@ -24,6 +25,7 @@ public class SideloadSourceFactory { private final AnchorTextKeywords anchorTextKeywords; private final AnchorTagsSourceFactory anchorTagsSourceFactory; private final DirtreeSideloaderFactory dirtreeSideloaderFactory; + private final WarcSideloadFactory warcSideloadFactory; @Inject public SideloadSourceFactory(Gson gson, @@ -31,7 +33,8 @@ public class SideloadSourceFactory { ThreadLocalSentenceExtractorProvider sentenceExtractorProvider, DocumentKeywordExtractor documentKeywordExtractor, AnchorTextKeywords anchorTextKeywords, AnchorTagsSourceFactory anchorTagsSourceFactory, - DirtreeSideloaderFactory dirtreeSideloaderFactory) { + DirtreeSideloaderFactory dirtreeSideloaderFactory, + WarcSideloadFactory warcSideloadFactory) { this.gson = gson; this.sideloaderProcessing = sideloaderProcessing; this.sentenceExtractorProvider = sentenceExtractorProvider; @@ -39,6 +42,7 @@ public class SideloadSourceFactory { this.anchorTextKeywords = anchorTextKeywords; this.anchorTagsSourceFactory = anchorTagsSourceFactory; this.dirtreeSideloaderFactory = dirtreeSideloaderFactory; + this.warcSideloadFactory = warcSideloadFactory; } public SideloadSource sideloadEncyclopediaMarginaliaNu(Path pathToDbFile, String baseUrl) throws SQLException { @@ -49,6 +53,10 @@ public class SideloadSourceFactory { return dirtreeSideloaderFactory.createSideloaders(pathToYamlFile); } + public Collection sideloadWarc(Path pathToWarcFiles) throws IOException { + return warcSideloadFactory.createSideloaders(pathToWarcFiles); + } + /** Do not use, this code isn't finished */ public Collection sideloadStackexchange(Path pathToDbFileRoot) throws IOException { try (var dirs = Files.walk(pathToDbFileRoot)) { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java index 65f0bd41..16a1ae7c 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java @@ -50,7 +50,8 @@ public class SideloaderProcessing { Integer.toHexString(url.hashCode()), url, "", - "SIDELOAD" + "SIDELOAD", + false ); var ret = new ProcessedDocument(); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloadFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloadFactory.java new file mode 100644 index 00000000..35fb6d3a --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloadFactory.java @@ -0,0 +1,32 @@ +package nu.marginalia.converting.sideload.warc; + +import nu.marginalia.converting.sideload.SideloadSource; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +public class WarcSideloadFactory { + + public Collection createSideloaders(Path pathToWarcFiles) throws IOException { + final List files = new ArrayList<>(); + + try (var stream = Files.list(pathToWarcFiles)) { + stream + .filter(Files::isRegularFile) + .filter(this::isWarcFile) + .forEach(files::add); + + } + // stub + return null; + } + + private boolean isWarcFile(Path path) { + return path.toString().endsWith(".warc") + || path.toString().endsWith(".warc.gz"); + } +} \ No newline at end of file diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java new file mode 100644 index 00000000..2d8c1bda --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java @@ -0,0 +1,160 @@ +package nu.marginalia.converting.sideload.warc; + +import lombok.SneakyThrows; +import nu.marginalia.atags.model.DomainLinks; +import nu.marginalia.contenttype.ContentTypeParser; +import nu.marginalia.contenttype.DocumentBodyToString; +import nu.marginalia.converting.model.GeneratorType; +import nu.marginalia.converting.model.ProcessedDocument; +import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.converting.sideload.SideloadSource; +import nu.marginalia.converting.sideload.SideloaderProcessing; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.DomainIndexingState; +import org.netpreserve.jwarc.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.file.Path; +import java.util.Iterator; +import java.util.List; +import java.util.Objects; +import java.util.Optional; + +public class WarcSideloader implements SideloadSource, AutoCloseable { + + private static final Logger logger = LoggerFactory.getLogger(WarcSideloader.class); + + private final SideloaderProcessing sideloaderProcessing; + + private final WarcReader reader; + + private final EdgeDomain domain; + + + public WarcSideloader(Path warcFile, + SideloaderProcessing sideloaderProcessing) + throws IOException + { + this.sideloaderProcessing = sideloaderProcessing; + this.reader = new WarcReader(warcFile); + this.domain = sniffDomainFromWarc() + .orElseThrow(() -> new IOException("Could not identify domain from warc file")); + } + + @SneakyThrows + @Override + public ProcessedDomain getDomain() { + var ret = new ProcessedDomain(); + + ret.domain = domain; + ret.ip = "0.0.0.0"; + ret.state = DomainIndexingState.ACTIVE; + + return ret; + } + + private Optional sniffDomainFromWarc() throws IOException { + try { + for (var record : reader) { + if (!(record instanceof WarcRequest request)) { + continue; + } + + String target = request.target(); + if (target.startsWith("http://") || target.startsWith("https://")) { + return Optional.of(new EdgeUrl(target).getDomain()); + } + } + } catch (URISyntaxException e) { + return Optional.empty(); + } finally { + reader.position(0); + } + return Optional.empty(); + } + + @SneakyThrows + @Override + public Iterator getDocumentsStream() { + return reader.records() + .filter(record -> record instanceof WarcResponse) + .map(WarcResponse.class::cast) + .filter(this::isRelevantResponse) + .map(this::process) + .filter(Optional::isPresent) + .map(Optional::get) + .iterator(); + } + + private boolean isRelevantResponse(WarcResponse warcResponse) { + try { + HttpResponse httpResponse = warcResponse.http(); + if (httpResponse == null) + return false; + if (httpResponse.status() != 200) + return false; + if (!Objects.equals(httpResponse.contentType(), MediaType.HTML)) + return false; + + var url = new EdgeUrl(warcResponse.target()); + if (!Objects.equals(url.getDomain(), domain)) { + return false; + } + + return true; + } catch (Exception e) { + e.printStackTrace(); + } + + return false; + } + + @SneakyThrows + private Optional process(WarcResponse response) { + Optional body = getBody(response); + String url = response.target(); + + // We trim "/index.html"-suffixes from the index if they are present, + // since this is typically an artifact from document retrieval + if (url.endsWith("/index.html")) { + url = url.substring(0, url.length() - "index.html".length()); + } + + if (body.isEmpty()) { + return Optional.empty(); + } + + return Optional.of(sideloaderProcessing + .processDocument(url, body.get(), List.of(), new DomainLinks(), + GeneratorType.DOCS, + 10_000)); + } + + @SneakyThrows + private Optional getBody(WarcResponse response) { + var http = response.http(); + + // TODO: We should support additional encodings here + try (var body = http.body()) { + String contentType = http.headers().first("Content-Type").orElse(null); + byte[] bytes = body.stream().readAllBytes(); + + var ct = ContentTypeParser.parseContentType(contentType, bytes); + return Optional.of(DocumentBodyToString.getStringData(ct, bytes)); + } + catch (Exception ex) { + logger.info("Failed to parse body", ex); + } + return Optional.empty(); + } + + @Override + public void close() throws Exception { + reader.close(); + } + +} diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java index 1ca66ed6..3069c5ed 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java @@ -3,6 +3,7 @@ package nu.marginalia.converting.writer; import lombok.SneakyThrows; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.worklog.BatchingWorkLog; +import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,7 +42,10 @@ public class ConverterWriter implements AutoCloseable { } @SneakyThrows - public void accept(ProcessedDomain domain) { + public void accept(@Nullable ProcessedDomain domain) { + if (null == domain) + return; + domainData.put(domain); } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java index ce0d8f4a..eaa9d813 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -65,6 +65,7 @@ public class ConvertingIntegrationTest { @Test public void testMemexMarginaliaNu() throws IOException { var ret = domainProcessor.process(asSerializableCrawlData(readMarginaliaWorkingSet())); + assertNotNull(ret); assertEquals(ret.state, DomainIndexingState.ACTIVE); assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu")); @@ -114,7 +115,8 @@ public class ConvertingIntegrationTest { Double.toString(Math.random()), "https://memex.marginalia.nu/" + file, null, - "" + "", + false ); docs.add(doc); } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 7ef056d2..535eac31 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -3,31 +3,51 @@ package nu.marginalia.converting; import com.google.inject.Guice; import com.google.inject.Injector; import lombok.SneakyThrows; +import nu.marginalia.UserAgent; import nu.marginalia.WmsaHome; +import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawlspec.CrawlSpecRecord; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Tag; -import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; import java.util.List; +import java.util.Set; +import java.util.function.Predicate; +import java.util.stream.Collectors; -/* This is mostly a debugging utility */ +import static org.junit.jupiter.api.Assertions.*; + +/** Tests for the crawler and converter integration. These are pretty slow and potentially + * a bit flaky, since they attempt to fetch real websites. + */ @Tag("slow") public class CrawlingThenConvertingIntegrationTest { private DomainProcessor domainProcessor; private HttpFetcher httpFetcher; + private static final Logger logger = LoggerFactory.getLogger(CrawlingThenConvertingIntegrationTest.class); + + private Path fileName; + private Path fileName2; + @SneakyThrows @BeforeAll public static void setUpAll() { @@ -44,10 +64,80 @@ public class CrawlingThenConvertingIntegrationTest { domainProcessor = injector.getInstance(DomainProcessor.class); httpFetcher = new HttpFetcherImpl(WmsaHome.getUserAgent().uaString()); + this.fileName = Files.createTempFile("crawling-then-converting", ".warc.gz"); + this.fileName2 = Files.createTempFile("crawling-then-converting", ".warc.gz"); + } + + @AfterEach + public void tearDown() throws IOException { + Files.deleteIfExists(fileName); + Files.deleteIfExists(fileName2); } @Test - public void crawlThenProcess() { + public void testInvalidDomain() throws IOException { + // Attempt to fetch an invalid domain + var specs = CrawlSpecRecord.builder() + .domain("invalid.invalid.invalid") + .crawlDepth(10) + .urls(List.of()) // add specific URLs to crawl here + .build(); + + CrawledDomain crawlData = crawl(specs); + + assertEquals("ERROR", crawlData.crawlerStatus); + assertTrue(crawlData.doc.isEmpty()); + + var processedData = process(); + + assertNotNull(processedData); + assertTrue(processedData.documents.isEmpty()); + } + + @Test + public void testRedirectingDomain() throws IOException { + // Attempt to fetch an invalid domain + var specs = CrawlSpecRecord.builder() + .domain("memex.marginalia.nu") + .crawlDepth(10) + .urls(List.of()) // add specific URLs to crawl here + .build(); + + CrawledDomain crawlData = crawl(specs); + + assertEquals("REDIRECT", crawlData.crawlerStatus); + assertEquals("www.marginalia.nu", crawlData.redirectDomain); + assertTrue(crawlData.doc.isEmpty()); + + var processedData = process(); + + assertNotNull(processedData); + assertTrue(processedData.documents.isEmpty()); + } + + @Test + public void testBlockedDomain() throws IOException { + // Attempt to fetch an invalid domain + var specs = CrawlSpecRecord.builder() + .domain("search.marginalia.nu") + .crawlDepth(10) + .urls(List.of()) // add specific URLs to crawl here + .build(); + + CrawledDomain crawlData = crawl(specs, d->false); // simulate blocking by blacklisting everything + + assertEquals("ERROR", crawlData.crawlerStatus); + assertEquals("BLOCKED;IP not allowed", crawlData.crawlerStatusDesc); + assertTrue(crawlData.doc.isEmpty()); + + var processedData = process(); + + assertNotNull(processedData); + assertTrue(processedData.documents.isEmpty()); + } + + @Test + public void crawlSunnyDay() throws IOException { var specs = CrawlSpecRecord.builder() .domain("www.marginalia.nu") .crawlDepth(10) @@ -55,12 +145,20 @@ public class CrawlingThenConvertingIntegrationTest { .build(); CrawledDomain domain = crawl(specs); + assertFalse(domain.doc.isEmpty()); + assertEquals("OK", domain.crawlerStatus); + assertEquals("www.marginalia.nu", domain.domain); - List data = new ArrayList<>(); - data.add(domain); - data.addAll(domain.doc); + boolean hasRobotsTxt = domain.doc.stream().map(doc -> doc.url).anyMatch(url -> url.endsWith("/robots.txt")); + assertFalse(hasRobotsTxt, "Robots.txt should not leave the crawler"); + + var output = process(); + + assertNotNull(output); + assertFalse(output.documents.isEmpty()); + assertEquals(new EdgeDomain("www.marginalia.nu"), output.domain); + assertEquals(DomainIndexingState.ACTIVE, output.state); - var output = domainProcessor.process(SerializableCrawlDataStream.fromIterator(data.iterator())); for (var doc : output.documents) { if (doc.isOk()) { @@ -73,12 +171,122 @@ public class CrawlingThenConvertingIntegrationTest { } - private CrawledDomain crawl(CrawlSpecRecord specs) { + + + @Test + public void crawlContentTypes() throws IOException { + var specs = CrawlSpecRecord.builder() + .domain("www.marginalia.nu") + .crawlDepth(5) + .urls(List.of( + "https://www.marginalia.nu/sanic.png", + "https://www.marginalia.nu/invalid" + )) + .build(); + + CrawledDomain domain = crawl(specs); + assertFalse(domain.doc.isEmpty()); + assertEquals("OK", domain.crawlerStatus); + assertEquals("www.marginalia.nu", domain.domain); + + Set allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet()); + assertTrue(allUrls.contains("https://www.marginalia.nu/sanic.png"), "Should have record for image despite blocked content type"); + assertTrue(allUrls.contains("https://www.marginalia.nu/invalid"), "Should have have record for invalid URL"); + + var output = process(); + + assertNotNull(output); + assertFalse(output.documents.isEmpty()); + assertEquals(new EdgeDomain("www.marginalia.nu"), output.domain); + assertEquals(DomainIndexingState.ACTIVE, output.state); + + + for (var doc : output.documents) { + if (doc.isOk()) { + System.out.println(doc.url + "\t" + doc.state + "\t" + doc.details.title); + } + else { + System.out.println(doc.url + "\t" + doc.state + "\t" + doc.stateReason); + } + } + + } + + + @Test + public void crawlRobotsTxt() throws IOException { + var specs = CrawlSpecRecord.builder() + .domain("search.marginalia.nu") + .crawlDepth(5) + .urls(List.of( + "https://search.marginalia.nu/search?q=hello+world" + )) + .build(); + + CrawledDomain domain = crawl(specs); + assertFalse(domain.doc.isEmpty()); + assertEquals("OK", domain.crawlerStatus); + assertEquals("search.marginalia.nu", domain.domain); + + Set allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet()); + assertTrue(allUrls.contains("https://search.marginalia.nu/search"), "We expect a record for entities that are forbidden"); + + var output = process(); + + assertNotNull(output); + assertFalse(output.documents.isEmpty()); + assertEquals(new EdgeDomain("search.marginalia.nu"), output.domain); + assertEquals(DomainIndexingState.ACTIVE, output.state); + + for (var doc : output.documents) { + if (doc.isOk()) { + System.out.println(doc.url + "\t" + doc.state + "\t" + doc.details.title); + } + else { + System.out.println(doc.url + "\t" + doc.state + "\t" + doc.stateReason); + } + } + + } + + private ProcessedDomain process() { + try (var stream = new ParquetSerializableCrawlDataStream(fileName2)) { + return domainProcessor.process(stream); + } + catch (Exception e) { + Assertions.fail(e); + return null; // unreachable + } + } + private CrawledDomain crawl(CrawlSpecRecord specs) throws IOException { + return crawl(specs, domain -> true); + } + + private CrawledDomain crawl(CrawlSpecRecord specs, Predicate domainBlacklist) throws IOException { List data = new ArrayList<>(); - new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch(); + try (var recorder = new WarcRecorder(fileName)) { + new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).fetch(); + } + + CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain, + new UserAgent("test"), + fileName, fileName2); + + try (var reader = new ParquetSerializableCrawlDataStream(fileName2)) { + while (reader.hasNext()) { + var next = reader.next(); + logger.info("{}", next); + data.add(next); + } + } + + CrawledDomain domain = data.stream() + .filter(CrawledDomain.class::isInstance) + .map(CrawledDomain.class::cast) + .findFirst() + .get(); - CrawledDomain domain = data.stream().filter(CrawledDomain.class::isInstance).map(CrawledDomain.class::cast).findFirst().get(); data.stream().filter(CrawledDocument.class::isInstance).map(CrawledDocument.class::cast).forEach(domain.doc::add); return domain; } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/warc/WarcSideloaderTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/warc/WarcSideloaderTest.java new file mode 100644 index 00000000..da94e3a8 --- /dev/null +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/warc/WarcSideloaderTest.java @@ -0,0 +1,81 @@ +package nu.marginalia.converting.sideload.warc; + +import com.google.inject.AbstractModule; +import com.google.inject.Guice; +import nu.marginalia.converting.ConverterModule; +import nu.marginalia.converting.model.ProcessedDocument; +import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.converting.processor.ConverterDomainTypes; +import nu.marginalia.converting.sideload.SideloaderProcessing; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; +import org.netpreserve.jwarc.WarcWriter; + +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.mockito.Mockito.when; + +class WarcSideloaderTest extends AbstractModule { + SideloaderProcessing processing; + + Path warcFile; + @BeforeEach + public void setUp() throws IOException { + processing = Guice.createInjector(new ConverterModule(), this) + .getInstance(SideloaderProcessing.class); + warcFile = Files.createTempFile(getClass().getSimpleName(), ".warc.gz"); + } + + @AfterEach + public void tearDown() throws IOException { + Files.deleteIfExists(warcFile); + } + + public void configure() { + var domainTypesMock = Mockito.mock(ConverterDomainTypes.class); + when(domainTypesMock.isBlog(Mockito.any())).thenReturn(false); + + bind(ConverterDomainTypes.class).toInstance(domainTypesMock); + } + + + @Test + public void test() throws IOException { + try (var writer = new WarcWriter(Files.newOutputStream(warcFile))) { + writer.fetch(new URI("https://www.marginalia.nu/")); + writer.fetch(new URI("https://www.marginalia.nu/log/93_atags/")); + writer.fetch(new URI("https://www.marginalia.nu/links/")); + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } + + ProcessedDomain domain; + List docs = new ArrayList<>(); + + try (var sideloader = new WarcSideloader(warcFile, processing)) { + domain = sideloader.getDomain(); + sideloader.getDocumentsStream().forEachRemaining(docs::add); + } catch (Exception e) { + throw new RuntimeException(e); + } + + assertNotNull(domain); + assertEquals(3, docs.size()); + List fetchedUrls = docs.stream().map(doc -> doc.url).map(Object::toString).toList(); + assertEquals(List.of( + "https://www.marginalia.nu/", + "https://www.marginalia.nu/log/93_atags/", + "https://www.marginalia.nu/links/"), + fetchedUrls); + } +} \ No newline at end of file diff --git a/code/processes/crawling-process/build.gradle b/code/processes/crawling-process/build.gradle index 00f0f01b..baa02906 100644 --- a/code/processes/crawling-process/build.gradle +++ b/code/processes/crawling-process/build.gradle @@ -41,6 +41,7 @@ dependencies { implementation project(':code:features-convert:anchor-keywords') implementation project(':code:features-crawl:crawl-blocklist') implementation project(':code:features-crawl:link-parser') + implementation project(':code:features-crawl:content-type') implementation libs.bundles.slf4j @@ -48,6 +49,7 @@ dependencies { implementation libs.guice implementation libs.gson implementation libs.zstd + implementation libs.jwarc implementation libs.crawlercommons implementation libs.okhttp3 implementation libs.jsoup diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java deleted file mode 100644 index 1b61cb0d..00000000 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java +++ /dev/null @@ -1,83 +0,0 @@ -package nu.marginalia.crawl; - -import lombok.SneakyThrows; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.concurrent.Semaphore; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicBoolean; - -public class CrawlLimiter { - public static final int maxPoolSize = Integer.getInteger("crawler.pool-size", 256); - - // Thresholds for throttling task-spawning. Note there's a bit of hysteresis to this - private final long THROTTLE_TRIGGER_FREE_RAM = Runtime.getRuntime().maxMemory() / 4; - private final long THROTTLE_RELEASE_FREE_RAM = Runtime.getRuntime().maxMemory() / 2; - - private final Semaphore taskSemCount = new Semaphore(maxPoolSize); - - // When set to true, the crawler will wait before starting additional tasks - private final AtomicBoolean throttle = new AtomicBoolean(false); - private static final Logger logger = LoggerFactory.getLogger(CrawlLimiter.class); - - public CrawlLimiter() { - Thread monitorThread = new Thread(this::monitor, "Memory Monitor"); - monitorThread.setDaemon(true); - monitorThread.start(); - } - - - @SneakyThrows - public void monitor() { - for (;;) { - synchronized (throttle) { - boolean oldThrottle = throttle.get(); - boolean newThrottle = oldThrottle; - - if (Runtime.getRuntime().maxMemory() == Long.MAX_VALUE) { - // According to the spec this may happen, although it seems to rarely - // be the case in practice - logger.warn("Memory based throttling disabled (set Xmx)"); - return; - } - - final long freeMemory = Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory(); - - if (oldThrottle && freeMemory > THROTTLE_RELEASE_FREE_RAM) { - newThrottle = false; - logger.warn("Memory based throttling released"); - } - else if (!oldThrottle && freeMemory < THROTTLE_TRIGGER_FREE_RAM) { - newThrottle = true; - logger.warn("Memory based throttling triggered"); - - // Try to GC - System.gc(); - } - - - throttle.set(newThrottle); - - if (!newThrottle) { - throttle.notifyAll(); - } - if (newThrottle != oldThrottle) { - logger.warn("Memory based throttling set to {}", newThrottle); - } - } - - TimeUnit.SECONDS.sleep(1); - } - } - - @SneakyThrows - public void waitForEnoughRAM() { - while (throttle.get()) { - synchronized (throttle) { - throttle.wait(30000); - } - } - } - -} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index f824d815..c3864868 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -13,10 +13,13 @@ import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.spec.CrawlSpecProvider; import nu.marginalia.crawl.spec.DbCrawlSpecProvider; import nu.marginalia.crawl.spec.ParquetCrawlSpecProvider; import nu.marginalia.crawling.io.CrawledDomainReader; +import nu.marginalia.crawling.io.CrawlerOutputFile; +import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; import nu.marginalia.crawlspec.CrawlSpecFileNames; import nu.marginalia.storage.FileStorageService; import nu.marginalia.model.crawlspec.CrawlSpecRecord; @@ -27,18 +30,17 @@ import nu.marginalia.mq.inbox.MqSingleShotInbox; import nu.marginalia.process.control.ProcessHeartbeatImpl; import nu.marginalia.process.log.WorkLog; import nu.marginalia.service.module.DatabaseModule; -import nu.marginalia.crawling.io.CrawledDomainWriter; import nu.marginalia.crawl.retreival.CrawlerRetreiver; -import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.util.SimpleBlockingThreadPool; import okhttp3.ConnectionPool; import okhttp3.Dispatcher; -import okhttp3.internal.Util; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.StandardCopyOption; import java.sql.SQLException; import java.util.*; import java.util.concurrent.*; @@ -49,13 +51,8 @@ import static nu.marginalia.mqapi.ProcessInboxNames.CRAWLER_INBOX; public class CrawlerMain { private final static Logger logger = LoggerFactory.getLogger(CrawlerMain.class); - private final ProcessHeartbeatImpl heartbeat; - private final ConnectionPool connectionPool = new ConnectionPool(5, 10, TimeUnit.SECONDS); - - private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS, - new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true))); - private final UserAgent userAgent; + private final ProcessHeartbeatImpl heartbeat; private final MessageQueueFactory messageQueueFactory; private final DomainProber domainProber; private final FileStorageService fileStorageService; @@ -66,13 +63,12 @@ public class CrawlerMain { private final SimpleBlockingThreadPool pool; private final Map processingIds = new ConcurrentHashMap<>(); - private final CrawledDomainReader reader = new CrawledDomainReader(); final AbortMonitor abortMonitor = AbortMonitor.getInstance(); volatile int totalTasks; final AtomicInteger tasksDone = new AtomicInteger(0); - private final CrawlLimiter limiter = new CrawlLimiter(); + private HttpFetcherImpl fetcher; @Inject public CrawlerMain(UserAgent userAgent, @@ -83,8 +79,8 @@ public class CrawlerMain { DbCrawlSpecProvider dbCrawlSpecProvider, AnchorTagsSourceFactory anchorTagsSourceFactory, Gson gson) { - this.heartbeat = heartbeat; this.userAgent = userAgent; + this.heartbeat = heartbeat; this.messageQueueFactory = messageQueueFactory; this.domainProber = domainProber; this.fileStorageService = fileStorageService; @@ -93,8 +89,14 @@ public class CrawlerMain { this.gson = gson; this.node = processConfiguration.node(); - // maybe need to set -Xss for JVM to deal with this? - pool = new SimpleBlockingThreadPool("CrawlerPool", CrawlLimiter.maxPoolSize, 1); + pool = new SimpleBlockingThreadPool("CrawlerPool", + Integer.getInteger("crawler.pool-size", 256), + 1); + + fetcher = new HttpFetcherImpl(userAgent.uaString(), + new Dispatcher(Executors.newVirtualThreadPerTaskExecutor()), + new ConnectionPool(5, 10, TimeUnit.SECONDS) + ); } public static void main(String... args) throws Exception { @@ -141,6 +143,7 @@ public class CrawlerMain { public void run(CrawlSpecProvider specProvider, Path outputDir) throws InterruptedException, IOException { heartbeat.start(); + try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler.log")); AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(specProvider.getDomains()) ) { @@ -175,6 +178,7 @@ public class CrawlerMain { activePoolCount = newActivePoolCount; } } + } catch (Exception ex) { logger.warn("Exception in crawler", ex); @@ -211,27 +215,48 @@ public class CrawlerMain { @Override public void run() throws Exception { - limiter.waitForEnoughRAM(); + Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE); + Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP); + Path finalWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.FINAL); + Path parquetFile = CrawlerOutputFile.createParquetPath(outputDir, id, domain); - HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool); + if (Files.exists(newWarcFile)) { + Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING); + } + else { + Files.deleteIfExists(tempFile); + } - try (CrawledDomainWriter writer = new CrawledDomainWriter(outputDir, domain, id); + try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now + var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder); CrawlDataReference reference = getReference()) { Thread.currentThread().setName("crawling:" + domain); var domainLinks = anchorTagsSource.getAnchorTags(domain); - var retreiver = new CrawlerRetreiver(fetcher, domainProber, specification, writer::accept); - int size = retreiver.fetch(domainLinks, reference); + if (Files.exists(tempFile)) { + retriever.syncAbortedRun(tempFile); + Files.delete(tempFile); + } - workLog.setJobToFinished(domain, writer.getOutputFile().toString(), size); + int size = retriever.fetch(domainLinks, reference); + + // Delete the reference crawl data if it's not the same as the new one + // (mostly a case when migrating from legacy->warc) + reference.delete(); + + CrawledDocumentParquetRecordFileWriter + .convertWarc(domain, userAgent, newWarcFile, parquetFile); + + workLog.setJobToFinished(domain, parquetFile.toString(), size); heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks); logger.info("Fetched {}", domain); - } catch (Exception e) { logger.error("Error fetching domain " + domain, e); + Files.deleteIfExists(newWarcFile); + Files.deleteIfExists(tempFile); } finally { // We don't need to double-count these; it's also kept int he workLog @@ -242,8 +267,7 @@ public class CrawlerMain { private CrawlDataReference getReference() { try { - var dataStream = reader.createDataStream(outputDir, domain, id); - return new CrawlDataReference(dataStream); + return new CrawlDataReference(CrawledDomainReader.createDataStream(outputDir, domain, id)); } catch (IOException e) { logger.debug("Failed to read previous crawl data for {}", specification.domain); return new CrawlDataReference(); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java index 985bfc39..65e1529b 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java @@ -5,14 +5,19 @@ import com.google.common.hash.Hashing; import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.lsh.EasyLSH; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import javax.annotation.Nullable; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; /** A reference to a domain that has been crawled before. */ public class CrawlDataReference implements AutoCloseable { private final SerializableCrawlDataStream data; + private static final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class); public CrawlDataReference(SerializableCrawlDataStream data) { this.data = data; @@ -22,6 +27,15 @@ public class CrawlDataReference implements AutoCloseable { this(SerializableCrawlDataStream.empty()); } + /** Delete the associated data from disk, if it exists */ + public void delete() throws IOException { + Path filePath = data.path(); + + if (filePath != null) { + Files.deleteIfExists(filePath); + } + } + @Nullable public CrawledDocument nextDocument() { try { @@ -32,17 +46,16 @@ public class CrawlDataReference implements AutoCloseable { } } catch (IOException ex) { - ex.printStackTrace(); + logger.error("Failed to read next document", ex); } + return null; } - public boolean isContentBodySame(CrawledDocument one, CrawledDocument other) { - assert one.documentBody != null; - assert other.documentBody != null; + public boolean isContentBodySame(String one, String other) { - final long contentHashOne = contentHash(one.documentBody); - final long contentHashOther = contentHash(other.documentBody); + final long contentHashOne = contentHash(one); + final long contentHashOther = contentHash(other); return EasyLSH.hammingDistance(contentHashOne, contentHashOther) < 4; } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java index ca2494dc..e52b73b6 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java @@ -20,8 +20,18 @@ public class CrawlDelayTimer { this.delayTime = delayTime; } + /** Call when we've gotten an HTTP 429 response. This will wait a moment, and then + * set a flag that slows down the main crawl delay as well. */ + public void waitRetryDelay(RateLimitException ex) throws InterruptedException { + slowDown = true; + + int delay = ex.retryAfter(); + + Thread.sleep(Math.clamp(delay, 100, 5000)); + } + @SneakyThrows - public void delay(long spentTime) { + public void waitFetchDelay(long spentTime) { long sleepTime = delayTime; if (sleepTime >= 1) { @@ -30,10 +40,6 @@ public class CrawlDelayTimer { Thread.sleep(min(sleepTime - spentTime, 5000)); } - else if (slowDown) { - // Additional delay when the server is signalling it wants slower requests - Thread.sleep( DEFAULT_CRAWL_DELAY_MIN_MS); - } else { // When no crawl delay is specified, lean toward twice the fetch+process time, // within sane limits. This means slower servers get slower crawling, and faster @@ -48,10 +54,10 @@ public class CrawlDelayTimer { Thread.sleep(sleepTime - spentTime); } - } - /** Increase the delay between requests if the server is signalling it wants slower requests with HTTP 429 */ - public void slowDown() { - slowDown = true; + if (slowDown) { + // Additional delay when the server is signalling it wants slower requests + Thread.sleep( DEFAULT_CRAWL_DELAY_MIN_MS); + } } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java new file mode 100644 index 00000000..37f84d58 --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java @@ -0,0 +1,91 @@ +package nu.marginalia.crawl.retreival; + +import nu.marginalia.crawling.body.HttpFetchResult; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawlerDocumentStatus; +import nu.marginalia.model.EdgeUrl; + +import java.time.LocalDateTime; +import java.util.Objects; + +public class CrawledDocumentFactory { + + public static CrawledDocument createHardErrorRsp(EdgeUrl url, Exception why) { + return CrawledDocument.builder() + .crawlerStatus(CrawlerDocumentStatus.ERROR.toString()) + .crawlerStatusDesc(why.getClass().getSimpleName() + ": " + why.getMessage()) + .timestamp(LocalDateTime.now().toString()) + .url(url.toString()) + .build(); + } + + public static CrawledDocument createUnknownHostError(EdgeUrl url) { + return CrawledDocument.builder() + .crawlerStatus(CrawlerDocumentStatus.ERROR.toString()) + .crawlerStatusDesc("Unknown Host") + .timestamp(LocalDateTime.now().toString()) + .url(url.toString()) + .build(); + } + + public static CrawledDocument createTimeoutErrorRsp(EdgeUrl url) { + return CrawledDocument.builder() + .crawlerStatus("Timeout") + .timestamp(LocalDateTime.now().toString()) + .url(url.toString()) + .build(); + } + + public static CrawledDocument createErrorResponse(EdgeUrl url, HttpFetchResult.ResultOk rsp, CrawlerDocumentStatus status, String why) { + return CrawledDocument.builder() + .crawlerStatus(status.toString()) + .crawlerStatusDesc(why) + .headers(rsp.headers().toString()) + .contentType(Objects.requireNonNullElse(rsp.headers().get("Content-Type"), "")) + .timestamp(LocalDateTime.now().toString()) + .httpStatus(rsp.statusCode()) + .url(url.toString()) + .build(); + } + public static CrawledDocument createErrorResponse(EdgeUrl url, String contentType, int statusCode, CrawlerDocumentStatus status, String why) { + return CrawledDocument.builder() + .crawlerStatus(status.toString()) + .crawlerStatusDesc(why) + .headers("") + .contentType(contentType) + .timestamp(LocalDateTime.now().toString()) + .httpStatus(statusCode) + .url(url.toString()) + .build(); + } + + public static CrawledDocument createRedirectResponse(EdgeUrl url, HttpFetchResult.ResultOk rsp, EdgeUrl responseUrl) { + + return CrawledDocument.builder() + .crawlerStatus(CrawlerDocumentStatus.REDIRECT.name()) + .redirectUrl(responseUrl.toString()) + .headers(rsp.headers().toString()) + .contentType(Objects.requireNonNullElse(rsp.headers().get("Content-Type"), "")) + .timestamp(LocalDateTime.now().toString()) + .httpStatus(rsp.statusCode()) + .url(url.toString()) + .build(); + } + + public static CrawledDocument createRobotsError(EdgeUrl url) { + return CrawledDocument.builder() + .url(url.toString()) + .timestamp(LocalDateTime.now().toString()) + .httpStatus(-1) + .crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name()) + .build(); + } + public static CrawledDocument createRetryError(EdgeUrl url) { + return CrawledDocument.builder() + .url(url.toString()) + .timestamp(LocalDateTime.now().toString()) + .httpStatus(429) + .crawlerStatus(CrawlerDocumentStatus.ERROR.name()) + .build(); + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index b32e0b6c..18035d52 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -3,11 +3,15 @@ package nu.marginalia.crawl.retreival; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import crawlercommons.robots.SimpleRobotRules; -import lombok.SneakyThrows; import nu.marginalia.atags.model.DomainLinks; +import nu.marginalia.contenttype.ContentType; import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; -import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever; +import nu.marginalia.crawling.body.HttpFetchResult; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor; +import nu.marginalia.crawl.retreival.revisit.DocumentWithReference; +import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.crawling.model.*; import nu.marginalia.ip_blocklist.UrlBlocklist; @@ -19,54 +23,49 @@ import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.annotation.Nullable; +import java.io.IOException; import java.net.InetAddress; import java.net.UnknownHostException; -import java.time.LocalDateTime; +import java.nio.file.Path; import java.util.*; -import java.util.function.Consumer; -public class CrawlerRetreiver { +public class CrawlerRetreiver implements AutoCloseable { private static final int MAX_ERRORS = 20; + private static final int HTTP_429_RETRY_LIMIT = 1; // Retry 429s once private final HttpFetcher fetcher; private final String domain; - private final Consumer crawledDomainWriter; private static final LinkParser linkParser = new LinkParser(); private static final Logger logger = LoggerFactory.getLogger(CrawlerRetreiver.class); - private static final HashFunction hashMethod = Hashing.murmur3_128(0); private static final UrlBlocklist urlBlocklist = new UrlBlocklist(); private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector(); private final DomainProber domainProber; - private final SitemapRetriever sitemapRetriever; private final DomainCrawlFrontier crawlFrontier; + private final WarcRecorder warcRecorder; + private final CrawlerRevisitor crawlerRevisitor; + private final SitemapFetcher sitemapFetcher; int errorCount = 0; - /** recrawlState tag for documents that had a HTTP status 304 */ - private static final String documentWasRetainedTag = "RETAINED/304"; - - /** recrawlState tag for documents that had a 200 status but were identical to a previous version */ - private static final String documentWasSameTag = "SAME-BY-COMPARISON"; - public CrawlerRetreiver(HttpFetcher fetcher, DomainProber domainProber, CrawlSpecRecord specs, - Consumer writer) { + WarcRecorder warcRecorder) + { + this.warcRecorder = warcRecorder; this.fetcher = fetcher; this.domainProber = domainProber; domain = specs.domain; - crawledDomainWriter = writer; - - this.crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), Objects.requireNonNullElse(specs.urls, List.of()), specs.crawlDepth); - sitemapRetriever = fetcher.createSitemapRetriever(); + crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), Objects.requireNonNullElse(specs.urls, List.of()), specs.crawlDepth); + crawlerRevisitor = new CrawlerRevisitor(crawlFrontier, this, warcRecorder); + sitemapFetcher = new SitemapFetcher(crawlFrontier, fetcher.createSitemapRetriever()); // We must always crawl the index page first, this is assumed when fingerprinting the server var fst = crawlFrontier.peek(); @@ -90,43 +89,42 @@ public class CrawlerRetreiver { public int fetch(DomainLinks domainLinks, CrawlDataReference oldCrawlData) { final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, crawlFrontier.peek()); - return switch (probeResult) { - case DomainProber.ProbeResultOk(EdgeUrl probedUrl) -> crawlDomain(oldCrawlData, probedUrl, domainLinks); - case DomainProber.ProbeResultError(CrawlerDomainStatus status, String desc) -> { - crawledDomainWriter.accept( - CrawledDomain.builder() - .crawlerStatus(status.name()) - .crawlerStatusDesc(desc) - .domain(domain) - .ip(findIp(domain)) - .build() - ); - yield 1; - } - case DomainProber.ProbeResultRedirect(EdgeDomain redirectDomain) -> { - crawledDomainWriter.accept( - CrawledDomain.builder() - .crawlerStatus(CrawlerDomainStatus.REDIRECT.name()) - .crawlerStatusDesc("Redirected to different domain") - .redirectDomain(redirectDomain.toString()) - .domain(domain) - .ip(findIp(domain)) - .build() - ); - yield 1; - } - }; + try { + return crawlDomain(oldCrawlData, probeResult, domainLinks); + } + catch (Exception ex) { + logger.error("Error crawling domain {}", domain, ex); + return 0; + } } - private int crawlDomain(CrawlDataReference oldCrawlData, EdgeUrl rootUrl, DomainLinks domainLinks) { + public void syncAbortedRun(Path warcFile) { + var resync = new CrawlerWarcResynchronizer(crawlFrontier, warcRecorder); + + resync.run(warcFile); + } + + private int crawlDomain(CrawlDataReference oldCrawlData, DomainProber.ProbeResult probeResult, DomainLinks domainLinks) throws IOException, InterruptedException { String ip = findIp(domain); + EdgeUrl rootUrl; + + warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult); + + if (!(probeResult instanceof DomainProber.ProbeResultOk ok)) { + return 1; + } + else { + rootUrl = ok.probedUrl(); + } + + assert !crawlFrontier.isEmpty(); - final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain); + final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain, warcRecorder); final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay()); - sniffRootDocument(delayTimer, rootUrl); + sniffRootDocument(rootUrl); // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified int recrawled = recrawl(oldCrawlData, robotsRules, delayTimer); @@ -140,9 +138,15 @@ public class CrawlerRetreiver { crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto)); // Add links from the sitemap to the crawl frontier - downloadSitemaps(robotsRules, rootUrl); + sitemapFetcher.downloadSitemaps(robotsRules, rootUrl); - CrawledDomain ret = new CrawledDomain(domain, null, CrawlerDomainStatus.OK.name(), null, ip, new ArrayList<>(), null); + CrawledDomain ret = new CrawledDomain(domain, + null, + CrawlerDomainStatus.OK.name(), + null, + ip, + new ArrayList<>(), + null); int fetchedCount = recrawled; @@ -154,7 +158,7 @@ public class CrawlerRetreiver { var top = crawlFrontier.takeNextUrl(); if (!robotsRules.isAllowed(top.toString())) { - crawledDomainWriter.accept(createRobotsError(top)); + warcRecorder.flagAsRobotsTxtError(top); continue; } @@ -177,149 +181,43 @@ public class CrawlerRetreiver { continue; - if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isPresent()) { - fetchedCount++; + try { + if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isOk()) { + fetchedCount++; + } + } + catch (InterruptedException ex) { + Thread.currentThread().interrupt(); + break; } } ret.cookies = fetcher.getCookies(); - crawledDomainWriter.accept(ret); - return fetchedCount; } - /** Performs a re-crawl of old documents, comparing etags and last-modified */ - private int recrawl(CrawlDataReference oldCrawlData, - SimpleRobotRules robotsRules, - CrawlDelayTimer delayTimer) { - int recrawled = 0; - int retained = 0; - - for (;;) { - CrawledDocument doc = oldCrawlData.nextDocument(); - - if (doc == null) { - break; - } - - // This Shouldn't Happen (TM) - var urlMaybe = EdgeUrl.parse(doc.url); - if (urlMaybe.isEmpty()) continue; - var url = urlMaybe.get(); - - // If we've previously 404:d on this URL, we'll refrain from trying to fetch it again - if (doc.httpStatus == 404) { - crawlFrontier.addVisited(url); - continue; - } - - if (doc.httpStatus != 200) continue; - - if (!robotsRules.isAllowed(url.toString())) { - crawledDomainWriter.accept(createRobotsError(url)); - continue; - } - if (!crawlFrontier.filterLink(url)) - continue; - if (!crawlFrontier.addVisited(url)) - continue; - - - if (recrawled > 5 - && retained > 0.9 * recrawled - && Math.random() < 0.9) - { - // Since it looks like most of these documents haven't changed, - // we'll load the documents directly; but we do this in a random - // fashion to make sure we eventually catch changes over time - - crawledDomainWriter.accept(doc); - crawlFrontier.addVisited(url); - continue; - } - - - // GET the document with the stored document as a reference - // providing etag and last-modified headers, so we can recycle the - // document if it hasn't changed without actually downloading it - - var fetchedDocOpt = fetchWriteAndSleep(url, - delayTimer, - new DocumentWithReference(doc, oldCrawlData)); - if (fetchedDocOpt.isEmpty()) continue; - - if (documentWasRetainedTag.equals(fetchedDocOpt.get().recrawlState)) retained ++; - else if (documentWasSameTag.equals(fetchedDocOpt.get().recrawlState)) retained ++; - - recrawled ++; - } - - return recrawled; + /** Using the old crawl data, fetch the documents comparing etags and last-modified */ + private int recrawl(CrawlDataReference oldCrawlData, SimpleRobotRules robotsRules, CrawlDelayTimer delayTimer) throws InterruptedException { + return crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer); } - private void downloadSitemaps(SimpleRobotRules robotsRules, EdgeUrl rootUrl) { - List sitemaps = robotsRules.getSitemaps(); - - List urls = new ArrayList<>(sitemaps.size()); - if (!sitemaps.isEmpty()) { - for (var url : sitemaps) { - EdgeUrl.parse(url).ifPresent(urls::add); - } - } - else { - urls.add(rootUrl.withPathAndParam("/sitemap.xml", null)); - } - - downloadSitemaps(urls); - } - - private void downloadSitemaps(List urls) { - - Set checkedSitemaps = new HashSet<>(); - - for (var url : urls) { - // Let's not download sitemaps from other domains for now - if (!crawlFrontier.isSameDomain(url)) { - continue; - } - - if (checkedSitemaps.contains(url.path)) - continue; - - var sitemap = sitemapRetriever.fetchSitemap(url); - if (sitemap.isEmpty()) { - continue; - } - - // ensure we don't try to download this sitemap again - // (don't move this up, as we may want to check the same - // path with different protocols until we find one that works) - - checkedSitemaps.add(url.path); - - crawlFrontier.addAllToQueue(sitemap); - } - - logger.debug("Queue is now {}", crawlFrontier.queueSize()); - } - - private void sniffRootDocument(CrawlDelayTimer delayTimer, EdgeUrl rootUrl) { + private void sniffRootDocument(EdgeUrl rootUrl) { try { logger.debug("Configuring link filter"); var url = rootUrl.withPathAndParam("/", null); - var maybeSample = fetchUrl(url, delayTimer, DocumentWithReference.empty()).filter(sample -> sample.httpStatus == 200); - if (maybeSample.isEmpty()) + var result = fetcher.fetchContent(url, warcRecorder, ContentTags.empty()); + if (!(result instanceof HttpFetchResult.ResultOk ok)) return; - var sample = maybeSample.get(); - if (sample.documentBody == null) + var optDoc = ok.parseDocument(); + if (optDoc.isEmpty()) return; // Sniff the software based on the sample document - var doc = Jsoup.parse(sample.documentBody); + var doc = optDoc.get(); crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc)); for (var link : doc.getElementsByTag("link")) { @@ -338,7 +236,7 @@ public class CrawlerRetreiver { linkParser.parseLink(url, href) .filter(crawlFrontier::isSameDomain) .map(List::of) - .ifPresent(this::downloadSitemaps); + .ifPresent(sitemapFetcher::downloadSitemaps); } } catch (Exception ex) { @@ -346,41 +244,67 @@ public class CrawlerRetreiver { } } - private Optional fetchWriteAndSleep(EdgeUrl top, - CrawlDelayTimer timer, - DocumentWithReference reference) { + public HttpFetchResult fetchWriteAndSleep(EdgeUrl top, + CrawlDelayTimer timer, + DocumentWithReference reference) throws InterruptedException + { logger.debug("Fetching {}", top); + HttpFetchResult fetchedDoc = new HttpFetchResult.ResultNone(); + long startTime = System.currentTimeMillis(); + var contentTags = reference.getContentTags(); - var docOpt = fetchUrl(top, timer, reference); - - if (docOpt.isPresent()) { - var doc = docOpt.get(); - - if (!Objects.equals(doc.recrawlState, documentWasRetainedTag) - && reference.isContentBodySame(doc)) - { - // The document didn't change since the last time - doc.recrawlState = documentWasSameTag; + // Fetch the document, retrying if we get a rate limit exception + for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) { + try { + fetchedDoc = fetcher.fetchContent(top, warcRecorder, contentTags); + break; } - - crawledDomainWriter.accept(doc); - - if (doc.url != null) { - // We may have redirected to a different path - EdgeUrl.parse(doc.url).ifPresent(crawlFrontier::addVisited); + catch (RateLimitException ex) { + timer.waitRetryDelay(ex); } - - if ("ERROR".equals(doc.crawlerStatus) && doc.httpStatus != 404) { - errorCount++; + catch (Exception ex) { + logger.warn("Failed to fetch {}", top, ex); + fetchedDoc = new HttpFetchResult.ResultException(ex); } - } - timer.delay(System.currentTimeMillis() - startTime); + try { + if (fetchedDoc instanceof HttpFetchResult.ResultOk ok) { + var docOpt = ok.parseDocument(); + if (docOpt.isPresent()) { + var doc = docOpt.get(); - return docOpt; + crawlFrontier.enqueueLinksFromDocument(top, doc); + crawlFrontier.addVisited(new EdgeUrl(ok.uri())); + } + } + else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) { + var doc = reference.doc(); + + warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody); + + fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url, + new ContentType(doc.contentType, "UTF-8"), + doc.documentBody); + + var parsed = Jsoup.parse(doc.documentBody); + + crawlFrontier.enqueueLinksFromDocument(top, parsed); + crawlFrontier.addVisited(top); + } + else if (fetchedDoc instanceof HttpFetchResult.ResultException ex) { + errorCount ++; + } + } + catch (Exception ex) { + logger.error("Error parsing document {}", top, ex); + } + + timer.waitFetchDelay(System.currentTimeMillis() - startTime); + + return fetchedDoc; } private boolean isAllowedProtocol(String proto) { @@ -388,91 +312,6 @@ public class CrawlerRetreiver { || proto.equalsIgnoreCase("https"); } - private Optional fetchUrl(EdgeUrl top, CrawlDelayTimer timer, DocumentWithReference reference) { - try { - var contentTags = reference.getContentTags(); - var fetchedDoc = tryDownload(top, timer, contentTags); - - CrawledDocument doc = reference.replaceOn304(fetchedDoc); - - if (doc.documentBody != null) { - doc.documentBodyHash = createHash(doc.documentBody); - - var parsedDoc = Jsoup.parse(doc.documentBody); - EdgeUrl url = new EdgeUrl(doc.url); - - findLinks(url, parsedDoc); - findCanonicalUrl(url, parsedDoc) - .ifPresent(canonicalLink -> doc.canonicalUrl = canonicalLink.toString()); - } - - return Optional.of(doc); - } - catch (Exception ex) { - logger.warn("Failed to process document {}", top); - } - - return Optional.empty(); - - } - - - @SneakyThrows - private CrawledDocument tryDownload(EdgeUrl top, CrawlDelayTimer timer, ContentTags tags) { - for (int i = 0; i < 2; i++) { - try { - var doc = fetcher.fetchContent(top, tags); - doc.recrawlState = "NEW"; - return doc; - } - catch (RateLimitException ex) { - timer.slowDown(); - - int delay = ex.retryAfter(); - if (delay > 0 && delay < 5000) { - Thread.sleep(delay); - } - } - } - - return createRetryError(top); - } - - private String createHash(String documentBodyHash) { - return hashMethod.hashUnencodedChars(documentBodyHash).toString(); - } - - private void findLinks(EdgeUrl baseUrl, Document parsed) { - baseUrl = linkParser.getBaseLink(parsed, baseUrl); - - for (var link : parsed.getElementsByTag("a")) { - linkParser.parseLink(baseUrl, link).ifPresent(crawlFrontier::addToQueue); - } - for (var link : parsed.getElementsByTag("frame")) { - linkParser.parseFrame(baseUrl, link).ifPresent(crawlFrontier::addToQueue); - } - for (var link : parsed.getElementsByTag("iframe")) { - linkParser.parseFrame(baseUrl, link).ifPresent(crawlFrontier::addToQueue); - } - for (var link : parsed.getElementsByTag("link")) { - String rel = link.attr("rel"); - - if (rel.equalsIgnoreCase("next") || rel.equalsIgnoreCase("prev")) { - linkParser.parseLink(baseUrl, link).ifPresent(crawlFrontier::addToQueue); - } - } - } - - private Optional findCanonicalUrl(EdgeUrl baseUrl, Document parsed) { - baseUrl = baseUrl.domain.toRootUrl(); - - for (var link : parsed.select("link[rel=canonical]")) { - return linkParser.parseLink(baseUrl, link); - } - - return Optional.empty(); - } - private String findIp(String domain) { try { return InetAddress.getByName(domain).getHostAddress(); @@ -481,92 +320,9 @@ public class CrawlerRetreiver { } } - private CrawledDocument createRobotsError(EdgeUrl url) { - return CrawledDocument.builder() - .url(url.toString()) - .timestamp(LocalDateTime.now().toString()) - .httpStatus(-1) - .crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name()) - .build(); - } - private CrawledDocument createRetryError(EdgeUrl url) { - return CrawledDocument.builder() - .url(url.toString()) - .timestamp(LocalDateTime.now().toString()) - .httpStatus(429) - .crawlerStatus(CrawlerDocumentStatus.ERROR.name()) - .build(); - } - - private record DocumentWithReference( - @Nullable CrawledDocument doc, - @Nullable CrawlDataReference reference) { - - private static final DocumentWithReference emptyInstance = new DocumentWithReference(null, null); - public static DocumentWithReference empty() { - return emptyInstance; - } - - public boolean isContentBodySame(CrawledDocument newDoc) { - if (reference == null) - return false; - if (doc == null) - return false; - if (doc.documentBody == null) - return false; - if (newDoc.documentBody == null) - return false; - - return reference.isContentBodySame(doc, newDoc); - } - - private ContentTags getContentTags() { - if (null == doc) - return ContentTags.empty(); - - String headers = doc.headers; - if (headers == null) - return ContentTags.empty(); - - String[] headersLines = headers.split("\n"); - - String lastmod = null; - String etag = null; - - for (String line : headersLines) { - if (line.toLowerCase().startsWith("etag:")) { - etag = line.substring(5).trim(); - } - if (line.toLowerCase().startsWith("last-modified:")) { - lastmod = line.substring(14).trim(); - } - } - - return new ContentTags(etag, lastmod); - } - - public boolean isEmpty() { - return doc == null || reference == null; - } - - /** If the provided document has HTTP status 304, and the reference document is provided, - * return the reference document; otherwise return the provided document. - */ - public CrawledDocument replaceOn304(CrawledDocument fetchedDoc) { - - if (doc == null) - return fetchedDoc; - - // HTTP status 304 is NOT MODIFIED, which means the document is the same as it was when - // we fetched it last time. We can recycle the reference document. - if (fetchedDoc.httpStatus != 304) - return fetchedDoc; - - var ret = doc; - ret.recrawlState = documentWasRetainedTag; - ret.timestamp = LocalDateTime.now().toString(); - return ret; - } + @Override + public void close() throws Exception { + warcRecorder.close(); } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java new file mode 100644 index 00000000..52ebe2f3 --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java @@ -0,0 +1,107 @@ +package nu.marginalia.crawl.retreival; + +import nu.marginalia.crawling.body.DocumentBodyExtractor; +import nu.marginalia.crawling.body.DocumentBodyResult; +import nu.marginalia.crawling.body.HttpFetchResult; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.model.EdgeUrl; +import org.jsoup.Jsoup; +import org.netpreserve.jwarc.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Path; + +/** + * This class is responsible for resynchronizing the crawl frontier with a partially written + * warc file. This may happen if the crawl is interrupted or crashes. + *

+ * This is best-effort and not guaranteed to recover all data, but it should limit + * the amount of data that is lost and needs to be re-crawled in the event of an unexpected + * shutdown. + */ +public class CrawlerWarcResynchronizer { + private final DomainCrawlFrontier crawlFrontier; + private final WarcRecorder recorder; + private static final Logger logger = LoggerFactory.getLogger(CrawlerWarcResynchronizer.class); + public CrawlerWarcResynchronizer(DomainCrawlFrontier crawlFrontier, WarcRecorder recorder) { + this.crawlFrontier = crawlFrontier; + this.recorder = recorder; + } + + public void run(Path tempFile) { + // First pass, enqueue links + try (var reader = new WarcReader(tempFile)) { + WarcXResponseReference.register(reader); + WarcXEntityRefused.register(reader); + + for (var item : reader) { + accept(item); + } + } catch (IOException e) { + logger.info(STR."Failed read full warc file \{tempFile}", e); + } + + // Second pass, copy records to the new warc file + try (var reader = new WarcReader(tempFile)) { + for (var item : reader) { + recorder.resync(item); + } + } catch (IOException e) { + logger.info(STR."Failed read full warc file \{tempFile}", e); + } + } + + public void accept(WarcRecord item) { + try { + if (item instanceof WarcResponse rsp) { + response(rsp); + } else if (item instanceof WarcRequest req) { + request(req); + } else if (item instanceof WarcXEntityRefused refused) { + refused(refused); + } + + } + catch (Exception ex) { + logger.info(STR."Failed to process warc record \{item}", ex); + } + } + + private void refused(WarcXEntityRefused refused) { + // In general, we don't want to re-crawl urls that were refused, + // but to permit circumstances to change over time, we'll + // allow for a small chance of re-probing these entries + + if (Math.random() > 0.1) { + crawlFrontier.addVisited(new EdgeUrl(refused.targetURI())); + } + } + + private void request(WarcRequest request) { + EdgeUrl.parse(request.target()).ifPresent(crawlFrontier::addVisited); + } + + private void response(WarcResponse rsp) { + var url = new EdgeUrl(rsp.targetURI()); + + crawlFrontier.addVisited(url); + + try { + var response = HttpFetchResult.importWarc(rsp); + DocumentBodyExtractor + .asString(response) + .ifPresent((ct, body) -> + { + var doc = Jsoup.parse(body); + crawlFrontier.enqueueLinksFromDocument(url, doc); + }); + } + catch (Exception e) { + logger.info(STR."Failed to parse response body for \{url}", e); + } + } + + +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java index 30902a8e..46446fee 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java @@ -3,14 +3,19 @@ package nu.marginalia.crawl.retreival; import com.google.common.hash.HashFunction; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import nu.marginalia.ip_blocklist.UrlBlocklist; +import nu.marginalia.link_parser.LinkParser; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import org.jsoup.nodes.Document; import java.net.URISyntaxException; import java.util.*; import java.util.function.Predicate; public class DomainCrawlFrontier { + + private static final LinkParser linkParser = new LinkParser(); + private final ArrayDeque queue; // To save the number of strings kept in memory, @@ -45,9 +50,14 @@ public class DomainCrawlFrontier { } } + /** Increase the depth of the crawl by a factor. If the current depth is smaller + * than the number of already visited documents, the base depth will be adjusted + * to the visited count first. + */ public void increaseDepth(double depthIncreaseFactor) { - depth = (int)(depth * depthIncreaseFactor); + depth = (int)(Math.max(visited.size(), depth) * depthIncreaseFactor); } + public void setLinkFilter(Predicate linkFilter) { this.linkFilter = linkFilter; } @@ -141,4 +151,27 @@ public class DomainCrawlFrontier { public int queueSize() { return queue.size(); } + + + public void enqueueLinksFromDocument(EdgeUrl baseUrl, Document parsed) { + baseUrl = linkParser.getBaseLink(parsed, baseUrl); + + for (var link : parsed.getElementsByTag("a")) { + linkParser.parseLink(baseUrl, link).ifPresent(this::addToQueue); + } + for (var link : parsed.getElementsByTag("frame")) { + linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue); + } + for (var link : parsed.getElementsByTag("iframe")) { + linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue); + } + for (var link : parsed.getElementsByTag("link")) { + String rel = link.attr("rel"); + + if (rel.equalsIgnoreCase("next") || rel.equalsIgnoreCase("prev")) { + linkParser.parseLink(baseUrl, link).ifPresent(this::addToQueue); + } + } + } + } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java new file mode 100644 index 00000000..df070cc5 --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java @@ -0,0 +1,86 @@ +package nu.marginalia.crawl.retreival.fetcher; + +import nu.marginalia.crawling.body.ContentTypeLogic; +import nu.marginalia.model.EdgeUrl; +import okhttp3.OkHttpClient; +import okhttp3.Request; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.net.SocketTimeoutException; +import java.util.Objects; + +public class ContentTypeProber { + + private static final Logger logger = LoggerFactory.getLogger(ContentTypeProber.class); + private final String userAgent; + private final OkHttpClient client; + private final ContentTypeLogic contentTypeLogic = new ContentTypeLogic(); + + public ContentTypeProber(String userAgent, OkHttpClient httpClient) { + this.userAgent = userAgent; + this.client = httpClient; + } + + /** Probe the content type of the given URL with a HEAD request. + * This is used to detect binary files, which we don't want to crawl. + *

+ * If the URL redirects, the final URL is returned, to avoid redundant + * requests. + * + * @param url The URL to probe + * @return A ContentTypeProbeResult + */ + public ContentTypeProbeResult probeContentType(EdgeUrl url) { + logger.debug("Probing suspected binary {}", url); + + var headBuilder = new Request.Builder().head() + .addHeader("User-agent", userAgent) + .addHeader("Accept-Encoding", "gzip") + .url(url.toString()); + + var head = headBuilder.build(); + var call = client.newCall(head); + + try (var rsp = call.execute()) { + var contentTypeHeader = rsp.header("Content-type"); + + if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) { + return new ContentTypeProbeResult.BadContentType(contentTypeHeader, rsp.code()); + } + + // Update the URL to the final URL of the HEAD request, otherwise we might end up doing + + // HEAD 301 url1 -> url2 + // HEAD 200 url2 + // GET 301 url1 -> url2 + // GET 200 url2 + + // which is not what we want. Overall we want to do as few requests as possible to not raise + // too many eyebrows when looking at the logs on the target server. Overall it's probably desirable + // that it looks like the traffic makes sense, as opposed to looking like a broken bot. + + var redirectUrl = new EdgeUrl(rsp.request().url().toString()); + EdgeUrl ret; + + if (Objects.equals(redirectUrl.domain, url.domain)) ret = redirectUrl; + else ret = url; + + return new ContentTypeProbeResult.Ok(ret); + + } catch (SocketTimeoutException ex) { + return new ContentTypeProbeResult.Timeout(); + } catch (Exception ex) { + logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage()); + + return new ContentTypeProbeResult.Exception(ex); + } + } + + public sealed interface ContentTypeProbeResult { + record Ok(EdgeUrl resolvedUrl) implements ContentTypeProbeResult { } + record BadContentType(String contentType, int statusCode) implements ContentTypeProbeResult { } + record Timeout() implements ContentTypeProbeResult { } + record Exception(java.lang.Exception ex) implements ContentTypeProbeResult { } + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java index 11ad272e..70576510 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java @@ -3,7 +3,8 @@ package nu.marginalia.crawl.retreival.fetcher; import com.google.inject.ImplementedBy; import crawlercommons.robots.SimpleRobotRules; import nu.marginalia.crawl.retreival.RateLimitException; -import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.body.HttpFetchResult; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; @@ -18,9 +19,9 @@ public interface HttpFetcher { FetchResult probeDomain(EdgeUrl url); - CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) throws RateLimitException; + HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) throws RateLimitException; - SimpleRobotRules fetchRobotRules(EdgeDomain domain); + SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder); SitemapRetriever createSitemapRetriever(); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 5720ef34..ef6b48cb 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -7,43 +7,41 @@ import crawlercommons.robots.SimpleRobotRulesParser; import lombok.SneakyThrows; import nu.marginalia.crawl.retreival.Cookies; import nu.marginalia.crawl.retreival.RateLimitException; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawlerDocumentStatus; -import nu.marginalia.crawling.model.ContentType; +import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult; +import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory; +import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; +import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL; +import nu.marginalia.crawling.body.DocumentBodyExtractor; +import nu.marginalia.crawling.body.HttpFetchResult; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.crawling.body.ContentTypeLogic; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; -import nu.marginalia.crawl.retreival.logic.ContentTypeParser; -import okhttp3.*; -import org.apache.commons.io.input.BOMInputStream; +import okhttp3.ConnectionPool; +import okhttp3.Dispatcher; +import okhttp3.OkHttpClient; +import okhttp3.Request; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.net.ssl.SSLException; import javax.net.ssl.X509TrustManager; -import java.io.EOFException; -import java.io.IOException; -import java.net.*; -import java.nio.charset.Charset; -import java.nio.charset.IllegalCharsetNameException; -import java.nio.charset.StandardCharsets; -import java.nio.charset.UnsupportedCharsetException; -import java.time.LocalDateTime; -import java.util.*; +import java.util.List; +import java.util.Objects; +import java.util.Optional; import java.util.concurrent.TimeUnit; -import java.util.zip.GZIPInputStream; + public class HttpFetcherImpl implements HttpFetcher { private final Logger logger = LoggerFactory.getLogger(getClass()); private final String userAgent; - private final int maxFetchSize = 1024*512; private final Cookies cookies = new Cookies(); private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser(); - private final ContentTypeLogic contentTypeLogic = new ContentTypeLogic(); + private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic(); + private final ContentTypeProber contentTypeProber; @Override public void setAllowAllContentTypes(boolean allowAllContentTypes) { @@ -64,6 +62,7 @@ public class HttpFetcherImpl implements HttpFetcher { return builder.sslSocketFactory(NoSecuritySSL.buildSocketFactory(), (X509TrustManager) NoSecuritySSL.trustAllCerts[0]) .socketFactory(ftSocketFactory) .hostnameVerifier(NoSecuritySSL.buildHostnameVerifyer()) + .addNetworkInterceptor(new IpInterceptingNetworkInterceptor()) .connectionPool(pool) .cookieJar(cookies.getJar()) .followRedirects(true) @@ -92,13 +91,22 @@ public class HttpFetcherImpl implements HttpFetcher { { this.client = createClient(dispatcher, connectionPool); this.userAgent = userAgent; + this.contentTypeProber = new ContentTypeProber(userAgent, client); } public HttpFetcherImpl(@Named("user-agent") String userAgent) { this.client = createClient(null, new ConnectionPool()); this.userAgent = userAgent; + this.contentTypeProber = new ContentTypeProber(userAgent, client); } + /** + * Probe the domain to see if it is reachable, attempting to identify which schema to use, + * and if there are any redirects. This is done by one or more HEAD requests. + * + * @param url The URL to probe. + * @return The result of the probe, indicating the state and the URL. + */ @Override @SneakyThrows public FetchResult probeDomain(EdgeUrl url) { @@ -130,8 +138,9 @@ public class HttpFetcherImpl implements HttpFetcher { @Override @SneakyThrows - public CrawledDocument fetchContent(EdgeUrl url, - ContentTags contentTags) + public HttpFetchResult fetchContent(EdgeUrl url, + WarcRecorder warcRecorder, + ContentTags contentTags) throws RateLimitException { @@ -139,268 +148,54 @@ public class HttpFetcherImpl implements HttpFetcher { // looks like it might be something else, we perform a HEAD first to check the content type if (contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) { - logger.debug("Probing suspected binary {}", url); - - var headBuilder = new Request.Builder().head() - .addHeader("User-agent", userAgent) - .url(url.toString()) - .addHeader("Accept-Encoding", "gzip"); - - var head = headBuilder.build(); - var call = client.newCall(head); - - try (var rsp = call.execute()) { - var contentTypeHeader = rsp.header("Content-type"); - if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) { - return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "Early probe failed"); - } - - // Update the URL to the final URL of the HEAD request, otherwise we might end up doing - - // HEAD 301 url1 -> url2 - // HEAD 200 url2 - // GET 301 url1 -> url2 - // GET 200 url2 - - // which is not what we want. Overall we want to do as few requests as possible to not raise - // too many eyebrows when looking at the logs on the target server. Overall it's probably desirable - // that it looks like the traffic makes sense, as opposed to looking like a broken bot. - - var redirectUrl = new EdgeUrl(rsp.request().url().toString()); - if (Objects.equals(redirectUrl.domain, url.domain)) - url = redirectUrl; + ContentTypeProbeResult probeResult = contentTypeProber.probeContentType(url); + if (probeResult instanceof ContentTypeProbeResult.Ok ok) { + url = ok.resolvedUrl(); } - catch (SocketTimeoutException ex) { - return createTimeoutErrorRsp(url, ex); + else if (probeResult instanceof ContentTypeProbeResult.BadContentType badContentType) { + warcRecorder.flagAsFailedContentTypeProbe(url, badContentType.contentType(), badContentType.statusCode()); + return new HttpFetchResult.ResultNone(); } - catch (Exception ex) { - logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage()); - return createHardErrorRsp(url, ex); + else if (probeResult instanceof ContentTypeProbeResult.BadContentType.Timeout timeout) { + warcRecorder.flagAsTimeout(url); + return new HttpFetchResult.ResultNone(); + } + else if (probeResult instanceof ContentTypeProbeResult.Exception exception) { + warcRecorder.flagAsError(url, exception.ex()); + return new HttpFetchResult.ResultNone(); } } var getBuilder = new Request.Builder().get(); - getBuilder.addHeader("User-agent", userAgent) - .url(url.toString()) - .addHeader("Accept-Encoding", "gzip"); + getBuilder.url(url.toString()) + .addHeader("Accept-Encoding", "gzip") + .addHeader("User-agent", userAgent); contentTags.paint(getBuilder); - var get = getBuilder.build(); - var call = client.newCall(get); + HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build()); - try (var rsp = call.execute()) { - return extractBody(url, rsp); - } - catch (RateLimitException rle) { - throw rle; - } - catch (SocketTimeoutException ex) { - return createTimeoutErrorRsp(url, ex); - } - catch (UnknownHostException ex) { - return createUnknownHostError(url, ex); - } - catch (SocketException | ProtocolException | IllegalCharsetNameException | SSLException | EOFException ex) { - // This is a bit of a grab-bag of errors that crop up - // IllegalCharsetName is egg on our face, - // but SSLException and EOFException are probably the server's fault - - return createHardErrorRsp(url, ex); - } - catch (Exception ex) { - logger.error("Error during fetching", ex); - return createHardErrorRsp(url, ex); - } - } - - private CrawledDocument createHardErrorRsp(EdgeUrl url, Exception why) { - return CrawledDocument.builder() - .crawlerStatus(CrawlerDocumentStatus.ERROR.toString()) - .crawlerStatusDesc(why.getClass().getSimpleName() + ": " + why.getMessage()) - .timestamp(LocalDateTime.now().toString()) - .url(url.toString()) - .build(); - } - - private CrawledDocument createUnknownHostError(EdgeUrl url, Exception why) { - return CrawledDocument.builder() - .crawlerStatus(CrawlerDocumentStatus.ERROR.toString()) - .crawlerStatusDesc("Unknown Host") - .timestamp(LocalDateTime.now().toString()) - .url(url.toString()) - .build(); - } - - private CrawledDocument createTimeoutErrorRsp(EdgeUrl url, Exception why) { - return CrawledDocument.builder() - .crawlerStatus("Timeout") - .crawlerStatusDesc(why.getMessage()) - .timestamp(LocalDateTime.now().toString()) - .url(url.toString()) - .build(); - } - private CrawledDocument createErrorResponse(EdgeUrl url, Response rsp, CrawlerDocumentStatus status, String why) { - return CrawledDocument.builder() - .crawlerStatus(status.toString()) - .crawlerStatusDesc(why) - .headers(rsp.headers().toString()) - .contentType(rsp.header("Content-type")) - .timestamp(LocalDateTime.now().toString()) - .httpStatus(rsp.code()) - .url(url.toString()) - .build(); - } - - private CrawledDocument extractBody(EdgeUrl url, Response rsp) throws IOException, URISyntaxException, RateLimitException { - - var responseUrl = new EdgeUrl(rsp.request().url().toString()); - if (!Objects.equals(responseUrl.domain, url.domain)) { - return createRedirectResponse(url, rsp, responseUrl); - } - - if (rsp.code() == 429) { - throw new RateLimitException(rsp.header("Retry-After", "1000")); - } - - var body = rsp.body(); - if (null == body) { - return createErrorResponse(url, rsp, CrawlerDocumentStatus.ERROR, "No body"); - } - - var byteStream = body.byteStream(); - - if ("gzip".equals(rsp.header("Content-encoding"))) { - byteStream = new GZIPInputStream(byteStream); - } - byteStream = new BOMInputStream(byteStream); - - var contentTypeHeader = rsp.header("Content-type"); - if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) { - return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); - } - - byte[] data = byteStream.readNBytes(maxFetchSize); - - var contentType = ContentTypeParser.parse(contentTypeHeader, data); - if (!contentTypeLogic.isAllowableContentType(contentType.contentType())) { - return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); - } - - if ("Shift_JIS".equalsIgnoreCase(contentType.charset())) { - return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CHARSET, ""); - } - - if (!isXRobotsTagsPermitted(rsp.headers("X-Robots-Tag"), userAgent)) { - return CrawledDocument.builder() - .crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name()) - .crawlerStatusDesc("X-Robots-Tag") - .url(responseUrl.toString()) - .httpStatus(-1) - .timestamp(LocalDateTime.now().toString()) - .headers(rsp.headers().toString()) - .build(); - } - - var strData = getStringData(data, contentType); - var canonical = rsp.header("rel=canonical", ""); - - return CrawledDocument.builder() - .crawlerStatus(CrawlerDocumentStatus.OK.name()) - .headers(rsp.headers().toString()) - .contentType(rsp.header("Content-type")) - .timestamp(LocalDateTime.now().toString()) - .canonicalUrl(canonical) - .httpStatus(rsp.code()) - .url(responseUrl.toString()) - .documentBody(strData) - .build(); - } - - /** Check X-Robots-Tag header tag to see if we are allowed to index this page. - *

- * Reference: https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag - * - * @param xRobotsHeaderTags List of X-Robots-Tag values - * @param userAgent User agent string - * @return true if we are allowed to index this page - */ - // Visible for tests - public static boolean isXRobotsTagsPermitted(List xRobotsHeaderTags, String userAgent) { - boolean isPermittedGeneral = true; - boolean isPermittedMarginalia = false; - boolean isForbiddenMarginalia = false; - - for (String header : xRobotsHeaderTags) { - if (header.indexOf(':') >= 0) { - String[] parts = StringUtils.split(header, ":", 2); - - if (parts.length < 2) - continue; - - // Is this relevant to us? - if (!Objects.equals(parts[0].trim(), userAgent)) - continue; - - if (parts[1].contains("noindex")) - isForbiddenMarginalia = true; - else if (parts[1].contains("none")) - isForbiddenMarginalia = true; - else if (parts[1].contains("all")) - isPermittedMarginalia = true; + if (result instanceof HttpFetchResult.ResultOk ok) { + if (ok.statusCode() == 429) { + String retryAfter = Objects.requireNonNullElse(ok.header("Retry-After"), "1000"); + throw new RateLimitException(retryAfter); } - else { - if (header.contains("noindex")) - isPermittedGeneral = false; - if (header.contains("none")) - isPermittedGeneral = false; + if (ok.statusCode() == 304) { + return new HttpFetchResult.Result304Raw(); + } + if (ok.statusCode() == 200) { + return ok; } } - if (isPermittedMarginalia) - return true; - if (isForbiddenMarginalia) - return false; - return isPermittedGeneral; - } - - private String getStringData(byte[] data, ContentType contentType) { - Charset charset; - try { - charset = Charset.forName(contentType.charset()); - } - catch (IllegalCharsetNameException ex) { - charset = StandardCharsets.UTF_8; - } - catch (UnsupportedCharsetException ex) { - // This is usually like Macintosh Latin - // (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding) - // - // It's close enough to 8859-1 to serve - charset = StandardCharsets.ISO_8859_1; - } - return new String(data, charset); - } - - private CrawledDocument createRedirectResponse(EdgeUrl url, Response rsp, EdgeUrl responseUrl) { - - return CrawledDocument.builder() - .crawlerStatus(CrawlerDocumentStatus.REDIRECT.name()) - .redirectUrl(responseUrl.toString()) - .headers(rsp.headers().toString()) - .contentType(rsp.header("Content-type")) - .timestamp(LocalDateTime.now().toString()) - .httpStatus(rsp.code()) - .url(url.toString()) - .build(); - + return new HttpFetchResult.ResultNone(); } @Override - public SimpleRobotRules fetchRobotRules(EdgeDomain domain) { - return fetchRobotsForProto("https", domain) - .or(() -> fetchRobotsForProto("http", domain)) + public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) { + return fetchRobotsForProto("https", recorder, domain) + .or(() -> fetchRobotsForProto("http", recorder, domain)) .orElseGet(() -> new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL)); } @@ -409,21 +204,31 @@ public class HttpFetcherImpl implements HttpFetcher { return new SitemapRetriever(); } - private Optional fetchRobotsForProto(String proto, EdgeDomain domain) { + private Optional fetchRobotsForProto(String proto, WarcRecorder recorder, EdgeDomain domain) { try { var url = new EdgeUrl(proto, domain, null, "/robots.txt", null); - return Optional.of(parseRobotsTxt(fetchContent(url, ContentTags.empty()))); + + var getBuilder = new Request.Builder().get(); + + getBuilder.url(url.toString()) + .addHeader("Accept-Encoding", "gzip") + .addHeader("User-agent", userAgent); + + HttpFetchResult result = recorder.fetch(client, getBuilder.build()); + + return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) -> + robotsParser.parseContent(url.toString(), + body, + contentType.toString(), + userAgent) + ); + } catch (Exception ex) { return Optional.empty(); } } - private SimpleRobotRules parseRobotsTxt(CrawledDocument doc) { - return robotsParser.parseContent(doc.url, - doc.documentBody.getBytes(), - doc.contentType, - userAgent); - } } + diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/FastTerminatingSocketFactory.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/FastTerminatingSocketFactory.java similarity index 96% rename from code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/FastTerminatingSocketFactory.java rename to code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/FastTerminatingSocketFactory.java index add64e29..ffb29b33 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/FastTerminatingSocketFactory.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/FastTerminatingSocketFactory.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawl.retreival.fetcher; +package nu.marginalia.crawl.retreival.fetcher.socket; import javax.net.SocketFactory; import java.io.IOException; diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/IpInterceptingNetworkInterceptor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/IpInterceptingNetworkInterceptor.java new file mode 100644 index 00000000..90f43e5c --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/IpInterceptingNetworkInterceptor.java @@ -0,0 +1,31 @@ +package nu.marginalia.crawl.retreival.fetcher.socket; + +import okhttp3.Interceptor; +import okhttp3.Response; +import org.jetbrains.annotations.NotNull; + +import java.io.IOException; + + +/** An interceptor that intercepts network requests and adds the remote IP address as + * a header in the response. This is used to pass the remote IP address to the Warc + * writer, as this information is not available in the response. + */ +public class IpInterceptingNetworkInterceptor implements Interceptor { + private static final String pseudoHeaderName = "X-Marginalia-Remote-IP"; + + @NotNull + @Override + public Response intercept(@NotNull Interceptor.Chain chain) throws IOException { + String IP = chain.connection().socket().getInetAddress().getHostAddress(); + + return chain.proceed(chain.request()) + .newBuilder() + .addHeader(pseudoHeaderName, IP) + .build(); + } + + public static String getIpFromResponse(Response response) { + return response.header(pseudoHeaderName); + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/NoSecuritySSL.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java similarity index 89% rename from code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/NoSecuritySSL.java rename to code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java index f86d2c48..b6b8a589 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/NoSecuritySSL.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawl.retreival.fetcher; +package nu.marginalia.crawl.retreival.fetcher.socket; import lombok.SneakyThrows; @@ -8,6 +8,8 @@ import java.security.cert.X509Certificate; public class NoSecuritySSL { // Create a trust manager that does not validate certificate chains + // We want to accept e.g. self-signed certificates and certificates + // that are not signed by a CA is generally trusted by the system. public static final TrustManager[] trustAllCerts = new TrustManager[]{ new X509TrustManager() { @Override @@ -27,7 +29,6 @@ public class NoSecuritySSL { } }; - @SneakyThrows public static SSLSocketFactory buildSocketFactory() { // Install the all-trusting trust manager diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java new file mode 100644 index 00000000..6fd020b4 --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java @@ -0,0 +1,33 @@ +package nu.marginalia.crawl.retreival.fetcher.warc; + +import org.netpreserve.jwarc.WarcDigest; + +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; + +class WarcDigestBuilder { + private final MessageDigest digest; + + private static final String digestAlgorithm = "SHA-1"; + + public WarcDigestBuilder() throws NoSuchAlgorithmException { + this.digest = MessageDigest.getInstance(digestAlgorithm); + } + + public void update(String s) { + byte[] bytes = s.getBytes(); + update(bytes, bytes.length); + } + + public void update(byte[] buffer, int n) { + update(buffer, 0, n); + } + + public void update(byte[] buffer, int s, int n) { + digest.update(buffer, s, n); + } + + public WarcDigest build() { + return new WarcDigest(digest); + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java new file mode 100644 index 00000000..ad29056f --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java @@ -0,0 +1,170 @@ +package nu.marginalia.crawl.retreival.fetcher.warc; + +import okhttp3.Protocol; +import okhttp3.Request; +import okhttp3.Response; +import org.apache.commons.lang3.StringUtils; + +import java.net.URI; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Map; +import java.util.StringJoiner; +import java.util.stream.Collectors; + +/** We don't have access to the raw HTTP request and response, so we need to reconstruct them + * as best is possible from the data we have available. + */ +public class WarcProtocolReconstructor { + + static String getHttpRequestString(Request request, URI uri) { + StringBuilder requestStringBuilder = new StringBuilder(); + + final String encodedURL = encodeURLKeepSlashes(uri.getPath()); + + requestStringBuilder.append(request.method()).append(" ").append(encodedURL); + + if (uri.getQuery() != null) { + requestStringBuilder.append("?").append(uri.getQuery()); + } + requestStringBuilder.append(" HTTP/1.1\r\n"); + requestStringBuilder.append("Host: ").append(uri.getHost()).append("\r\n"); + + request.headers().toMultimap().forEach((k, values) -> { + for (var value : values) { + requestStringBuilder.append(capitalizeHeader(k)).append(": ").append(value).append("\r\n"); + } + }); + + return requestStringBuilder.toString(); + } + + /** Java's URLEncoder will URLEncode slashes, which is not desirable + * when sanitizing a URL for HTTP protocol purposes + */ + + private static String encodeURLKeepSlashes(String URL) { + String[] parts = StringUtils.split(URL,"/"); + StringJoiner joiner = new StringJoiner("/"); + for (String part : parts) { + joiner.add(URLEncoder.encode(part, StandardCharsets.UTF_8)); + } + return joiner.toString(); + } + + static String getResponseHeader(String headersAsString, int code) { + String version = "1.1"; + + String statusCode = String.valueOf(code); + String statusMessage = STATUS_CODE_MAP.getOrDefault(code, "Unknown"); + + String headerString = getHeadersAsString(headersAsString); + + return STR."HTTP/\{version} \{statusCode} \{statusMessage}\r\n\{headerString}\r\n\r\n"; + } + + static String getResponseHeader(Response response) { + String version = response.protocol() == Protocol.HTTP_1_1 ? "1.1" : "2.0"; + + String statusCode = String.valueOf(response.code()); + String statusMessage = STATUS_CODE_MAP.getOrDefault(response.code(), "Unknown"); + + String headerString = getHeadersAsString(response); + + return STR."HTTP/\{version} \{statusCode} \{statusMessage}\r\n\{headerString}\r\n\r\n"; + } + + private static final Map STATUS_CODE_MAP = Map.ofEntries( + Map.entry(200, "OK"), + Map.entry(201, "Created"), + Map.entry(202, "Accepted"), + Map.entry(203, "Non-Authoritative Information"), + Map.entry(204, "No Content"), + Map.entry(205, "Reset Content"), + Map.entry(206, "Partial Content"), + Map.entry(207, "Multi-Status"), + Map.entry(208, "Already Reported"), + Map.entry(226, "IM Used"), + Map.entry(300, "Multiple Choices"), + Map.entry(301, "Moved Permanently"), + Map.entry(302, "Found"), + Map.entry(303, "See Other"), + Map.entry(304, "Not Modified"), + Map.entry(307, "Temporary Redirect"), + Map.entry(308, "Permanent Redirect"), + Map.entry(400, "Bad Request"), + Map.entry(401, "Unauthorized"), + Map.entry(403, "Forbidden"), + Map.entry(404, "Not Found"), + Map.entry(405, "Method Not Allowed"), + Map.entry(406, "Not Acceptable"), + Map.entry(408, "Request Timeout"), + Map.entry(409, "Conflict"), + Map.entry(410, "Gone"), + Map.entry(411, "Length Required"), + Map.entry(412, "Precondition Failed"), + Map.entry(413, "Payload Too Large"), + Map.entry(414, "URI Too Long"), + Map.entry(415, "Unsupported Media Type"), + Map.entry(416, "Range Not Satisfiable"), + Map.entry(417, "Expectation Failed"), + Map.entry(418, "I'm a teapot"), + Map.entry(421, "Misdirected Request"), + Map.entry(426, "Upgrade Required"), + Map.entry(428, "Precondition Required"), + Map.entry(429, "Too Many Requests"), + Map.entry(431, "Request Header Fields Too Large"), + Map.entry(451, "Unavailable For Legal Reasons"), + Map.entry(500, "Internal Server Error"), + Map.entry(501, "Not Implemented"), + Map.entry(502, "Bad Gateway"), + Map.entry(503, "Service Unavailable"), + Map.entry(504, "Gateway Timeout"), + Map.entry(505, "HTTP Version Not Supported"), + Map.entry(506, "Variant Also Negotiates"), + Map.entry(507, "Insufficient Storage"), + Map.entry(508, "Loop Detected"), + Map.entry(510, "Not Extended"), + Map.entry(511, "Network Authentication Required") + ); + + static private String getHeadersAsString(String headersBlob) { + StringJoiner joiner = new StringJoiner("\r\n"); + + Arrays.stream(headersBlob.split("\n")).forEach(joiner::add); + + return joiner.toString(); + } + + static private String getHeadersAsString(Response response) { + StringJoiner joiner = new StringJoiner("\r\n"); + + response.headers().toMultimap().forEach((k, values) -> { + String headerCapitalized = capitalizeHeader(k); + + // Omit pseudoheaders injected by the crawler itself + if (headerCapitalized.startsWith("X-Marginalia")) + return; + + // Omit Transfer-Encoding header, as we'll be using Content-Length + // instead in the warc file, despite what the server says + if (headerCapitalized.startsWith("Transfer-Encoding")) + return; + + for (var value : values) { + joiner.add(headerCapitalized + ": " + value); + } + }); + return joiner.toString(); + } + + // okhttp gives us flattened headers, so we need to reconstruct Camel-Kebab-Case style + // for the WARC parser's sake... + static private String capitalizeHeader(String k) { + return Arrays.stream(StringUtils.split(k, '-')) + .map(StringUtils::capitalize) + .collect(Collectors.joining("-")); + } + +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java new file mode 100644 index 00000000..e31585ef --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java @@ -0,0 +1,402 @@ +package nu.marginalia.crawl.retreival.fetcher.warc; + +import nu.marginalia.crawl.retreival.DomainProber; +import nu.marginalia.crawling.body.HttpFetchResult; +import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import okhttp3.OkHttpClient; +import okhttp3.Request; +import org.netpreserve.jwarc.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.io.InputStream; +import java.net.InetAddress; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.security.NoSuchAlgorithmException; +import java.time.Instant; +import java.util.*; + +/** Based on JWarc's fetch method, APL 2.0 license + *

+ * This class wraps OkHttp's OkHttpClient and records the HTTP request and response in a WARC file, + * as best is possible given not all the data is available at the same time and needs to + * be reconstructed. + */ +public class WarcRecorder implements AutoCloseable { + private static final int MAX_TIME = 30_000; + private static final int MAX_SIZE = 1024 * 1024 * 10; + private final WarcWriter writer; + private final Path warcFile; + private static final Logger logger = LoggerFactory.getLogger(WarcRecorder.class); + + private final ThreadLocal bufferThreadLocal = ThreadLocal.withInitial(() -> new byte[MAX_SIZE]); + + private boolean temporaryFile = false; + + // Affix a version string in case we need to change the format in the future + // in some way + private final String warcRecorderVersion = "1.0"; + + // We need to know if the site uses cookies so this can be reported among the search results + // -- flip this to true if we see any cookies. This information will also be painted on any + // revisited pages. It's not 100% perfect and a bit order dependent, but it's good enough. + private final WarcXCookieInformationHeader cookieInformation = new WarcXCookieInformationHeader(); + + /** + * Create a new WarcRecorder that will write to the given file + * + * @param warcFile The file to write to + */ + public WarcRecorder(Path warcFile) throws IOException { + this.warcFile = warcFile; + this.writer = new WarcWriter(warcFile); + } + + /** + * Create a new WarcRecorder that will write to a temporary file + * and delete it when close() is called. + */ + public WarcRecorder() throws IOException { + this.warcFile = Files.createTempFile("warc", ".warc.gz"); + this.writer = new WarcWriter(this.warcFile); + + temporaryFile = true; + } + + public HttpFetchResult fetch(OkHttpClient client, Request request) throws NoSuchAlgorithmException, + IOException, + URISyntaxException, + InterruptedException + { + URI requestUri = request.url().uri(); + + WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder(); + WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder(); + + String ip; + Instant date = Instant.now(); + long startMillis = date.toEpochMilli(); + + var call = client.newCall(request); + + int totalLength = 0; + + WarcTruncationReason truncationReason = null; + + ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(); + + cookieInformation.update(client, request.url()); + + try (var response = call.execute()) { + var body = response.body(); + InputStream inputStream; + + if (body == null) { + inputStream = null; + truncationReason = WarcTruncationReason.DISCONNECT; + } + else { + inputStream = body.byteStream(); + } + + ip = IpInterceptingNetworkInterceptor.getIpFromResponse(response); + + String responseHeaders = WarcProtocolReconstructor.getResponseHeader(response); + + responseDataBuffer.put(responseHeaders); + responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseDataBuffer.length()); + + int dataStart = responseDataBuffer.pos(); + + while (inputStream != null) { + int remainingLength = responseDataBuffer.remaining(); + if (remainingLength == 0) + break; + + int startPos = responseDataBuffer.pos(); + + int n = responseDataBuffer.readFrom(inputStream, remainingLength); + if (n < 0) + break; + + responseDataBuffer.updateDigest(responseDigestBuilder, startPos, n); + responseDataBuffer.updateDigest(payloadDigestBuilder, startPos, n); + totalLength += n; + + if (MAX_TIME > 0 && System.currentTimeMillis() - startMillis > MAX_TIME) { + truncationReason = WarcTruncationReason.TIME; + break; + } + if (MAX_SIZE > 0 && totalLength >= MAX_SIZE) { + truncationReason = WarcTruncationReason.LENGTH; + break; + } + } + + // It looks like this might be the same as requestUri, but it's not; + // it's the URI after resolving redirects. + final URI responseUri = response.request().url().uri(); + + WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri) + .blockDigest(responseDigestBuilder.build()) + .date(date) + .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes()); + + cookieInformation.paint(responseBuilder); + + if (ip != null) responseBuilder.ipAddress(InetAddress.getByName(ip)); + + responseBuilder.payloadDigest(payloadDigestBuilder.build()); + + if (truncationReason != null) + responseBuilder.truncated(truncationReason); + + // Build and write the response + + var warcResponse = responseBuilder.build(); + warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it + writer.write(warcResponse); + + // Build and write the request + + WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder(); + + String httpRequestString = WarcProtocolReconstructor.getHttpRequestString(response.request(), requestUri); + + requestDigestBuilder.update(httpRequestString); + + WarcRequest warcRequest = new WarcRequest.Builder(requestUri) + .blockDigest(requestDigestBuilder.build()) + .date(date) + .body(MediaType.HTTP_REQUEST, httpRequestString.getBytes()) + .concurrentTo(warcResponse.id()) + .build(); + warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it + writer.write(warcRequest); + + return new HttpFetchResult.ResultOk(responseUri, + response.code(), + response.headers(), + ip, + responseDataBuffer.data, + dataStart, + responseDataBuffer.length() - dataStart); + } + catch (Exception ex) { + logger.warn("Failed to fetch URL {}", requestUri, ex); + return new HttpFetchResult.ResultException(ex); + } + } + + public void resync(WarcRecord item) throws IOException { + writer.write(item); + } + + private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, String documentBody) { + try { + WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder(); + WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder(); + + byte[] bytes = documentBody.getBytes(); + + String fakeHeaders = STR.""" + Content-Type: \{contentType} + Content-Length: \{bytes.length} + Content-Encoding: UTF-8 + """; + + String header = WarcProtocolReconstructor.getResponseHeader(fakeHeaders, statusCode); + ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(); + responseDataBuffer.put(header); + + responseDigestBuilder.update(header); + + responseDigestBuilder.update(bytes, bytes.length); + payloadDigestBuilder.update(bytes, bytes.length); + responseDataBuffer.put(bytes, 0, bytes.length); + + WarcXResponseReference.Builder builder = new WarcXResponseReference.Builder(url.asURI()) + .blockDigest(responseDigestBuilder.build()) + .payloadDigest(payloadDigestBuilder.build()) + .date(Instant.now()) + .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes()); + + cookieInformation.paint(builder); + + var reference = builder.build(); + + reference.http(); // force HTTP header to be parsed before body is consumed so that caller can use it + + writer.write(reference); + + } catch (URISyntaxException | IOException | NoSuchAlgorithmException e) { + throw new RuntimeException(e); + } + } + + /** + * Flag the given URL as skipped by the crawler, so that it will not be retried. + * Which URLs were skipped is still important when resynchronizing on the WARC file, + * so that the crawler can avoid re-fetching them. + */ + public void flagAsSkipped(EdgeUrl url, String contentType, int statusCode, String documentBody) { + saveOldResponse(url, contentType, statusCode, documentBody); + } + + /** + * Write a reference copy of the given document data. This is used when the crawler provides + * an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this + * scenario we want to record the data as it was in the previous crawl, but not re-fetch it. + */ + public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, String documentBody) { + saveOldResponse(url, contentType, statusCode, documentBody); + } + + public void writeWarcinfoHeader(String ip, EdgeDomain domain, DomainProber.ProbeResult result) throws IOException { + + Map> fields = new HashMap<>(); + fields.put("ip", List.of(ip)); + fields.put("software", List.of(STR."search.marginalia.nu/\{warcRecorderVersion}")); + fields.put("domain", List.of(domain.toString())); + + switch (result) { + case DomainProber.ProbeResultRedirect redirectDomain: + fields.put("X-WARC-Probe-Status", List.of(STR."REDIRECT;\{redirectDomain.domain()}")); + break; + case DomainProber.ProbeResultError error: + fields.put("X-WARC-Probe-Status", List.of(STR."\{error.status().toString()};\{error.desc()}")); + break; + case DomainProber.ProbeResultOk ok: + fields.put("X-WARC-Probe-Status", List.of("OK")); + break; + } + + var warcinfo = new Warcinfo.Builder() + .date(Instant.now()) + .fields(fields) + .recordId(UUID.randomUUID()) + .build(); + + writer.write(warcinfo); + } + + public void flagAsRobotsTxtError(EdgeUrl top) { + try { + WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(top.asURI(), WarcXEntityRefused.documentRobotsTxtSkippedURN) + .date(Instant.now()) + .build(); + + writer.write(refusal); + } catch (URISyntaxException | IOException e) { + throw new RuntimeException(e); + } + } + + public void flagAsFailedContentTypeProbe(EdgeUrl url, String contentType, int status) { + try { + WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentBadContentTypeURN) + .date(Instant.now()) + .addHeader("Rejected-Content-Type", contentType) + .addHeader("Http-Status", Integer.toString(status)) + .build(); + + writer.write(refusal); + } catch (URISyntaxException | IOException e) { + throw new RuntimeException(e); + } + } + + public void flagAsError(EdgeUrl url, Exception ex) { + try { + WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentUnspecifiedError) + .date(Instant.now()) + .addHeader("Exception", ex.getClass().getSimpleName()) + .addHeader("ErrorMessage", Objects.requireNonNullElse(ex.getMessage(), "")) + .build(); + + writer.write(refusal); + } catch (URISyntaxException | IOException e) { + throw new RuntimeException(e); + } + } + + public void flagAsTimeout(EdgeUrl url) { + try { + WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentProbeTimeout) + .date(Instant.now()) + .build(); + + writer.write(refusal); + } catch (URISyntaxException | IOException e) { + throw new RuntimeException(e); + } + } + + private class ResponseDataBuffer { + private final byte[] data; + private int length = 0; + private int pos = 0; + + public ResponseDataBuffer() { + data = bufferThreadLocal.get(); + } + + public int pos() { + return pos; + } + public int length() { + return length; + } + + public void put(String s) { + byte[] bytes = s.getBytes(); + put(bytes, 0, bytes.length); + } + + private void put(byte[] bytes, int i, int n) { + System.arraycopy(bytes, i, data, pos, n); + pos += n; + length += n; + } + + public int readFrom(InputStream inputStream, int remainingLength) throws IOException { + int n = inputStream.read(data, pos, remainingLength); + if (n > 0) { + pos += n; + length += n; + } + return n; + } + + public int remaining() { + return MAX_SIZE - pos; + } + + public void updateDigest(WarcDigestBuilder digestBuilder, int startPos, int n) { + digestBuilder.update(data, startPos, n); + } + + public byte[] copyBytes() { + byte[] copy = new byte[length]; + System.arraycopy(data, 0, copy, 0, length); + return copy; + } + + } + + public void close() { + try { + writer.close(); + if (temporaryFile) + Files.deleteIfExists(warcFile); + } catch (IOException e) { + throw new RuntimeException(e); + } + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java new file mode 100644 index 00000000..91c21d65 --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java @@ -0,0 +1,108 @@ +package nu.marginalia.crawl.retreival.revisit; + +import crawlercommons.robots.SimpleRobotRules; +import nu.marginalia.crawl.retreival.CrawlDataReference; +import nu.marginalia.crawl.retreival.CrawlDelayTimer; +import nu.marginalia.crawl.retreival.CrawlerRetreiver; +import nu.marginalia.crawl.retreival.DomainCrawlFrontier; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.model.EdgeUrl; +import org.jsoup.Jsoup; + +/** This class encapsulates the logic for re-visiting a domain that has already been crawled. + * We may use information from the previous crawl to inform the next crawl, specifically the + * E-Tag and Last-Modified headers. + */ +public class CrawlerRevisitor { + private final DomainCrawlFrontier crawlFrontier; + private final CrawlerRetreiver crawlerRetreiver; + private final WarcRecorder warcRecorder; + + public CrawlerRevisitor(DomainCrawlFrontier crawlFrontier, + CrawlerRetreiver crawlerRetreiver, + WarcRecorder warcRecorder) { + this.crawlFrontier = crawlFrontier; + this.crawlerRetreiver = crawlerRetreiver; + this.warcRecorder = warcRecorder; + } + + /** Performs a re-crawl of old documents, comparing etags and last-modified */ + public int recrawl(CrawlDataReference oldCrawlData, + SimpleRobotRules robotsRules, + CrawlDelayTimer delayTimer) + throws InterruptedException { + int recrawled = 0; + int retained = 0; + + for (;;) { + CrawledDocument doc = oldCrawlData.nextDocument(); + + if (doc == null) { + break; + } + + // This Shouldn't Happen (TM) + var urlMaybe = EdgeUrl.parse(doc.url); + if (urlMaybe.isEmpty()) continue; + var url = urlMaybe.get(); + + // If we've previously 404:d on this URL, we'll refrain from trying to fetch it again + if (doc.httpStatus == 404) { + crawlFrontier.addVisited(url); + continue; + } + + if (doc.httpStatus != 200) continue; + + if (!robotsRules.isAllowed(url.toString())) { + warcRecorder.flagAsRobotsTxtError(url); + continue; + } + if (!crawlFrontier.filterLink(url)) + continue; + if (!crawlFrontier.addVisited(url)) + continue; + + + if (recrawled > 5 + && retained > 0.9 * recrawled + && Math.random() < 0.9) + { + // Since it looks like most of these documents haven't changed, + // we'll load the documents directly; but we do this in a random + // fashion to make sure we eventually catch changes over time + // and ensure we discover new links + + crawlFrontier.addVisited(url); + + // Hoover up any links from the document + if (doc.httpStatus == 200 && doc.documentBody != null) { + var parsedDoc = Jsoup.parse(doc.documentBody); + crawlFrontier.enqueueLinksFromDocument(url, parsedDoc); + } + + // Add a WARC record so we don't repeat this + warcRecorder.flagAsSkipped(url, doc.contentType, doc.httpStatus, doc.documentBody); + + continue; + } + + + // GET the document with the stored document as a reference + // providing etag and last-modified headers, so we can recycle the + // document if it hasn't changed without actually downloading it + + var reference = new DocumentWithReference(doc, oldCrawlData); + var result = crawlerRetreiver.fetchWriteAndSleep(url, delayTimer, reference); + + if (reference.isSame(result)) { + retained++; + } + + recrawled++; + } + + return recrawled; + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java new file mode 100644 index 00000000..a0559aec --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java @@ -0,0 +1,77 @@ +package nu.marginalia.crawl.retreival.revisit; + +import nu.marginalia.crawl.retreival.CrawlDataReference; +import nu.marginalia.crawl.retreival.fetcher.ContentTags; +import nu.marginalia.crawling.body.DocumentBodyExtractor; +import nu.marginalia.crawling.body.DocumentBodyResult; +import nu.marginalia.crawling.body.HttpFetchResult; +import nu.marginalia.crawling.model.CrawledDocument; + +import javax.annotation.Nullable; + +public record DocumentWithReference( + @Nullable CrawledDocument doc, + @Nullable CrawlDataReference reference) { + + private static final DocumentWithReference emptyInstance = new DocumentWithReference(null, null); + + public static DocumentWithReference empty() { + return emptyInstance; + } + + /** Returns true if the provided document is the same as the reference document, + * or if the result was retained via HTTP 304. + */ + public boolean isSame(HttpFetchResult result) { + if (result instanceof HttpFetchResult.Result304Raw) + return true; + if (result instanceof HttpFetchResult.Result304ReplacedWithReference) + return true; + + if (!(result instanceof HttpFetchResult.ResultOk resultOk)) + return false; + + if (reference == null) + return false; + if (doc == null) + return false; + if (doc.documentBody == null) + return false; + + if (!(DocumentBodyExtractor.asString(resultOk) instanceof DocumentBodyResult.Ok bodyOk)) { + return false; + } + + return reference.isContentBodySame(doc.documentBody, bodyOk.body()); + } + + public ContentTags getContentTags() { + if (null == doc) + return ContentTags.empty(); + + String headers = doc.headers; + if (headers == null) + return ContentTags.empty(); + + String[] headersLines = headers.split("\n"); + + String lastmod = null; + String etag = null; + + for (String line : headersLines) { + if (line.toLowerCase().startsWith("etag:")) { + etag = line.substring(5).trim(); + } + if (line.toLowerCase().startsWith("last-modified:")) { + lastmod = line.substring(14).trim(); + } + } + + return new ContentTags(etag, lastmod); + } + + public boolean isEmpty() { + return doc == null || reference == null; + } + +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/sitemap/SitemapFetcher.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/sitemap/SitemapFetcher.java new file mode 100644 index 00000000..3ce33d64 --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/sitemap/SitemapFetcher.java @@ -0,0 +1,71 @@ +package nu.marginalia.crawl.retreival.sitemap; + +import crawlercommons.robots.SimpleRobotRules; +import nu.marginalia.crawl.retreival.DomainCrawlFrontier; +import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever; +import nu.marginalia.model.EdgeUrl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +public class SitemapFetcher { + + private final DomainCrawlFrontier crawlFrontier; + private final SitemapRetriever sitemapRetriever; + private static final Logger logger = LoggerFactory.getLogger(SitemapFetcher.class); + + public SitemapFetcher(DomainCrawlFrontier crawlFrontier, SitemapRetriever sitemapRetriever) { + this.crawlFrontier = crawlFrontier; + this.sitemapRetriever = sitemapRetriever; + } + + public void downloadSitemaps(SimpleRobotRules robotsRules, EdgeUrl rootUrl) { + List sitemaps = robotsRules.getSitemaps(); + + List urls = new ArrayList<>(sitemaps.size()); + if (!sitemaps.isEmpty()) { + for (var url : sitemaps) { + EdgeUrl.parse(url).ifPresent(urls::add); + } + } + else { + urls.add(rootUrl.withPathAndParam("/sitemap.xml", null)); + } + + downloadSitemaps(urls); + } + + public void downloadSitemaps(List urls) { + + Set checkedSitemaps = new HashSet<>(); + + for (var url : urls) { + // Let's not download sitemaps from other domains for now + if (!crawlFrontier.isSameDomain(url)) { + continue; + } + + if (checkedSitemaps.contains(url.path)) + continue; + + var sitemap = sitemapRetriever.fetchSitemap(url); + if (sitemap.isEmpty()) { + continue; + } + + // ensure we don't try to download this sitemap again + // (don't move this up, as we may want to check the same + // path with different protocols until we find one that works) + + checkedSitemaps.add(url.path); + + crawlFrontier.addAllToQueue(sitemap); + } + + logger.debug("Queue is now {}", crawlFrontier.queueSize()); + } +} diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java new file mode 100644 index 00000000..ae3d9be4 --- /dev/null +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java @@ -0,0 +1,88 @@ +package nu.marginalia.crawl.retreival; + +import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import okhttp3.OkHttpClient; +import okhttp3.Request; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.netpreserve.jwarc.WarcReader; +import org.netpreserve.jwarc.WarcRequest; +import org.netpreserve.jwarc.WarcResponse; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.security.NoSuchAlgorithmException; +import java.util.List; +import java.util.zip.GZIPInputStream; + +import static org.junit.jupiter.api.Assertions.*; + +class CrawlerWarcResynchronizerTest { + Path fileName; + Path outputFile; + OkHttpClient httpClient; + @BeforeEach + public void setUp() throws Exception { + httpClient = new OkHttpClient.Builder() + .addNetworkInterceptor(new IpInterceptingNetworkInterceptor()) + .build(); + + fileName = Files.createTempFile("test", ".warc.gz"); + outputFile = Files.createTempFile("test", ".warc.gz"); + } + + @AfterEach + public void tearDown() throws Exception { + Files.deleteIfExists(fileName); + Files.deleteIfExists(outputFile); + } + + @Test + void run() throws IOException, URISyntaxException { + try (var oldRecorder = new WarcRecorder(fileName)) { + fetchUrl(oldRecorder, "https://www.marginalia.nu/"); + fetchUrl(oldRecorder, "https://www.marginalia.nu/log/"); + fetchUrl(oldRecorder, "https://www.marginalia.nu/feed/"); + } catch (Exception e) { + fail(e); + } + + var crawlFrontier = new DomainCrawlFrontier(new EdgeDomain("www.marginalia.nu"), List.of(), 100); + + try (var newRecorder = new WarcRecorder(outputFile)) { + new CrawlerWarcResynchronizer(crawlFrontier, newRecorder).run(fileName); + } + + assertTrue(crawlFrontier.isVisited(new EdgeUrl("https://www.marginalia.nu/"))); + assertTrue(crawlFrontier.isVisited(new EdgeUrl("https://www.marginalia.nu/log/"))); + assertTrue(crawlFrontier.isVisited(new EdgeUrl("https://www.marginalia.nu/feed/"))); + + try (var warcReader = new WarcReader(outputFile)) { + for (var item : warcReader) { + if (item instanceof WarcRequest req) { + System.out.println("req:" + req.target()); + } + if (item instanceof WarcResponse rsp) { + System.out.println("req:" + rsp.target()); + } + } + } + + new GZIPInputStream(Files.newInputStream(outputFile)).transferTo(System.out); + } + + void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException { + var req = new Request.Builder().url(url) + .addHeader("User-agent", "test.marginalia.nu") + .addHeader("Accept-Encoding", "gzip") + .get().build(); + recorder.fetch(httpClient, req); + } +} \ No newline at end of file diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java new file mode 100644 index 00000000..4a015fb9 --- /dev/null +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java @@ -0,0 +1,59 @@ +package nu.marginalia.crawl.retreival.fetcher; + +import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult.BadContentType; +import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult.Ok; +import nu.marginalia.model.EdgeUrl; +import okhttp3.ConnectionPool; +import okhttp3.Dispatcher; +import okhttp3.OkHttpClient; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.net.URISyntaxException; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; + +import static org.junit.jupiter.api.Assertions.*; + +class ContentTypeProberTest { + + ContentTypeProber prober; + + @BeforeEach + void setUp() { + OkHttpClient client = new OkHttpClient.Builder() + .dispatcher(new Dispatcher(Executors.newVirtualThreadPerTaskExecutor())) + .connectionPool(new ConnectionPool(0, 1, TimeUnit.NANOSECONDS)) + .build(); + + prober = new ContentTypeProber("test.marginalia.nu", client); + } + + @Test + void probeContentType() throws URISyntaxException { + assertEquals( + new Ok(new EdgeUrl("https://www.marginalia.nu/robots.txt")), + prober.probeContentType(new EdgeUrl("https://www.marginalia.nu/robots.txt")), + "robots.txt is expected to pass the probing test since it's text/plain" + ); + + assertEquals( + new BadContentType("image/png", 200), + prober.probeContentType(new EdgeUrl("https://www.marginalia.nu/sanic.png")), + "sanic.png is expected to pass the probing test since it's image/png" + ); + + assertEquals( + new Ok(new EdgeUrl("https://www.marginalia.nu/dev/null")), + prober.probeContentType(new EdgeUrl("https://www.marginalia.nu/dev/null")), + "Despite being a 404, we expect this to be passed as OK as it's NotMyJob(TM) to verify response codes" + ); + + assertEquals( + new Ok(new EdgeUrl("https://www.marginalia.nu/projects/edge/about.gmi/")), + prober.probeContentType(new EdgeUrl("https://www.marginalia.nu/projects/edge/about.gmi")), + "about.gmi is expected to give a redirect to about.gmi/ which is served as text/html" + ); + + } +} \ No newline at end of file diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImplTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImplTest.java index 27b55760..e5673a6a 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImplTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImplTest.java @@ -1,5 +1,6 @@ package nu.marginalia.crawl.retreival.fetcher; +import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; import org.junit.jupiter.api.Test; import java.util.List; @@ -7,30 +8,30 @@ import java.util.List; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; -class HttpFetcherImplTest { +class CrawledDocumentParquetRecordFileWriterTest { @Test public void testXRobotsTag() { - assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("foo:"), "search.marginalia.nu")); - assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(":bar"), "search.marginalia.nu")); - assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(":"), "search.marginalia.nu")); - assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(""), "search.marginalia.nu")); + assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("foo:"), "search.marginalia.nu")); + assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of(":bar"), "search.marginalia.nu")); + assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of(":"), "search.marginalia.nu")); + assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of(""), "search.marginalia.nu")); - assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("doindex"), "search.marginalia.nu")); - assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("noindex"), "search.marginalia.nu")); - assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none"), "search.marginalia.nu")); - assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: noindex"), "search.marginalia.nu")); - assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none"), "search.marginalia.nu")); - assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("googlebot: noindex"), "search.marginalia.nu")); + assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("doindex"), "search.marginalia.nu")); + assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("noindex"), "search.marginalia.nu")); + assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("none"), "search.marginalia.nu")); + assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: noindex"), "search.marginalia.nu")); + assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none"), "search.marginalia.nu")); + assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("googlebot: noindex"), "search.marginalia.nu")); - assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("noindex", "search.marginalia.nu: all"), "search.marginalia.nu")); - assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: all"), "search.marginalia.nu")); - assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: none"), "search.marginalia.nu")); - assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("all", "search.marginalia.nu: none"), "search.marginalia.nu")); - assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "noindex"), "search.marginalia.nu")); - assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "none"), "search.marginalia.nu")); - assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "none"), "search.marginalia.nu")); - assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "all"), "search.marginalia.nu")); + assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("noindex", "search.marginalia.nu: all"), "search.marginalia.nu")); + assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: all"), "search.marginalia.nu")); + assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: none"), "search.marginalia.nu")); + assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("all", "search.marginalia.nu: none"), "search.marginalia.nu")); + assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "noindex"), "search.marginalia.nu")); + assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "none"), "search.marginalia.nu")); + assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "none"), "search.marginalia.nu")); + assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "all"), "search.marginalia.nu")); } } \ No newline at end of file diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java new file mode 100644 index 00000000..cdc10bd2 --- /dev/null +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java @@ -0,0 +1,147 @@ +package nu.marginalia.crawl.retreival.fetcher; + +import nu.marginalia.UserAgent; +import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader; +import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; +import nu.marginalia.model.EdgeUrl; +import okhttp3.OkHttpClient; +import okhttp3.Request; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.netpreserve.jwarc.*; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.security.NoSuchAlgorithmException; +import java.util.HashMap; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class WarcRecorderTest { + Path fileNameWarc; + Path fileNameParquet; + WarcRecorder client; + OkHttpClient httpClient; + @BeforeEach + public void setUp() throws Exception { + httpClient = new OkHttpClient.Builder() + .addNetworkInterceptor(new IpInterceptingNetworkInterceptor()) + .build(); + + fileNameWarc = Files.createTempFile("test", ".warc"); + fileNameParquet = Files.createTempFile("test", ".parquet"); + + client = new WarcRecorder(fileNameWarc); + } + + @AfterEach + public void tearDown() throws Exception { + client.close(); + Files.delete(fileNameWarc); + } + + @Test + void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException { + client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/") + .addHeader("User-agent", "test.marginalia.nu") + .addHeader("Accept-Encoding", "gzip") + .get().build()); + + Map sampleData = new HashMap<>(); + try (var warcReader = new WarcReader(fileNameWarc)) { + warcReader.forEach(record -> { + if (record instanceof WarcRequest req) { + sampleData.put(record.type(), req.target()); + } + if (record instanceof WarcResponse rsp) { + sampleData.put(record.type(), rsp.target()); + } + }); + } + + assertEquals("https://www.marginalia.nu/", sampleData.get("request")); + assertEquals("https://www.marginalia.nu/", sampleData.get("response")); + } + + @Test + public void flagAsSkipped() throws IOException, URISyntaxException { + + try (var recorder = new WarcRecorder(fileNameWarc)) { + recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"), + "text/html", + 200, + "test"); + } + + try (var reader = new WarcReader(fileNameWarc)) { + for (var record : reader) { + if (record instanceof WarcResponse rsp) { + assertEquals("https://www.marginalia.nu/", rsp.target()); + assertEquals("text/html", rsp.contentType().type()); + assertEquals(200, rsp.http().status()); + assertEquals("1", rsp.http().headers().first("X-Cookies").orElse(null)); + } + } + } + } + + @Test + public void testSaveImport() throws URISyntaxException, IOException { + try (var recorder = new WarcRecorder(fileNameWarc)) { + recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"), + "text/html", + 200, + "test"); + } + + try (var reader = new WarcReader(fileNameWarc)) { + WarcXResponseReference.register(reader); + + for (var record : reader) { + System.out.println(record.type()); + System.out.println(record.getClass().getSimpleName()); + if (record instanceof WarcXResponseReference rsp) { + assertEquals("https://www.marginalia.nu/", rsp.target()); + } + } + } + + } + + @Test + public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException { + client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/") + .addHeader("User-agent", "test.marginalia.nu") + .addHeader("Accept-Encoding", "gzip") + .get().build()); + client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/log/") + .addHeader("User-agent", "test.marginalia.nu") + .addHeader("Accept-Encoding", "gzip") + .get().build()); + client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/sanic.png") + .addHeader("User-agent", "test.marginalia.nu") + .addHeader("Accept-Encoding", "gzip") + .get().build()); + client.close(); + + CrawledDocumentParquetRecordFileWriter.convertWarc( + "www.marginalia.nu", + new UserAgent("test"), + fileNameWarc, + fileNameParquet); + + var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList(); + assertEquals(3, urls.size()); + assertEquals("https://www.marginalia.nu/", urls.get(0)); + assertEquals("https://www.marginalia.nu/log/", urls.get(1)); + assertEquals("https://www.marginalia.nu/sanic.png", urls.get(2)); + + } + +} \ No newline at end of file diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java index 5893910f..0873924f 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java @@ -4,11 +4,15 @@ import lombok.SneakyThrows; import nu.marginalia.crawl.retreival.RateLimitException; import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; -import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; +import nu.marginalia.crawling.body.DocumentBodyExtractor; +import nu.marginalia.crawling.body.DocumentBodyResult; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.crawling.body.ContentTypeLogic; import nu.marginalia.model.EdgeUrl; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import java.io.IOException; import java.net.URISyntaxException; class HttpFetcherTest { @@ -28,16 +32,25 @@ class HttpFetcherTest { } @Test - void fetchUTF8() throws URISyntaxException, RateLimitException { + void fetchUTF8() throws URISyntaxException, RateLimitException, IOException { var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); - var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), ContentTags.empty()); - System.out.println(str.contentType); + try (var recorder = new WarcRecorder()) { + var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty()); + if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) { + System.out.println(bodyOk.contentType()); + } + } } @Test - void fetchText() throws URISyntaxException, RateLimitException { + void fetchText() throws URISyntaxException, RateLimitException, IOException { var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); - var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), ContentTags.empty()); - System.out.println(str); + + try (var recorder = new WarcRecorder()) { + var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty()); + if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) { + System.out.println(bodyOk.contentType()); + } + } } } \ No newline at end of file diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index b65e5ae6..749b821c 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -5,6 +5,8 @@ import lombok.SneakyThrows; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.*; +import nu.marginalia.crawling.body.HttpFetchResult; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawlerDocumentStatus; import nu.marginalia.crawling.model.SerializableCrawlData; @@ -12,17 +14,16 @@ import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawlspec.CrawlSpecRecord; import nu.marginalia.test.CommonTestData; +import okhttp3.Headers; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; import org.mockito.Mockito; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.net.URISyntaxException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; public class CrawlerMockFetcherTest { @@ -61,44 +62,42 @@ public class CrawlerMockFetcherTest { } + void crawl(CrawlSpecRecord spec) throws IOException { + try (var recorder = new WarcRecorder()) { + new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder) + .fetch(); + } + } + @Test - public void testLemmy() throws URISyntaxException { + public void testLemmy() throws URISyntaxException, IOException { List out = new ArrayList<>(); registerUrlClasspathData(new EdgeUrl("https://startrek.website/"), "mock-crawl-data/lemmy/index.html"); registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html"); registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html"); - new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("startrek.website", 10, new ArrayList<>()), out::add) - .fetch(); - - out.forEach(System.out::println); + crawl(new CrawlSpecRecord("startrek.website", 10, new ArrayList<>())); } @Test - public void testMediawiki() throws URISyntaxException { + public void testMediawiki() throws URISyntaxException, IOException { List out = new ArrayList<>(); registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html"); - new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()), out::add) - .fetch(); - - out.forEach(System.out::println); + crawl(new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>())); } @Test - public void testDiscourse() throws URISyntaxException { + public void testDiscourse() throws URISyntaxException, IOException { List out = new ArrayList<>(); registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/"), "mock-crawl-data/discourse/index.html"); registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html"); registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html"); - new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), new CrawlSpecRecord("community.tt-rss.org", 100, new ArrayList<>()), out::add) - .fetch(); - - out.forEach(System.out::println); + crawl(new CrawlSpecRecord("community.tt-rss.org", 10, new ArrayList<>())); } class MockFetcher implements HttpFetcher { @@ -118,25 +117,28 @@ public class CrawlerMockFetcherTest { return new FetchResult(FetchResultState.OK, url); } + @SneakyThrows @Override - public CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) { + public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) { logger.info("Fetching {}", url); if (mockData.containsKey(url)) { - return mockData.get(url); - } - else { - return CrawledDocument.builder() - .crawlId("1") - .url(url.toString()) - .contentType("text/html") - .httpStatus(404) - .crawlerStatus(CrawlerDocumentStatus.ERROR.name()) - .build(); + byte[] bodyBytes = mockData.get(url).documentBody.getBytes(); + return new HttpFetchResult.ResultOk( + url.asURI(), + 200, + new Headers.Builder().build(), + "127.0.0.1", + bodyBytes, + 0, + bodyBytes.length + ); } + + return new HttpFetchResult.ResultNone(); } @Override - public SimpleRobotRules fetchRobotRules(EdgeDomain domain) { + public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) { return new SimpleRobotRules(); } @@ -144,5 +146,6 @@ public class CrawlerMockFetcherTest { public SitemapRetriever createSitemapRetriever() { return Mockito.mock(SitemapRetriever.class); } + } } diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index e7742445..286f15f5 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -8,6 +8,7 @@ import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.io.CrawledDomainReader; import nu.marginalia.crawling.io.CrawledDomainWriter; import nu.marginalia.crawling.model.CrawledDocument; @@ -15,22 +16,24 @@ import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.model.crawlspec.CrawlSpecRecord; import org.junit.jupiter.api.*; +import org.netpreserve.jwarc.*; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.stream.Collectors; +import java.util.zip.GZIPInputStream; +import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @Tag("slow") class CrawlerRetreiverTest { private HttpFetcher httpFetcher; + Path tempFile; + Path tempFile2; @BeforeEach public void setUp() { httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D"); @@ -43,8 +46,62 @@ class CrawlerRetreiverTest { System.setProperty("http.agent", WmsaHome.getUserAgent().uaString()); } + @AfterEach + public void tearDown() throws IOException { + if (tempFile != null) { + Files.deleteIfExists(tempFile); + } + if (tempFile2 != null) { + Files.deleteIfExists(tempFile2); + } + } @Test - public void testWithKnownDomains() { + public void testWarcOutput() throws IOException { + var specs = CrawlSpecRecord + .builder() + .crawlDepth(5) + .domain("www.marginalia.nu") + .urls(List.of("https://www.marginalia.nu/misc/debian-laptop-install-log/")) + .build(); + Path tempFile = null; + try { + tempFile = Files.createTempFile("crawling-process", "warc"); + + try (var recorder = new WarcRecorder(tempFile)) { + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(); + } catch (IOException ex) { + Assertions.fail(ex); + } + + Set requests = new HashSet<>(); + Set responses = new HashSet<>(); + + try (var reader = new WarcReader(tempFile)) { + reader.forEach(record -> { + if (record instanceof WarcRequest req) { + requests.add(req.target()); + System.out.println(req.type() + ":" + req.target()); + } + else if (record instanceof WarcResponse rsp) { + responses.add(rsp.target()); + System.out.println(rsp.type() + ":" + rsp.target()); + } + else { + System.out.println(record.type()); + } + }); + } + + assertTrue(requests.contains("https://www.marginalia.nu/misc/debian-laptop-install-log/")); + assertEquals(requests, responses); + } + finally { + if (tempFile != null) + Files.deleteIfExists(tempFile); + } + } + @Test + public void testWithKnownDomains() throws IOException { var specs = CrawlSpecRecord .builder() .crawlDepth(5) @@ -54,10 +111,30 @@ class CrawlerRetreiverTest { List data = new ArrayList<>(); - new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch(); + tempFile = Files.createTempFile("crawling-process", ".warc"); + + try (var recorder = new WarcRecorder(tempFile)) { + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(); + } + catch (IOException ex) { + Assertions.fail(ex); + } + + + try (var stream = CrawledDomainReader.createDataStream(tempFile)) { + while (stream.hasNext()) { + if (stream.next() instanceof CrawledDocument doc) { + data.add(doc); + } + } + } catch (Exception e) { + throw new RuntimeException(e); + } var fetchedUrls = - data.stream().filter(CrawledDocument.class::isInstance) + data.stream() + .peek(System.out::println) + .filter(CrawledDocument.class::isInstance) .map(CrawledDocument.class::cast) .map(doc -> doc.url) .collect(Collectors.toSet()); @@ -72,7 +149,7 @@ class CrawlerRetreiverTest { } @Test - public void testEmptySet() { + public void testEmptySet() throws IOException { var specs = CrawlSpecRecord .builder() @@ -81,9 +158,29 @@ class CrawlerRetreiverTest { .urls(List.of()) .build(); + List data = new ArrayList<>(); - new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, data::add).fetch(); + tempFile = Files.createTempFile("crawling-process", ".warc"); + + try (var recorder = new WarcRecorder(tempFile)) { + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(); + } + catch (IOException ex) { + Assertions.fail(ex); + } + + + try (var stream = CrawledDomainReader.createDataStream(tempFile)) { + while (stream.hasNext()) { + if (stream.next() instanceof CrawledDocument doc) { + data.add(doc); + } + } + } catch (Exception e) { + throw new RuntimeException(e); + } + data.stream().filter(CrawledDocument.class::isInstance) .map(CrawledDocument.class::cast) @@ -115,33 +212,70 @@ class CrawlerRetreiverTest { .build(); - Path out = Files.createTempDirectory("crawling-process"); - var writer = new CrawledDomainWriter(out, specs.domain, "idid"); + tempFile = Files.createTempFile("crawling-process", ".warc.gz"); + tempFile2 = Files.createTempFile("crawling-process", ".warc.gz"); + Map, List> data = new HashMap<>(); - new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, d -> { - data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d); - if (d instanceof CrawledDocument doc) { - System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); - if (Math.random() > 0.5) { - doc.headers = ""; - } - } - writer.accept(d); - }).fetch(); - writer.close(); + try (var recorder = new WarcRecorder(tempFile)) { + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(); + } + catch (IOException ex) { + Assertions.fail(ex); + } - var reader = new CrawledDomainReader(); - var stream = reader.createDataStream(out, specs.domain, "idid"); + try (var stream = CrawledDomainReader.createDataStream(tempFile)) { + while (stream.hasNext()) { + var doc = stream.next(); + data.computeIfAbsent(doc.getClass(), c -> new ArrayList<>()).add(doc); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + var stream = CrawledDomainReader.createDataStream(tempFile); CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0); domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList()); + try (var recorder = new WarcRecorder(tempFile2)) { + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(new DomainLinks(), + new CrawlDataReference(stream)); + } + catch (IOException ex) { + Assertions.fail(ex); + } - new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, d -> { - if (d instanceof CrawledDocument doc) { - System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); + new GZIPInputStream(Files.newInputStream(tempFile2)).transferTo(System.out); + + try (var reader = new WarcReader(tempFile2)) { + WarcXResponseReference.register(reader); + + reader.forEach(record -> { + if (record instanceof WarcResponse rsp) { + try { + System.out.println(rsp.type() + ":" + rsp.target() + "/" + rsp.http().status()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + if (record instanceof WarcMetadata rsp) { + System.out.println("meta:" + rsp.target()); + } + }); + } + + try (var ds = CrawledDomainReader.createDataStream(tempFile2)) { + while (ds.hasNext()) { + var doc = ds.next(); + if (doc instanceof CrawledDomain dr) { + System.out.println(dr.domain + "/" + dr.crawlerStatus); + } + else if (doc instanceof CrawledDocument dc) { + System.out.println(dc.url + "/" + dc.crawlerStatus + "/" + dc.httpStatus); + } } - }).fetch(new DomainLinks(), new CrawlDataReference(stream)); + } catch (Exception e) { + throw new RuntimeException(e); + } } } \ No newline at end of file diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java index 275f4092..4af4852e 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java @@ -32,6 +32,7 @@ public class ConvertActor extends RecordActorPrototype { public record Convert(FileStorageId fid) implements ActorStep {}; public record ConvertEncyclopedia(String source, String baseUrl) implements ActorStep {}; public record ConvertDirtree(String source) implements ActorStep {}; + public record ConvertWarc(String source) implements ActorStep {}; public record ConvertStackexchange(String source) implements ActorStep {}; @Resume(behavior = ActorResumeBehavior.RETRY) public record ConvertWait(FileStorageId destFid, @@ -74,6 +75,25 @@ public class ConvertActor extends RecordActorPrototype { mqConverterOutbox.sendAsync(ConvertRequest.forDirtree(sourcePath, processedArea.id())) ); } + case ConvertWarc(String source) -> { + Path sourcePath = Path.of(source); + if (!Files.exists(sourcePath)) + yield new Error("Source path does not exist: " + sourcePath); + + String fileName = sourcePath.toFile().getName(); + + var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); + var processedArea = storageService.allocateTemporaryStorage(base, + FileStorageType.PROCESSED_DATA, "processed-data", + "Processed Warc Data; " + fileName); + + storageService.setFileStorageState(processedArea.id(), FileStorageState.NEW); + + yield new ConvertWait( + processedArea.id(), + mqConverterOutbox.sendAsync(ConvertRequest.forWarc(sourcePath, processedArea.id())) + ); + } case ConvertEncyclopedia(String source, String baseUrl) -> { Path sourcePath = Path.of(source); diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java index 0af77acb..353ef965 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java @@ -63,8 +63,6 @@ public class ExportAtagsActor extends RecordActorPrototype { Path inputDir = storageService.getStorage(crawlId).asPath(); - var reader = new CrawledDomainReader(); - try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)))); ) { @@ -78,7 +76,7 @@ public class ExportAtagsActor extends RecordActorPrototype { } Path crawlDataPath = inputDir.resolve(item.relPath()); - try (var stream = reader.createDataStream(crawlDataPath)) { + try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) { exportLinks(tagWriter, stream); } catch (Exception ex) { diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java index b8bf0a5a..f00bace2 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java @@ -170,6 +170,7 @@ public class IndexQueryService extends IndexApiImplBase { } } + // GRPC endpoint @SneakyThrows public void query(nu.marginalia.index.api.RpcIndexQuery request, diff --git a/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java b/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java index 1a73a952..4322d3fc 100644 --- a/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java +++ b/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java @@ -29,13 +29,11 @@ public class CrawlDataUnfcker { return; } - var reader = new CrawledDomainReader(); - try (var wl = new WorkLog(output.resolve("crawler.log"))) { for (var inputItem : WorkLog.iterable(input.resolve("crawler.log"))) { Path inputPath = input.resolve(inputItem.relPath()); - var domainMaybe = readDomain(reader, inputPath).map(CrawledDomain::getDomain); + var domainMaybe = readDomain(inputPath).map(CrawledDomain::getDomain); if (domainMaybe.isEmpty()) continue; var domain = domainMaybe.get(); @@ -43,7 +41,7 @@ public class CrawlDataUnfcker { // Generate conformant ID String newId = Integer.toHexString(domain.hashCode()); - var outputPath = CrawlerOutputFile.createOutputPath(output, newId, domain); + var outputPath = CrawlerOutputFile.createLegacyOutputPath(output, newId, domain); var outputFileName = outputPath.toFile().getName(); System.out.println(inputPath + " -> " + outputPath); @@ -56,13 +54,13 @@ public class CrawlDataUnfcker { } } - static Optional readDomain(CrawledDomainReader reader, Path file) { + static Optional readDomain(Path file) { if (!Files.exists(file)) { System.out.println("Missing file " + file); return Optional.empty(); } - try (var stream = reader.createDataStream(file)) { + try (var stream = CrawledDomainReader.createDataStream(file)) { while (stream.hasNext()) { if (stream.next() instanceof CrawledDomain domain) { return Optional.of(domain); diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java index 97df4a39..c5751a7a 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java @@ -50,10 +50,9 @@ public class ExperimentRunnerMain { experiment.args(Arrays.copyOfRange(args, 2, args.length)); Path basePath = Path.of(args[0]); - var reader = new CrawledDomainReader(); for (var item : WorkLog.iterable(basePath.resolve("crawler.log"))) { Path crawlDataPath = basePath.resolve(item.relPath()); - try (var stream = reader.createDataStream(crawlDataPath)) { + try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) { experiment.process(stream); } catch (Exception ex) { diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/LegacyExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/LegacyExperiment.java index 4e61ffc4..5d7d8d11 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/LegacyExperiment.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/LegacyExperiment.java @@ -5,12 +5,12 @@ import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import java.io.IOException; -import java.net.URISyntaxException; import java.util.ArrayList; import java.util.List; public abstract class LegacyExperiment extends Experiment { public abstract boolean process(CrawledDomain domain); + @Override public boolean process(SerializableCrawlDataStream dataStream) throws IOException { List documentList = new ArrayList<>(); diff --git a/settings.gradle b/settings.gradle index 342107de..42ae0f47 100644 --- a/settings.gradle +++ b/settings.gradle @@ -41,6 +41,7 @@ include 'code:features-convert:topic-detection' include 'code:features-crawl:crawl-blocklist' include 'code:features-crawl:link-parser' +include 'code:features-crawl:content-type' include 'code:features-index:index-journal' include 'code:features-index:index-query' @@ -154,6 +155,8 @@ dependencyResolutionManagement { library('duckdb', 'org.duckdb', 'duckdb_jdbc').version('0.9.1') library('okhttp3','com.squareup.okhttp3','okhttp').version('4.11.0') + library('jwarc', 'org.netpreserve', 'jwarc').version('0.28.5') + library('httpcomponents.core','org.apache.httpcomponents','httpcore').version('4.4.15') library('httpcomponents.client','org.apache.httpcomponents','httpclient').version('4.5.13') library('commons.net', 'commons-net','commons-net').version('3.9.0') diff --git a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java index 1ec3e7fb..45718fe8 100644 --- a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java +++ b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java @@ -13,6 +13,7 @@ import org.apache.parquet.io.DelegatingSeekableInputStream; import org.apache.parquet.io.InputFile; import org.apache.parquet.io.SeekableInputStream; import org.apache.parquet.io.api.GroupConverter; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.PrimitiveType; @@ -144,7 +145,11 @@ public final class ParquetReader implements Spliterator, Closeable { case BINARY: case FIXED_LEN_BYTE_ARRAY: case INT96: - return primitiveType.stringifier().stringify(columnReader.getBinary()); + if (primitiveType.getLogicalTypeAnnotation() == null) { + return columnReader.getBinary().getBytes(); + } else { + return primitiveType.stringifier().stringify(columnReader.getBinary()); + } case BOOLEAN: return columnReader.getBoolean(); case DOUBLE: diff --git a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java index 6e53c189..6d9b5734 100644 --- a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java +++ b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java @@ -242,7 +242,7 @@ public final class ParquetWriter implements Closeable { if (type.getLogicalTypeAnnotation() == LogicalTypeAnnotation.stringType()) { recordConsumer.addBinary(Binary.fromString((String)value)); } else { - throw new UnsupportedOperationException("We don't support writing logical annotation type " + type.getLogicalTypeAnnotation()); + recordConsumer.addBinary(Binary.fromConstantByteArray((byte[])value)); } break; default: