From cc813a5624de29259429dce3f2ecd5b5340000fd Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 6 Dec 2023 18:43:55 +0100 Subject: [PATCH 01/23] (convert) Add basic support for Warc file sideloading This update includes the integration of the jwarc library and implements support for Warc file sideloading, as a first trial integration with this library. --- .../mqapi/converting/ConvertAction.java | 1 + .../mqapi/converting/ConvertRequest.java | 7 + .../processes/converting-process/build.gradle | 1 + .../marginalia/converting/ConverterMain.java | 8 + .../sideload/SideloadSourceFactory.java | 10 +- .../sideload/warc/WarcSideloadFactory.java | 32 ++++ .../sideload/warc/WarcSideloader.java | 141 ++++++++++++++++++ .../sideload/warc/WarcSideloaderTest.java | 37 +++++ .../marginalia/actor/task/ConvertActor.java | 20 +++ .../index/svc/IndexQueryService.java | 1 + settings.gradle | 2 + 11 files changed, 259 insertions(+), 1 deletion(-) create mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloadFactory.java create mode 100644 code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java create mode 100644 code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/warc/WarcSideloaderTest.java diff --git a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertAction.java b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertAction.java index 833ad3f0..17102c06 100644 --- a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertAction.java +++ b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertAction.java @@ -4,5 +4,6 @@ public enum ConvertAction { ConvertCrawlData, SideloadEncyclopedia, SideloadDirtree, + SideloadWarc, SideloadStackexchange } diff --git a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertRequest.java b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertRequest.java index fffed79b..cf445e5a 100644 --- a/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertRequest.java +++ b/code/api/process-mqapi/src/main/java/nu/marginalia/mqapi/converting/ConvertRequest.java @@ -38,6 +38,13 @@ public class ConvertRequest { destId, null); } + public static ConvertRequest forWarc(Path sourcePath, FileStorageId destId) { + return new ConvertRequest(ConvertAction.SideloadWarc, + sourcePath.toString(), + null, + destId, + null); + } public static ConvertRequest forStackexchange(Path sourcePath, FileStorageId destId) { return new ConvertRequest(ConvertAction.SideloadStackexchange, diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index faa952fb..58b0ecdd 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -65,6 +65,7 @@ dependencies { implementation libs.bundles.slf4j implementation libs.notnull + implementation libs.jwarc implementation libs.jsoup diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index fb919018..50f29fb1 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -262,6 +262,14 @@ public class ConverterMain { processData.asPath(), msg, inbox); } + case SideloadWarc -> { + var processData = fileStorageService.getStorage(request.processedDataStorage); + + yield new SideloadAction( + sideloadSourceFactory.sideloadWarc(Path.of(request.inputSource)), + processData.asPath(), + msg, inbox); + } case SideloadStackexchange -> { var processData = fileStorageService.getStorage(request.processedDataStorage); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java index debc460f..48ab45c9 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java @@ -6,6 +6,7 @@ import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory; import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader; import nu.marginalia.converting.sideload.stackexchange.StackexchangeSideloader; +import nu.marginalia.converting.sideload.warc.WarcSideloadFactory; import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider; @@ -22,6 +23,7 @@ public class SideloadSourceFactory { private final DocumentKeywordExtractor documentKeywordExtractor; private final AnchorTagsSourceFactory anchorTagsSourceFactory; private final DirtreeSideloaderFactory dirtreeSideloaderFactory; + private final WarcSideloadFactory warcSideloadFactory; @Inject public SideloadSourceFactory(Gson gson, @@ -29,13 +31,15 @@ public class SideloadSourceFactory { ThreadLocalSentenceExtractorProvider sentenceExtractorProvider, DocumentKeywordExtractor documentKeywordExtractor, AnchorTagsSourceFactory anchorTagsSourceFactory, - DirtreeSideloaderFactory dirtreeSideloaderFactory) { + DirtreeSideloaderFactory dirtreeSideloaderFactory, + WarcSideloadFactory warcSideloadFactory) { this.gson = gson; this.sideloaderProcessing = sideloaderProcessing; this.sentenceExtractorProvider = sentenceExtractorProvider; this.documentKeywordExtractor = documentKeywordExtractor; this.anchorTagsSourceFactory = anchorTagsSourceFactory; this.dirtreeSideloaderFactory = dirtreeSideloaderFactory; + this.warcSideloadFactory = warcSideloadFactory; } public SideloadSource sideloadEncyclopediaMarginaliaNu(Path pathToDbFile, String baseUrl) throws SQLException { @@ -46,6 +50,10 @@ public class SideloadSourceFactory { return dirtreeSideloaderFactory.createSideloaders(pathToYamlFile); } + public Collection sideloadWarc(Path pathToWarcFiles) throws IOException { + return warcSideloadFactory.createSideloaders(pathToWarcFiles); + } + /** Do not use, this code isn't finished */ public Collection sideloadStackexchange(Path pathToDbFileRoot) throws IOException { try (var dirs = Files.walk(pathToDbFileRoot)) { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloadFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloadFactory.java new file mode 100644 index 00000000..35fb6d3a --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloadFactory.java @@ -0,0 +1,32 @@ +package nu.marginalia.converting.sideload.warc; + +import nu.marginalia.converting.sideload.SideloadSource; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +public class WarcSideloadFactory { + + public Collection createSideloaders(Path pathToWarcFiles) throws IOException { + final List files = new ArrayList<>(); + + try (var stream = Files.list(pathToWarcFiles)) { + stream + .filter(Files::isRegularFile) + .filter(this::isWarcFile) + .forEach(files::add); + + } + // stub + return null; + } + + private boolean isWarcFile(Path path) { + return path.toString().endsWith(".warc") + || path.toString().endsWith(".warc.gz"); + } +} \ No newline at end of file diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java new file mode 100644 index 00000000..73d29a30 --- /dev/null +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java @@ -0,0 +1,141 @@ +package nu.marginalia.converting.sideload.warc; + +import lombok.SneakyThrows; +import nu.marginalia.atags.model.DomainLinks; +import nu.marginalia.converting.model.GeneratorType; +import nu.marginalia.converting.model.ProcessedDocument; +import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.converting.sideload.SideloadSource; +import nu.marginalia.converting.sideload.SideloaderProcessing; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.DomainIndexingState; +import org.netpreserve.jwarc.*; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.util.Iterator; +import java.util.List; +import java.util.Objects; +import java.util.Optional; +import java.util.stream.StreamSupport; + +public class WarcSideloader implements SideloadSource, AutoCloseable { + + private final Path warcFile; + private final SideloaderProcessing sideloaderProcessing; + + private final WarcReader reader; + + private final EdgeDomain domain; + + public WarcSideloader(Path warcFile, + SideloaderProcessing sideloaderProcessing) + throws IOException + { + this.warcFile = warcFile; + this.sideloaderProcessing = sideloaderProcessing; + this.reader = new WarcReader(warcFile); + this.domain = sniffDomainFromWarc() + .orElseThrow(() -> new IOException("Could not identify domain from warc file")); + } + + @SneakyThrows + @Override + public ProcessedDomain getDomain() { + var ret = new ProcessedDomain(); + + ret.domain = domain; + ret.ip = "0.0.0.0"; + ret.state = DomainIndexingState.ACTIVE; + + return ret; + } + + private Optional sniffDomainFromWarc() throws IOException { + try { + for (var record : reader) { + if (!(record instanceof WarcRequest request)) { + continue; + } + + String target = request.target(); + if (target.startsWith("http://") || target.startsWith("https://")) { + return Optional.of(new EdgeUrl(target).getDomain()); + } + } + } catch (URISyntaxException e) { + return Optional.empty(); + } finally { + reader.position(0); + } + return Optional.empty(); + } + + @SneakyThrows + @Override + public Iterator getDocumentsStream() { + return reader.records() + .filter(record -> record instanceof WarcResponse) + .map(WarcResponse.class::cast) + .filter(this::isRelevantResponse) + .map(this::process) + .iterator(); + } + + private boolean isRelevantResponse(WarcResponse warcResponse) { + try { + HttpResponse httpResponse = warcResponse.http(); + if (httpResponse == null) + return false; + if (httpResponse.status() != 200) + return false; + if (!Objects.equals(httpResponse.contentType(), MediaType.HTML)) + return false; + + var url = new EdgeUrl(warcResponse.target()); + if (!Objects.equals(url.getDomain(), domain)) { + return false; + } + + return true; + } catch (Exception e) { + e.printStackTrace(); + } + + return false; + } + + @SneakyThrows + private ProcessedDocument process(WarcResponse response) { + String body = getBody(response); + String url = response.target(); + + // We trim "/index.html"-suffixes from the index if they are present, + // since this is typically an artifact from document retrieval + if (url.endsWith("/index.html")) { + url = url.substring(0, url.length() - "index.html".length()); + } + + return sideloaderProcessing + .processDocument(url, body, List.of(), new DomainLinks(), + GeneratorType.DOCS, + 10_000); + } + + @SneakyThrows + private String getBody(WarcResponse response) { + var http = response.http(); + + // TODO: We should support additional encodings here + return new String(http.body().stream().readAllBytes(), StandardCharsets.UTF_8); + } + + @Override + public void close() throws Exception { + reader.close(); + } + +} diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/warc/WarcSideloaderTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/warc/WarcSideloaderTest.java new file mode 100644 index 00000000..dfa3c972 --- /dev/null +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/warc/WarcSideloaderTest.java @@ -0,0 +1,37 @@ +package nu.marginalia.converting.sideload.warc; + +import com.google.inject.AbstractModule; +import com.google.inject.Guice; +import nu.marginalia.converting.ConverterModule; +import nu.marginalia.converting.processor.ConverterDomainTypes; +import nu.marginalia.converting.sideload.SideloaderProcessing; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +import java.io.IOException; +import java.nio.file.Path; + +import static org.mockito.Mockito.when; + +class WarcSideloaderTest { + @Test + public void test() throws IOException { + var domainTypesMock = Mockito.mock(ConverterDomainTypes.class); + when(domainTypesMock.isBlog(Mockito.any())).thenReturn(false); + + var processing = Guice.createInjector(new ConverterModule(), + new AbstractModule() { + public void configure() { + bind(ConverterDomainTypes.class).toInstance(domainTypesMock); + } + } + ) + .getInstance(SideloaderProcessing.class); + + var sideloader = new WarcSideloader(Path.of("/home/vlofgren/marginalia.warc.gz"), processing); + + var domain = sideloader.getDomain(); + System.out.println(domain); + sideloader.getDocumentsStream().forEachRemaining(System.out::println); + } +} \ No newline at end of file diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java index 275f4092..4af4852e 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ConvertActor.java @@ -32,6 +32,7 @@ public class ConvertActor extends RecordActorPrototype { public record Convert(FileStorageId fid) implements ActorStep {}; public record ConvertEncyclopedia(String source, String baseUrl) implements ActorStep {}; public record ConvertDirtree(String source) implements ActorStep {}; + public record ConvertWarc(String source) implements ActorStep {}; public record ConvertStackexchange(String source) implements ActorStep {}; @Resume(behavior = ActorResumeBehavior.RETRY) public record ConvertWait(FileStorageId destFid, @@ -74,6 +75,25 @@ public class ConvertActor extends RecordActorPrototype { mqConverterOutbox.sendAsync(ConvertRequest.forDirtree(sourcePath, processedArea.id())) ); } + case ConvertWarc(String source) -> { + Path sourcePath = Path.of(source); + if (!Files.exists(sourcePath)) + yield new Error("Source path does not exist: " + sourcePath); + + String fileName = sourcePath.toFile().getName(); + + var base = storageService.getStorageBase(FileStorageBaseType.STORAGE); + var processedArea = storageService.allocateTemporaryStorage(base, + FileStorageType.PROCESSED_DATA, "processed-data", + "Processed Warc Data; " + fileName); + + storageService.setFileStorageState(processedArea.id(), FileStorageState.NEW); + + yield new ConvertWait( + processedArea.id(), + mqConverterOutbox.sendAsync(ConvertRequest.forWarc(sourcePath, processedArea.id())) + ); + } case ConvertEncyclopedia(String source, String baseUrl) -> { Path sourcePath = Path.of(source); diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java index b8bf0a5a..f00bace2 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/svc/IndexQueryService.java @@ -170,6 +170,7 @@ public class IndexQueryService extends IndexApiImplBase { } } + // GRPC endpoint @SneakyThrows public void query(nu.marginalia.index.api.RpcIndexQuery request, diff --git a/settings.gradle b/settings.gradle index 952acd9c..4814f7e7 100644 --- a/settings.gradle +++ b/settings.gradle @@ -153,6 +153,8 @@ dependencyResolutionManagement { library('duckdb', 'org.duckdb', 'duckdb_jdbc').version('0.9.1') library('okhttp3','com.squareup.okhttp3','okhttp').version('4.11.0') + library('jwarc', 'org.netpreserve', 'jwarc').version('0.28.4') + library('httpcomponents.core','org.apache.httpcomponents','httpcore').version('4.4.15') library('httpcomponents.client','org.apache.httpcomponents','httpclient').version('4.5.13') library('commons.net', 'commons-net','commons-net').version('3.9.0') From 2d5d11645df95ad01b4b15bf851e04bd18f5115a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 6 Dec 2023 19:00:29 +0100 Subject: [PATCH 02/23] (warc) Refactor WarcSideloaderTest to not rely on specific test files on the computer --- .../sideload/warc/WarcSideloaderTest.java | 58 ++++++++++++++----- 1 file changed, 43 insertions(+), 15 deletions(-) diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/warc/WarcSideloaderTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/warc/WarcSideloaderTest.java index dfa3c972..4e9fb406 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/warc/WarcSideloaderTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/warc/WarcSideloaderTest.java @@ -5,33 +5,61 @@ import com.google.inject.Guice; import nu.marginalia.converting.ConverterModule; import nu.marginalia.converting.processor.ConverterDomainTypes; import nu.marginalia.converting.sideload.SideloaderProcessing; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.mockito.Mockito; +import org.netpreserve.jwarc.WarcWriter; import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.file.Files; import java.nio.file.Path; import static org.mockito.Mockito.when; -class WarcSideloaderTest { - @Test - public void test() throws IOException { +class WarcSideloaderTest extends AbstractModule { + SideloaderProcessing processing; + + Path warcFile; + @BeforeEach + public void setUp() throws IOException { + processing = Guice.createInjector(new ConverterModule(), this) + .getInstance(SideloaderProcessing.class); + warcFile = Files.createTempFile(getClass().getSimpleName(), ".warc.gz"); + } + + @AfterEach + public void tearDown() throws IOException { + Files.deleteIfExists(warcFile); + } + + public void configure() { var domainTypesMock = Mockito.mock(ConverterDomainTypes.class); when(domainTypesMock.isBlog(Mockito.any())).thenReturn(false); - var processing = Guice.createInjector(new ConverterModule(), - new AbstractModule() { - public void configure() { - bind(ConverterDomainTypes.class).toInstance(domainTypesMock); - } - } - ) - .getInstance(SideloaderProcessing.class); + bind(ConverterDomainTypes.class).toInstance(domainTypesMock); + } - var sideloader = new WarcSideloader(Path.of("/home/vlofgren/marginalia.warc.gz"), processing); - var domain = sideloader.getDomain(); - System.out.println(domain); - sideloader.getDocumentsStream().forEachRemaining(System.out::println); + @Test + public void test() throws IOException { + try (var writer = new WarcWriter(Files.newOutputStream(warcFile))) { + writer.fetch(new URI("https://www.marginalia.nu/")); + writer.fetch(new URI("https://www.marginalia.nu/log/93_atags/")); + writer.fetch(new URI("https://www.marginalia.nu/links/")); + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } + + try (var sideloader = new WarcSideloader(warcFile, processing)) { + + var domain = sideloader.getDomain(); + System.out.println(domain); + sideloader.getDocumentsStream().forEachRemaining(System.out::println); + } catch (Exception e) { + throw new RuntimeException(e); + } } } \ No newline at end of file From 064265b0b94dc1dd9282f149fd83338d26741408 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 7 Dec 2023 15:16:37 +0100 Subject: [PATCH 03/23] (crawler) Move content type/charset sniffing to a separate microlibrary This functionality needs to be accessed by the WarcSideloader, which is in the converter. The resultant microlibrary is tiny, but I think in this case it's justifiable. --- code/features-crawl/content-type/build.gradle | 28 ++++++++++ .../marginalia/contenttype/ContentType.java | 9 ++++ .../contenttype}/ContentTypeParser.java | 52 ++++++++++++------- .../contenttype/DocumentBodyToString.java | 27 ++++++++++ .../contenttype/ContentTypeParserTest.java | 50 ++++++++++++++++++ .../contenttype/DocumentBodyToStringTest.java | 48 +++++++++++++++++ .../crawling/model/ContentType.java | 5 -- code/processes/crawling-process/build.gradle | 1 + .../retreival/fetcher/HttpFetcherImpl.java | 29 +++-------- settings.gradle | 1 + 10 files changed, 203 insertions(+), 47 deletions(-) create mode 100644 code/features-crawl/content-type/build.gradle create mode 100644 code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java rename code/{processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic => features-crawl/content-type/src/main/java/nu/marginalia/contenttype}/ContentTypeParser.java (60%) create mode 100644 code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/DocumentBodyToString.java create mode 100644 code/features-crawl/content-type/src/test/java/nu/marginalia/contenttype/ContentTypeParserTest.java create mode 100644 code/features-crawl/content-type/src/test/java/nu/marginalia/contenttype/DocumentBodyToStringTest.java delete mode 100644 code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/ContentType.java diff --git a/code/features-crawl/content-type/build.gradle b/code/features-crawl/content-type/build.gradle new file mode 100644 index 00000000..17eaea3f --- /dev/null +++ b/code/features-crawl/content-type/build.gradle @@ -0,0 +1,28 @@ +plugins { + id 'java' + + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(21)) + } +} + +dependencies { + implementation project(':code:common:model') + implementation libs.crawlercommons + implementation libs.notnull + + implementation libs.bundles.gson + implementation libs.bundles.slf4j + testImplementation libs.bundles.slf4j.test + + implementation libs.jsoup + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito +} diff --git a/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java new file mode 100644 index 00000000..374788b4 --- /dev/null +++ b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java @@ -0,0 +1,9 @@ +package nu.marginalia.contenttype; + +/** Content type and charset of a document + * @param contentType The content type, e.g. "text/html" + * @param charset The charset, e.g. "UTF-8" + */ +public record ContentType(String contentType, String charset) { + +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeParser.java b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentTypeParser.java similarity index 60% rename from code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeParser.java rename to code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentTypeParser.java index 604264e3..5b794246 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeParser.java +++ b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentTypeParser.java @@ -1,7 +1,8 @@ -package nu.marginalia.crawl.retreival.logic; +package nu.marginalia.contenttype; import crawlercommons.mimetypes.MimeTypeDetector; -import nu.marginalia.crawling.model.ContentType; +import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; import org.jsoup.Jsoup; import java.util.Arrays; @@ -11,28 +12,40 @@ public class ContentTypeParser { static final MimeTypeDetector mimeTypeDetector = new MimeTypeDetector(); - public static ContentType parse(String contentType, byte[] data) { - return getContentTypeFromContentTypeString(contentType) - .or(() -> getContentTypeStringFromTag(data)) + /** Parse the content type and charset from a content type header and/or the body of a document, + * best effort + */ + public static ContentType parseContentType( + @Nullable String contentTypeHeader, + @NotNull byte[] body) + { + return getContentTypeFromContentTypeString(contentTypeHeader) + .or(() -> getContentTypeStringFromTag(body)) .orElseGet(() -> { - Optional charset = getCharsetFromTag(data); + Optional charset = getCharsetFromTag(body); return new ContentType( - Optional.ofNullable(contentType) - .or(() -> Optional.ofNullable(mimeTypeDetector.detect(data))) - .orElseGet(() -> ContentTypeParser.shittyMimeSniffer(data)), charset.orElse("ISO_8859_1")); + Optional.ofNullable(contentTypeHeader) + .or(() -> Optional.ofNullable(mimeTypeDetector.detect(body))) + .orElseGet(() -> ContentTypeParser.shittyMimeSniffer(body)), charset.orElse("ISO_8859_1")); }); } - private static Optional getContentTypeFromContentTypeString(String contentType) { - if (contentType != null && contentType.contains(";")) { - var parts = contentType.split(";"); - var content = parts[0].trim(); - var extra = parts[1].trim(); - if (extra.startsWith("charset=")) { - return Optional.of(new ContentType(content, extra.substring("charset=".length()))); - } - } - return Optional.empty(); + /** Parse the charset from a content type string. */ + private static Optional getContentTypeFromContentTypeString(@Nullable String contentType) { + if (contentType == null) + return Optional.empty(); + + if (!contentType.contains(";")) + return Optional.empty(); + + var parts = contentType.split(";"); + var content = parts[0].trim(); + var extra = parts[1].trim(); + + if (!extra.startsWith("charset=")) + return Optional.empty(); + + return Optional.of(new ContentType(content, extra.substring("charset=".length()))); } private static String shittyMimeSniffer(byte[] data) { @@ -45,6 +58,7 @@ public class ContentTypeParser { String startStr = new String(Arrays.copyOf(data, Math.min(128, data.length))).trim().toLowerCase(); if (startStr.contains("Title".getBytes(StandardCharsets.UTF_8); + String contentTypeHeader = "text/html; charset=UTF-8"; + ContentType result = ContentTypeParser.parseContentType(contentTypeHeader, body); + assertNotNull(result); + assertEquals("text/html", result.contentType()); + assertEquals("UTF-8", result.charset()); + } + + @Test + public void testParseContentTypeWithMetaCharset() { + byte[] body = "Title".getBytes(StandardCharsets.UTF_8); + ContentType result = ContentTypeParser.parseContentType(null, body); + assertNotNull(result); + assertEquals("text/html", result.contentType()); + assertEquals("UTF-8", result.charset()); + } + + @Test + public void testParseContentTypeWithHeaderValueAbsent() { + byte[] body = "Some random text.".getBytes(StandardCharsets.UTF_8); + String contentTypeHeader = "text/plain"; + ContentType result = ContentTypeParser.parseContentType(contentTypeHeader, body); + assertNotNull(result); + assertEquals("text/plain", result.contentType()); + assertEquals("ISO_8859_1", result.charset()); + } + + @Test + public void testParseContentTypeWithBinaryData() { + byte[] body = new byte[128]; + body[0] = 31; // ascii value less than 32 + ContentType result = ContentTypeParser.parseContentType(null, body); + assertNotNull(result); + assertEquals("application/binary", result.contentType()); + assertEquals("ISO_8859_1", result.charset()); + } +} \ No newline at end of file diff --git a/code/features-crawl/content-type/src/test/java/nu/marginalia/contenttype/DocumentBodyToStringTest.java b/code/features-crawl/content-type/src/test/java/nu/marginalia/contenttype/DocumentBodyToStringTest.java new file mode 100644 index 00000000..f7cf120d --- /dev/null +++ b/code/features-crawl/content-type/src/test/java/nu/marginalia/contenttype/DocumentBodyToStringTest.java @@ -0,0 +1,48 @@ +package nu.marginalia.contenttype; + +import org.junit.jupiter.api.Test; +import static org.junit.jupiter.api.Assertions.*; + +import java.nio.charset.StandardCharsets; + +public class DocumentBodyToStringTest { + @Test + public void testGetStringData_onUTF8(){ + + ContentType type = new ContentType("text/html", "UTF-8"); + + String expected = "Hello, World!"; + byte[] data = expected.getBytes(StandardCharsets.UTF_8); + + String result = DocumentBodyToString.getStringData(type, data); + + assertEquals(expected, result, "Result should match the expected string"); + } + + @Test + public void testGetStringData_onIllegalCharsetName(){ + + ContentType type = new ContentType("text/html", "unsupportedname"); + + String expected = "Hello, World!"; + byte[] data = expected.getBytes(StandardCharsets.UTF_8); + + String result = DocumentBodyToString.getStringData(type, data); + + assertEquals(expected, result, "Result should match the expected string if charset is illegal name"); + } + + @Test + public void testGetStringData_onUnsupportedCharset(){ + + ContentType type = new ContentType("text/html", "Macintosh"); + + String expected = "Hello, World!"; + byte[] data = expected.getBytes(StandardCharsets.UTF_8); + + String result = DocumentBodyToString.getStringData(type, data); + + assertEquals(expected, result, "Result should fall back to UTF-8 parsing if charset is unsupported"); + } + +} \ No newline at end of file diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/ContentType.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/ContentType.java deleted file mode 100644 index e8a9fca1..00000000 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/ContentType.java +++ /dev/null @@ -1,5 +0,0 @@ -package nu.marginalia.crawling.model; - - -public record ContentType(String contentType, String charset) { -} diff --git a/code/processes/crawling-process/build.gradle b/code/processes/crawling-process/build.gradle index 00f0f01b..dbac9b66 100644 --- a/code/processes/crawling-process/build.gradle +++ b/code/processes/crawling-process/build.gradle @@ -41,6 +41,7 @@ dependencies { implementation project(':code:features-convert:anchor-keywords') implementation project(':code:features-crawl:crawl-blocklist') implementation project(':code:features-crawl:link-parser') + implementation project(':code:features-crawl:content-type') implementation libs.bundles.slf4j diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 041ae08d..872e00f3 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -5,17 +5,17 @@ import com.google.inject.name.Named; import crawlercommons.robots.SimpleRobotRules; import crawlercommons.robots.SimpleRobotRulesParser; import lombok.SneakyThrows; +import nu.marginalia.contenttype.DocumentBodyToString; import nu.marginalia.crawl.retreival.Cookies; import nu.marginalia.crawl.retreival.RateLimitException; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawlerDocumentStatus; -import nu.marginalia.crawling.model.ContentType; +import nu.marginalia.contenttype.ContentType; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; -import nu.marginalia.crawl.retreival.logic.ContentTypeParser; +import nu.marginalia.contenttype.ContentTypeParser; import okhttp3.*; -import org.apache.commons.collections4.queue.PredicatedQueue; import org.apache.commons.io.input.BOMInputStream; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; @@ -281,7 +281,7 @@ public class HttpFetcherImpl implements HttpFetcher { byte[] data = byteStream.readNBytes(maxFetchSize); - var contentType = ContentTypeParser.parse(contentTypeHeader, data); + var contentType = ContentTypeParser.parseContentType(contentTypeHeader, data); if (!contentTypeLogic.isAllowableContentType(contentType.contentType())) { return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); } @@ -301,7 +301,8 @@ public class HttpFetcherImpl implements HttpFetcher { .build(); } - var strData = getStringData(data, contentType); + var strData = DocumentBodyToString.getStringData(contentType, data); + var canonical = rsp.header("rel=canonical", ""); return CrawledDocument.builder() @@ -363,24 +364,6 @@ public class HttpFetcherImpl implements HttpFetcher { return isPermittedGeneral; } - private String getStringData(byte[] data, ContentType contentType) { - Charset charset; - try { - charset = Charset.forName(contentType.charset()); - } - catch (IllegalCharsetNameException ex) { - charset = StandardCharsets.UTF_8; - } - catch (UnsupportedCharsetException ex) { - // This is usually like Macintosh Latin - // (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding) - // - // It's close enough to 8859-1 to serve - charset = StandardCharsets.ISO_8859_1; - } - return new String(data, charset); - } - private CrawledDocument createRedirectResponse(EdgeUrl url, Response rsp, EdgeUrl responseUrl) { return CrawledDocument.builder() diff --git a/settings.gradle b/settings.gradle index 4814f7e7..59f42bec 100644 --- a/settings.gradle +++ b/settings.gradle @@ -40,6 +40,7 @@ include 'code:features-convert:topic-detection' include 'code:features-crawl:crawl-blocklist' include 'code:features-crawl:link-parser' +include 'code:features-crawl:content-type' include 'code:features-index:index-journal' include 'code:features-index:index-query' From fabffa80f0522c9c4fee7111c63ebdba8723fcc6 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 7 Dec 2023 15:26:01 +0100 Subject: [PATCH 04/23] (warc) Integrate the crawler's content type parsing and charset logic into the WarcSideloader --- .../processes/converting-process/build.gradle | 1 + .../sideload/warc/WarcSideloader.java | 41 ++++++++++++++----- .../sideload/warc/WarcSideloaderTest.java | 24 +++++++++-- 3 files changed, 51 insertions(+), 15 deletions(-) diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index 58b0ecdd..4a3f2290 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -58,6 +58,7 @@ dependencies { implementation project(':code:features-crawl:crawl-blocklist') implementation project(':code:features-crawl:link-parser') + implementation project(':code:features-crawl:content-type') testImplementation project(':code:libraries:term-frequency-dict') testImplementation project(':code:process-models:crawl-spec') diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java index 73d29a30..2d8c1bda 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java @@ -2,6 +2,8 @@ package nu.marginalia.converting.sideload.warc; import lombok.SneakyThrows; import nu.marginalia.atags.model.DomainLinks; +import nu.marginalia.contenttype.ContentTypeParser; +import nu.marginalia.contenttype.DocumentBodyToString; import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDomain; @@ -11,31 +13,32 @@ import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.DomainIndexingState; import org.netpreserve.jwarc.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.net.URISyntaxException; -import java.nio.charset.StandardCharsets; import java.nio.file.Path; import java.util.Iterator; import java.util.List; import java.util.Objects; import java.util.Optional; -import java.util.stream.StreamSupport; public class WarcSideloader implements SideloadSource, AutoCloseable { - private final Path warcFile; + private static final Logger logger = LoggerFactory.getLogger(WarcSideloader.class); + private final SideloaderProcessing sideloaderProcessing; private final WarcReader reader; private final EdgeDomain domain; + public WarcSideloader(Path warcFile, SideloaderProcessing sideloaderProcessing) throws IOException { - this.warcFile = warcFile; this.sideloaderProcessing = sideloaderProcessing; this.reader = new WarcReader(warcFile); this.domain = sniffDomainFromWarc() @@ -82,6 +85,8 @@ public class WarcSideloader implements SideloadSource, AutoCloseable { .map(WarcResponse.class::cast) .filter(this::isRelevantResponse) .map(this::process) + .filter(Optional::isPresent) + .map(Optional::get) .iterator(); } @@ -109,8 +114,8 @@ public class WarcSideloader implements SideloadSource, AutoCloseable { } @SneakyThrows - private ProcessedDocument process(WarcResponse response) { - String body = getBody(response); + private Optional process(WarcResponse response) { + Optional body = getBody(response); String url = response.target(); // We trim "/index.html"-suffixes from the index if they are present, @@ -119,18 +124,32 @@ public class WarcSideloader implements SideloadSource, AutoCloseable { url = url.substring(0, url.length() - "index.html".length()); } - return sideloaderProcessing - .processDocument(url, body, List.of(), new DomainLinks(), + if (body.isEmpty()) { + return Optional.empty(); + } + + return Optional.of(sideloaderProcessing + .processDocument(url, body.get(), List.of(), new DomainLinks(), GeneratorType.DOCS, - 10_000); + 10_000)); } @SneakyThrows - private String getBody(WarcResponse response) { + private Optional getBody(WarcResponse response) { var http = response.http(); // TODO: We should support additional encodings here - return new String(http.body().stream().readAllBytes(), StandardCharsets.UTF_8); + try (var body = http.body()) { + String contentType = http.headers().first("Content-Type").orElse(null); + byte[] bytes = body.stream().readAllBytes(); + + var ct = ContentTypeParser.parseContentType(contentType, bytes); + return Optional.of(DocumentBodyToString.getStringData(ct, bytes)); + } + catch (Exception ex) { + logger.info("Failed to parse body", ex); + } + return Optional.empty(); } @Override diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/warc/WarcSideloaderTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/warc/WarcSideloaderTest.java index 4e9fb406..da94e3a8 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/warc/WarcSideloaderTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/warc/WarcSideloaderTest.java @@ -3,6 +3,8 @@ package nu.marginalia.converting.sideload.warc; import com.google.inject.AbstractModule; import com.google.inject.Guice; import nu.marginalia.converting.ConverterModule; +import nu.marginalia.converting.model.ProcessedDocument; +import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.processor.ConverterDomainTypes; import nu.marginalia.converting.sideload.SideloaderProcessing; import org.junit.jupiter.api.AfterEach; @@ -16,7 +18,11 @@ import java.net.URI; import java.net.URISyntaxException; import java.nio.file.Files; import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.mockito.Mockito.when; class WarcSideloaderTest extends AbstractModule { @@ -53,13 +59,23 @@ class WarcSideloaderTest extends AbstractModule { throw new RuntimeException(e); } - try (var sideloader = new WarcSideloader(warcFile, processing)) { + ProcessedDomain domain; + List docs = new ArrayList<>(); - var domain = sideloader.getDomain(); - System.out.println(domain); - sideloader.getDocumentsStream().forEachRemaining(System.out::println); + try (var sideloader = new WarcSideloader(warcFile, processing)) { + domain = sideloader.getDomain(); + sideloader.getDocumentsStream().forEachRemaining(docs::add); } catch (Exception e) { throw new RuntimeException(e); } + + assertNotNull(domain); + assertEquals(3, docs.size()); + List fetchedUrls = docs.stream().map(doc -> doc.url).map(Object::toString).toList(); + assertEquals(List.of( + "https://www.marginalia.nu/", + "https://www.marginalia.nu/log/93_atags/", + "https://www.marginalia.nu/links/"), + fetchedUrls); } } \ No newline at end of file From 072b5fcd12ad3575eba65bfe7e0b28f2f58a1e8e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 8 Dec 2023 13:49:16 +0100 Subject: [PATCH 05/23] Implement Warc-recording wrapper for OkHttp3 client This is a first step of using WARC as an intermediate flight recorder style step in the crawler, ultimately aimed at being able to resume crawls if the crawler is restarted. This component is currently not hooked into anything. The OkHttp3 client wrapper class 'WarcRecordingFetcherClient' was implemented for web archiving. This allows for the recording of HTTP requests and responses. New classes were introduced, 'WarcDigestBuilder', 'IpInterceptingNetworkInterceptor', and 'WarcProtocolReconstructor'. The JWarc dependency was added to the build.gradle file, and relevant unit tests were also introduced. Some HttpFetcher-adjacent structural changes were also done for better organization. --- .../java/nu/marginalia/model/EdgeUrl.java | 7 + code/processes/crawling-process/build.gradle | 1 + .../retreival/fetcher/HttpFetcherImpl.java | 10 +- .../FastTerminatingSocketFactory.java | 2 +- .../IpInterceptingNetworkInterceptor.java | 24 +++ .../fetcher/{ => socket}/NoSecuritySSL.java | 4 +- .../fetcher/warc/WarcDigestBuilder.java | 29 +++ .../warc/WarcProtocolReconstructor.java | 127 +++++++++++++ .../warc/WarcRecordingFetcherClient.java | 175 ++++++++++++++++++ .../WarcRecordingFetcherClientTest.java | 70 +++++++ 10 files changed, 441 insertions(+), 8 deletions(-) rename code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/{ => socket}/FastTerminatingSocketFactory.java (96%) create mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/IpInterceptingNetworkInterceptor.java rename code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/{ => socket}/NoSecuritySSL.java (89%) create mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java create mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java create mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecordingFetcherClient.java create mode 100644 code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecordingFetcherClientTest.java diff --git a/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java b/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java index 9def0480..f0f23956 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java +++ b/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java @@ -232,4 +232,11 @@ public class EdgeUrl implements Serializable { return new URL(this.proto, this.domain.toString(), port, this.path); } + + public URI asURI() throws URISyntaxException { + if (port == null) + return new URI(this.proto, null, this.domain.toString(), this.path, this.param); + else + return new URI(this.proto, null, this.domain.toString(), this.port, this.path, this.param, null); + } } diff --git a/code/processes/crawling-process/build.gradle b/code/processes/crawling-process/build.gradle index dbac9b66..baa02906 100644 --- a/code/processes/crawling-process/build.gradle +++ b/code/processes/crawling-process/build.gradle @@ -49,6 +49,7 @@ dependencies { implementation libs.guice implementation libs.gson implementation libs.zstd + implementation libs.jwarc implementation libs.crawlercommons implementation libs.okhttp3 implementation libs.jsoup diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 872e00f3..57e73c44 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -8,9 +8,9 @@ import lombok.SneakyThrows; import nu.marginalia.contenttype.DocumentBodyToString; import nu.marginalia.crawl.retreival.Cookies; import nu.marginalia.crawl.retreival.RateLimitException; +import nu.marginalia.crawl.retreival.fetcher.socket.*; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawlerDocumentStatus; -import nu.marginalia.contenttype.ContentType; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; @@ -26,10 +26,7 @@ import javax.net.ssl.X509TrustManager; import java.io.EOFException; import java.io.IOException; import java.net.*; -import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; -import java.nio.charset.StandardCharsets; -import java.nio.charset.UnsupportedCharsetException; import java.time.LocalDateTime; import java.util.*; import java.util.concurrent.TimeUnit; @@ -65,6 +62,7 @@ public class HttpFetcherImpl implements HttpFetcher { return builder.sslSocketFactory(NoSecuritySSL.buildSocketFactory(), (X509TrustManager) NoSecuritySSL.trustAllCerts[0]) .socketFactory(ftSocketFactory) .hostnameVerifier(NoSecuritySSL.buildHostnameVerifyer()) + .addNetworkInterceptor(new IpInterceptingNetworkInterceptor()) .connectionPool(pool) .cookieJar(cookies.getJar()) .followRedirects(true) @@ -141,8 +139,8 @@ public class HttpFetcherImpl implements HttpFetcher { var headBuilder = new Request.Builder().head() .addHeader("User-agent", userAgent) - .url(url.toString()) - .addHeader("Accept-Encoding", "gzip"); + .addHeader("Accept-Encoding", "gzip") + .url(url.toString()); var head = headBuilder.build(); var call = client.newCall(head); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/FastTerminatingSocketFactory.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/FastTerminatingSocketFactory.java similarity index 96% rename from code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/FastTerminatingSocketFactory.java rename to code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/FastTerminatingSocketFactory.java index add64e29..ffb29b33 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/FastTerminatingSocketFactory.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/FastTerminatingSocketFactory.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawl.retreival.fetcher; +package nu.marginalia.crawl.retreival.fetcher.socket; import javax.net.SocketFactory; import java.io.IOException; diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/IpInterceptingNetworkInterceptor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/IpInterceptingNetworkInterceptor.java new file mode 100644 index 00000000..c5eb76ac --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/IpInterceptingNetworkInterceptor.java @@ -0,0 +1,24 @@ +package nu.marginalia.crawl.retreival.fetcher.socket; + +import okhttp3.Interceptor; +import okhttp3.Response; +import org.jetbrains.annotations.NotNull; + +import java.io.IOException; + +public class IpInterceptingNetworkInterceptor implements Interceptor { + @NotNull + @Override + public Response intercept(@NotNull Interceptor.Chain chain) throws IOException { + String IP = chain.connection().socket().getInetAddress().getHostAddress(); + + return chain.proceed(chain.request()) + .newBuilder() + .addHeader("X-Remote-IP", IP) + .build(); + } + + public static String getIpFromResponse(Response response) { + return response.header("X-Remote-IP"); + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/NoSecuritySSL.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java similarity index 89% rename from code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/NoSecuritySSL.java rename to code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java index f86d2c48..45dc431c 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/NoSecuritySSL.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawl.retreival.fetcher; +package nu.marginalia.crawl.retreival.fetcher.socket; import lombok.SneakyThrows; @@ -8,6 +8,8 @@ import java.security.cert.X509Certificate; public class NoSecuritySSL { // Create a trust manager that does not validate certificate chains + // We want to accept e.g. self-signed certificates and certificates + // that are not signed by a CA is generally trusted by the system. public static final TrustManager[] trustAllCerts = new TrustManager[]{ new X509TrustManager() { @Override diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java new file mode 100644 index 00000000..88052a7e --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java @@ -0,0 +1,29 @@ +package nu.marginalia.crawl.retreival.fetcher.warc; + +import org.netpreserve.jwarc.WarcDigest; + +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; + +class WarcDigestBuilder { + private final MessageDigest digest; + + private static final String digestAlgorithm = "SHA-1"; + + public WarcDigestBuilder() throws NoSuchAlgorithmException { + this.digest = MessageDigest.getInstance(digestAlgorithm); + } + + public void update(String s) { + byte[] bytes = s.getBytes(); + update(bytes, bytes.length); + } + + public void update(byte[] buffer, int n) { + digest.update(buffer, 0, n); + } + + public WarcDigest build() { + return new WarcDigest(digest); + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java new file mode 100644 index 00000000..a583bcc9 --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java @@ -0,0 +1,127 @@ +package nu.marginalia.crawl.retreival.fetcher.warc; + +import okhttp3.Protocol; +import okhttp3.Request; +import okhttp3.Response; +import org.apache.commons.lang3.StringUtils; + +import java.net.URI; +import java.util.Arrays; +import java.util.Map; +import java.util.StringJoiner; +import java.util.stream.Collectors; + +/** We don't have access to the raw HTTP request and response, so we need to reconstruct them + * as best is possible from the data we have available. + */ +public class WarcProtocolReconstructor { + + static String getHttpRequestString(Request request, URI uri) { + StringBuilder requestStringBuilder = new StringBuilder(); + requestStringBuilder.append(request.method()).append(" ").append(uri.getPath()); + if (uri.getQuery() != null) { + requestStringBuilder.append("?").append(uri.getQuery()); + } + requestStringBuilder.append(" HTTP/1.1\r\n"); + requestStringBuilder.append("Host: ").append(uri.getHost()).append("\r\n"); + + request.headers().toMultimap().forEach((k, values) -> { + for (var value : values) { + requestStringBuilder.append(capitalizeHeader(k)).append(": ").append(value).append("\r\n"); + } + }); + + return requestStringBuilder.toString(); + } + + static String getResponseHeader(Response response) { + String version = response.protocol() == Protocol.HTTP_1_1 ? "1.1" : "2.0"; + + String statusCode = String.valueOf(response.code()); + String statusMessage = STATUS_CODE_MAP.getOrDefault(response.code(), "Unknown"); + + String headerString = getHeadersAsString(response); + + return STR."HTTP/\{version} \{statusCode} \{statusMessage}\r\n\{headerString}\r\n\r\n"; + } + + private static final Map STATUS_CODE_MAP = Map.ofEntries( + Map.entry(200, "OK"), + Map.entry(201, "Created"), + Map.entry(202, "Accepted"), + Map.entry(203, "Non-Authoritative Information"), + Map.entry(204, "No Content"), + Map.entry(205, "Reset Content"), + Map.entry(206, "Partial Content"), + Map.entry(207, "Multi-Status"), + Map.entry(208, "Already Reported"), + Map.entry(226, "IM Used"), + Map.entry(300, "Multiple Choices"), + Map.entry(301, "Moved Permanently"), + Map.entry(302, "Found"), + Map.entry(303, "See Other"), + Map.entry(304, "Not Modified"), + Map.entry(307, "Temporary Redirect"), + Map.entry(308, "Permanent Redirect"), + Map.entry(400, "Bad Request"), + Map.entry(401, "Unauthorized"), + Map.entry(403, "Forbidden"), + Map.entry(404, "Not Found"), + Map.entry(405, "Method Not Allowed"), + Map.entry(406, "Not Acceptable"), + Map.entry(408, "Request Timeout"), + Map.entry(409, "Conflict"), + Map.entry(410, "Gone"), + Map.entry(411, "Length Required"), + Map.entry(412, "Precondition Failed"), + Map.entry(413, "Payload Too Large"), + Map.entry(414, "URI Too Long"), + Map.entry(415, "Unsupported Media Type"), + Map.entry(416, "Range Not Satisfiable"), + Map.entry(417, "Expectation Failed"), + Map.entry(418, "I'm a teapot"), + Map.entry(421, "Misdirected Request"), + Map.entry(426, "Upgrade Required"), + Map.entry(428, "Precondition Required"), + Map.entry(429, "Too Many Requests"), + Map.entry(431, "Request Header Fields Too Large"), + Map.entry(451, "Unavailable For Legal Reasons"), + Map.entry(500, "Internal Server Error"), + Map.entry(501, "Not Implemented"), + Map.entry(502, "Bad Gateway"), + Map.entry(503, "Service Unavailable"), + Map.entry(504, "Gateway Timeout"), + Map.entry(505, "HTTP Version Not Supported"), + Map.entry(506, "Variant Also Negotiates"), + Map.entry(507, "Insufficient Storage"), + Map.entry(508, "Loop Detected"), + Map.entry(510, "Not Extended"), + Map.entry(511, "Network Authentication Required") + ); + + + static private String getHeadersAsString(Response response) { + StringJoiner joiner = new StringJoiner("\r\n"); + + response.headers().toMultimap().forEach((k, values) -> { + String headerCapitalized = capitalizeHeader(k); + + if (headerCapitalized.startsWith("X")) + return; + + for (var value : values) { + joiner.add(headerCapitalized + ": " + value); + } + }); + return joiner.toString(); + } + + // okhttp gives us flattened headers, so we need to reconstruct Camel-Kebab-Case style + // for the WARC parser's sake... + static private String capitalizeHeader(String k) { + return Arrays.stream(StringUtils.split(k, '-')) + .map(StringUtils::capitalize) + .collect(Collectors.joining("-")); + } + +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecordingFetcherClient.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecordingFetcherClient.java new file mode 100644 index 00000000..a3440c5a --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecordingFetcherClient.java @@ -0,0 +1,175 @@ +package nu.marginalia.crawl.retreival.fetcher.warc; + +import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; +import nu.marginalia.model.EdgeDomain; +import okhttp3.OkHttpClient; +import okhttp3.Request; +import org.netpreserve.jwarc.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.io.InputStream; +import java.net.InetAddress; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.security.NoSuchAlgorithmException; +import java.time.Instant; +import java.util.Optional; + +/** Based on JWarc's fetch method, APL 2.0 license + *

+ * This class wraps OkHttp's OkHttpClient and records the HTTP request and response in a WARC file, + * as best is possible given not all the data is available at the same time and needs to + * be reconstructed. + */ +public class WarcRecordingFetcherClient implements AutoCloseable { + private static final int MAX_TIME = 30_000; + private static final int MAX_SIZE = 1024 * 1024 * 10; + private final WarcWriter writer; + + private final EdgeDomain domain; + private static final Logger logger = LoggerFactory.getLogger(WarcRecordingFetcherClient.class); + + + public WarcRecordingFetcherClient(Path warcFile, EdgeDomain domain) throws IOException { + this.writer = new WarcWriter(warcFile); + this.domain = domain; + } + + public Optional fetch(OkHttpClient client, Request request) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException { + URI uri = request.url().uri(); + + WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder(); + WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder(); + + String ip; + Instant date = Instant.now(); + long startMillis = date.toEpochMilli(); + + Path tempFileName = Files.createTempFile(domain.toString(), ".data"); + + var call = client.newCall(request); + + int totalLength = 0; + + WarcTruncationReason truncationReason = null; + + + + try (FileChannel tempFile = + (FileChannel) Files.newByteChannel(tempFileName, StandardOpenOption.READ, StandardOpenOption.WRITE); + var response = call.execute() + ) { + var body = response.body(); + InputStream inputStream; + + if (body == null) { + inputStream = null; + truncationReason = WarcTruncationReason.DISCONNECT; + } + else { + inputStream = body.byteStream(); + } + + byte[] buf = new byte[8192]; + + ip = IpInterceptingNetworkInterceptor.getIpFromResponse(response); + + String responseHeaders = WarcProtocolReconstructor.getResponseHeader(response); + tempFile.write(ByteBuffer.wrap(responseHeaders.getBytes())); + responseDigestBuilder.update(responseHeaders); + + while (inputStream != null) { + int remainingLength; + + if (MAX_SIZE > 0 && MAX_SIZE - totalLength < buf.length) { + remainingLength = (MAX_SIZE - totalLength); + } else { + remainingLength = buf.length; + } + + int n = inputStream.read(buf, 0, remainingLength); + if (n < 0) + break; + + totalLength += n; + + for (int i = 0; i < n; ) { + int written = tempFile.write(ByteBuffer.wrap(buf, i, n - i)); + i += written; + } + + responseDigestBuilder.update(buf, n); + payloadDigestBuilder.update(buf, n); + + if (MAX_TIME > 0 && System.currentTimeMillis() - startMillis > MAX_TIME) { + truncationReason = WarcTruncationReason.TIME; + break; + } + if (MAX_SIZE > 0 && totalLength >= MAX_SIZE) { + truncationReason = WarcTruncationReason.LENGTH; + break; + } + } + + tempFile.position(0); + WarcResponse.Builder responseBuilder = new WarcResponse.Builder(uri) + .blockDigest(responseDigestBuilder.build()) + .date(date) + .body(MediaType.HTTP_RESPONSE, tempFile, tempFile.size()); + + if (ip != null) responseBuilder.ipAddress(InetAddress.getByName(ip)); + + responseBuilder.payloadDigest(payloadDigestBuilder.build()); + + if (truncationReason != null) + responseBuilder.truncated(truncationReason); + + // Build and write the response + + var warcResponse = responseBuilder.build(); + warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it + writer.write(warcResponse); + + // Build and write the request + + WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder(); + + String httpRequestString = WarcProtocolReconstructor.getHttpRequestString(response.request(), uri); + + requestDigestBuilder.update(httpRequestString); + + WarcRequest warcRequest = new WarcRequest.Builder(uri) + .blockDigest(requestDigestBuilder.build()) + .date(date) + .body(MediaType.HTTP_REQUEST, httpRequestString.getBytes()) + .concurrentTo(warcResponse.id()) + .build(); + warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it + writer.write(warcRequest); + + return Optional.of(warcResponse); + } + catch (Exception ex) { + logger.warn("Failed to fetch URL {}", uri, ex); + return Optional.empty(); + } + finally { + Files.deleteIfExists(tempFileName); + } + } + + public void close() { + try { + writer.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } +} diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecordingFetcherClientTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecordingFetcherClientTest.java new file mode 100644 index 00000000..c1129e86 --- /dev/null +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecordingFetcherClientTest.java @@ -0,0 +1,70 @@ +package nu.marginalia.crawl.retreival.fetcher; + +import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecordingFetcherClient; +import nu.marginalia.model.EdgeDomain; +import okhttp3.OkHttpClient; +import okhttp3.Request; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.netpreserve.jwarc.WarcReader; +import org.netpreserve.jwarc.WarcRequest; +import org.netpreserve.jwarc.WarcResponse; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.security.NoSuchAlgorithmException; +import java.util.HashMap; +import java.util.Map; +import java.util.zip.GZIPInputStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class WarcRecordingFetcherClientTest { + Path fileName; + WarcRecordingFetcherClient client; + OkHttpClient httpClient; + @BeforeEach + public void setUp() throws Exception { + httpClient = new OkHttpClient.Builder() + .addNetworkInterceptor(new IpInterceptingNetworkInterceptor()) + .build(); + + fileName = Files.createTempFile("test", ".warc.gz"); + client = new WarcRecordingFetcherClient(fileName, new EdgeDomain("www.marginalia.nu")); + } + + @AfterEach + public void tearDown() throws Exception { + client.close(); + Files.delete(fileName); + } + + @Test + void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException { + client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/") + .addHeader("User-agent", "test.marginalia.nu") + .addHeader("Accept-Encoding", "gzip") + .get().build()); + + new GZIPInputStream(Files.newInputStream(fileName)).transferTo(System.out); + + Map sampleData = new HashMap<>(); + try (var warcReader = new WarcReader(fileName)) { + warcReader.forEach(record -> { + if (record instanceof WarcRequest req) { + sampleData.put(record.type(), req.target()); + } + if (record instanceof WarcResponse rsp) { + sampleData.put(record.type(), rsp.target()); + } + }); + } + + assertEquals("https://www.marginalia.nu/", sampleData.get("request")); + assertEquals("https://www.marginalia.nu/", sampleData.get("response")); + } +} \ No newline at end of file From 3bbffd3c221dc30a9170e0b3a317b465dc84eeb2 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 8 Dec 2023 17:12:51 +0100 Subject: [PATCH 06/23] (crawler) Refactor HttpFetcher to integrate WarcRecorder Partially hook in the WarcRecorder into the crawler process. So far it's not read, but should record the crawled documents. The WarcRecorder and HttpFetcher classes were also refactored and broken apart to be easier to reason about. --- ...CrawlingThenConvertingIntegrationTest.java | 10 +- .../java/nu/marginalia/crawl/CrawlerMain.java | 7 +- .../crawl/retreival/CrawlerRetreiver.java | 20 +- .../retreival/fetcher/ContentTypeProber.java | 86 +++++++ .../fetcher/CrawledDocumentFactory.java | 75 ++++++ .../crawl/retreival/fetcher/HttpFetcher.java | 5 +- .../retreival/fetcher/HttpFetcherImpl.java | 217 +++++++----------- .../IpInterceptingNetworkInterceptor.java | 4 +- .../fetcher/warc/HttpFetchResult.java | 31 +++ .../fetcher/warc/WarcDigestBuilder.java | 6 +- .../warc/WarcProtocolReconstructor.java | 3 +- ...ngFetcherClient.java => WarcRecorder.java} | 142 ++++++++---- .../fetcher/ContentTypeProberTest.java | 59 +++++ ...rClientTest.java => WarcRecorderTest.java} | 9 +- .../marginalia/crawling/HttpFetcherTest.java | 20 +- .../retreival/CrawlerMockFetcherTest.java | 29 ++- .../retreival/CrawlerRetreiverTest.java | 113 +++++++-- 17 files changed, 603 insertions(+), 233 deletions(-) create mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java create mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/CrawledDocumentFactory.java create mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/HttpFetchResult.java rename code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/{WarcRecordingFetcherClient.java => WarcRecorder.java} (54%) create mode 100644 code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java rename code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/{WarcRecordingFetcherClientTest.java => WarcRecorderTest.java} (88%) diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 58d8a486..7150b1e0 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -8,6 +8,7 @@ import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; @@ -18,6 +19,7 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Tag; import org.junit.jupiter.api.Test; +import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -46,7 +48,7 @@ public class CrawlingThenConvertingIntegrationTest { } @Test - public void crawlThenProcess() { + public void crawlThenProcess() throws IOException { var specs = CrawlSpecRecord.builder() .domain("www.marginalia.nu") .crawlDepth(10) @@ -72,10 +74,12 @@ public class CrawlingThenConvertingIntegrationTest { } - private CrawledDomain crawl(CrawlSpecRecord specs) { + private CrawledDomain crawl(CrawlSpecRecord specs) throws IOException { List data = new ArrayList<>(); - new CrawlerRetreiver(httpFetcher, specs, data::add).fetch(); + try (var recorder = new WarcRecorder()) { + new CrawlerRetreiver(httpFetcher, specs, recorder, data::add).fetch(); + } CrawledDomain domain = data.stream().filter(CrawledDomain.class::isInstance).map(CrawledDomain.class::cast).findFirst().get(); data.stream().filter(CrawledDocument.class::isInstance).map(CrawledDocument.class::cast).forEach(domain.doc::add); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index dc76abde..73126246 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -12,6 +12,7 @@ import nu.marginalia.atags.source.AnchorTagsSource; import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.spec.CrawlSpecProvider; import nu.marginalia.crawl.spec.DbCrawlSpecProvider; import nu.marginalia.crawl.spec.ParquetCrawlSpecProvider; @@ -212,21 +213,23 @@ public class CrawlerMain { HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool); + try (CrawledDomainWriter writer = new CrawledDomainWriter(outputDir, domain, id); + var warcRecorder = new WarcRecorder(); // write to a temp file for now + var retreiver = new CrawlerRetreiver(fetcher, specification, warcRecorder, writer::accept); CrawlDataReference reference = getReference()) { Thread.currentThread().setName("crawling:" + domain); var domainLinks = anchorTagsSource.getAnchorTags(domain); - var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept); + int size = retreiver.fetch(domainLinks, reference); workLog.setJobToFinished(domain, writer.getOutputFile().toString(), size); heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks); logger.info("Fetched {}", domain); - } catch (Exception e) { logger.error("Error fetching domain " + domain, e); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index ce5ecb89..22fcaa15 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -8,6 +8,7 @@ import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.crawling.model.*; import nu.marginalia.ip_blocklist.UrlBlocklist; @@ -20,13 +21,15 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.annotation.Nullable; +import java.io.IOException; import java.net.InetAddress; import java.net.UnknownHostException; +import java.nio.file.Path; import java.time.LocalDateTime; import java.util.*; import java.util.function.Consumer; -public class CrawlerRetreiver { +public class CrawlerRetreiver implements AutoCloseable { private static final int MAX_ERRORS = 20; @@ -45,6 +48,7 @@ public class CrawlerRetreiver { private static final DomainProber domainProber = new DomainProber(); private final SitemapRetriever sitemapRetriever; private final DomainCrawlFrontier crawlFrontier; + private final WarcRecorder warcRecorder; int errorCount = 0; @@ -56,7 +60,10 @@ public class CrawlerRetreiver { public CrawlerRetreiver(HttpFetcher fetcher, CrawlSpecRecord specs, - Consumer writer) { + WarcRecorder warcRecorder, + Consumer writer) + { + this.warcRecorder = warcRecorder; this.fetcher = fetcher; domain = specs.domain; @@ -121,7 +128,7 @@ public class CrawlerRetreiver { assert !crawlFrontier.isEmpty(); - final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain); + final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain, warcRecorder); final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay()); sniffRootDocument(delayTimer, rootUrl); @@ -419,7 +426,7 @@ public class CrawlerRetreiver { private CrawledDocument tryDownload(EdgeUrl top, CrawlDelayTimer timer, ContentTags tags) { for (int i = 0; i < 2; i++) { try { - var doc = fetcher.fetchContent(top, tags); + var doc = fetcher.fetchContent(top, warcRecorder, tags); doc.recrawlState = "NEW"; return doc; } @@ -496,6 +503,11 @@ public class CrawlerRetreiver { .build(); } + @Override + public void close() throws Exception { + warcRecorder.close(); + } + private record DocumentWithReference( @Nullable CrawledDocument doc, @Nullable CrawlDataReference reference) { diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java new file mode 100644 index 00000000..55f2e633 --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java @@ -0,0 +1,86 @@ +package nu.marginalia.crawl.retreival.fetcher; + +import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; +import nu.marginalia.model.EdgeUrl; +import okhttp3.OkHttpClient; +import okhttp3.Request; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.net.SocketTimeoutException; +import java.util.Objects; + +public class ContentTypeProber { + + private static final Logger logger = LoggerFactory.getLogger(ContentTypeProber.class); + private final String userAgent; + private final OkHttpClient client; + private final ContentTypeLogic contentTypeLogic = new ContentTypeLogic(); + + public ContentTypeProber(String userAgent, OkHttpClient httpClient) { + this.userAgent = userAgent; + this.client = httpClient; + } + + /** Probe the content type of the given URL with a HEAD request. + * This is used to detect binary files, which we don't want to crawl. + *

+ * If the URL redirects, the final URL is returned, to avoid redundant + * requests. + * + * @param url The URL to probe + * @return A ContentTypeProbeResult + */ + public ContentTypeProbeResult probeContentType(EdgeUrl url) { + logger.debug("Probing suspected binary {}", url); + + var headBuilder = new Request.Builder().head() + .addHeader("User-agent", userAgent) + .addHeader("Accept-Encoding", "gzip") + .url(url.toString()); + + var head = headBuilder.build(); + var call = client.newCall(head); + + try (var rsp = call.execute()) { + var contentTypeHeader = rsp.header("Content-type"); + + if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) { + return new ContentTypeProbeResult.BadContentType(contentTypeHeader, rsp.code()); + } + + // Update the URL to the final URL of the HEAD request, otherwise we might end up doing + + // HEAD 301 url1 -> url2 + // HEAD 200 url2 + // GET 301 url1 -> url2 + // GET 200 url2 + + // which is not what we want. Overall we want to do as few requests as possible to not raise + // too many eyebrows when looking at the logs on the target server. Overall it's probably desirable + // that it looks like the traffic makes sense, as opposed to looking like a broken bot. + + var redirectUrl = new EdgeUrl(rsp.request().url().toString()); + EdgeUrl ret; + + if (Objects.equals(redirectUrl.domain, url.domain)) ret = redirectUrl; + else ret = url; + + return new ContentTypeProbeResult.Ok(ret); + + } catch (SocketTimeoutException ex) { + return new ContentTypeProbeResult.Timeout(); + } catch (Exception ex) { + logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage()); + + return new ContentTypeProbeResult.Exception(ex); + } + } + + public sealed interface ContentTypeProbeResult { + record Ok(EdgeUrl resolvedUrl) implements ContentTypeProbeResult { } + record BadContentType(String contentType, int statusCode) implements ContentTypeProbeResult { } + record Timeout() implements ContentTypeProbeResult { } + record Exception(java.lang.Exception ex) implements ContentTypeProbeResult { } + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/CrawledDocumentFactory.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/CrawledDocumentFactory.java new file mode 100644 index 00000000..8a654e20 --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/CrawledDocumentFactory.java @@ -0,0 +1,75 @@ +package nu.marginalia.crawl.retreival.fetcher; + +import nu.marginalia.crawl.retreival.fetcher.warc.HttpFetchResult; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawlerDocumentStatus; +import nu.marginalia.model.EdgeUrl; + +import java.time.LocalDateTime; +import java.util.Objects; + +public class CrawledDocumentFactory { + + public static CrawledDocument createHardErrorRsp(EdgeUrl url, Exception why) { + return CrawledDocument.builder() + .crawlerStatus(CrawlerDocumentStatus.ERROR.toString()) + .crawlerStatusDesc(why.getClass().getSimpleName() + ": " + why.getMessage()) + .timestamp(LocalDateTime.now().toString()) + .url(url.toString()) + .build(); + } + + public static CrawledDocument createUnknownHostError(EdgeUrl url) { + return CrawledDocument.builder() + .crawlerStatus(CrawlerDocumentStatus.ERROR.toString()) + .crawlerStatusDesc("Unknown Host") + .timestamp(LocalDateTime.now().toString()) + .url(url.toString()) + .build(); + } + + public static CrawledDocument createTimeoutErrorRsp(EdgeUrl url) { + return CrawledDocument.builder() + .crawlerStatus("Timeout") + .timestamp(LocalDateTime.now().toString()) + .url(url.toString()) + .build(); + } + + public static CrawledDocument createErrorResponse(EdgeUrl url, HttpFetchResult.ResultOk rsp, CrawlerDocumentStatus status, String why) { + return CrawledDocument.builder() + .crawlerStatus(status.toString()) + .crawlerStatusDesc(why) + .headers(rsp.headers().toString()) + .contentType(Objects.requireNonNullElse(rsp.headers().get("Content-Type"), "")) + .timestamp(LocalDateTime.now().toString()) + .httpStatus(rsp.statusCode()) + .url(url.toString()) + .build(); + } + public static CrawledDocument createErrorResponse(EdgeUrl url, String contentType, int statusCode, CrawlerDocumentStatus status, String why) { + return CrawledDocument.builder() + .crawlerStatus(status.toString()) + .crawlerStatusDesc(why) + .headers("") + .contentType(contentType) + .timestamp(LocalDateTime.now().toString()) + .httpStatus(statusCode) + .url(url.toString()) + .build(); + } + + public static CrawledDocument createRedirectResponse(EdgeUrl url, HttpFetchResult.ResultOk rsp, EdgeUrl responseUrl) { + + return CrawledDocument.builder() + .crawlerStatus(CrawlerDocumentStatus.REDIRECT.name()) + .redirectUrl(responseUrl.toString()) + .headers(rsp.headers().toString()) + .contentType(Objects.requireNonNullElse(rsp.headers().get("Content-Type"), "")) + .timestamp(LocalDateTime.now().toString()) + .httpStatus(rsp.statusCode()) + .url(url.toString()) + .build(); + + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java index 11ad272e..8fc288f9 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java @@ -3,6 +3,7 @@ package nu.marginalia.crawl.retreival.fetcher; import com.google.inject.ImplementedBy; import crawlercommons.robots.SimpleRobotRules; import nu.marginalia.crawl.retreival.RateLimitException; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; @@ -18,9 +19,9 @@ public interface HttpFetcher { FetchResult probeDomain(EdgeUrl url); - CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) throws RateLimitException; + CrawledDocument fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) throws RateLimitException; - SimpleRobotRules fetchRobotRules(EdgeDomain domain); + SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder); SitemapRetriever createSitemapRetriever(); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 57e73c44..4d985b8b 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -8,7 +8,11 @@ import lombok.SneakyThrows; import nu.marginalia.contenttype.DocumentBodyToString; import nu.marginalia.crawl.retreival.Cookies; import nu.marginalia.crawl.retreival.RateLimitException; +import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult; import nu.marginalia.crawl.retreival.fetcher.socket.*; +import nu.marginalia.crawl.retreival.fetcher.warc.HttpFetchResult; +import static nu.marginalia.crawl.retreival.fetcher.CrawledDocumentFactory.*; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawlerDocumentStatus; import nu.marginalia.model.EdgeDomain; @@ -32,16 +36,17 @@ import java.util.*; import java.util.concurrent.TimeUnit; import java.util.zip.GZIPInputStream; + public class HttpFetcherImpl implements HttpFetcher { private final Logger logger = LoggerFactory.getLogger(getClass()); private final String userAgent; - private final int maxFetchSize = 1024*512; private final Cookies cookies = new Cookies(); private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser(); private final ContentTypeLogic contentTypeLogic = new ContentTypeLogic(); + private final ContentTypeProber contentTypeProber; @Override public void setAllowAllContentTypes(boolean allowAllContentTypes) { @@ -88,13 +93,22 @@ public class HttpFetcherImpl implements HttpFetcher { public HttpFetcherImpl(@Named("user-agent") String userAgent, Dispatcher dispatcher, ConnectionPool connectionPool) { this.client = createClient(dispatcher, connectionPool); this.userAgent = userAgent; + this.contentTypeProber = new ContentTypeProber(userAgent, client); } public HttpFetcherImpl(@Named("user-agent") String userAgent) { this.client = createClient(null, new ConnectionPool()); this.userAgent = userAgent; + this.contentTypeProber = new ContentTypeProber(userAgent, client); } + /** + * Probe the domain to see if it is reachable, attempting to identify which schema to use, + * and if there are any redirects. This is done by one or more HEAD requests. + * + * @param url The URL to probe. + * @return The result of the probe, indicating the state and the URL. + */ @Override @SneakyThrows public FetchResult probeDomain(EdgeUrl url) { @@ -127,6 +141,7 @@ public class HttpFetcherImpl implements HttpFetcher { @Override @SneakyThrows public CrawledDocument fetchContent(EdgeUrl url, + WarcRecorder warcRecorder, ContentTags contentTags) throws RateLimitException { @@ -135,149 +150,96 @@ public class HttpFetcherImpl implements HttpFetcher { // looks like it might be something else, we perform a HEAD first to check the content type if (contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) { - logger.debug("Probing suspected binary {}", url); - - var headBuilder = new Request.Builder().head() - .addHeader("User-agent", userAgent) - .addHeader("Accept-Encoding", "gzip") - .url(url.toString()); - - var head = headBuilder.build(); - var call = client.newCall(head); - - try (var rsp = call.execute()) { - var contentTypeHeader = rsp.header("Content-type"); - if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) { - return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, "Early probe failed"); - } - - // Update the URL to the final URL of the HEAD request, otherwise we might end up doing - - // HEAD 301 url1 -> url2 - // HEAD 200 url2 - // GET 301 url1 -> url2 - // GET 200 url2 - - // which is not what we want. Overall we want to do as few requests as possible to not raise - // too many eyebrows when looking at the logs on the target server. Overall it's probably desirable - // that it looks like the traffic makes sense, as opposed to looking like a broken bot. - - var redirectUrl = new EdgeUrl(rsp.request().url().toString()); - if (Objects.equals(redirectUrl.domain, url.domain)) + ContentTypeProbeResult probeResult = contentTypeProber.probeContentType(url); + switch (probeResult) { + case ContentTypeProbeResult.Ok(EdgeUrl redirectUrl) -> { url = redirectUrl; - } - catch (SocketTimeoutException ex) { - return createTimeoutErrorRsp(url, ex); - } - catch (Exception ex) { - logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage()); - return createHardErrorRsp(url, ex); - } + } + case ContentTypeProbeResult.BadContentType (String contentType, int statusCode) -> { + return createErrorResponse(url, contentType, statusCode, + CrawlerDocumentStatus.BAD_CONTENT_TYPE, + contentType + ); + } + case ContentTypeProbeResult.Timeout timeout -> { + return createTimeoutErrorRsp(url); + } + case ContentTypeProbeResult.Exception ex -> { + return createErrorFromException(url, ex.ex()); + } + }; } var getBuilder = new Request.Builder().get(); - getBuilder.addHeader("User-agent", userAgent) - .url(url.toString()) - .addHeader("Accept-Encoding", "gzip"); + getBuilder.url(url.toString()) + .addHeader("Accept-Encoding", "gzip") + .addHeader("User-agent", userAgent); contentTags.paint(getBuilder); - var get = getBuilder.build(); - var call = client.newCall(get); + HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build()); - try (var rsp = call.execute()) { - return extractBody(url, rsp); + if (result instanceof HttpFetchResult.ResultError err) { + return createErrorFromException(url, err.ex()); } - catch (RateLimitException rle) { - throw rle; + else if (result instanceof HttpFetchResult.ResultOk ok) { + try { + return extractBody(url, ok); + } + catch (Exception ex) { + return createErrorFromException(url, ex); + } } - catch (SocketTimeoutException ex) { - return createTimeoutErrorRsp(url, ex); - } - catch (UnknownHostException ex) { - return createUnknownHostError(url, ex); - } - catch (SocketException | ProtocolException | IllegalCharsetNameException | SSLException | EOFException ex) { - // This is a bit of a grab-bag of errors that crop up - // IllegalCharsetName is egg on our face, - // but SSLException and EOFException are probably the server's fault - - return createHardErrorRsp(url, ex); - } - catch (Exception ex) { - logger.error("Error during fetching", ex); - return createHardErrorRsp(url, ex); + else { + throw new IllegalStateException("Unknown result type " + result.getClass()); } } - private CrawledDocument createHardErrorRsp(EdgeUrl url, Exception why) { - return CrawledDocument.builder() - .crawlerStatus(CrawlerDocumentStatus.ERROR.toString()) - .crawlerStatusDesc(why.getClass().getSimpleName() + ": " + why.getMessage()) - .timestamp(LocalDateTime.now().toString()) - .url(url.toString()) - .build(); + private CrawledDocument createErrorFromException(EdgeUrl url, Exception exception) throws RateLimitException { + return switch (exception) { + case RateLimitException rle -> throw rle; + case SocketTimeoutException ex -> createTimeoutErrorRsp(url); + case UnknownHostException ex -> createUnknownHostError(url); + case SocketException ex -> createHardErrorRsp(url, ex); + case ProtocolException ex -> createHardErrorRsp(url, ex); + case IllegalCharsetNameException ex -> createHardErrorRsp(url, ex); + case SSLException ex -> createHardErrorRsp(url, ex); + case EOFException ex -> createHardErrorRsp(url, ex); + default -> { + logger.error("Error during fetching", exception); + yield createHardErrorRsp(url, exception); + } + }; } - private CrawledDocument createUnknownHostError(EdgeUrl url, Exception why) { - return CrawledDocument.builder() - .crawlerStatus(CrawlerDocumentStatus.ERROR.toString()) - .crawlerStatusDesc("Unknown Host") - .timestamp(LocalDateTime.now().toString()) - .url(url.toString()) - .build(); - } + private CrawledDocument extractBody(EdgeUrl url, HttpFetchResult.ResultOk rsp) throws IOException, RateLimitException { - private CrawledDocument createTimeoutErrorRsp(EdgeUrl url, Exception why) { - return CrawledDocument.builder() - .crawlerStatus("Timeout") - .crawlerStatusDesc(why.getMessage()) - .timestamp(LocalDateTime.now().toString()) - .url(url.toString()) - .build(); - } - private CrawledDocument createErrorResponse(EdgeUrl url, Response rsp, CrawlerDocumentStatus status, String why) { - return CrawledDocument.builder() - .crawlerStatus(status.toString()) - .crawlerStatusDesc(why) - .headers(rsp.headers().toString()) - .contentType(rsp.header("Content-type")) - .timestamp(LocalDateTime.now().toString()) - .httpStatus(rsp.code()) - .url(url.toString()) - .build(); - } + var responseUrl = new EdgeUrl(rsp.uri()); - private CrawledDocument extractBody(EdgeUrl url, Response rsp) throws IOException, URISyntaxException, RateLimitException { - - var responseUrl = new EdgeUrl(rsp.request().url().toString()); if (!Objects.equals(responseUrl.domain, url.domain)) { return createRedirectResponse(url, rsp, responseUrl); } - if (rsp.code() == 429) { - throw new RateLimitException(rsp.header("Retry-After", "1000")); + if (rsp.statusCode() == 429) { + String retryAfter = Objects.requireNonNullElse(rsp.header("Retry-After"), "1000"); + + throw new RateLimitException(retryAfter); } - var body = rsp.body(); - if (null == body) { - return createErrorResponse(url, rsp, CrawlerDocumentStatus.ERROR, "No body"); - } + var byteStream = rsp.getInputStream(); - var byteStream = body.byteStream(); - - if ("gzip".equals(rsp.header("Content-encoding"))) { + if ("gzip".equals(rsp.header("Content-Encoding"))) { byteStream = new GZIPInputStream(byteStream); } byteStream = new BOMInputStream(byteStream); - var contentTypeHeader = rsp.header("Content-type"); + var contentTypeHeader = rsp.header("Content-Type"); if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) { return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); } - byte[] data = byteStream.readNBytes(maxFetchSize); + byte[] data = byteStream.readAllBytes(); // size is limited by WarcRecorder var contentType = ContentTypeParser.parseContentType(contentTypeHeader, data); if (!contentTypeLogic.isAllowableContentType(contentType.contentType())) { @@ -288,7 +250,7 @@ public class HttpFetcherImpl implements HttpFetcher { return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CHARSET, ""); } - if (!isXRobotsTagsPermitted(rsp.headers("X-Robots-Tag"), userAgent)) { + if (!isXRobotsTagsPermitted(rsp.allHeaders("X-Robots-Tag"), userAgent)) { return CrawledDocument.builder() .crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name()) .crawlerStatusDesc("X-Robots-Tag") @@ -301,15 +263,12 @@ public class HttpFetcherImpl implements HttpFetcher { var strData = DocumentBodyToString.getStringData(contentType, data); - var canonical = rsp.header("rel=canonical", ""); - return CrawledDocument.builder() .crawlerStatus(CrawlerDocumentStatus.OK.name()) .headers(rsp.headers().toString()) - .contentType(rsp.header("Content-type")) + .contentType(contentTypeHeader) .timestamp(LocalDateTime.now().toString()) - .canonicalUrl(canonical) - .httpStatus(rsp.code()) + .httpStatus(rsp.statusCode()) .url(responseUrl.toString()) .documentBody(strData) .build(); @@ -362,24 +321,11 @@ public class HttpFetcherImpl implements HttpFetcher { return isPermittedGeneral; } - private CrawledDocument createRedirectResponse(EdgeUrl url, Response rsp, EdgeUrl responseUrl) { - - return CrawledDocument.builder() - .crawlerStatus(CrawlerDocumentStatus.REDIRECT.name()) - .redirectUrl(responseUrl.toString()) - .headers(rsp.headers().toString()) - .contentType(rsp.header("Content-type")) - .timestamp(LocalDateTime.now().toString()) - .httpStatus(rsp.code()) - .url(url.toString()) - .build(); - - } @Override - public SimpleRobotRules fetchRobotRules(EdgeDomain domain) { - return fetchRobotsForProto("https", domain) - .or(() -> fetchRobotsForProto("http", domain)) + public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) { + return fetchRobotsForProto("https", recorder, domain) + .or(() -> fetchRobotsForProto("http", recorder, domain)) .orElseGet(() -> new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL)); } @@ -388,10 +334,10 @@ public class HttpFetcherImpl implements HttpFetcher { return new SitemapRetriever(); } - private Optional fetchRobotsForProto(String proto, EdgeDomain domain) { + private Optional fetchRobotsForProto(String proto, WarcRecorder recorder, EdgeDomain domain) { try { var url = new EdgeUrl(proto, domain, null, "/robots.txt", null); - return Optional.of(parseRobotsTxt(fetchContent(url, ContentTags.empty()))); + return Optional.of(parseRobotsTxt(fetchContent(url, recorder, ContentTags.empty()))); } catch (Exception ex) { return Optional.empty(); @@ -406,3 +352,4 @@ public class HttpFetcherImpl implements HttpFetcher { } } + diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/IpInterceptingNetworkInterceptor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/IpInterceptingNetworkInterceptor.java index c5eb76ac..d918afd1 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/IpInterceptingNetworkInterceptor.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/IpInterceptingNetworkInterceptor.java @@ -14,11 +14,11 @@ public class IpInterceptingNetworkInterceptor implements Interceptor { return chain.proceed(chain.request()) .newBuilder() - .addHeader("X-Remote-IP", IP) + .addHeader("X-Marginalia-Remote-IP", IP) .build(); } public static String getIpFromResponse(Response response) { - return response.header("X-Remote-IP"); + return response.header("X-Marginalia-Remote-IP"); } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/HttpFetchResult.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/HttpFetchResult.java new file mode 100644 index 00000000..305c05da --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/HttpFetchResult.java @@ -0,0 +1,31 @@ +package nu.marginalia.crawl.retreival.fetcher.warc; + +import okhttp3.Headers; + +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.net.URI; +import java.util.List; + +public sealed interface HttpFetchResult { + record ResultOk(URI uri, + int statusCode, + Headers headers, + byte[] bytesRaw, + int bytesStart, + int bytesLength + ) implements HttpFetchResult { + public InputStream getInputStream() { + return new ByteArrayInputStream(bytesRaw, bytesStart, bytesLength); + } + + public String header(String name) { + return headers.get(name); + } + public List allHeaders(String name) { + return headers.values(name); + } + + }; + record ResultError(Exception ex) implements HttpFetchResult { }; +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java index 88052a7e..6fd020b4 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcDigestBuilder.java @@ -20,7 +20,11 @@ class WarcDigestBuilder { } public void update(byte[] buffer, int n) { - digest.update(buffer, 0, n); + update(buffer, 0, n); + } + + public void update(byte[] buffer, int s, int n) { + digest.update(buffer, s, n); } public WarcDigest build() { diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java index a583bcc9..683498a0 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java @@ -106,7 +106,8 @@ public class WarcProtocolReconstructor { response.headers().toMultimap().forEach((k, values) -> { String headerCapitalized = capitalizeHeader(k); - if (headerCapitalized.startsWith("X")) + // Omit pseudoheaders injected by the crawler itself + if (headerCapitalized.startsWith("X-Marginalia")) return; for (var value : values) { diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecordingFetcherClient.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java similarity index 54% rename from code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecordingFetcherClient.java rename to code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java index a3440c5a..a8ee9cf9 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecordingFetcherClient.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java @@ -1,7 +1,6 @@ package nu.marginalia.crawl.retreival.fetcher.warc; import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; -import nu.marginalia.model.EdgeDomain; import okhttp3.OkHttpClient; import okhttp3.Request; import org.netpreserve.jwarc.*; @@ -13,14 +12,10 @@ import java.io.InputStream; import java.net.InetAddress; import java.net.URI; import java.net.URISyntaxException; -import java.nio.ByteBuffer; -import java.nio.channels.FileChannel; import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.StandardOpenOption; import java.security.NoSuchAlgorithmException; import java.time.Instant; -import java.util.Optional; /** Based on JWarc's fetch method, APL 2.0 license *

@@ -28,21 +23,39 @@ import java.util.Optional; * as best is possible given not all the data is available at the same time and needs to * be reconstructed. */ -public class WarcRecordingFetcherClient implements AutoCloseable { +public class WarcRecorder implements AutoCloseable { private static final int MAX_TIME = 30_000; private static final int MAX_SIZE = 1024 * 1024 * 10; private final WarcWriter writer; + private final Path warcFile; + private static final Logger logger = LoggerFactory.getLogger(WarcRecorder.class); - private final EdgeDomain domain; - private static final Logger logger = LoggerFactory.getLogger(WarcRecordingFetcherClient.class); + private ThreadLocal bufferThreadLocal = ThreadLocal.withInitial(() -> new byte[MAX_SIZE]); + private boolean temporaryFile = false; - public WarcRecordingFetcherClient(Path warcFile, EdgeDomain domain) throws IOException { - this.writer = new WarcWriter(warcFile); - this.domain = domain; + /** + * Create a new WarcRecorder that will write to the given file + * + * @param warcFile The file to write to + */ + public WarcRecorder(Path warcFile) throws IOException { + this.warcFile = warcFile; + this.writer = new WarcWriter(this.warcFile); } - public Optional fetch(OkHttpClient client, Request request) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException { + /** + * Create a new WarcRecorder that will write to a temporary file + * and delete it when close() is called. + */ + public WarcRecorder() throws IOException { + this.warcFile = Files.createTempFile("warc", ".warc.gz"); + this.writer = new WarcWriter(this.warcFile); + + temporaryFile = true; + } + + public HttpFetchResult fetch(OkHttpClient client, Request request) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException { URI uri = request.url().uri(); WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder(); @@ -52,20 +65,15 @@ public class WarcRecordingFetcherClient implements AutoCloseable { Instant date = Instant.now(); long startMillis = date.toEpochMilli(); - Path tempFileName = Files.createTempFile(domain.toString(), ".data"); - var call = client.newCall(request); int totalLength = 0; WarcTruncationReason truncationReason = null; + ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(); - - try (FileChannel tempFile = - (FileChannel) Files.newByteChannel(tempFileName, StandardOpenOption.READ, StandardOpenOption.WRITE); - var response = call.execute() - ) { + try (var response = call.execute()) { var body = response.body(); InputStream inputStream; @@ -82,29 +90,27 @@ public class WarcRecordingFetcherClient implements AutoCloseable { ip = IpInterceptingNetworkInterceptor.getIpFromResponse(response); String responseHeaders = WarcProtocolReconstructor.getResponseHeader(response); - tempFile.write(ByteBuffer.wrap(responseHeaders.getBytes())); - responseDigestBuilder.update(responseHeaders); + + responseDataBuffer.put(responseHeaders); + responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseDataBuffer.length()); + + int dataStart = responseDataBuffer.pos(); while (inputStream != null) { - int remainingLength; + int remainingLength = responseDataBuffer.remaining(); + if (remainingLength == 0) + break; - if (MAX_SIZE > 0 && MAX_SIZE - totalLength < buf.length) { - remainingLength = (MAX_SIZE - totalLength); - } else { - remainingLength = buf.length; - } + int startPos = responseDataBuffer.pos(); - int n = inputStream.read(buf, 0, remainingLength); + int n = responseDataBuffer.readFrom(inputStream, remainingLength); if (n < 0) break; + responseDataBuffer.updateDigest(responseDigestBuilder, startPos, n); + responseDataBuffer.updateDigest(payloadDigestBuilder, startPos, n); totalLength += n; - for (int i = 0; i < n; ) { - int written = tempFile.write(ByteBuffer.wrap(buf, i, n - i)); - i += written; - } - responseDigestBuilder.update(buf, n); payloadDigestBuilder.update(buf, n); @@ -118,11 +124,10 @@ public class WarcRecordingFetcherClient implements AutoCloseable { } } - tempFile.position(0); WarcResponse.Builder responseBuilder = new WarcResponse.Builder(uri) .blockDigest(responseDigestBuilder.build()) .date(date) - .body(MediaType.HTTP_RESPONSE, tempFile, tempFile.size()); + .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes()); if (ip != null) responseBuilder.ipAddress(InetAddress.getByName(ip)); @@ -133,6 +138,8 @@ public class WarcRecordingFetcherClient implements AutoCloseable { // Build and write the response + long pos = writer.position(); + var warcResponse = responseBuilder.build(); warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it writer.write(warcResponse); @@ -154,20 +161,77 @@ public class WarcRecordingFetcherClient implements AutoCloseable { warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it writer.write(warcRequest); - return Optional.of(warcResponse); + return new HttpFetchResult.ResultOk(uri, + response.code(), + response.headers(), + responseDataBuffer.data, + dataStart, + responseDataBuffer.length() - dataStart); } catch (Exception ex) { logger.warn("Failed to fetch URL {}", uri, ex); - return Optional.empty(); + return new HttpFetchResult.ResultError(ex); } - finally { - Files.deleteIfExists(tempFileName); + } + + + private class ResponseDataBuffer { + private final byte[] data; + private int length = 0; + private int pos = 0; + + public ResponseDataBuffer() { + data = bufferThreadLocal.get(); } + + public int pos() { + return pos; + } + public int length() { + return length; + } + + public void put(String s) { + byte[] bytes = s.getBytes(); + put(bytes, 0, bytes.length); + } + + private void put(byte[] bytes, int i, int n) { + System.arraycopy(bytes, i, data, pos, n); + pos += n; + length += n; + } + + public int readFrom(InputStream inputStream, int remainingLength) throws IOException { + int n = inputStream.read(data, pos, remainingLength); + if (n > 0) { + pos += n; + length += n; + } + return n; + } + + public int remaining() { + return MAX_SIZE - pos; + } + + public void updateDigest(WarcDigestBuilder digestBuilder, int startPos, int n) { + digestBuilder.update(data, startPos, n); + } + + public byte[] copyBytes() { + byte[] copy = new byte[length]; + System.arraycopy(data, 0, copy, 0, length); + return copy; + } + } public void close() { try { writer.close(); + if (temporaryFile) + Files.deleteIfExists(warcFile); } catch (IOException e) { throw new RuntimeException(e); } diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java new file mode 100644 index 00000000..4a015fb9 --- /dev/null +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java @@ -0,0 +1,59 @@ +package nu.marginalia.crawl.retreival.fetcher; + +import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult.BadContentType; +import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult.Ok; +import nu.marginalia.model.EdgeUrl; +import okhttp3.ConnectionPool; +import okhttp3.Dispatcher; +import okhttp3.OkHttpClient; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.net.URISyntaxException; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; + +import static org.junit.jupiter.api.Assertions.*; + +class ContentTypeProberTest { + + ContentTypeProber prober; + + @BeforeEach + void setUp() { + OkHttpClient client = new OkHttpClient.Builder() + .dispatcher(new Dispatcher(Executors.newVirtualThreadPerTaskExecutor())) + .connectionPool(new ConnectionPool(0, 1, TimeUnit.NANOSECONDS)) + .build(); + + prober = new ContentTypeProber("test.marginalia.nu", client); + } + + @Test + void probeContentType() throws URISyntaxException { + assertEquals( + new Ok(new EdgeUrl("https://www.marginalia.nu/robots.txt")), + prober.probeContentType(new EdgeUrl("https://www.marginalia.nu/robots.txt")), + "robots.txt is expected to pass the probing test since it's text/plain" + ); + + assertEquals( + new BadContentType("image/png", 200), + prober.probeContentType(new EdgeUrl("https://www.marginalia.nu/sanic.png")), + "sanic.png is expected to pass the probing test since it's image/png" + ); + + assertEquals( + new Ok(new EdgeUrl("https://www.marginalia.nu/dev/null")), + prober.probeContentType(new EdgeUrl("https://www.marginalia.nu/dev/null")), + "Despite being a 404, we expect this to be passed as OK as it's NotMyJob(TM) to verify response codes" + ); + + assertEquals( + new Ok(new EdgeUrl("https://www.marginalia.nu/projects/edge/about.gmi/")), + prober.probeContentType(new EdgeUrl("https://www.marginalia.nu/projects/edge/about.gmi")), + "about.gmi is expected to give a redirect to about.gmi/ which is served as text/html" + ); + + } +} \ No newline at end of file diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecordingFetcherClientTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java similarity index 88% rename from code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecordingFetcherClientTest.java rename to code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java index c1129e86..80c1218d 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecordingFetcherClientTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java @@ -1,8 +1,7 @@ package nu.marginalia.crawl.retreival.fetcher; import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; -import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecordingFetcherClient; -import nu.marginalia.model.EdgeDomain; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import okhttp3.OkHttpClient; import okhttp3.Request; import org.junit.jupiter.api.AfterEach; @@ -23,9 +22,9 @@ import java.util.zip.GZIPInputStream; import static org.junit.jupiter.api.Assertions.assertEquals; -class WarcRecordingFetcherClientTest { +class WarcRecorderTest { Path fileName; - WarcRecordingFetcherClient client; + WarcRecorder client; OkHttpClient httpClient; @BeforeEach public void setUp() throws Exception { @@ -34,7 +33,7 @@ class WarcRecordingFetcherClientTest { .build(); fileName = Files.createTempFile("test", ".warc.gz"); - client = new WarcRecordingFetcherClient(fileName, new EdgeDomain("www.marginalia.nu")); + client = new WarcRecorder(fileName); } @AfterEach diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java index 5893910f..2f3076cd 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java @@ -4,11 +4,13 @@ import lombok.SneakyThrows; import nu.marginalia.crawl.retreival.RateLimitException; import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; import nu.marginalia.model.EdgeUrl; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import java.io.IOException; import java.net.URISyntaxException; class HttpFetcherTest { @@ -28,16 +30,22 @@ class HttpFetcherTest { } @Test - void fetchUTF8() throws URISyntaxException, RateLimitException { + void fetchUTF8() throws URISyntaxException, RateLimitException, IOException { var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); - var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), ContentTags.empty()); - System.out.println(str.contentType); + try (var recorder = new WarcRecorder()) { + var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty()); + System.out.println(str.contentType); + } + } @Test - void fetchText() throws URISyntaxException, RateLimitException { + void fetchText() throws URISyntaxException, RateLimitException, IOException { var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); - var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), ContentTags.empty()); - System.out.println(str); + + try (var recorder = new WarcRecorder()) { + var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty()); + System.out.println(str.contentType); + } } } \ No newline at end of file diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index c0df397f..9a974713 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -4,6 +4,7 @@ import crawlercommons.robots.SimpleRobotRules; import lombok.SneakyThrows; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.fetcher.*; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawlerDocumentStatus; import nu.marginalia.crawling.model.SerializableCrawlData; @@ -17,11 +18,13 @@ import org.mockito.Mockito; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.function.Consumer; public class CrawlerMockFetcherTest { @@ -60,42 +63,46 @@ public class CrawlerMockFetcherTest { } + void crawl(CrawlSpecRecord spec, Consumer consumer) throws IOException { + try (var recorder = new WarcRecorder()) { + new CrawlerRetreiver(fetcherMock, spec, recorder, consumer) + .fetch(); + } + } + @Test - public void testLemmy() throws URISyntaxException { + public void testLemmy() throws URISyntaxException, IOException { List out = new ArrayList<>(); registerUrlClasspathData(new EdgeUrl("https://startrek.website/"), "mock-crawl-data/lemmy/index.html"); registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html"); registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html"); - new CrawlerRetreiver(fetcherMock, new CrawlSpecRecord("startrek.website", 10, new ArrayList<>()), out::add) - .fetch(); + crawl(new CrawlSpecRecord("startrek.website", 10, new ArrayList<>()), out::add); out.forEach(System.out::println); } @Test - public void testMediawiki() throws URISyntaxException { + public void testMediawiki() throws URISyntaxException, IOException { List out = new ArrayList<>(); registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html"); - new CrawlerRetreiver(fetcherMock, new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()), out::add) - .fetch(); + crawl(new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()), out::add); out.forEach(System.out::println); } @Test - public void testDiscourse() throws URISyntaxException { + public void testDiscourse() throws URISyntaxException, IOException { List out = new ArrayList<>(); registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/"), "mock-crawl-data/discourse/index.html"); registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html"); registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html"); - new CrawlerRetreiver(fetcherMock, new CrawlSpecRecord("community.tt-rss.org", 100, new ArrayList<>()), out::add) - .fetch(); + crawl(new CrawlSpecRecord("community.tt-rss.org", 10, new ArrayList<>()), out::add); out.forEach(System.out::println); } @@ -118,7 +125,7 @@ public class CrawlerMockFetcherTest { } @Override - public CrawledDocument fetchContent(EdgeUrl url, ContentTags tags) { + public CrawledDocument fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) { logger.info("Fetching {}", url); if (mockData.containsKey(url)) { return mockData.get(url); @@ -135,7 +142,7 @@ public class CrawlerMockFetcherTest { } @Override - public SimpleRobotRules fetchRobotRules(EdgeDomain domain) { + public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) { return new SimpleRobotRules(); } diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 147aca68..3b58d50f 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -7,6 +7,7 @@ import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.io.CrawledDomainReader; import nu.marginalia.crawling.io.CrawledDomainWriter; import nu.marginalia.crawling.model.CrawledDocument; @@ -14,16 +15,17 @@ import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.model.crawlspec.CrawlSpecRecord; import org.junit.jupiter.api.*; +import org.netpreserve.jwarc.WarcReader; +import org.netpreserve.jwarc.WarcRequest; +import org.netpreserve.jwarc.WarcResponse; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.stream.Collectors; +import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @Tag("slow") @@ -42,6 +44,53 @@ class CrawlerRetreiverTest { System.setProperty("http.agent", WmsaHome.getUserAgent().uaString()); } + @Test + public void testWarcOutput() throws IOException { + var specs = CrawlSpecRecord + .builder() + .crawlDepth(5) + .domain("www.marginalia.nu") + .urls(List.of("https://www.marginalia.nu/misc/debian-laptop-install-log/")) + .build(); + Path tempFile = null; + try { + tempFile = Files.createTempFile("crawling-process", "warc"); + + List data = new ArrayList<>(); + + try (var recorder = new WarcRecorder(tempFile)) { + new CrawlerRetreiver(httpFetcher, specs, recorder, data::add).fetch(); + } catch (IOException ex) { + Assertions.fail(ex); + } + + Set requests = new HashSet<>(); + Set responses = new HashSet<>(); + + try (var reader = new WarcReader(tempFile)) { + reader.forEach(record -> { + if (record instanceof WarcRequest req) { + requests.add(req.target()); + System.out.println(req.type() + ":" + req.target()); + } + else if (record instanceof WarcResponse rsp) { + responses.add(rsp.target()); + System.out.println(rsp.type() + ":" + rsp.target()); + } + else { + System.out.println(record.type()); + } + }); + } + + assertTrue(requests.contains("https://www.marginalia.nu/misc/debian-laptop-install-log/")); + assertEquals(requests, responses); + } + finally { + if (tempFile != null) + Files.deleteIfExists(tempFile); + } + } @Test public void testWithKnownDomains() { var specs = CrawlSpecRecord @@ -53,7 +102,12 @@ class CrawlerRetreiverTest { List data = new ArrayList<>(); - new CrawlerRetreiver(httpFetcher, specs, data::add).fetch(); + try (var recorder = new WarcRecorder()) { + new CrawlerRetreiver(httpFetcher, specs, recorder, data::add).fetch(); + } + catch (IOException ex) { + Assertions.fail(ex); + } var fetchedUrls = data.stream().filter(CrawledDocument.class::isInstance) @@ -82,7 +136,12 @@ class CrawlerRetreiverTest { List data = new ArrayList<>(); - new CrawlerRetreiver(httpFetcher, specs, data::add).fetch(); + try (var recorder = new WarcRecorder()) { + new CrawlerRetreiver(httpFetcher, specs, recorder, data::add).fetch(); + } + catch (IOException ex) { + Assertions.fail(ex); + } data.stream().filter(CrawledDocument.class::isInstance) .map(CrawledDocument.class::cast) @@ -118,16 +177,23 @@ class CrawlerRetreiverTest { var writer = new CrawledDomainWriter(out, specs.domain, "idid"); Map, List> data = new HashMap<>(); - new CrawlerRetreiver(httpFetcher, specs, d -> { - data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d); - if (d instanceof CrawledDocument doc) { - System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); - if (Math.random() > 0.5) { - doc.headers = ""; + try (var recorder = new WarcRecorder()) { + new CrawlerRetreiver(httpFetcher, specs, recorder, d -> { + data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d); + if (d instanceof CrawledDocument doc) { + System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); + if (Math.random() > 0.5) { + doc.headers = ""; + } } - } - writer.accept(d); - }).fetch(); + writer.accept(d); + }).fetch(); + } + catch (IOException ex) { + Assertions.fail(ex); + } + + writer.close(); var reader = new CrawledDomainReader(); @@ -135,12 +201,15 @@ class CrawlerRetreiverTest { CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0); domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList()); - - new CrawlerRetreiver(httpFetcher, specs, d -> { - if (d instanceof CrawledDocument doc) { - System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); - } - }).fetch(new DomainLinks(), new CrawlDataReference(stream)); - + try (var recorder = new WarcRecorder()) { + new CrawlerRetreiver(httpFetcher, specs, recorder, d -> { + if (d instanceof CrawledDocument doc) { + System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); + } + }).fetch(new DomainLinks(), new CrawlDataReference(stream)); + } + catch (IOException ex) { + Assertions.fail(ex); + } } } \ No newline at end of file From 968dce50fc905c9f63cfe140d998b69b9820c76e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 8 Dec 2023 17:45:46 +0100 Subject: [PATCH 07/23] (crawler) Refactored IpInterceptingNetworkInterceptor for clarity. --- .../socket/IpInterceptingNetworkInterceptor.java | 11 +++++++++-- .../crawl/retreival/fetcher/socket/NoSecuritySSL.java | 1 - 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/IpInterceptingNetworkInterceptor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/IpInterceptingNetworkInterceptor.java index d918afd1..90f43e5c 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/IpInterceptingNetworkInterceptor.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/IpInterceptingNetworkInterceptor.java @@ -6,7 +6,14 @@ import org.jetbrains.annotations.NotNull; import java.io.IOException; + +/** An interceptor that intercepts network requests and adds the remote IP address as + * a header in the response. This is used to pass the remote IP address to the Warc + * writer, as this information is not available in the response. + */ public class IpInterceptingNetworkInterceptor implements Interceptor { + private static final String pseudoHeaderName = "X-Marginalia-Remote-IP"; + @NotNull @Override public Response intercept(@NotNull Interceptor.Chain chain) throws IOException { @@ -14,11 +21,11 @@ public class IpInterceptingNetworkInterceptor implements Interceptor { return chain.proceed(chain.request()) .newBuilder() - .addHeader("X-Marginalia-Remote-IP", IP) + .addHeader(pseudoHeaderName, IP) .build(); } public static String getIpFromResponse(Response response) { - return response.header("X-Marginalia-Remote-IP"); + return response.header(pseudoHeaderName); } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java index 45dc431c..b6b8a589 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/socket/NoSecuritySSL.java @@ -29,7 +29,6 @@ public class NoSecuritySSL { } }; - @SneakyThrows public static SSLSocketFactory buildSocketFactory() { // Install the all-trusting trust manager From e6a1052ba7027e28e1788aff79e8f1cc39b5e50b Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 8 Dec 2023 20:24:01 +0100 Subject: [PATCH 08/23] Simplify CrawlerMain, removing the CrawlerLimiter and using a global HttpFetcher with a virtual thread pool dispatcher instead of the default. --- .../nu/marginalia/crawl/CrawlLimiter.java | 83 ------------------- .../java/nu/marginalia/crawl/CrawlerMain.java | 25 +++--- 2 files changed, 10 insertions(+), 98 deletions(-) delete mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java deleted file mode 100644 index 1b61cb0d..00000000 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlLimiter.java +++ /dev/null @@ -1,83 +0,0 @@ -package nu.marginalia.crawl; - -import lombok.SneakyThrows; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.concurrent.Semaphore; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicBoolean; - -public class CrawlLimiter { - public static final int maxPoolSize = Integer.getInteger("crawler.pool-size", 256); - - // Thresholds for throttling task-spawning. Note there's a bit of hysteresis to this - private final long THROTTLE_TRIGGER_FREE_RAM = Runtime.getRuntime().maxMemory() / 4; - private final long THROTTLE_RELEASE_FREE_RAM = Runtime.getRuntime().maxMemory() / 2; - - private final Semaphore taskSemCount = new Semaphore(maxPoolSize); - - // When set to true, the crawler will wait before starting additional tasks - private final AtomicBoolean throttle = new AtomicBoolean(false); - private static final Logger logger = LoggerFactory.getLogger(CrawlLimiter.class); - - public CrawlLimiter() { - Thread monitorThread = new Thread(this::monitor, "Memory Monitor"); - monitorThread.setDaemon(true); - monitorThread.start(); - } - - - @SneakyThrows - public void monitor() { - for (;;) { - synchronized (throttle) { - boolean oldThrottle = throttle.get(); - boolean newThrottle = oldThrottle; - - if (Runtime.getRuntime().maxMemory() == Long.MAX_VALUE) { - // According to the spec this may happen, although it seems to rarely - // be the case in practice - logger.warn("Memory based throttling disabled (set Xmx)"); - return; - } - - final long freeMemory = Runtime.getRuntime().maxMemory() - Runtime.getRuntime().totalMemory(); - - if (oldThrottle && freeMemory > THROTTLE_RELEASE_FREE_RAM) { - newThrottle = false; - logger.warn("Memory based throttling released"); - } - else if (!oldThrottle && freeMemory < THROTTLE_TRIGGER_FREE_RAM) { - newThrottle = true; - logger.warn("Memory based throttling triggered"); - - // Try to GC - System.gc(); - } - - - throttle.set(newThrottle); - - if (!newThrottle) { - throttle.notifyAll(); - } - if (newThrottle != oldThrottle) { - logger.warn("Memory based throttling set to {}", newThrottle); - } - } - - TimeUnit.SECONDS.sleep(1); - } - } - - @SneakyThrows - public void waitForEnoughRAM() { - while (throttle.get()) { - synchronized (throttle) { - throttle.wait(30000); - } - } - } - -} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index 73126246..0a4fe32e 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -50,12 +50,6 @@ public class CrawlerMain { private final static Logger logger = LoggerFactory.getLogger(CrawlerMain.class); private final ProcessHeartbeatImpl heartbeat; - private final ConnectionPool connectionPool = new ConnectionPool(5, 10, TimeUnit.SECONDS); - - private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS, - new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true))); - - private final UserAgent userAgent; private final MessageQueueFactory messageQueueFactory; private final FileStorageService fileStorageService; private final DbCrawlSpecProvider dbCrawlSpecProvider; @@ -71,7 +65,7 @@ public class CrawlerMain { volatile int totalTasks; final AtomicInteger tasksDone = new AtomicInteger(0); - private final CrawlLimiter limiter = new CrawlLimiter(); + private HttpFetcherImpl fetcher; @Inject public CrawlerMain(UserAgent userAgent, @@ -83,7 +77,6 @@ public class CrawlerMain { AnchorTagsSourceFactory anchorTagsSourceFactory, Gson gson) { this.heartbeat = heartbeat; - this.userAgent = userAgent; this.messageQueueFactory = messageQueueFactory; this.fileStorageService = fileStorageService; this.dbCrawlSpecProvider = dbCrawlSpecProvider; @@ -91,8 +84,14 @@ public class CrawlerMain { this.gson = gson; this.node = processConfiguration.node(); - // maybe need to set -Xss for JVM to deal with this? - pool = new SimpleBlockingThreadPool("CrawlerPool", CrawlLimiter.maxPoolSize, 1); + pool = new SimpleBlockingThreadPool("CrawlerPool", + Integer.getInteger("crawler.pool-size", 256), + 1); + + fetcher = new HttpFetcherImpl(userAgent.uaString(), + new Dispatcher(Executors.newVirtualThreadPerTaskExecutor()), + new ConnectionPool(5, 10, TimeUnit.SECONDS) + ); } public static void main(String... args) throws Exception { @@ -173,6 +172,7 @@ public class CrawlerMain { activePoolCount = newActivePoolCount; } } + } catch (Exception ex) { logger.warn("Exception in crawler", ex); @@ -209,11 +209,6 @@ public class CrawlerMain { @Override public void run() throws Exception { - limiter.waitForEnoughRAM(); - - HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool); - - try (CrawledDomainWriter writer = new CrawledDomainWriter(outputDir, domain, id); var warcRecorder = new WarcRecorder(); // write to a temp file for now var retreiver = new CrawlerRetreiver(fetcher, specification, warcRecorder, writer::accept); From b74a3ebd850f7c8b43195f485916a3678406ed02 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 11 Dec 2023 19:32:58 +0100 Subject: [PATCH 09/23] (crawler) WIP integration of WARC files into the crawler process. At this stage, the crawler will use the WARCs to resume a crawl if it terminates incorrectly. This is a WIP commit, since the warc files are not fully incorporated into the work flow, they are deleted after the domain is crawled. The commit also includes fairly invasive refactoring of the crawler classes, to accomplish better separation of concerns. --- .../java/nu/marginalia/model/EdgeUrl.java | 20 +- .../crawling/io/CrawlerOutputFile.java | 25 +- .../crawling/model/CrawledDomain.java | 4 + .../converting/processor/DomainProcessor.java | 2 +- .../java/nu/marginalia/crawl/CrawlerMain.java | 32 +- .../{fetcher => }/CrawledDocumentFactory.java | 18 +- .../crawl/retreival/CrawlerRetreiver.java | 276 +++--------------- .../retreival/CrawlerWarcResynchronizer.java | 110 +++++++ .../crawl/retreival/DomainCrawlFrontier.java | 28 ++ .../crawl/retreival/fetcher/HttpFetcher.java | 1 + .../retreival/fetcher/HttpFetcherImpl.java | 64 ++-- .../fetcher/body/DocumentBodyExtractor.java | 44 +++ .../fetcher/body/DocumentBodyResult.java | 8 + .../fetcher/warc/HttpFetchResult.java | 55 ++++ .../warc/WarcProtocolReconstructor.java | 18 ++ .../retreival/fetcher/warc/WarcRecorder.java | 67 ++++- .../retreival/revisit/CrawlerRevisitor.java | 123 ++++++++ .../revisit/DocumentWithReference.java | 82 ++++++ .../retreival/sitemap/SitemapFetcher.java | 71 +++++ .../CrawlerWarcResynchronizerTest.java | 88 ++++++ .../retreival/fetcher/WarcRecorderTest.java | 30 ++ .../retreival/CrawlerMockFetcherTest.java | 2 + 22 files changed, 858 insertions(+), 310 deletions(-) rename code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/{fetcher => }/CrawledDocumentFactory.java (81%) create mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java create mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/body/DocumentBodyExtractor.java create mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/body/DocumentBodyResult.java create mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java create mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java create mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/sitemap/SitemapFetcher.java create mode 100644 code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java diff --git a/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java b/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java index f0f23956..c09ed550 100644 --- a/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java +++ b/code/common/model/src/main/java/nu/marginalia/model/EdgeUrl.java @@ -224,19 +224,19 @@ public class EdgeUrl implements Serializable { } public URL asURL() throws MalformedURLException { - int port = this.port != null ? this.port : switch(proto) { - case "http" -> 80; - case "https" -> 443; - default -> 0; - }; - - return new URL(this.proto, this.domain.toString(), port, this.path); + try { + return asURI().toURL(); + } + catch (URISyntaxException e) { + throw new MalformedURLException(e.getMessage()); + } } public URI asURI() throws URISyntaxException { - if (port == null) - return new URI(this.proto, null, this.domain.toString(), this.path, this.param); - else + if (port != null) { return new URI(this.proto, null, this.domain.toString(), this.port, this.path, this.param, null); + } + + return new URI(this.proto, this.domain.toString(), this.path, this.param, null); } } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java index a7661085..67e8738c 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java @@ -14,7 +14,7 @@ public class CrawlerOutputFile { String second = id.substring(2, 4); Path destDir = base.resolve(first).resolve(second); - return destDir.resolve(id + "-" + filesystemSafeName(name) + ".zstd"); + return destDir.resolve(STR."\{id}-\{filesystemSafeName(name)}.zstd"); } /** Return the Path to a file for the given id and name, creating the prerequisite @@ -31,7 +31,7 @@ public class CrawlerOutputFile { if (!Files.exists(destDir)) { Files.createDirectories(destDir); } - return destDir.resolve(id + "-" + filesystemSafeName(name) + ".zstd"); + return destDir.resolve(STR."\{id}-\{filesystemSafeName(name)}.zstd"); } @@ -49,4 +49,25 @@ public class CrawlerOutputFile { } + public static Path createWarcFile(Path baseDir, String id, String name, WarcFileVersion version) { + if (id.length() < 4) { + id = Strings.repeat("0", 4 - id.length()) + id; + } + + String fileName = STR."\{id}-\{filesystemSafeName(name)}.zstd\{version.suffix}"; + + return baseDir.resolve(fileName); + } + + public enum WarcFileVersion { + LIVE(".open"), + TEMP(".tmp"), + FINAL(""); + + public final String suffix; + + WarcFileVersion(String suffix) { + this.suffix = suffix; + } + } } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDomain.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDomain.java index 482311c1..55ec27a6 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDomain.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDomain.java @@ -24,6 +24,10 @@ public class CrawledDomain implements SerializableCrawlData { return doc.size(); } + public boolean hasCookies() { + return cookies != null && !cookies.isEmpty(); + } + public static final String SERIAL_IDENTIFIER = "// DOMAIN"; @Override public String getSerialIdentifier() { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index fc824906..fea8f69a 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -79,7 +79,7 @@ public class DomainProcessor { ret.domain = new EdgeDomain(crawledDomain.domain); ret.ip = crawledDomain.ip; - cookies = Objects.requireNonNullElse(crawledDomain.cookies, Collections.emptyList()).size() > 0; + cookies = crawledDomain.hasCookies(); ip = crawledDomain.ip; if (crawledDomain.redirectDomain != null) { diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index b3a9d26a..a5d78a1f 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -18,6 +18,7 @@ import nu.marginalia.crawl.spec.CrawlSpecProvider; import nu.marginalia.crawl.spec.DbCrawlSpecProvider; import nu.marginalia.crawl.spec.ParquetCrawlSpecProvider; import nu.marginalia.crawling.io.CrawledDomainReader; +import nu.marginalia.crawling.io.CrawlerOutputFile; import nu.marginalia.crawlspec.CrawlSpecFileNames; import nu.marginalia.storage.FileStorageService; import nu.marginalia.model.crawlspec.CrawlSpecRecord; @@ -30,16 +31,16 @@ import nu.marginalia.process.log.WorkLog; import nu.marginalia.service.module.DatabaseModule; import nu.marginalia.crawling.io.CrawledDomainWriter; import nu.marginalia.crawl.retreival.CrawlerRetreiver; -import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.util.SimpleBlockingThreadPool; import okhttp3.ConnectionPool; import okhttp3.Dispatcher; -import okhttp3.internal.Util; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.StandardCopyOption; import java.sql.SQLException; import java.util.*; import java.util.concurrent.*; @@ -212,8 +213,19 @@ public class CrawlerMain { @Override public void run() throws Exception { + Path newWarcFile = CrawlerOutputFile.createWarcFile(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE); + Path tempFile = CrawlerOutputFile.createWarcFile(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP); + Path finalWarcFile = CrawlerOutputFile.createWarcFile(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.FINAL); + + if (Files.exists(newWarcFile)) { + Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING); + } + else { + Files.deleteIfExists(tempFile); + } + try (CrawledDomainWriter writer = new CrawledDomainWriter(outputDir, domain, id); - var warcRecorder = new WarcRecorder(); // write to a temp file for now + var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now var retreiver = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder, writer::accept); CrawlDataReference reference = getReference()) { @@ -221,19 +233,33 @@ public class CrawlerMain { var domainLinks = anchorTagsSource.getAnchorTags(domain); + if (Files.exists(tempFile)) { + retreiver.syncAbortedRun(tempFile); + Files.delete(tempFile); + } + int size = retreiver.fetch(domainLinks, reference); + Files.move(newWarcFile, finalWarcFile, StandardCopyOption.REPLACE_EXISTING); + workLog.setJobToFinished(domain, writer.getOutputFile().toString(), size); heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks); logger.info("Fetched {}", domain); } catch (Exception e) { logger.error("Error fetching domain " + domain, e); + Files.deleteIfExists(newWarcFile); + if (tempFile != null) { + Files.deleteIfExists(tempFile); + } } finally { // We don't need to double-count these; it's also kept int he workLog processingIds.remove(domain); Thread.currentThread().setName("[idle]"); + + // FIXME: Remove this when we're done + Files.deleteIfExists(finalWarcFile); } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/CrawledDocumentFactory.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java similarity index 81% rename from code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/CrawledDocumentFactory.java rename to code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java index 8a654e20..b3ab9ee5 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/CrawledDocumentFactory.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawl.retreival.fetcher; +package nu.marginalia.crawl.retreival; import nu.marginalia.crawl.retreival.fetcher.warc.HttpFetchResult; import nu.marginalia.crawling.model.CrawledDocument; @@ -70,6 +70,22 @@ public class CrawledDocumentFactory { .httpStatus(rsp.statusCode()) .url(url.toString()) .build(); + } + public static CrawledDocument createRobotsError(EdgeUrl url) { + return CrawledDocument.builder() + .url(url.toString()) + .timestamp(LocalDateTime.now().toString()) + .httpStatus(-1) + .crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name()) + .build(); + } + public static CrawledDocument createRetryError(EdgeUrl url) { + return CrawledDocument.builder() + .url(url.toString()) + .timestamp(LocalDateTime.now().toString()) + .httpStatus(429) + .crawlerStatus(CrawlerDocumentStatus.ERROR.name()) + .build(); } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index bb4991b9..30054008 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -9,6 +9,9 @@ import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor; +import nu.marginalia.crawl.retreival.revisit.DocumentWithReference; +import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.crawling.model.*; import nu.marginalia.ip_blocklist.UrlBlocklist; @@ -20,12 +23,9 @@ import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.annotation.Nullable; -import java.io.IOException; import java.net.InetAddress; import java.net.UnknownHostException; import java.nio.file.Path; -import java.time.LocalDateTime; import java.util.*; import java.util.function.Consumer; @@ -46,18 +46,13 @@ public class CrawlerRetreiver implements AutoCloseable { private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector(); private final DomainProber domainProber; - private final SitemapRetriever sitemapRetriever; private final DomainCrawlFrontier crawlFrontier; private final WarcRecorder warcRecorder; + private final CrawlerRevisitor crawlerRevisitor; + private final SitemapFetcher sitemapFetcher; int errorCount = 0; - /** recrawlState tag for documents that had a HTTP status 304 */ - private static final String documentWasRetainedTag = "RETAINED/304"; - - /** recrawlState tag for documents that had a 200 status but were identical to a previous version */ - private static final String documentWasSameTag = "SAME-BY-COMPARISON"; - public CrawlerRetreiver(HttpFetcher fetcher, DomainProber domainProber, CrawlSpecRecord specs, @@ -72,8 +67,10 @@ public class CrawlerRetreiver implements AutoCloseable { crawledDomainWriter = writer; - this.crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), Objects.requireNonNullElse(specs.urls, List.of()), specs.crawlDepth); - sitemapRetriever = fetcher.createSitemapRetriever(); + + crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), Objects.requireNonNullElse(specs.urls, List.of()), specs.crawlDepth); + crawlerRevisitor = new CrawlerRevisitor(crawlFrontier, crawledDomainWriter, this, warcRecorder); + sitemapFetcher = new SitemapFetcher(crawlFrontier, fetcher.createSitemapRetriever()); // We must always crawl the index page first, this is assumed when fingerprinting the server var fst = crawlFrontier.peek(); @@ -125,6 +122,12 @@ public class CrawlerRetreiver implements AutoCloseable { }; } + public void syncAbortedRun(Path warcFile) { + var resync = new CrawlerWarcResynchronizer(crawlFrontier, warcRecorder); + + resync.run(warcFile); + } + private int crawlDomain(CrawlDataReference oldCrawlData, EdgeUrl rootUrl, DomainLinks domainLinks) { String ip = findIp(domain); @@ -147,9 +150,15 @@ public class CrawlerRetreiver implements AutoCloseable { crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto)); // Add links from the sitemap to the crawl frontier - downloadSitemaps(robotsRules, rootUrl); + sitemapFetcher.downloadSitemaps(robotsRules, rootUrl); - CrawledDomain ret = new CrawledDomain(domain, null, CrawlerDomainStatus.OK.name(), null, ip, new ArrayList<>(), null); + CrawledDomain ret = new CrawledDomain(domain, + null, + CrawlerDomainStatus.OK.name(), + null, + ip, + new ArrayList<>(), + null); int fetchedCount = recrawled; @@ -161,7 +170,7 @@ public class CrawlerRetreiver implements AutoCloseable { var top = crawlFrontier.takeNextUrl(); if (!robotsRules.isAllowed(top.toString())) { - crawledDomainWriter.accept(createRobotsError(top)); + crawledDomainWriter.accept(CrawledDocumentFactory.createRobotsError(top)); continue; } @@ -196,119 +205,9 @@ public class CrawlerRetreiver implements AutoCloseable { return fetchedCount; } - /** Performs a re-crawl of old documents, comparing etags and last-modified */ - private int recrawl(CrawlDataReference oldCrawlData, - SimpleRobotRules robotsRules, - CrawlDelayTimer delayTimer) { - int recrawled = 0; - int retained = 0; - - for (;;) { - CrawledDocument doc = oldCrawlData.nextDocument(); - - if (doc == null) { - break; - } - - // This Shouldn't Happen (TM) - var urlMaybe = EdgeUrl.parse(doc.url); - if (urlMaybe.isEmpty()) continue; - var url = urlMaybe.get(); - - // If we've previously 404:d on this URL, we'll refrain from trying to fetch it again - if (doc.httpStatus == 404) { - crawlFrontier.addVisited(url); - continue; - } - - if (doc.httpStatus != 200) continue; - - if (!robotsRules.isAllowed(url.toString())) { - crawledDomainWriter.accept(createRobotsError(url)); - continue; - } - if (!crawlFrontier.filterLink(url)) - continue; - if (!crawlFrontier.addVisited(url)) - continue; - - - if (recrawled > 5 - && retained > 0.9 * recrawled - && Math.random() < 0.9) - { - // Since it looks like most of these documents haven't changed, - // we'll load the documents directly; but we do this in a random - // fashion to make sure we eventually catch changes over time - - crawledDomainWriter.accept(doc); - crawlFrontier.addVisited(url); - continue; - } - - - // GET the document with the stored document as a reference - // providing etag and last-modified headers, so we can recycle the - // document if it hasn't changed without actually downloading it - - var fetchedDocOpt = fetchWriteAndSleep(url, - delayTimer, - new DocumentWithReference(doc, oldCrawlData)); - if (fetchedDocOpt.isEmpty()) continue; - - if (documentWasRetainedTag.equals(fetchedDocOpt.get().recrawlState)) retained ++; - else if (documentWasSameTag.equals(fetchedDocOpt.get().recrawlState)) retained ++; - - recrawled ++; - } - - return recrawled; - } - - private void downloadSitemaps(SimpleRobotRules robotsRules, EdgeUrl rootUrl) { - List sitemaps = robotsRules.getSitemaps(); - - List urls = new ArrayList<>(sitemaps.size()); - if (!sitemaps.isEmpty()) { - for (var url : sitemaps) { - EdgeUrl.parse(url).ifPresent(urls::add); - } - } - else { - urls.add(rootUrl.withPathAndParam("/sitemap.xml", null)); - } - - downloadSitemaps(urls); - } - - private void downloadSitemaps(List urls) { - - Set checkedSitemaps = new HashSet<>(); - - for (var url : urls) { - // Let's not download sitemaps from other domains for now - if (!crawlFrontier.isSameDomain(url)) { - continue; - } - - if (checkedSitemaps.contains(url.path)) - continue; - - var sitemap = sitemapRetriever.fetchSitemap(url); - if (sitemap.isEmpty()) { - continue; - } - - // ensure we don't try to download this sitemap again - // (don't move this up, as we may want to check the same - // path with different protocols until we find one that works) - - checkedSitemaps.add(url.path); - - crawlFrontier.addAllToQueue(sitemap); - } - - logger.debug("Queue is now {}", crawlFrontier.queueSize()); + /** Using the old crawl data, fetch the documents comparing etags and last-modified */ + private int recrawl(CrawlDataReference oldCrawlData, SimpleRobotRules robotsRules, CrawlDelayTimer delayTimer) { + return crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer); } private void sniffRootDocument(CrawlDelayTimer delayTimer, EdgeUrl rootUrl) { @@ -345,7 +244,7 @@ public class CrawlerRetreiver implements AutoCloseable { linkParser.parseLink(url, href) .filter(crawlFrontier::isSameDomain) .map(List::of) - .ifPresent(this::downloadSitemaps); + .ifPresent(sitemapFetcher::downloadSitemaps); } } catch (Exception ex) { @@ -353,7 +252,7 @@ public class CrawlerRetreiver implements AutoCloseable { } } - private Optional fetchWriteAndSleep(EdgeUrl top, + public Optional fetchWriteAndSleep(EdgeUrl top, CrawlDelayTimer timer, DocumentWithReference reference) { logger.debug("Fetching {}", top); @@ -365,11 +264,11 @@ public class CrawlerRetreiver implements AutoCloseable { if (docOpt.isPresent()) { var doc = docOpt.get(); - if (!Objects.equals(doc.recrawlState, documentWasRetainedTag) + if (!Objects.equals(doc.recrawlState, CrawlerRevisitor.documentWasRetainedTag) && reference.isContentBodySame(doc)) { // The document didn't change since the last time - doc.recrawlState = documentWasSameTag; + doc.recrawlState = CrawlerRevisitor.documentWasSameTag; } crawledDomainWriter.accept(doc); @@ -408,7 +307,7 @@ public class CrawlerRetreiver implements AutoCloseable { var parsedDoc = Jsoup.parse(doc.documentBody); EdgeUrl url = new EdgeUrl(doc.url); - findLinks(url, parsedDoc); + crawlFrontier.enqueueLinksFromDocument(url, parsedDoc); findCanonicalUrl(url, parsedDoc) .ifPresent(canonicalLink -> doc.canonicalUrl = canonicalLink.toString()); } @@ -442,34 +341,13 @@ public class CrawlerRetreiver implements AutoCloseable { } } - return createRetryError(top); + return CrawledDocumentFactory.createRetryError(top); } private String createHash(String documentBodyHash) { return hashMethod.hashUnencodedChars(documentBodyHash).toString(); } - private void findLinks(EdgeUrl baseUrl, Document parsed) { - baseUrl = linkParser.getBaseLink(parsed, baseUrl); - - for (var link : parsed.getElementsByTag("a")) { - linkParser.parseLink(baseUrl, link).ifPresent(crawlFrontier::addToQueue); - } - for (var link : parsed.getElementsByTag("frame")) { - linkParser.parseFrame(baseUrl, link).ifPresent(crawlFrontier::addToQueue); - } - for (var link : parsed.getElementsByTag("iframe")) { - linkParser.parseFrame(baseUrl, link).ifPresent(crawlFrontier::addToQueue); - } - for (var link : parsed.getElementsByTag("link")) { - String rel = link.attr("rel"); - - if (rel.equalsIgnoreCase("next") || rel.equalsIgnoreCase("prev")) { - linkParser.parseLink(baseUrl, link).ifPresent(crawlFrontier::addToQueue); - } - } - } - private Optional findCanonicalUrl(EdgeUrl baseUrl, Document parsed) { baseUrl = baseUrl.domain.toRootUrl(); @@ -488,97 +366,9 @@ public class CrawlerRetreiver implements AutoCloseable { } } - private CrawledDocument createRobotsError(EdgeUrl url) { - return CrawledDocument.builder() - .url(url.toString()) - .timestamp(LocalDateTime.now().toString()) - .httpStatus(-1) - .crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name()) - .build(); - } - private CrawledDocument createRetryError(EdgeUrl url) { - return CrawledDocument.builder() - .url(url.toString()) - .timestamp(LocalDateTime.now().toString()) - .httpStatus(429) - .crawlerStatus(CrawlerDocumentStatus.ERROR.name()) - .build(); - } - @Override public void close() throws Exception { warcRecorder.close(); } - private record DocumentWithReference( - @Nullable CrawledDocument doc, - @Nullable CrawlDataReference reference) { - - private static final DocumentWithReference emptyInstance = new DocumentWithReference(null, null); - public static DocumentWithReference empty() { - return emptyInstance; - } - - public boolean isContentBodySame(CrawledDocument newDoc) { - if (reference == null) - return false; - if (doc == null) - return false; - if (doc.documentBody == null) - return false; - if (newDoc.documentBody == null) - return false; - - return reference.isContentBodySame(doc, newDoc); - } - - private ContentTags getContentTags() { - if (null == doc) - return ContentTags.empty(); - - String headers = doc.headers; - if (headers == null) - return ContentTags.empty(); - - String[] headersLines = headers.split("\n"); - - String lastmod = null; - String etag = null; - - for (String line : headersLines) { - if (line.toLowerCase().startsWith("etag:")) { - etag = line.substring(5).trim(); - } - if (line.toLowerCase().startsWith("last-modified:")) { - lastmod = line.substring(14).trim(); - } - } - - return new ContentTags(etag, lastmod); - } - - public boolean isEmpty() { - return doc == null || reference == null; - } - - /** If the provided document has HTTP status 304, and the reference document is provided, - * return the reference document; otherwise return the provided document. - */ - public CrawledDocument replaceOn304(CrawledDocument fetchedDoc) { - - if (doc == null) - return fetchedDoc; - - // HTTP status 304 is NOT MODIFIED, which means the document is the same as it was when - // we fetched it last time. We can recycle the reference document. - if (fetchedDoc.httpStatus != 304) - return fetchedDoc; - - var ret = doc; - ret.recrawlState = documentWasRetainedTag; - ret.timestamp = LocalDateTime.now().toString(); - return ret; - } - } - } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java new file mode 100644 index 00000000..01bafbe1 --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java @@ -0,0 +1,110 @@ +package nu.marginalia.crawl.retreival; + +import nu.marginalia.crawl.retreival.fetcher.body.DocumentBodyExtractor; +import nu.marginalia.crawl.retreival.fetcher.body.DocumentBodyResult; +import nu.marginalia.crawl.retreival.fetcher.warc.HttpFetchResult; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.model.EdgeUrl; +import org.jsoup.Jsoup; +import org.netpreserve.jwarc.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Path; + +/** + * This class is responsible for resynchronizing the crawl frontier with a partially written + * warc file. This may happen if the crawl is interrupted or crashes. + *

+ * This is best-effort and not guaranteed to recover all data, but it should limit + * the amount of data that is lost and needs to be re-crawled in the event of an unexpected + * shutdown. + */ +public class CrawlerWarcResynchronizer { + private final DomainCrawlFrontier crawlFrontier; + private final WarcRecorder recorder; + private static final Logger logger = LoggerFactory.getLogger(CrawlerWarcResynchronizer.class); + public CrawlerWarcResynchronizer(DomainCrawlFrontier crawlFrontier, WarcRecorder recorder) { + this.crawlFrontier = crawlFrontier; + this.recorder = recorder; + } + + public void run(Path tempFile) { + // First pass, enqueue links + try (var reader = new WarcReader(tempFile)) { + for (var item : reader) { + accept(item); + } + } catch (IOException e) { + logger.info(STR."Failed read full warc file \{tempFile}", e); + } + + // Second pass, copy records to the new warc file + try (var reader = new WarcReader(tempFile)) { + for (var item : reader) { + recorder.resync(item); + } + } catch (IOException e) { + logger.info(STR."Failed read full warc file \{tempFile}", e); + } + } + + public void accept(WarcRecord item) { + try { + if (item instanceof WarcResponse rsp) { + response(rsp); + } else if (item instanceof WarcRevisit revisit) { + revisit(revisit); + } else if (item instanceof WarcRequest req) { + request(req); + } + } + catch (Exception ex) { + logger.info(STR."Failed to process warc record \{item}", ex); + } + } + + private void request(WarcRequest request) { + EdgeUrl.parse(request.target()).ifPresent(crawlFrontier::addVisited); + } + + private void response(WarcResponse rsp) { + var url = new EdgeUrl(rsp.targetURI()); + + crawlFrontier.addVisited(url); + + try { + var response = HttpFetchResult.importWarc(rsp); + if (DocumentBodyExtractor.extractBody(response) instanceof DocumentBodyResult.Ok ok) { + var doc = Jsoup.parse(ok.body()); + crawlFrontier.enqueueLinksFromDocument(url, doc); + } + } + catch (Exception e) { + logger.info(STR."Failed to parse response body for \{url}", e); + } + } + + private void revisit(WarcRevisit revisit) throws IOException { + if (!WarcRecorder.revisitURI.equals(revisit.profile())) { + return; + } + + var url = new EdgeUrl(revisit.targetURI()); + + crawlFrontier.addVisited(url); + + try { + var response = HttpFetchResult.importWarc(revisit); + if (DocumentBodyExtractor.extractBody(response) instanceof DocumentBodyResult.Ok ok) { + var doc = Jsoup.parse(ok.body()); + crawlFrontier.enqueueLinksFromDocument(url, doc); + } + } + catch (Exception e) { + logger.info(STR."Failed to parse response body for \{url}", e); + } + } + +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java index 30902a8e..6d868fdf 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java @@ -3,14 +3,19 @@ package nu.marginalia.crawl.retreival; import com.google.common.hash.HashFunction; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import nu.marginalia.ip_blocklist.UrlBlocklist; +import nu.marginalia.link_parser.LinkParser; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import org.jsoup.nodes.Document; import java.net.URISyntaxException; import java.util.*; import java.util.function.Predicate; public class DomainCrawlFrontier { + + private static final LinkParser linkParser = new LinkParser(); + private final ArrayDeque queue; // To save the number of strings kept in memory, @@ -141,4 +146,27 @@ public class DomainCrawlFrontier { public int queueSize() { return queue.size(); } + + + public void enqueueLinksFromDocument(EdgeUrl baseUrl, Document parsed) { + baseUrl = linkParser.getBaseLink(parsed, baseUrl); + + for (var link : parsed.getElementsByTag("a")) { + linkParser.parseLink(baseUrl, link).ifPresent(this::addToQueue); + } + for (var link : parsed.getElementsByTag("frame")) { + linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue); + } + for (var link : parsed.getElementsByTag("iframe")) { + linkParser.parseFrame(baseUrl, link).ifPresent(this::addToQueue); + } + for (var link : parsed.getElementsByTag("link")) { + String rel = link.attr("rel"); + + if (rel.equalsIgnoreCase("next") || rel.equalsIgnoreCase("prev")) { + linkParser.parseLink(baseUrl, link).ifPresent(this::addToQueue); + } + } + } + } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java index 8fc288f9..be815954 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java @@ -8,6 +8,7 @@ import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import java.nio.file.Path; import java.util.List; @ImplementedBy(HttpFetcherImpl.class) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 8ff9dd12..3faffe4a 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -5,22 +5,21 @@ import com.google.inject.name.Named; import crawlercommons.robots.SimpleRobotRules; import crawlercommons.robots.SimpleRobotRulesParser; import lombok.SneakyThrows; -import nu.marginalia.contenttype.DocumentBodyToString; import nu.marginalia.crawl.retreival.Cookies; import nu.marginalia.crawl.retreival.RateLimitException; import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult; +import nu.marginalia.crawl.retreival.fetcher.body.DocumentBodyExtractor; +import nu.marginalia.crawl.retreival.fetcher.body.DocumentBodyResult; import nu.marginalia.crawl.retreival.fetcher.socket.*; import nu.marginalia.crawl.retreival.fetcher.warc.HttpFetchResult; -import static nu.marginalia.crawl.retreival.fetcher.CrawledDocumentFactory.*; +import static nu.marginalia.crawl.retreival.CrawledDocumentFactory.*; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawlerDocumentStatus; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; -import nu.marginalia.contenttype.ContentTypeParser; import okhttp3.*; -import org.apache.commons.io.input.BOMInputStream; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -34,7 +33,6 @@ import java.nio.charset.IllegalCharsetNameException; import java.time.LocalDateTime; import java.util.*; import java.util.concurrent.TimeUnit; -import java.util.zip.GZIPInputStream; public class HttpFetcherImpl implements HttpFetcher { @@ -45,7 +43,7 @@ public class HttpFetcherImpl implements HttpFetcher { private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser(); - private final ContentTypeLogic contentTypeLogic = new ContentTypeLogic(); + private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic(); private final ContentTypeProber contentTypeProber; @Override @@ -188,14 +186,14 @@ public class HttpFetcherImpl implements HttpFetcher { } else if (result instanceof HttpFetchResult.ResultOk ok) { try { - return extractBody(url, ok); + return extractBody(userAgent, url, ok); } catch (Exception ex) { return createErrorFromException(url, ex); } } else { - throw new IllegalStateException("Unknown result type " + result.getClass()); + throw new IllegalStateException(STR."Unknown result type \{result.getClass()}"); } } @@ -216,7 +214,7 @@ public class HttpFetcherImpl implements HttpFetcher { }; } - private CrawledDocument extractBody(EdgeUrl url, HttpFetchResult.ResultOk rsp) throws IOException, RateLimitException { + public static CrawledDocument extractBody(String userAgent, EdgeUrl url, HttpFetchResult.ResultOk rsp) throws IOException, RateLimitException { var responseUrl = new EdgeUrl(rsp.uri()); @@ -230,29 +228,6 @@ public class HttpFetcherImpl implements HttpFetcher { throw new RateLimitException(retryAfter); } - var byteStream = rsp.getInputStream(); - - if ("gzip".equals(rsp.header("Content-Encoding"))) { - byteStream = new GZIPInputStream(byteStream); - } - byteStream = new BOMInputStream(byteStream); - - var contentTypeHeader = rsp.header("Content-Type"); - if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) { - return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); - } - - byte[] data = byteStream.readAllBytes(); // size is limited by WarcRecorder - - var contentType = ContentTypeParser.parseContentType(contentTypeHeader, data); - if (!contentTypeLogic.isAllowableContentType(contentType.contentType())) { - return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); - } - - if ("Shift_JIS".equalsIgnoreCase(contentType.charset())) { - return createErrorResponse(url, rsp, CrawlerDocumentStatus.BAD_CHARSET, ""); - } - if (!isXRobotsTagsPermitted(rsp.allHeaders("X-Robots-Tag"), userAgent)) { return CrawledDocument.builder() .crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name()) @@ -264,17 +239,20 @@ public class HttpFetcherImpl implements HttpFetcher { .build(); } - var strData = DocumentBodyToString.getStringData(contentType, data); - - return CrawledDocument.builder() - .crawlerStatus(CrawlerDocumentStatus.OK.name()) - .headers(rsp.headers().toString()) - .contentType(contentTypeHeader) - .timestamp(LocalDateTime.now().toString()) - .httpStatus(rsp.statusCode()) - .url(responseUrl.toString()) - .documentBody(strData) - .build(); + return switch(DocumentBodyExtractor.extractBody(rsp)) { + case DocumentBodyResult.Error(CrawlerDocumentStatus status, String why) -> + createErrorResponse(url, rsp, status, why); + case DocumentBodyResult.Ok(String contentType, String body) -> + CrawledDocument.builder() + .crawlerStatus(CrawlerDocumentStatus.OK.name()) + .headers(rsp.headers().toString()) + .contentType(contentType) + .timestamp(LocalDateTime.now().toString()) + .httpStatus(rsp.statusCode()) + .url(responseUrl.toString()) + .documentBody(body) + .build(); + }; } /** Check X-Robots-Tag header tag to see if we are allowed to index this page. diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/body/DocumentBodyExtractor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/body/DocumentBodyExtractor.java new file mode 100644 index 00000000..99ae2cae --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/body/DocumentBodyExtractor.java @@ -0,0 +1,44 @@ +package nu.marginalia.crawl.retreival.fetcher.body; + +import nu.marginalia.contenttype.ContentTypeParser; +import nu.marginalia.contenttype.DocumentBodyToString; +import nu.marginalia.crawl.retreival.fetcher.warc.HttpFetchResult; +import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; +import nu.marginalia.crawling.model.CrawlerDocumentStatus; +import org.apache.commons.io.input.BOMInputStream; + +import java.io.IOException; +import java.util.zip.GZIPInputStream; + +public class DocumentBodyExtractor { + private static ContentTypeLogic contentTypeLogic = new ContentTypeLogic(); + + public static DocumentBodyResult extractBody(HttpFetchResult.ResultOk rsp) throws IOException { + var byteStream = rsp.getInputStream(); + + if ("gzip".equals(rsp.header("Content-Encoding"))) { + byteStream = new GZIPInputStream(byteStream); + } + byteStream = new BOMInputStream(byteStream); + + var contentTypeHeader = rsp.header("Content-Type"); + if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) { + return new DocumentBodyResult.Error(CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); + } + + byte[] data = byteStream.readAllBytes(); // size is limited by WarcRecorder + + var contentType = ContentTypeParser.parseContentType(contentTypeHeader, data); + if (!contentTypeLogic.isAllowableContentType(contentType.contentType())) { + return new DocumentBodyResult.Error(CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); + } + + if ("Shift_JIS".equalsIgnoreCase(contentType.charset())) { + return new DocumentBodyResult.Error(CrawlerDocumentStatus.BAD_CHARSET, ""); + } + + + return new DocumentBodyResult.Ok(contentType.contentType(), DocumentBodyToString.getStringData(contentType, data)); + } + +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/body/DocumentBodyResult.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/body/DocumentBodyResult.java new file mode 100644 index 00000000..fc5d67ec --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/body/DocumentBodyResult.java @@ -0,0 +1,8 @@ +package nu.marginalia.crawl.retreival.fetcher.body; + +import nu.marginalia.crawling.model.CrawlerDocumentStatus; + +public sealed interface DocumentBodyResult { + record Ok(String contentType, String body) implements DocumentBodyResult { } + record Error(CrawlerDocumentStatus status, String why) implements DocumentBodyResult { } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/HttpFetchResult.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/HttpFetchResult.java index 305c05da..ae9673b1 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/HttpFetchResult.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/HttpFetchResult.java @@ -1,13 +1,47 @@ package nu.marginalia.crawl.retreival.fetcher.warc; import okhttp3.Headers; +import org.netpreserve.jwarc.MessageHeaders; +import org.netpreserve.jwarc.WarcResponse; +import org.netpreserve.jwarc.WarcRevisit; import java.io.ByteArrayInputStream; +import java.io.IOException; import java.io.InputStream; import java.net.URI; import java.util.List; public sealed interface HttpFetchResult { + static ResultOk importWarc(WarcResponse response) throws IOException { + var http = response.http(); + try (var body = http.body()) { + byte[] bytes = body.stream().readAllBytes(); + + return new ResultOk( + response.targetURI(), + http.status(), + http.headers(), + bytes, + 0, + bytes.length + ); + } + } + static ResultOk importWarc(WarcRevisit revisit) throws IOException { + var http = revisit.http(); + try (var body = http.body()) { + byte[] bytes = body.stream().readAllBytes(); + + return new ResultOk( + revisit.targetURI(), + http.status(), + http.headers(), + bytes, + 0, + bytes.length + ); + } + } record ResultOk(URI uri, int statusCode, Headers headers, @@ -15,6 +49,26 @@ public sealed interface HttpFetchResult { int bytesStart, int bytesLength ) implements HttpFetchResult { + + public ResultOk(URI uri, + int statusCode, + MessageHeaders headers, + byte[] bytesRaw, + int bytesStart, + int bytesLength) { + this(uri, statusCode, convertHeaders(headers), bytesRaw, bytesStart, bytesLength); + } + + private static Headers convertHeaders(MessageHeaders headers) { + var ret = new Headers.Builder(); + for (var header : headers.map().entrySet()) { + for (var value : header.getValue()) { + ret.add(header.getKey(), value); + } + } + return ret.build(); + } + public InputStream getInputStream() { return new ByteArrayInputStream(bytesRaw, bytesStart, bytesLength); } @@ -26,6 +80,7 @@ public sealed interface HttpFetchResult { return headers.values(name); } + }; record ResultError(Exception ex) implements HttpFetchResult { }; } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java index 683498a0..368bf3c7 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java @@ -34,6 +34,17 @@ public class WarcProtocolReconstructor { return requestStringBuilder.toString(); } + static String getResponseHeader(String headersAsString, int code) { + String version = "1.1"; + + String statusCode = String.valueOf(code); + String statusMessage = STATUS_CODE_MAP.getOrDefault(code, "Unknown"); + + String headerString = getHeadersAsString(headersAsString); + + return STR."HTTP/\{version} \{statusCode} \{statusMessage}\r\n\{headerString}\r\n\r\n"; + } + static String getResponseHeader(Response response) { String version = response.protocol() == Protocol.HTTP_1_1 ? "1.1" : "2.0"; @@ -99,6 +110,13 @@ public class WarcProtocolReconstructor { Map.entry(511, "Network Authentication Required") ); + static private String getHeadersAsString(String headersBlob) { + StringJoiner joiner = new StringJoiner("\r\n"); + + Arrays.stream(headersBlob.split("\n")).forEach(joiner::add); + + return joiner.toString(); + } static private String getHeadersAsString(Response response) { StringJoiner joiner = new StringJoiner("\r\n"); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java index a8ee9cf9..3d4b5aaa 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java @@ -1,12 +1,14 @@ package nu.marginalia.crawl.retreival.fetcher.warc; import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; +import nu.marginalia.model.EdgeUrl; import okhttp3.OkHttpClient; import okhttp3.Request; import org.netpreserve.jwarc.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.net.InetAddress; @@ -16,6 +18,9 @@ import java.nio.file.Files; import java.nio.file.Path; import java.security.NoSuchAlgorithmException; import java.time.Instant; +import java.util.HashMap; +import java.util.List; +import java.util.Map; /** Based on JWarc's fetch method, APL 2.0 license *

@@ -24,6 +29,8 @@ import java.time.Instant; * be reconstructed. */ public class WarcRecorder implements AutoCloseable { + public static final URI revisitURI = URI.create("urn:marginalia:revisit"); + private static final int MAX_TIME = 30_000; private static final int MAX_SIZE = 1024 * 1024 * 10; private final WarcWriter writer; @@ -85,8 +92,6 @@ public class WarcRecorder implements AutoCloseable { inputStream = body.byteStream(); } - byte[] buf = new byte[8192]; - ip = IpInterceptingNetworkInterceptor.getIpFromResponse(response); String responseHeaders = WarcProtocolReconstructor.getResponseHeader(response); @@ -111,9 +116,6 @@ public class WarcRecorder implements AutoCloseable { responseDataBuffer.updateDigest(payloadDigestBuilder, startPos, n); totalLength += n; - responseDigestBuilder.update(buf, n); - payloadDigestBuilder.update(buf, n); - if (MAX_TIME > 0 && System.currentTimeMillis() - startMillis > MAX_TIME) { truncationReason = WarcTruncationReason.TIME; break; @@ -138,8 +140,6 @@ public class WarcRecorder implements AutoCloseable { // Build and write the response - long pos = writer.position(); - var warcResponse = responseBuilder.build(); warcResponse.http(); // force HTTP header to be parsed before body is consumed so that caller can use it writer.write(warcResponse); @@ -174,6 +174,59 @@ public class WarcRecorder implements AutoCloseable { } } + public void resync(WarcRecord item) throws IOException { + writer.write(item); + } + + /** + * Flag the given URL as skipped by the crawler, so that it will not be retried. + * Which URLs were skipped is still important when resynchronizing on the WARC file, + * so that the crawler can avoid re-fetching them. + * + * @param url The URL to flag + * @param headers + * @param documentBody + */ + public void flagAsSkipped(EdgeUrl url, String headers, int statusCode, String documentBody) { + try { + WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder(); + WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder(); + + String header = WarcProtocolReconstructor.getResponseHeader(headers, statusCode); + ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(); + responseDataBuffer.put(header); + + responseDigestBuilder.update(header); + + try (var inputStream = new ByteArrayInputStream(documentBody.getBytes())) { + int remainingLength; + while ((remainingLength = responseDataBuffer.remaining()) > 0) { + int startPos = responseDataBuffer.pos(); + + int n = responseDataBuffer.readFrom(inputStream, remainingLength); + if (n < 0) + break; + + responseDataBuffer.updateDigest(responseDigestBuilder, startPos, n); + responseDataBuffer.updateDigest(payloadDigestBuilder, startPos, n); + } + } + + WarcRevisit revisit = new WarcRevisit.Builder(url.asURI(), revisitURI) + .blockDigest(responseDigestBuilder.build()) + .payloadDigest(payloadDigestBuilder.build()) + .date(Instant.now()) + .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes()) + .build(); + + revisit.http(); // force HTTP header to be parsed before body is consumed so that caller can use it + + writer.write(revisit); + } catch (URISyntaxException | IOException | NoSuchAlgorithmException e) { + throw new RuntimeException(e); + } + } + private class ResponseDataBuffer { private final byte[] data; diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java new file mode 100644 index 00000000..c77af845 --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java @@ -0,0 +1,123 @@ +package nu.marginalia.crawl.retreival.revisit; + +import crawlercommons.robots.SimpleRobotRules; +import nu.marginalia.crawl.retreival.CrawlDataReference; +import nu.marginalia.crawl.retreival.CrawlDelayTimer; +import nu.marginalia.crawl.retreival.CrawlerRetreiver; +import nu.marginalia.crawl.retreival.DomainCrawlFrontier; +import nu.marginalia.crawl.retreival.CrawledDocumentFactory; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.model.EdgeUrl; +import org.jsoup.Jsoup; + +import java.util.function.Consumer; + +/** This class encapsulates the logic for re-visiting a domain that has already been crawled. + * We may use information from the previous crawl to inform the next crawl, specifically the + * E-Tag and Last-Modified headers. + */ +public class CrawlerRevisitor { + /** recrawlState tag for documents that had a HTTP status 304 */ + public static final String documentWasRetainedTag = "RETAINED/304"; + + /** recrawlState tag for documents that had a 200 status but were identical to a previous version */ + public static final String documentWasSameTag = "SAME-BY-COMPARISON"; + + + private final DomainCrawlFrontier crawlFrontier; + private final Consumer crawledDomainWriter; + private final CrawlerRetreiver crawlerRetreiver; + private final WarcRecorder warcRecorder; + + public CrawlerRevisitor(DomainCrawlFrontier crawlFrontier, + Consumer crawledDomainWriter, + CrawlerRetreiver crawlerRetreiver, + WarcRecorder warcRecorder) { + this.crawlFrontier = crawlFrontier; + this.crawledDomainWriter = crawledDomainWriter; + this.crawlerRetreiver = crawlerRetreiver; + this.warcRecorder = warcRecorder; + } + + /** Performs a re-crawl of old documents, comparing etags and last-modified */ + public int recrawl(CrawlDataReference oldCrawlData, + SimpleRobotRules robotsRules, + CrawlDelayTimer delayTimer) { + int recrawled = 0; + int retained = 0; + + for (;;) { + CrawledDocument doc = oldCrawlData.nextDocument(); + + if (doc == null) { + break; + } + + // This Shouldn't Happen (TM) + var urlMaybe = EdgeUrl.parse(doc.url); + if (urlMaybe.isEmpty()) continue; + var url = urlMaybe.get(); + + // If we've previously 404:d on this URL, we'll refrain from trying to fetch it again + if (doc.httpStatus == 404) { + crawlFrontier.addVisited(url); + continue; + } + + if (doc.httpStatus != 200) continue; + + if (!robotsRules.isAllowed(url.toString())) { + crawledDomainWriter.accept(CrawledDocumentFactory.createRobotsError(url)); + continue; + } + if (!crawlFrontier.filterLink(url)) + continue; + if (!crawlFrontier.addVisited(url)) + continue; + + + if (recrawled > 5 + && retained > 0.9 * recrawled + && Math.random() < 0.9) + { + // Since it looks like most of these documents haven't changed, + // we'll load the documents directly; but we do this in a random + // fashion to make sure we eventually catch changes over time + // and ensure we discover new links + + crawledDomainWriter.accept(doc); + crawlFrontier.addVisited(url); + + // Hoover up any links from the document + if (doc.httpStatus == 200 && doc.documentBody != null) { + var parsedDoc = Jsoup.parse(doc.documentBody); + crawlFrontier.enqueueLinksFromDocument(url, parsedDoc); + } + + // Add a WARC record so we don't repeat this + warcRecorder.flagAsSkipped(url, doc.headers, doc.httpStatus, doc.documentBody); + + continue; + } + + + // GET the document with the stored document as a reference + // providing etag and last-modified headers, so we can recycle the + // document if it hasn't changed without actually downloading it + + var fetchedDocOpt = crawlerRetreiver.fetchWriteAndSleep(url, + delayTimer, + new DocumentWithReference(doc, oldCrawlData)); + if (fetchedDocOpt.isEmpty()) continue; + + if (documentWasRetainedTag.equals(fetchedDocOpt.get().recrawlState)) retained ++; + else if (documentWasSameTag.equals(fetchedDocOpt.get().recrawlState)) retained ++; + + recrawled ++; + } + + return recrawled; + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java new file mode 100644 index 00000000..e832541f --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java @@ -0,0 +1,82 @@ +package nu.marginalia.crawl.retreival.revisit; + +import nu.marginalia.crawl.retreival.CrawlDataReference; +import nu.marginalia.crawl.retreival.CrawlerRetreiver; +import nu.marginalia.crawl.retreival.fetcher.ContentTags; +import nu.marginalia.crawling.model.CrawledDocument; + +import javax.annotation.Nullable; +import java.time.LocalDateTime; + +public record DocumentWithReference( + @Nullable CrawledDocument doc, + @Nullable CrawlDataReference reference) { + + private static final DocumentWithReference emptyInstance = new DocumentWithReference(null, null); + + public static DocumentWithReference empty() { + return emptyInstance; + } + + public boolean isContentBodySame(CrawledDocument newDoc) { + if (reference == null) + return false; + if (doc == null) + return false; + if (doc.documentBody == null) + return false; + if (newDoc.documentBody == null) + return false; + + return reference.isContentBodySame(doc, newDoc); + } + + public ContentTags getContentTags() { + if (null == doc) + return ContentTags.empty(); + + String headers = doc.headers; + if (headers == null) + return ContentTags.empty(); + + String[] headersLines = headers.split("\n"); + + String lastmod = null; + String etag = null; + + for (String line : headersLines) { + if (line.toLowerCase().startsWith("etag:")) { + etag = line.substring(5).trim(); + } + if (line.toLowerCase().startsWith("last-modified:")) { + lastmod = line.substring(14).trim(); + } + } + + return new ContentTags(etag, lastmod); + } + + public boolean isEmpty() { + return doc == null || reference == null; + } + + /** + * If the provided document has HTTP status 304, and the reference document is provided, + * return the reference document; otherwise return the provided document. + */ + public CrawledDocument replaceOn304(CrawledDocument fetchedDoc) { + + if (doc == null) + return fetchedDoc; + + // HTTP status 304 is NOT MODIFIED, which means the document is the same as it was when + // we fetched it last time. We can recycle the reference document. + if (fetchedDoc.httpStatus != 304) + return fetchedDoc; + + var ret = doc; + ret.recrawlState = CrawlerRevisitor.documentWasRetainedTag; + ret.timestamp = LocalDateTime.now().toString(); + return ret; + } +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/sitemap/SitemapFetcher.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/sitemap/SitemapFetcher.java new file mode 100644 index 00000000..3ce33d64 --- /dev/null +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/sitemap/SitemapFetcher.java @@ -0,0 +1,71 @@ +package nu.marginalia.crawl.retreival.sitemap; + +import crawlercommons.robots.SimpleRobotRules; +import nu.marginalia.crawl.retreival.DomainCrawlFrontier; +import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever; +import nu.marginalia.model.EdgeUrl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +public class SitemapFetcher { + + private final DomainCrawlFrontier crawlFrontier; + private final SitemapRetriever sitemapRetriever; + private static final Logger logger = LoggerFactory.getLogger(SitemapFetcher.class); + + public SitemapFetcher(DomainCrawlFrontier crawlFrontier, SitemapRetriever sitemapRetriever) { + this.crawlFrontier = crawlFrontier; + this.sitemapRetriever = sitemapRetriever; + } + + public void downloadSitemaps(SimpleRobotRules robotsRules, EdgeUrl rootUrl) { + List sitemaps = robotsRules.getSitemaps(); + + List urls = new ArrayList<>(sitemaps.size()); + if (!sitemaps.isEmpty()) { + for (var url : sitemaps) { + EdgeUrl.parse(url).ifPresent(urls::add); + } + } + else { + urls.add(rootUrl.withPathAndParam("/sitemap.xml", null)); + } + + downloadSitemaps(urls); + } + + public void downloadSitemaps(List urls) { + + Set checkedSitemaps = new HashSet<>(); + + for (var url : urls) { + // Let's not download sitemaps from other domains for now + if (!crawlFrontier.isSameDomain(url)) { + continue; + } + + if (checkedSitemaps.contains(url.path)) + continue; + + var sitemap = sitemapRetriever.fetchSitemap(url); + if (sitemap.isEmpty()) { + continue; + } + + // ensure we don't try to download this sitemap again + // (don't move this up, as we may want to check the same + // path with different protocols until we find one that works) + + checkedSitemaps.add(url.path); + + crawlFrontier.addAllToQueue(sitemap); + } + + logger.debug("Queue is now {}", crawlFrontier.queueSize()); + } +} diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java new file mode 100644 index 00000000..ae3d9be4 --- /dev/null +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizerTest.java @@ -0,0 +1,88 @@ +package nu.marginalia.crawl.retreival; + +import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import okhttp3.OkHttpClient; +import okhttp3.Request; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.netpreserve.jwarc.WarcReader; +import org.netpreserve.jwarc.WarcRequest; +import org.netpreserve.jwarc.WarcResponse; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.security.NoSuchAlgorithmException; +import java.util.List; +import java.util.zip.GZIPInputStream; + +import static org.junit.jupiter.api.Assertions.*; + +class CrawlerWarcResynchronizerTest { + Path fileName; + Path outputFile; + OkHttpClient httpClient; + @BeforeEach + public void setUp() throws Exception { + httpClient = new OkHttpClient.Builder() + .addNetworkInterceptor(new IpInterceptingNetworkInterceptor()) + .build(); + + fileName = Files.createTempFile("test", ".warc.gz"); + outputFile = Files.createTempFile("test", ".warc.gz"); + } + + @AfterEach + public void tearDown() throws Exception { + Files.deleteIfExists(fileName); + Files.deleteIfExists(outputFile); + } + + @Test + void run() throws IOException, URISyntaxException { + try (var oldRecorder = new WarcRecorder(fileName)) { + fetchUrl(oldRecorder, "https://www.marginalia.nu/"); + fetchUrl(oldRecorder, "https://www.marginalia.nu/log/"); + fetchUrl(oldRecorder, "https://www.marginalia.nu/feed/"); + } catch (Exception e) { + fail(e); + } + + var crawlFrontier = new DomainCrawlFrontier(new EdgeDomain("www.marginalia.nu"), List.of(), 100); + + try (var newRecorder = new WarcRecorder(outputFile)) { + new CrawlerWarcResynchronizer(crawlFrontier, newRecorder).run(fileName); + } + + assertTrue(crawlFrontier.isVisited(new EdgeUrl("https://www.marginalia.nu/"))); + assertTrue(crawlFrontier.isVisited(new EdgeUrl("https://www.marginalia.nu/log/"))); + assertTrue(crawlFrontier.isVisited(new EdgeUrl("https://www.marginalia.nu/feed/"))); + + try (var warcReader = new WarcReader(outputFile)) { + for (var item : warcReader) { + if (item instanceof WarcRequest req) { + System.out.println("req:" + req.target()); + } + if (item instanceof WarcResponse rsp) { + System.out.println("req:" + rsp.target()); + } + } + } + + new GZIPInputStream(Files.newInputStream(outputFile)).transferTo(System.out); + } + + void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException { + var req = new Request.Builder().url(url) + .addHeader("User-agent", "test.marginalia.nu") + .addHeader("Accept-Encoding", "gzip") + .get().build(); + recorder.fetch(httpClient, req); + } +} \ No newline at end of file diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java index 80c1218d..55f2eebe 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java @@ -2,6 +2,7 @@ package nu.marginalia.crawl.retreival.fetcher; import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.model.EdgeUrl; import okhttp3.OkHttpClient; import okhttp3.Request; import org.junit.jupiter.api.AfterEach; @@ -66,4 +67,33 @@ class WarcRecorderTest { assertEquals("https://www.marginalia.nu/", sampleData.get("request")); assertEquals("https://www.marginalia.nu/", sampleData.get("response")); } + + @Test + public void flagAsSkipped() throws IOException, URISyntaxException { + + try (var recorder = new WarcRecorder(fileName)) { + recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"), + """ + Content-type: text/html + X-Cookies: 1 + """, + 200, + "test"); + } + + try (var reader = new WarcReader(fileName)) { + for (var record : reader) { + if (record instanceof WarcResponse rsp) { + assertEquals("https://www.marginalia.nu/", rsp.target()); + assertEquals("text/html", rsp.contentType().type()); + assertEquals(200, rsp.http().status()); + assertEquals("1", rsp.http().headers().first("X-Cookies").orElse(null)); + } + } + } + + new GZIPInputStream(Files.newInputStream(fileName)).transferTo(System.out); + } + + } \ No newline at end of file diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index e5264301..2a00e6de 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -21,6 +21,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.net.URISyntaxException; +import java.nio.file.Path; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -151,5 +152,6 @@ public class CrawlerMockFetcherTest { public SitemapRetriever createSitemapRetriever() { return Mockito.mock(SitemapRetriever.class); } + } } From 440e097d7879f01a6f54b8489e3892129b3f1970 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 13 Dec 2023 15:33:42 +0100 Subject: [PATCH 10/23] (crawler) WIP integration of WARC files into the crawler and converter process. This commit is in a pretty rough state. It refactors the crawler fairly significantly to offer better separation of concerns. It replaces the zstd compressed json files used to store crawl data with WARC files entirely, and the converter is modified to be able to consume this data. This works, -ish. There appears to be some bug relating to reading robots.txt, and the X-Robots-Tag header is no longer processed either. A problem is that the WARC files are a bit too large. It will probably be likely to introduce a new format to store the crawl data long term, something like parquet; and use WARCs for intermediate storage to enable the crawler to be restarted without needing a recrawl. --- .../crawling-model/build.gradle | 5 + .../crawling/body}/ContentTypeLogic.java | 2 +- .../crawling/body/DocumentBodyExtractor.java | 60 +++++++ .../crawling/body/DocumentBodyResult.java | 23 +++ .../crawling/body}/HttpFetchResult.java | 56 +++++- .../crawling/io/CrawledDomainReader.java | 154 +++------------- .../crawling/io/CrawledDomainWriter.java | 2 +- .../crawling/io/CrawlerOutputFile.java | 37 +++- .../io/SerializableCrawlDataStream.java | 6 +- ...ileReadingSerializableCrawlDataStream.java | 70 ++++++++ ...arcReadingSerializableCrawlDataStream.java | 156 ++++++++++++++++ .../jwarc/WarcXResponseReference.java | 44 +++++ .../src/main/java/plan/CrawlPlan.java | 18 +- ...CrawlingThenConvertingIntegrationTest.java | 2 +- .../java/nu/marginalia/crawl/CrawlerMain.java | 33 ++-- .../crawl/retreival/CrawlDataReference.java | 19 +- .../retreival/CrawledDocumentFactory.java | 2 +- .../crawl/retreival/CrawlerRetreiver.java | 168 ++++++++---------- .../retreival/CrawlerWarcResynchronizer.java | 8 +- .../retreival/fetcher/ContentTypeProber.java | 2 +- .../crawl/retreival/fetcher/HttpFetcher.java | 5 +- .../retreival/fetcher/HttpFetcherImpl.java | 168 ++++++------------ .../fetcher/body/DocumentBodyExtractor.java | 44 ----- .../fetcher/body/DocumentBodyResult.java | 8 - .../retreival/fetcher/warc/WarcRecorder.java | 165 +++++++++++++---- .../retreival/revisit/CrawlerRevisitor.java | 25 +-- .../revisit/DocumentWithReference.java | 45 +++-- .../retreival/fetcher/WarcRecorderTest.java | 34 +++- .../marginalia/crawling/HttpFetcherTest.java | 17 +- .../retreival/CrawlerMockFetcherTest.java | 49 +++-- .../retreival/CrawlerRetreiverTest.java | 139 +++++++++++---- .../actor/task/ExportAtagsActor.java | 4 +- .../nu/marginalia/tools/CrawlDataUnfcker.java | 10 +- .../tools/ExperimentRunnerMain.java | 3 +- .../nu/marginalia/tools/LegacyExperiment.java | 2 +- settings.gradle | 2 +- 36 files changed, 966 insertions(+), 621 deletions(-) rename code/{processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic => process-models/crawling-model/src/main/java/nu/marginalia/crawling/body}/ContentTypeLogic.java (97%) create mode 100644 code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java create mode 100644 code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyResult.java rename code/{processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc => process-models/crawling-model/src/main/java/nu/marginalia/crawling/body}/HttpFetchResult.java (63%) create mode 100644 code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacyFileReadingSerializableCrawlDataStream.java create mode 100644 code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcReadingSerializableCrawlDataStream.java create mode 100644 code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java delete mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/body/DocumentBodyExtractor.java delete mode 100644 code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/body/DocumentBodyResult.java diff --git a/code/process-models/crawling-model/build.gradle b/code/process-models/crawling-model/build.gradle index ebbea855..f1f77a70 100644 --- a/code/process-models/crawling-model/build.gradle +++ b/code/process-models/crawling-model/build.gradle @@ -20,13 +20,18 @@ dependencies { implementation project(':code:api:index-api') implementation project(':code:common:service-discovery') implementation project(':code:common:service-client') + implementation project(':code:features-crawl:content-type') implementation project(':code:libraries:language-processing') implementation libs.bundles.slf4j implementation libs.notnull + implementation libs.jwarc implementation libs.gson + implementation libs.commons.io + implementation libs.okhttp3 + implementation libs.jsoup implementation libs.snakeyaml implementation libs.zstd diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeLogic.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/ContentTypeLogic.java similarity index 97% rename from code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeLogic.java rename to code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/ContentTypeLogic.java index c5860913..d7dfa6d1 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/logic/ContentTypeLogic.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/ContentTypeLogic.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawl.retreival.logic; +package nu.marginalia.crawling.body; import nu.marginalia.model.EdgeUrl; diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java new file mode 100644 index 00000000..7bb548e5 --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java @@ -0,0 +1,60 @@ +package nu.marginalia.crawling.body; + +import nu.marginalia.contenttype.ContentTypeParser; +import nu.marginalia.contenttype.DocumentBodyToString; +import nu.marginalia.crawling.model.CrawlerDocumentStatus; +import org.apache.commons.io.input.BOMInputStream; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.zip.GZIPInputStream; + +public class DocumentBodyExtractor { + private static ContentTypeLogic contentTypeLogic = new ContentTypeLogic(); + + private static final Logger logger = LoggerFactory.getLogger(DocumentBodyExtractor.class); + + public static DocumentBodyResult extractBody(HttpFetchResult result) { + if (result instanceof HttpFetchResult.ResultOk fetchOk) { + return extractBody(fetchOk); + } + else { + return new DocumentBodyResult.Error(CrawlerDocumentStatus.ERROR, ""); + } + } + + public static DocumentBodyResult extractBody(HttpFetchResult.ResultOk rsp) { + try { + var byteStream = rsp.getInputStream(); + + if ("gzip".equals(rsp.header("Content-Encoding"))) { + byteStream = new GZIPInputStream(byteStream); + } + byteStream = new BOMInputStream(byteStream); + + var contentTypeHeader = rsp.header("Content-Type"); + if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) { + return new DocumentBodyResult.Error(CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); + } + + byte[] data = byteStream.readAllBytes(); // size is limited by WarcRecorder + + var contentType = ContentTypeParser.parseContentType(contentTypeHeader, data); + if (!contentTypeLogic.isAllowableContentType(contentType.contentType())) { + return new DocumentBodyResult.Error(CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); + } + + if ("Shift_JIS".equalsIgnoreCase(contentType.charset())) { + return new DocumentBodyResult.Error(CrawlerDocumentStatus.BAD_CHARSET, ""); + } + + return new DocumentBodyResult.Ok(contentType.contentType(), DocumentBodyToString.getStringData(contentType, data)); + } + catch (IOException ex) { + logger.error("Failed to extract body", ex); + return new DocumentBodyResult.Error(CrawlerDocumentStatus.ERROR, ""); + } + } + +} diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyResult.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyResult.java new file mode 100644 index 00000000..1959f844 --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyResult.java @@ -0,0 +1,23 @@ +package nu.marginalia.crawling.body; + +import nu.marginalia.crawling.model.CrawlerDocumentStatus; + +import java.util.Optional; +import java.util.function.BiFunction; + +public sealed interface DocumentBodyResult { + record Ok(String contentType, String body) implements DocumentBodyResult { + @Override + public Optional map(BiFunction fun) { + return Optional.of(fun.apply(contentType, body)); + } + } + record Error(CrawlerDocumentStatus status, String why) implements DocumentBodyResult { + @Override + public Optional map(BiFunction fun) { + return Optional.empty(); + } + } + + Optional map(BiFunction fun); +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/HttpFetchResult.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/HttpFetchResult.java similarity index 63% rename from code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/HttpFetchResult.java rename to code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/HttpFetchResult.java index ae9673b1..9790e3da 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/HttpFetchResult.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/HttpFetchResult.java @@ -1,17 +1,23 @@ -package nu.marginalia.crawl.retreival.fetcher.warc; +package nu.marginalia.crawling.body; import okhttp3.Headers; +import org.jsoup.Jsoup; import org.netpreserve.jwarc.MessageHeaders; import org.netpreserve.jwarc.WarcResponse; import org.netpreserve.jwarc.WarcRevisit; +import org.jsoup.nodes.Document; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.net.URI; import java.util.List; +import java.util.Optional; public sealed interface HttpFetchResult { + + boolean isOk(); + static ResultOk importWarc(WarcResponse response) throws IOException { var http = response.http(); try (var body = http.body()) { @@ -27,6 +33,7 @@ public sealed interface HttpFetchResult { ); } } + static ResultOk importWarc(WarcRevisit revisit) throws IOException { var http = revisit.http(); try (var body = http.body()) { @@ -41,7 +48,11 @@ public sealed interface HttpFetchResult { bytes.length ); } + finally { + revisit.body().consume(); + } } + record ResultOk(URI uri, int statusCode, Headers headers, @@ -50,6 +61,10 @@ public sealed interface HttpFetchResult { int bytesLength ) implements HttpFetchResult { + public boolean isOk() { + return statusCode >= 200 && statusCode < 300; + } + public ResultOk(URI uri, int statusCode, MessageHeaders headers, @@ -73,6 +88,14 @@ public sealed interface HttpFetchResult { return new ByteArrayInputStream(bytesRaw, bytesStart, bytesLength); } + public Optional parseDocument() throws IOException { + return switch(DocumentBodyExtractor.extractBody(this)) { + case DocumentBodyResult.Ok ok when "text/html".equalsIgnoreCase(ok.contentType()) + -> Optional.of(Jsoup.parse(ok.body())); + default -> Optional.empty(); + }; + } + public String header(String name) { return headers.get(name); } @@ -82,5 +105,34 @@ public sealed interface HttpFetchResult { }; - record ResultError(Exception ex) implements HttpFetchResult { }; + record ResultRetained(String url, String body) implements HttpFetchResult { + + public boolean isOk() { + return true; + } + + public Optional parseDocument() { + try { + return Optional.of(Jsoup.parse(body)); + } + catch (Exception ex) { + return Optional.empty(); + } + } + }; + record ResultException(Exception ex) implements HttpFetchResult { + public boolean isOk() { + return false; + } + }; + record ResultSame() implements HttpFetchResult { + public boolean isOk() { + return false; + } + }; + record ResultNone() implements HttpFetchResult { + public boolean isOk() { + return false; + } + }; } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java index b7021ace..0da0b790 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java @@ -1,156 +1,44 @@ package nu.marginalia.crawling.io; -import com.github.luben.zstd.RecyclingBufferPool; -import com.github.luben.zstd.ZstdInputStream; import com.google.gson.Gson; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.crawling.io.format.LegacyFileReadingSerializableCrawlDataStream; +import nu.marginalia.crawling.io.format.WarcReadingSerializableCrawlDataStream; import nu.marginalia.model.gson.GsonFactory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.*; +import java.nio.file.Files; import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; -import java.util.Optional; -import java.util.concurrent.ForkJoinPool; -import java.util.concurrent.TimeUnit; public class CrawledDomainReader { - private final Gson gson = GsonFactory.get(); - private final Logger logger = LoggerFactory.getLogger(getClass()); - private final ForkJoinPool pool = new ForkJoinPool(6); + private static final Gson gson = GsonFactory.get(); public CrawledDomainReader() { } /** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */ - public SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException { - return new FileReadingSerializableCrawlDataStream(gson, fullPath.toFile()); + public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException { + String fileName = fullPath.getFileName().toString(); + if (fileName.endsWith(".zstd")) { + return new LegacyFileReadingSerializableCrawlDataStream(gson, fullPath.toFile()); + } + else if (fileName.endsWith(".warc") || fileName.endsWith(".warc.gz")) { + return new WarcReadingSerializableCrawlDataStream(fullPath); + } + else { + throw new IllegalArgumentException("Unknown file type: " + fullPath); + } } /** An iterator-like access to domain data. This must be closed otherwise it will leak off-heap memory! */ - public SerializableCrawlDataStream createDataStream(Path basePath, String domain, String id) throws IOException { - return createDataStream(CrawlerOutputFile.getOutputFile(basePath, id, domain)); - } + public static SerializableCrawlDataStream createDataStream(Path basePath, String domain, String id) throws IOException { + Path warcPath = CrawlerOutputFile.getWarcPath(basePath, id, domain, CrawlerOutputFile.WarcFileVersion.FINAL); - /** Read the entirety of the domain data into memory. This uses a lot of RAM */ - public CrawledDomain read(Path path) throws IOException { - DomainDataAssembler domainData = new DomainDataAssembler(); - - try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()), RecyclingBufferPool.INSTANCE)))) { - String line; - while ((line = br.readLine()) != null) { - if (line.startsWith("//")) { - String identifier = line; - String data = br.readLine(); - - pool.execute(() -> deserializeLine(identifier, data, domainData)); - } - } + if (Files.exists(warcPath)) { + return createDataStream(warcPath); } - - while (!pool.awaitQuiescence(1, TimeUnit.SECONDS)); - - return domainData.assemble(); - } - - - private void deserializeLine(String identifier, String data, DomainDataAssembler assembler) { - if (null == data) { - return; - } - if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) { - assembler.acceptDomain(gson.fromJson(data, CrawledDomain.class)); - } else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) { - assembler.acceptDoc(gson.fromJson(data, CrawledDocument.class)); + else { + return createDataStream(CrawlerOutputFile.getLegacyOutputFile(basePath, id, domain)); } } - public Optional readOptionally(Path path) { - try { - return Optional.of(read(path)); - } - catch (Exception ex) { - return Optional.empty(); - } - } - - private static class DomainDataAssembler { - private CrawledDomain domainPrototype; - private final List docs = new ArrayList<>(); - - public synchronized void acceptDomain(CrawledDomain domain) { - this.domainPrototype = domain; - } - - public synchronized void acceptDoc(CrawledDocument doc) { - docs.add(doc); - } - - public synchronized CrawledDomain assemble() { - if (!docs.isEmpty()) { - if (domainPrototype.doc == null) - domainPrototype.doc = new ArrayList<>(); - - domainPrototype.doc.addAll(docs); - } - return domainPrototype; - } - } - - private static class FileReadingSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { - private final Gson gson; - private final BufferedReader bufferedReader; - private SerializableCrawlData next = null; - - public FileReadingSerializableCrawlDataStream(Gson gson, File file) throws IOException { - this.gson = gson; - bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE))); - } - - @Override - public SerializableCrawlData next() throws IOException { - if (hasNext()) { - var ret = next; - next = null; - return ret; - } - throw new IllegalStateException("No more data"); - } - - @Override - public boolean hasNext() throws IOException { - if (next != null) - return true; - - String identifier = bufferedReader.readLine(); - if (identifier == null) { - bufferedReader.close(); - return false; - } - String data = bufferedReader.readLine(); - if (data == null) { - bufferedReader.close(); - return false; - } - - if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) { - next = gson.fromJson(data, CrawledDomain.class); - } else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) { - next = gson.fromJson(data, CrawledDocument.class); - } - else { - throw new IllegalStateException("Unknown identifier: " + identifier); - } - return true; - } - - @Override - public void close() throws Exception { - bufferedReader.close(); - } - } } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java index 0e278f09..f21715ee 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainWriter.java @@ -55,7 +55,7 @@ public class CrawledDomainWriter implements AutoCloseable { } private Path getOutputFile(String id, String name) throws IOException { - return CrawlerOutputFile.createOutputPath(outputDir, id, name); + return CrawlerOutputFile.createLegacyOutputPath(outputDir, id, name); } @Override diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java index 67e8738c..907eb081 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java @@ -9,7 +9,11 @@ import java.nio.file.Path; public class CrawlerOutputFile { /** Return the Path to a file for the given id and name */ - public static Path getOutputFile(Path base, String id, String name) { + public static Path getLegacyOutputFile(Path base, String id, String name) { + if (id.length() < 4) { + id = Strings.repeat("0", 4 - id.length()) + id; + } + String first = id.substring(0, 2); String second = id.substring(2, 4); @@ -19,7 +23,7 @@ public class CrawlerOutputFile { /** Return the Path to a file for the given id and name, creating the prerequisite * directory structure as necessary. */ - public static Path createOutputPath(Path base, String id, String name) throws IOException { + public static Path createLegacyOutputPath(Path base, String id, String name) throws IOException { if (id.length() < 4) { id = Strings.repeat("0", 4 - id.length()) + id; } @@ -49,20 +53,37 @@ public class CrawlerOutputFile { } - public static Path createWarcFile(Path baseDir, String id, String name, WarcFileVersion version) { + public static Path createWarcPath(Path basePath, String id, String domain, WarcFileVersion version) throws IOException { if (id.length() < 4) { id = Strings.repeat("0", 4 - id.length()) + id; } - String fileName = STR."\{id}-\{filesystemSafeName(name)}.zstd\{version.suffix}"; + String first = id.substring(0, 2); + String second = id.substring(2, 4); - return baseDir.resolve(fileName); + Path destDir = basePath.resolve(first).resolve(second); + if (!Files.exists(destDir)) { + Files.createDirectories(destDir); + } + return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}-\{version.suffix}.warc.gz"); + } + + public static Path getWarcPath(Path basePath, String id, String domain, WarcFileVersion version) { + if (id.length() < 4) { + id = Strings.repeat("0", 4 - id.length()) + id; + } + + String first = id.substring(0, 2); + String second = id.substring(2, 4); + + Path destDir = basePath.resolve(first).resolve(second); + return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}.warc\{version.suffix}"); } public enum WarcFileVersion { - LIVE(".open"), - TEMP(".tmp"), - FINAL(""); + LIVE("open"), + TEMP("tmp"), + FINAL("final"); public final String suffix; diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java index 3aecc0fc..9598d002 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java @@ -1,11 +1,13 @@ package nu.marginalia.crawling.io; import nu.marginalia.crawling.model.SerializableCrawlData; +import org.jetbrains.annotations.Nullable; import java.io.IOException; +import java.nio.file.Path; import java.util.Iterator; -/** Closable iterator over serialized crawl data +/** Closable iterator exceptional over serialized crawl data * The data may appear in any order, and the iterator must be closed. * * @see CrawledDomainReader @@ -17,6 +19,8 @@ public interface SerializableCrawlDataStream extends AutoCloseable { boolean hasNext() throws IOException; + @Nullable + default Path path() { return null; } // Dummy iterator over nothing static SerializableCrawlDataStream empty() { diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacyFileReadingSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacyFileReadingSerializableCrawlDataStream.java new file mode 100644 index 00000000..efff17f3 --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacyFileReadingSerializableCrawlDataStream.java @@ -0,0 +1,70 @@ +package nu.marginalia.crawling.io.format; + +import com.github.luben.zstd.RecyclingBufferPool; +import com.github.luben.zstd.ZstdInputStream; +import com.google.gson.Gson; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.SerializableCrawlData; + +import java.io.*; +import java.nio.file.Path; + +public class LegacyFileReadingSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { + private final Gson gson; + private final BufferedReader bufferedReader; + private SerializableCrawlData next = null; + + private final Path path; + public LegacyFileReadingSerializableCrawlDataStream(Gson gson, File file) throws IOException { + this.gson = gson; + bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE))); + path = file.toPath(); + } + + @Override + public Path path() { + return path; + } + @Override + public SerializableCrawlData next() throws IOException { + if (hasNext()) { + var ret = next; + next = null; + return ret; + } + throw new IllegalStateException("No more data"); + } + + @Override + public boolean hasNext() throws IOException { + if (next != null) + return true; + + String identifier = bufferedReader.readLine(); + if (identifier == null) { + bufferedReader.close(); + return false; + } + String data = bufferedReader.readLine(); + if (data == null) { + bufferedReader.close(); + return false; + } + + if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) { + next = gson.fromJson(data, CrawledDomain.class); + } else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) { + next = gson.fromJson(data, CrawledDocument.class); + } else { + throw new IllegalStateException("Unknown identifier: " + identifier); + } + return true; + } + + @Override + public void close() throws Exception { + bufferedReader.close(); + } +} diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcReadingSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcReadingSerializableCrawlDataStream.java new file mode 100644 index 00000000..9d8d1a63 --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcReadingSerializableCrawlDataStream.java @@ -0,0 +1,156 @@ +package nu.marginalia.crawling.io.format; + +import lombok.SneakyThrows; +import nu.marginalia.crawling.body.DocumentBodyExtractor; +import nu.marginalia.crawling.body.DocumentBodyResult; +import nu.marginalia.crawling.body.HttpFetchResult; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.SerializableCrawlData; +import org.netpreserve.jwarc.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.Iterator; +import java.util.List; +import java.util.NoSuchElementException; +import java.util.StringJoiner; + +public class WarcReadingSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { + private static final Logger logger = LoggerFactory.getLogger(WarcReadingSerializableCrawlDataStream.class); + + private final WarcReader reader; + private final Iterator backingIterator; + private SerializableCrawlData next = null; + private final Path path; + + public WarcReadingSerializableCrawlDataStream(Path file) throws IOException { + path = file; + reader = new WarcReader(file); + WarcXResponseReference.register(reader); + + backingIterator = reader.iterator(); + } + + @Override + public Path path() { + return path; + } + + @Override + @SneakyThrows + public boolean hasNext() { + while (backingIterator.hasNext() && next == null) { + var nextRecord = backingIterator.next(); + if (nextRecord instanceof WarcResponse response) { // this also includes WarcXResponseReference + convertResponse(response); + } + else if (nextRecord instanceof Warcinfo warcinfo) { + convertWarcinfo(warcinfo); + } + else if (nextRecord instanceof WarcMetadata metadata) { + convertMetadata(metadata); + } + } + return next != null; + } + + private void convertMetadata(WarcMetadata metadata) { + // Nothing to do here for now + } + + private void convertWarcinfo(Warcinfo warcinfo) throws IOException { + var headers = warcinfo.fields(); + String probeStatus = headers.first("X-WARC-Probe-Status").orElse(""); + String[] parts = probeStatus.split(" ", 2); + + + String domain = headers.first("domain").orElseThrow(() -> new IllegalStateException("Missing domain header")); + String status = parts[0]; + String statusReason = parts.length > 1 ? parts[1] : ""; + String ip = headers.first("ip").orElse(""); + + String redirectDomain = null; + if ("REDIRECT".equalsIgnoreCase(status)) { + redirectDomain = statusReason; + } + + // TODO: Fix cookies info somehow + next = new CrawledDomain(domain, redirectDomain, status, statusReason, ip, List.of(), List.of()); + } + + private void convertResponse(WarcResponse response) throws IOException { + var http = response.http(); + + if (http.status() != 200) { + return; + } + CrawledDocument document; + + var parsedBody = DocumentBodyExtractor.extractBody(HttpFetchResult.importWarc(response)); + if (parsedBody instanceof DocumentBodyResult.Error error) { + next = new CrawledDocument( + "", + response.targetURI().toString(), + http.contentType().raw(), + response.date().toString(), + http.status(), + error.status().toString(), + error.why(), + headers(http.headers()), + null, + response.payloadDigest().map(WarcDigest::base64).orElse(""), + "", + "", + ""); + } else if (parsedBody instanceof DocumentBodyResult.Ok ok) { + next = new CrawledDocument( + "", + response.targetURI().toString(), + ok.contentType(), + response.date().toString(), + http.status(), + "OK", + "", + headers(http.headers()), + ok.body(), + response.payloadDigest().map(WarcDigest::base64).orElse(""), + "", + "", + ""); + } else { + // unreachable + throw new IllegalStateException("Unknown body type: " + parsedBody); + } + } + + public String headers(MessageHeaders headers) { + StringJoiner ret = new StringJoiner("\n"); + for (var header : headers.map().entrySet()) { + for (var value : header.getValue()) { + ret.add(STR."\{header.getKey()}: \{value}"); + } + } + return ret.toString(); + } + + public void close() throws IOException { + reader.close(); + } + + @Override + public SerializableCrawlData next() throws IOException { + if (!hasNext()) + throw new NoSuchElementException(); + try { + return next; + } + finally { + next = null; + } + } + +} diff --git a/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java new file mode 100644 index 00000000..7e02d936 --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java @@ -0,0 +1,44 @@ +package org.netpreserve.jwarc; + +import java.io.IOException; +import java.net.URI; + +/** This defines a non-standard extension to WARC for storing old HTTP responses, + * essentially a 'revisit' with a full body, which is not something that is + * expected by the jwarc parser, and goes against the semantics of the revisit + * records a fair bit. + *

+ * An x-response-reference record is a response record with a full body, where + * the data is a reconstructed HTTP response from a previous crawl. + */ +public class WarcXResponseReference extends WarcResponse { + private static final String TYPE_NAME = "x-response-reference"; + + WarcXResponseReference(MessageVersion version, MessageHeaders headers, MessageBody body) { + super(version, headers, body); + } + + public static void register(WarcReader reader) { + reader.registerType(TYPE_NAME, WarcXResponseReference::new); + } + + public static class Builder extends AbstractBuilder { + public Builder(URI targetURI) { + this(targetURI.toString()); + } + + public Builder(String targetURI) { + super(TYPE_NAME); + setHeader("WARC-Target-URI", targetURI); + } + + public Builder body(HttpResponse httpResponse) throws IOException { + return body(MediaType.HTTP_RESPONSE, httpResponse); + } + + @Override + public WarcXResponseReference build() { + return build(WarcXResponseReference::new); + } + } +} diff --git a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java index 718dea06..cbb88772 100644 --- a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java +++ b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java @@ -74,23 +74,13 @@ public class CrawlPlan { return count; } + @Deprecated public Iterable domainsIterable() { - final CrawledDomainReader reader = new CrawledDomainReader(); - - return WorkLog.iterableMap(crawl.getLogFile(), - entry -> { - var path = getCrawledFilePath(entry.path()); - if (!Files.exists(path)) { - logger.warn("File not found: {}", path); - return Optional.empty(); - } - return reader.readOptionally(path); - }); + // This is no longer supported + throw new UnsupportedOperationException(); } public Iterable crawlDataIterable(Predicate idPredicate) { - final CrawledDomainReader reader = new CrawledDomainReader(); - return WorkLog.iterableMap(crawl.getLogFile(), entry -> { if (!idPredicate.test(entry.id())) { @@ -105,7 +95,7 @@ public class CrawlPlan { } try { - return Optional.of(reader.createDataStream(path)); + return Optional.of(CrawledDomainReader.createDataStream(path)); } catch (IOException ex) { return Optional.empty(); diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 5b5deddc..67b4f7b6 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -79,7 +79,7 @@ public class CrawlingThenConvertingIntegrationTest { List data = new ArrayList<>(); try (var recorder = new WarcRecorder()) { - new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, data::add).fetch(); + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(); } CrawledDomain domain = data.stream().filter(CrawledDomain.class::isInstance).map(CrawledDomain.class::cast).findFirst().get(); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index a5d78a1f..f5b5a10e 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -62,7 +62,6 @@ public class CrawlerMain { private final SimpleBlockingThreadPool pool; private final Map processingIds = new ConcurrentHashMap<>(); - private final CrawledDomainReader reader = new CrawledDomainReader(); final AbortMonitor abortMonitor = AbortMonitor.getInstance(); @@ -142,6 +141,7 @@ public class CrawlerMain { public void run(CrawlSpecProvider specProvider, Path outputDir) throws InterruptedException, IOException { heartbeat.start(); + try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler.log")); AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(specProvider.getDomains()) ) { @@ -213,9 +213,9 @@ public class CrawlerMain { @Override public void run() throws Exception { - Path newWarcFile = CrawlerOutputFile.createWarcFile(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE); - Path tempFile = CrawlerOutputFile.createWarcFile(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP); - Path finalWarcFile = CrawlerOutputFile.createWarcFile(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.FINAL); + Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE); + Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP); + Path finalWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.FINAL); if (Files.exists(newWarcFile)) { Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING); @@ -224,9 +224,8 @@ public class CrawlerMain { Files.deleteIfExists(tempFile); } - try (CrawledDomainWriter writer = new CrawledDomainWriter(outputDir, domain, id); - var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now - var retreiver = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder, writer::accept); + try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now + var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder); CrawlDataReference reference = getReference()) { Thread.currentThread().setName("crawling:" + domain); @@ -234,39 +233,37 @@ public class CrawlerMain { var domainLinks = anchorTagsSource.getAnchorTags(domain); if (Files.exists(tempFile)) { - retreiver.syncAbortedRun(tempFile); + retriever.syncAbortedRun(tempFile); Files.delete(tempFile); } - int size = retreiver.fetch(domainLinks, reference); + int size = retriever.fetch(domainLinks, reference); + + // Delete the reference crawl data if it's not the same as the new one + // (mostly a case when migrating from legacy->warc) + reference.delete(); Files.move(newWarcFile, finalWarcFile, StandardCopyOption.REPLACE_EXISTING); - workLog.setJobToFinished(domain, writer.getOutputFile().toString(), size); + workLog.setJobToFinished(domain, finalWarcFile.toString(), size); heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks); logger.info("Fetched {}", domain); } catch (Exception e) { logger.error("Error fetching domain " + domain, e); Files.deleteIfExists(newWarcFile); - if (tempFile != null) { - Files.deleteIfExists(tempFile); - } + Files.deleteIfExists(tempFile); } finally { // We don't need to double-count these; it's also kept int he workLog processingIds.remove(domain); Thread.currentThread().setName("[idle]"); - - // FIXME: Remove this when we're done - Files.deleteIfExists(finalWarcFile); } } private CrawlDataReference getReference() { try { - var dataStream = reader.createDataStream(outputDir, domain, id); - return new CrawlDataReference(dataStream); + return new CrawlDataReference(CrawledDomainReader.createDataStream(outputDir, domain, id)); } catch (IOException e) { logger.debug("Failed to read previous crawl data for {}", specification.domain); return new CrawlDataReference(); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java index 985bfc39..9088ebb4 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java @@ -8,6 +8,8 @@ import nu.marginalia.lsh.EasyLSH; import javax.annotation.Nullable; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; /** A reference to a domain that has been crawled before. */ public class CrawlDataReference implements AutoCloseable { @@ -22,6 +24,15 @@ public class CrawlDataReference implements AutoCloseable { this(SerializableCrawlDataStream.empty()); } + /** Delete the associated data from disk, if it exists */ + public void delete() throws IOException { + Path filePath = data.path(); + + if (filePath != null) { + Files.deleteIfExists(filePath); + } + } + @Nullable public CrawledDocument nextDocument() { try { @@ -37,12 +48,10 @@ public class CrawlDataReference implements AutoCloseable { return null; } - public boolean isContentBodySame(CrawledDocument one, CrawledDocument other) { - assert one.documentBody != null; - assert other.documentBody != null; + public boolean isContentBodySame(String one, String other) { - final long contentHashOne = contentHash(one.documentBody); - final long contentHashOther = contentHash(other.documentBody); + final long contentHashOne = contentHash(one); + final long contentHashOther = contentHash(other); return EasyLSH.hammingDistance(contentHashOne, contentHashOther) < 4; } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java index b3ab9ee5..37f84d58 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java @@ -1,6 +1,6 @@ package nu.marginalia.crawl.retreival; -import nu.marginalia.crawl.retreival.fetcher.warc.HttpFetchResult; +import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawlerDocumentStatus; import nu.marginalia.model.EdgeUrl; diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 30054008..514243ee 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -7,7 +7,7 @@ import lombok.SneakyThrows; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; -import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever; +import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor; import nu.marginalia.crawl.retreival.revisit.DocumentWithReference; @@ -18,16 +18,15 @@ import nu.marginalia.ip_blocklist.UrlBlocklist; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawlspec.CrawlSpecRecord; -import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.net.InetAddress; import java.net.UnknownHostException; import java.nio.file.Path; import java.util.*; -import java.util.function.Consumer; public class CrawlerRetreiver implements AutoCloseable { @@ -36,7 +35,6 @@ public class CrawlerRetreiver implements AutoCloseable { private final HttpFetcher fetcher; private final String domain; - private final Consumer crawledDomainWriter; private static final LinkParser linkParser = new LinkParser(); private static final Logger logger = LoggerFactory.getLogger(CrawlerRetreiver.class); @@ -56,8 +54,7 @@ public class CrawlerRetreiver implements AutoCloseable { public CrawlerRetreiver(HttpFetcher fetcher, DomainProber domainProber, CrawlSpecRecord specs, - WarcRecorder warcRecorder, - Consumer writer) + WarcRecorder warcRecorder) { this.warcRecorder = warcRecorder; this.fetcher = fetcher; @@ -65,11 +62,8 @@ public class CrawlerRetreiver implements AutoCloseable { domain = specs.domain; - crawledDomainWriter = writer; - - crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), Objects.requireNonNullElse(specs.urls, List.of()), specs.crawlDepth); - crawlerRevisitor = new CrawlerRevisitor(crawlFrontier, crawledDomainWriter, this, warcRecorder); + crawlerRevisitor = new CrawlerRevisitor(crawlFrontier, this, warcRecorder); sitemapFetcher = new SitemapFetcher(crawlFrontier, fetcher.createSitemapRetriever()); // We must always crawl the index page first, this is assumed when fingerprinting the server @@ -94,32 +88,13 @@ public class CrawlerRetreiver implements AutoCloseable { public int fetch(DomainLinks domainLinks, CrawlDataReference oldCrawlData) { final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, crawlFrontier.peek()); - return switch (probeResult) { - case DomainProber.ProbeResultOk(EdgeUrl probedUrl) -> crawlDomain(oldCrawlData, probedUrl, domainLinks); - case DomainProber.ProbeResultError(CrawlerDomainStatus status, String desc) -> { - crawledDomainWriter.accept( - CrawledDomain.builder() - .crawlerStatus(status.name()) - .crawlerStatusDesc(desc) - .domain(domain) - .ip(findIp(domain)) - .build() - ); - yield 1; - } - case DomainProber.ProbeResultRedirect(EdgeDomain redirectDomain) -> { - crawledDomainWriter.accept( - CrawledDomain.builder() - .crawlerStatus(CrawlerDomainStatus.REDIRECT.name()) - .crawlerStatusDesc("Redirected to different domain") - .redirectDomain(redirectDomain.toString()) - .domain(domain) - .ip(findIp(domain)) - .build() - ); - yield 1; - } - }; + try { + return crawlDomain(oldCrawlData, probeResult, domainLinks); + } + catch (Exception ex) { + logger.error("Error crawling domain {}", domain, ex); + return 0; + } } public void syncAbortedRun(Path warcFile) { @@ -128,9 +103,21 @@ public class CrawlerRetreiver implements AutoCloseable { resync.run(warcFile); } - private int crawlDomain(CrawlDataReference oldCrawlData, EdgeUrl rootUrl, DomainLinks domainLinks) { + private int crawlDomain(CrawlDataReference oldCrawlData, DomainProber.ProbeResult probeResult, DomainLinks domainLinks) throws IOException { String ip = findIp(domain); + EdgeUrl rootUrl; + + warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult); + + if (!(probeResult instanceof DomainProber.ProbeResultOk ok)) { + return 1; + } + else { + rootUrl = ok.probedUrl(); + } + + assert !crawlFrontier.isEmpty(); final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain, warcRecorder); @@ -170,7 +157,7 @@ public class CrawlerRetreiver implements AutoCloseable { var top = crawlFrontier.takeNextUrl(); if (!robotsRules.isAllowed(top.toString())) { - crawledDomainWriter.accept(CrawledDocumentFactory.createRobotsError(top)); + warcRecorder.flagAsRobotsTxtError(top); continue; } @@ -193,15 +180,13 @@ public class CrawlerRetreiver implements AutoCloseable { continue; - if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isPresent()) { + if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isOk()) { fetchedCount++; } } ret.cookies = fetcher.getCookies(); - crawledDomainWriter.accept(ret); - return fetchedCount; } @@ -216,16 +201,16 @@ public class CrawlerRetreiver implements AutoCloseable { var url = rootUrl.withPathAndParam("/", null); - var maybeSample = fetchUrl(url, delayTimer, DocumentWithReference.empty()).filter(sample -> sample.httpStatus == 200); - if (maybeSample.isEmpty()) + var result = tryDownload(url, delayTimer, ContentTags.empty()); + if (!(result instanceof HttpFetchResult.ResultOk ok)) return; - var sample = maybeSample.get(); - if (sample.documentBody == null) + var optDoc = ok.parseDocument(); + if (optDoc.isEmpty()) return; // Sniff the software based on the sample document - var doc = Jsoup.parse(sample.documentBody); + var doc = optDoc.get(); crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc)); for (var link : doc.getElementsByTag("link")) { @@ -252,41 +237,54 @@ public class CrawlerRetreiver implements AutoCloseable { } } - public Optional fetchWriteAndSleep(EdgeUrl top, + public HttpFetchResult fetchWriteAndSleep(EdgeUrl top, CrawlDelayTimer timer, DocumentWithReference reference) { logger.debug("Fetching {}", top); long startTime = System.currentTimeMillis(); - var docOpt = fetchUrl(top, timer, reference); + var contentTags = reference.getContentTags(); + var fetchedDoc = tryDownload(top, timer, contentTags); - if (docOpt.isPresent()) { - var doc = docOpt.get(); - - if (!Objects.equals(doc.recrawlState, CrawlerRevisitor.documentWasRetainedTag) - && reference.isContentBodySame(doc)) - { - // The document didn't change since the last time - doc.recrawlState = CrawlerRevisitor.documentWasSameTag; + if (fetchedDoc instanceof HttpFetchResult.ResultSame) { + var doc = reference.doc(); + if (doc != null) { + warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody); + fetchedDoc = new HttpFetchResult.ResultRetained(doc.url, doc.documentBody); } + } - crawledDomainWriter.accept(doc); + try { + if (fetchedDoc instanceof HttpFetchResult.ResultOk ok) { + var docOpt = ok.parseDocument(); + if (docOpt.isPresent()) { + var doc = docOpt.get(); - if (doc.url != null) { - // We may have redirected to a different path - EdgeUrl.parse(doc.url).ifPresent(crawlFrontier::addVisited); + crawlFrontier.enqueueLinksFromDocument(top, doc); + crawlFrontier.addVisited(new EdgeUrl(ok.uri())); + } } + else if (fetchedDoc instanceof HttpFetchResult.ResultRetained retained) { + var docOpt = retained.parseDocument(); + if (docOpt.isPresent()) { + var doc = docOpt.get(); - if ("ERROR".equals(doc.crawlerStatus) && doc.httpStatus != 404) { - errorCount++; + crawlFrontier.enqueueLinksFromDocument(top, doc); + EdgeUrl.parse(retained.url()).ifPresent(crawlFrontier::addVisited); + } } - + else if (fetchedDoc instanceof HttpFetchResult.ResultException ex) { + errorCount ++; + } + } + catch (Exception ex) { + logger.error("Error parsing document {}", top, ex); } timer.delay(System.currentTimeMillis() - startTime); - return docOpt; + return fetchedDoc; } private boolean isAllowedProtocol(String proto) { @@ -294,42 +292,11 @@ public class CrawlerRetreiver implements AutoCloseable { || proto.equalsIgnoreCase("https"); } - private Optional fetchUrl(EdgeUrl top, CrawlDelayTimer timer, DocumentWithReference reference) { - try { - var contentTags = reference.getContentTags(); - var fetchedDoc = tryDownload(top, timer, contentTags); - - CrawledDocument doc = reference.replaceOn304(fetchedDoc); - - if (doc.documentBody != null) { - doc.documentBodyHash = createHash(doc.documentBody); - - var parsedDoc = Jsoup.parse(doc.documentBody); - EdgeUrl url = new EdgeUrl(doc.url); - - crawlFrontier.enqueueLinksFromDocument(url, parsedDoc); - findCanonicalUrl(url, parsedDoc) - .ifPresent(canonicalLink -> doc.canonicalUrl = canonicalLink.toString()); - } - - return Optional.of(doc); - } - catch (Exception ex) { - logger.warn("Failed to process document {}", top); - } - - return Optional.empty(); - - } - - @SneakyThrows - private CrawledDocument tryDownload(EdgeUrl top, CrawlDelayTimer timer, ContentTags tags) { + private HttpFetchResult tryDownload(EdgeUrl top, CrawlDelayTimer timer, ContentTags tags) { for (int i = 0; i < 2; i++) { try { - var doc = fetcher.fetchContent(top, warcRecorder, tags); - doc.recrawlState = "NEW"; - return doc; + return fetcher.fetchContent(top, warcRecorder, tags); } catch (RateLimitException ex) { timer.slowDown(); @@ -339,15 +306,20 @@ public class CrawlerRetreiver implements AutoCloseable { Thread.sleep(delay); } } + catch (Exception ex) { + logger.warn("Failed to fetch {}", top, ex); + return new HttpFetchResult.ResultException(ex); + } } - return CrawledDocumentFactory.createRetryError(top); + return new HttpFetchResult.ResultNone(); } private String createHash(String documentBodyHash) { return hashMethod.hashUnencodedChars(documentBodyHash).toString(); } + // FIXME this does not belong in the crawler private Optional findCanonicalUrl(EdgeUrl baseUrl, Document parsed) { baseUrl = baseUrl.domain.toRootUrl(); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java index 01bafbe1..1a66c7a5 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java @@ -1,8 +1,8 @@ package nu.marginalia.crawl.retreival; -import nu.marginalia.crawl.retreival.fetcher.body.DocumentBodyExtractor; -import nu.marginalia.crawl.retreival.fetcher.body.DocumentBodyResult; -import nu.marginalia.crawl.retreival.fetcher.warc.HttpFetchResult; +import nu.marginalia.crawling.body.DocumentBodyExtractor; +import nu.marginalia.crawling.body.DocumentBodyResult; +import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.model.EdgeUrl; import org.jsoup.Jsoup; @@ -87,7 +87,7 @@ public class CrawlerWarcResynchronizer { } private void revisit(WarcRevisit revisit) throws IOException { - if (!WarcRecorder.revisitURI.equals(revisit.profile())) { + if (!WarcRecorder.documentRevisitURN.equals(revisit.profile())) { return; } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java index 55f2e633..df070cc5 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java @@ -1,6 +1,6 @@ package nu.marginalia.crawl.retreival.fetcher; -import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; +import nu.marginalia.crawling.body.ContentTypeLogic; import nu.marginalia.model.EdgeUrl; import okhttp3.OkHttpClient; import okhttp3.Request; diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java index be815954..70576510 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java @@ -3,12 +3,11 @@ package nu.marginalia.crawl.retreival.fetcher; import com.google.inject.ImplementedBy; import crawlercommons.robots.SimpleRobotRules; import nu.marginalia.crawl.retreival.RateLimitException; +import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; -import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; -import java.nio.file.Path; import java.util.List; @ImplementedBy(HttpFetcherImpl.class) @@ -20,7 +19,7 @@ public interface HttpFetcher { FetchResult probeDomain(EdgeUrl url); - CrawledDocument fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) throws RateLimitException; + HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) throws RateLimitException; SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 3faffe4a..d7732baa 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -8,30 +8,26 @@ import lombok.SneakyThrows; import nu.marginalia.crawl.retreival.Cookies; import nu.marginalia.crawl.retreival.RateLimitException; import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult; -import nu.marginalia.crawl.retreival.fetcher.body.DocumentBodyExtractor; -import nu.marginalia.crawl.retreival.fetcher.body.DocumentBodyResult; -import nu.marginalia.crawl.retreival.fetcher.socket.*; -import nu.marginalia.crawl.retreival.fetcher.warc.HttpFetchResult; -import static nu.marginalia.crawl.retreival.CrawledDocumentFactory.*; +import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory; +import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; +import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL; +import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawlerDocumentStatus; +import nu.marginalia.crawling.body.ContentTypeLogic; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; -import okhttp3.*; +import okhttp3.ConnectionPool; +import okhttp3.Dispatcher; +import okhttp3.OkHttpClient; +import okhttp3.Request; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import javax.net.ssl.SSLException; import javax.net.ssl.X509TrustManager; -import java.io.EOFException; -import java.io.IOException; -import java.net.*; -import java.nio.charset.IllegalCharsetNameException; -import java.time.LocalDateTime; -import java.util.*; +import java.util.List; +import java.util.Objects; +import java.util.Optional; import java.util.concurrent.TimeUnit; @@ -141,9 +137,9 @@ public class HttpFetcherImpl implements HttpFetcher { @Override @SneakyThrows - public CrawledDocument fetchContent(EdgeUrl url, - WarcRecorder warcRecorder, - ContentTags contentTags) + public HttpFetchResult fetchContent(EdgeUrl url, + WarcRecorder warcRecorder, + ContentTags contentTags) throws RateLimitException { @@ -152,23 +148,21 @@ public class HttpFetcherImpl implements HttpFetcher { if (contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) { ContentTypeProbeResult probeResult = contentTypeProber.probeContentType(url); - switch (probeResult) { - case ContentTypeProbeResult.Ok(EdgeUrl redirectUrl) -> { - url = redirectUrl; - } - case ContentTypeProbeResult.BadContentType (String contentType, int statusCode) -> { - return createErrorResponse(url, contentType, statusCode, - CrawlerDocumentStatus.BAD_CONTENT_TYPE, - contentType - ); - } - case ContentTypeProbeResult.Timeout timeout -> { - return createTimeoutErrorRsp(url); - } - case ContentTypeProbeResult.Exception ex -> { - return createErrorFromException(url, ex.ex()); - } - }; + if (probeResult instanceof ContentTypeProbeResult.Ok ok) { + url = ok.resolvedUrl(); + } + else if (probeResult instanceof ContentTypeProbeResult.BadContentType badContentType) { + warcRecorder.flagAsFailedContentTypeProbe(url, badContentType.contentType(), badContentType.statusCode()); + return new HttpFetchResult.ResultNone(); + } + else if (probeResult instanceof ContentTypeProbeResult.BadContentType.Timeout timeout) { + warcRecorder.flagAsTimeout(url); + return new HttpFetchResult.ResultNone(); + } + else if (probeResult instanceof ContentTypeProbeResult.Exception exception) { + warcRecorder.flagAsError(url, exception.ex()); + return new HttpFetchResult.ResultNone(); + } } var getBuilder = new Request.Builder().get(); @@ -181,78 +175,20 @@ public class HttpFetcherImpl implements HttpFetcher { HttpFetchResult result = warcRecorder.fetch(client, getBuilder.build()); - if (result instanceof HttpFetchResult.ResultError err) { - return createErrorFromException(url, err.ex()); - } - else if (result instanceof HttpFetchResult.ResultOk ok) { - try { - return extractBody(userAgent, url, ok); + if (result instanceof HttpFetchResult.ResultOk ok) { + if (ok.statusCode() == 429) { + String retryAfter = Objects.requireNonNullElse(ok.header("Retry-After"), "1000"); + throw new RateLimitException(retryAfter); } - catch (Exception ex) { - return createErrorFromException(url, ex); + if (ok.statusCode() == 304) { + return new HttpFetchResult.ResultSame(); + } + if (ok.statusCode() == 200) { + return ok; } } - else { - throw new IllegalStateException(STR."Unknown result type \{result.getClass()}"); - } - } - private CrawledDocument createErrorFromException(EdgeUrl url, Exception exception) throws RateLimitException { - return switch (exception) { - case RateLimitException rle -> throw rle; - case SocketTimeoutException ex -> createTimeoutErrorRsp(url); - case UnknownHostException ex -> createUnknownHostError(url); - case SocketException ex -> createHardErrorRsp(url, ex); - case ProtocolException ex -> createHardErrorRsp(url, ex); - case IllegalCharsetNameException ex -> createHardErrorRsp(url, ex); - case SSLException ex -> createHardErrorRsp(url, ex); - case EOFException ex -> createHardErrorRsp(url, ex); - default -> { - logger.error("Error during fetching", exception); - yield createHardErrorRsp(url, exception); - } - }; - } - - public static CrawledDocument extractBody(String userAgent, EdgeUrl url, HttpFetchResult.ResultOk rsp) throws IOException, RateLimitException { - - var responseUrl = new EdgeUrl(rsp.uri()); - - if (!Objects.equals(responseUrl.domain, url.domain)) { - return createRedirectResponse(url, rsp, responseUrl); - } - - if (rsp.statusCode() == 429) { - String retryAfter = Objects.requireNonNullElse(rsp.header("Retry-After"), "1000"); - - throw new RateLimitException(retryAfter); - } - - if (!isXRobotsTagsPermitted(rsp.allHeaders("X-Robots-Tag"), userAgent)) { - return CrawledDocument.builder() - .crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name()) - .crawlerStatusDesc("X-Robots-Tag") - .url(responseUrl.toString()) - .httpStatus(-1) - .timestamp(LocalDateTime.now().toString()) - .headers(rsp.headers().toString()) - .build(); - } - - return switch(DocumentBodyExtractor.extractBody(rsp)) { - case DocumentBodyResult.Error(CrawlerDocumentStatus status, String why) -> - createErrorResponse(url, rsp, status, why); - case DocumentBodyResult.Ok(String contentType, String body) -> - CrawledDocument.builder() - .crawlerStatus(CrawlerDocumentStatus.OK.name()) - .headers(rsp.headers().toString()) - .contentType(contentType) - .timestamp(LocalDateTime.now().toString()) - .httpStatus(rsp.statusCode()) - .url(responseUrl.toString()) - .documentBody(body) - .build(); - }; + return new HttpFetchResult.ResultNone(); } /** Check X-Robots-Tag header tag to see if we are allowed to index this page. @@ -318,17 +254,31 @@ public class HttpFetcherImpl implements HttpFetcher { private Optional fetchRobotsForProto(String proto, WarcRecorder recorder, EdgeDomain domain) { try { var url = new EdgeUrl(proto, domain, null, "/robots.txt", null); - return Optional.of(parseRobotsTxt(fetchContent(url, recorder, ContentTags.empty()))); + + var getBuilder = new Request.Builder().get(); + + getBuilder.url(url.toString()) + .addHeader("Accept-Encoding", "gzip") + .addHeader("User-agent", userAgent); + + HttpFetchResult result = recorder.fetch(client, getBuilder.build()); + + if (result instanceof HttpFetchResult.ResultOk ok) { + return Optional.of(parseRobotsTxt(ok)); + } + else { + return Optional.empty(); + } } catch (Exception ex) { return Optional.empty(); } } - private SimpleRobotRules parseRobotsTxt(CrawledDocument doc) { - return robotsParser.parseContent(doc.url, - doc.documentBody.getBytes(), - doc.contentType, + private SimpleRobotRules parseRobotsTxt(HttpFetchResult.ResultOk ok) { + return robotsParser.parseContent(ok.uri().toString(), + ok.bytesRaw(), + ok.header("Content-Type"), userAgent); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/body/DocumentBodyExtractor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/body/DocumentBodyExtractor.java deleted file mode 100644 index 99ae2cae..00000000 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/body/DocumentBodyExtractor.java +++ /dev/null @@ -1,44 +0,0 @@ -package nu.marginalia.crawl.retreival.fetcher.body; - -import nu.marginalia.contenttype.ContentTypeParser; -import nu.marginalia.contenttype.DocumentBodyToString; -import nu.marginalia.crawl.retreival.fetcher.warc.HttpFetchResult; -import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; -import nu.marginalia.crawling.model.CrawlerDocumentStatus; -import org.apache.commons.io.input.BOMInputStream; - -import java.io.IOException; -import java.util.zip.GZIPInputStream; - -public class DocumentBodyExtractor { - private static ContentTypeLogic contentTypeLogic = new ContentTypeLogic(); - - public static DocumentBodyResult extractBody(HttpFetchResult.ResultOk rsp) throws IOException { - var byteStream = rsp.getInputStream(); - - if ("gzip".equals(rsp.header("Content-Encoding"))) { - byteStream = new GZIPInputStream(byteStream); - } - byteStream = new BOMInputStream(byteStream); - - var contentTypeHeader = rsp.header("Content-Type"); - if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) { - return new DocumentBodyResult.Error(CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); - } - - byte[] data = byteStream.readAllBytes(); // size is limited by WarcRecorder - - var contentType = ContentTypeParser.parseContentType(contentTypeHeader, data); - if (!contentTypeLogic.isAllowableContentType(contentType.contentType())) { - return new DocumentBodyResult.Error(CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); - } - - if ("Shift_JIS".equalsIgnoreCase(contentType.charset())) { - return new DocumentBodyResult.Error(CrawlerDocumentStatus.BAD_CHARSET, ""); - } - - - return new DocumentBodyResult.Ok(contentType.contentType(), DocumentBodyToString.getStringData(contentType, data)); - } - -} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/body/DocumentBodyResult.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/body/DocumentBodyResult.java deleted file mode 100644 index fc5d67ec..00000000 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/body/DocumentBodyResult.java +++ /dev/null @@ -1,8 +0,0 @@ -package nu.marginalia.crawl.retreival.fetcher.body; - -import nu.marginalia.crawling.model.CrawlerDocumentStatus; - -public sealed interface DocumentBodyResult { - record Ok(String contentType, String body) implements DocumentBodyResult { } - record Error(CrawlerDocumentStatus status, String why) implements DocumentBodyResult { } -} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java index 3d4b5aaa..b7bb82bd 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java @@ -1,6 +1,9 @@ package nu.marginalia.crawl.retreival.fetcher.warc; +import nu.marginalia.crawl.retreival.DomainProber; +import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; +import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import okhttp3.OkHttpClient; import okhttp3.Request; @@ -8,7 +11,6 @@ import org.netpreserve.jwarc.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.net.InetAddress; @@ -18,9 +20,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.security.NoSuchAlgorithmException; import java.time.Instant; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** Based on JWarc's fetch method, APL 2.0 license *

@@ -29,7 +29,12 @@ import java.util.Map; * be reconstructed. */ public class WarcRecorder implements AutoCloseable { - public static final URI revisitURI = URI.create("urn:marginalia:revisit"); + public static final URI documentRevisitURN = URI.create("urn:marginalia/data/doc/revisit"); + + public static final URI documentRobotsTxtSkippedURN = URI.create("urn:marginalia/meta/doc/robots-txt-skipped"); + public static final URI documentBadContentTypeURN = URI.create("urn:marginalia/meta/doc/content-type-failed-probe"); + public static final URI documentProbeTimeout = URI.create("urn:marginalia/meta/doc/timeout-probe"); + public static final URI documentUnspecifiedError = URI.create("urn:marginalia/meta/doc/error"); private static final int MAX_TIME = 30_000; private static final int MAX_SIZE = 1024 * 1024 * 10; @@ -37,10 +42,14 @@ public class WarcRecorder implements AutoCloseable { private final Path warcFile; private static final Logger logger = LoggerFactory.getLogger(WarcRecorder.class); - private ThreadLocal bufferThreadLocal = ThreadLocal.withInitial(() -> new byte[MAX_SIZE]); + private final ThreadLocal bufferThreadLocal = ThreadLocal.withInitial(() -> new byte[MAX_SIZE]); private boolean temporaryFile = false; + // Affix a version string in case we need to change the format in the future + // in some way + private final String warcRecorderVersion = "1.0"; + /** * Create a new WarcRecorder that will write to the given file * @@ -48,7 +57,7 @@ public class WarcRecorder implements AutoCloseable { */ public WarcRecorder(Path warcFile) throws IOException { this.warcFile = warcFile; - this.writer = new WarcWriter(this.warcFile); + this.writer = new WarcWriter(warcFile); } /** @@ -170,7 +179,7 @@ public class WarcRecorder implements AutoCloseable { } catch (Exception ex) { logger.warn("Failed to fetch URL {}", uri, ex); - return new HttpFetchResult.ResultError(ex); + return new HttpFetchResult.ResultException(ex); } } @@ -178,55 +187,141 @@ public class WarcRecorder implements AutoCloseable { writer.write(item); } - /** - * Flag the given URL as skipped by the crawler, so that it will not be retried. - * Which URLs were skipped is still important when resynchronizing on the WARC file, - * so that the crawler can avoid re-fetching them. - * - * @param url The URL to flag - * @param headers - * @param documentBody - */ - public void flagAsSkipped(EdgeUrl url, String headers, int statusCode, String documentBody) { + private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, String documentBody) { try { WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder(); WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder(); - String header = WarcProtocolReconstructor.getResponseHeader(headers, statusCode); + byte[] bytes = documentBody.getBytes(); + + String fakeHeaders = STR.""" + Content-Type: \{contentType} + Content-Length: \{bytes.length} + Content-Encoding: UTF-8 + """; + + String header = WarcProtocolReconstructor.getResponseHeader(fakeHeaders, statusCode); ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(); responseDataBuffer.put(header); responseDigestBuilder.update(header); - try (var inputStream = new ByteArrayInputStream(documentBody.getBytes())) { - int remainingLength; - while ((remainingLength = responseDataBuffer.remaining()) > 0) { - int startPos = responseDataBuffer.pos(); + responseDigestBuilder.update(bytes, bytes.length); + payloadDigestBuilder.update(bytes, bytes.length); + responseDataBuffer.put(bytes, 0, bytes.length); - int n = responseDataBuffer.readFrom(inputStream, remainingLength); - if (n < 0) - break; - - responseDataBuffer.updateDigest(responseDigestBuilder, startPos, n); - responseDataBuffer.updateDigest(payloadDigestBuilder, startPos, n); - } - } - - WarcRevisit revisit = new WarcRevisit.Builder(url.asURI(), revisitURI) + WarcXResponseReference reference = new WarcXResponseReference.Builder(url.asURI()) .blockDigest(responseDigestBuilder.build()) .payloadDigest(payloadDigestBuilder.build()) .date(Instant.now()) .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes()) .build(); - revisit.http(); // force HTTP header to be parsed before body is consumed so that caller can use it + reference.http(); // force HTTP header to be parsed before body is consumed so that caller can use it - writer.write(revisit); + writer.write(reference); } catch (URISyntaxException | IOException | NoSuchAlgorithmException e) { throw new RuntimeException(e); } } + /** + * Flag the given URL as skipped by the crawler, so that it will not be retried. + * Which URLs were skipped is still important when resynchronizing on the WARC file, + * so that the crawler can avoid re-fetching them. + */ + public void flagAsSkipped(EdgeUrl url, String contentType, int statusCode, String documentBody) { + saveOldResponse(url, contentType, statusCode, documentBody); + } + + /** + * Write a reference copy of the given document data. This is used when the crawler provides + * an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this + * scenario we want to record the data as it was in the previous crawl, but not re-fetch it. + */ + public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, String documentBody) { + saveOldResponse(url, contentType, statusCode, documentBody); + } + + public void writeWarcinfoHeader(String ip, EdgeDomain domain, DomainProber.ProbeResult result) throws IOException { + + Map> fields = new HashMap<>(); + fields.put("ip", List.of(ip)); + fields.put("software", List.of(STR."search.marginalia.nu/\{warcRecorderVersion}")); + fields.put("domain", List.of(domain.toString())); + + switch (result) { + case DomainProber.ProbeResultRedirect redirectDomain: + fields.put("X-WARC-Probe-Status", List.of(STR."REDIRECT;\{redirectDomain.domain()}")); + break; + case DomainProber.ProbeResultError error: + fields.put("X-WARC-Probe-Status", List.of(STR."\{error.status().toString()};\{error.desc()}")); + break; + case DomainProber.ProbeResultOk ok: + fields.put("X-WARC-Probe-Status", List.of("OK")); + break; + } + + var warcinfo = new Warcinfo.Builder() + .date(Instant.now()) + .fields(fields) + .recordId(UUID.randomUUID()) + .build(); + + writer.write(warcinfo); + } + + public void flagAsRobotsTxtError(EdgeUrl top) { + try { + WarcRevisit revisit = new WarcRevisit.Builder(top.asURI(), documentRobotsTxtSkippedURN) + .date(Instant.now()) + .build(); + + writer.write(revisit); + } catch (URISyntaxException | IOException e) { + throw new RuntimeException(e); + } + } + + public void flagAsFailedContentTypeProbe(EdgeUrl url, String contentType, int status) { + try { + WarcRevisit revisit = new WarcRevisit.Builder(url.asURI(), documentBadContentTypeURN) + .date(Instant.now()) + .addHeader("Rejected-Content-Type", contentType) + .addHeader("Http-Status", Integer.toString(status)) + .build(); + + writer.write(revisit); + } catch (URISyntaxException | IOException e) { + throw new RuntimeException(e); + } + } + + public void flagAsError(EdgeUrl url, Exception ex) { + try { + WarcRevisit revisit = new WarcRevisit.Builder(url.asURI(), documentUnspecifiedError) + .date(Instant.now()) + .addHeader("Exception", ex.getClass().getSimpleName()) + .addHeader("ErrorMessage", Objects.requireNonNullElse(ex.getMessage(), "")) + .build(); + + writer.write(revisit); + } catch (URISyntaxException | IOException e) { + throw new RuntimeException(e); + } + } + + public void flagAsTimeout(EdgeUrl url) { + try { + WarcRevisit revisit = new WarcRevisit.Builder(url.asURI(), documentProbeTimeout) + .date(Instant.now()) + .build(); + + writer.write(revisit); + } catch (URISyntaxException | IOException e) { + throw new RuntimeException(e); + } + } private class ResponseDataBuffer { private final byte[] data; diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java index c77af845..70a98310 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java @@ -5,15 +5,11 @@ import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.CrawlDelayTimer; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.DomainCrawlFrontier; -import nu.marginalia.crawl.retreival.CrawledDocumentFactory; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.model.EdgeUrl; import org.jsoup.Jsoup; -import java.util.function.Consumer; - /** This class encapsulates the logic for re-visiting a domain that has already been crawled. * We may use information from the previous crawl to inform the next crawl, specifically the * E-Tag and Last-Modified headers. @@ -27,16 +23,13 @@ public class CrawlerRevisitor { private final DomainCrawlFrontier crawlFrontier; - private final Consumer crawledDomainWriter; private final CrawlerRetreiver crawlerRetreiver; private final WarcRecorder warcRecorder; public CrawlerRevisitor(DomainCrawlFrontier crawlFrontier, - Consumer crawledDomainWriter, CrawlerRetreiver crawlerRetreiver, WarcRecorder warcRecorder) { this.crawlFrontier = crawlFrontier; - this.crawledDomainWriter = crawledDomainWriter; this.crawlerRetreiver = crawlerRetreiver; this.warcRecorder = warcRecorder; } @@ -69,7 +62,7 @@ public class CrawlerRevisitor { if (doc.httpStatus != 200) continue; if (!robotsRules.isAllowed(url.toString())) { - crawledDomainWriter.accept(CrawledDocumentFactory.createRobotsError(url)); + warcRecorder.flagAsRobotsTxtError(url); continue; } if (!crawlFrontier.filterLink(url)) @@ -87,7 +80,6 @@ public class CrawlerRevisitor { // fashion to make sure we eventually catch changes over time // and ensure we discover new links - crawledDomainWriter.accept(doc); crawlFrontier.addVisited(url); // Hoover up any links from the document @@ -97,7 +89,7 @@ public class CrawlerRevisitor { } // Add a WARC record so we don't repeat this - warcRecorder.flagAsSkipped(url, doc.headers, doc.httpStatus, doc.documentBody); + warcRecorder.flagAsSkipped(url, doc.contentType, doc.httpStatus, doc.documentBody); continue; } @@ -107,15 +99,14 @@ public class CrawlerRevisitor { // providing etag and last-modified headers, so we can recycle the // document if it hasn't changed without actually downloading it - var fetchedDocOpt = crawlerRetreiver.fetchWriteAndSleep(url, - delayTimer, - new DocumentWithReference(doc, oldCrawlData)); - if (fetchedDocOpt.isEmpty()) continue; + var reference = new DocumentWithReference(doc, oldCrawlData); + var result = crawlerRetreiver.fetchWriteAndSleep(url, delayTimer, reference); - if (documentWasRetainedTag.equals(fetchedDocOpt.get().recrawlState)) retained ++; - else if (documentWasSameTag.equals(fetchedDocOpt.get().recrawlState)) retained ++; + if (reference.isSame(result)) { + retained++; + } - recrawled ++; + recrawled++; } return recrawled; diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java index e832541f..03b96760 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java @@ -1,12 +1,15 @@ package nu.marginalia.crawl.retreival.revisit; +import lombok.SneakyThrows; import nu.marginalia.crawl.retreival.CrawlDataReference; -import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.fetcher.ContentTags; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.crawling.body.DocumentBodyExtractor; +import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.model.EdgeUrl; import javax.annotation.Nullable; -import java.time.LocalDateTime; public record DocumentWithReference( @Nullable CrawledDocument doc, @@ -18,17 +21,28 @@ public record DocumentWithReference( return emptyInstance; } - public boolean isContentBodySame(CrawledDocument newDoc) { + /** Returns true if the provided document is the same as the reference document, + * or if the result was retained via HTTP 304. + */ + public boolean isSame(HttpFetchResult result) { + if (result instanceof HttpFetchResult.ResultSame) + return true; + if (result instanceof HttpFetchResult.ResultRetained) + return true; + + if (!(result instanceof HttpFetchResult.ResultOk resultOk)) + return false; + if (reference == null) return false; if (doc == null) return false; if (doc.documentBody == null) return false; - if (newDoc.documentBody == null) - return false; - return reference.isContentBodySame(doc, newDoc); + return DocumentBodyExtractor.extractBody(resultOk) + .map((contentType, body) -> reference.isContentBodySame(doc.documentBody, body)) + .orElse(false); } public ContentTags getContentTags() { @@ -60,23 +74,4 @@ public record DocumentWithReference( return doc == null || reference == null; } - /** - * If the provided document has HTTP status 304, and the reference document is provided, - * return the reference document; otherwise return the provided document. - */ - public CrawledDocument replaceOn304(CrawledDocument fetchedDoc) { - - if (doc == null) - return fetchedDoc; - - // HTTP status 304 is NOT MODIFIED, which means the document is the same as it was when - // we fetched it last time. We can recycle the reference document. - if (fetchedDoc.httpStatus != 304) - return fetchedDoc; - - var ret = doc; - ret.recrawlState = CrawlerRevisitor.documentWasRetainedTag; - ret.timestamp = LocalDateTime.now().toString(); - return ret; - } } diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java index 55f2eebe..4faa2042 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java @@ -8,9 +8,7 @@ import okhttp3.Request; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.netpreserve.jwarc.WarcReader; -import org.netpreserve.jwarc.WarcRequest; -import org.netpreserve.jwarc.WarcResponse; +import org.netpreserve.jwarc.*; import java.io.IOException; import java.net.URISyntaxException; @@ -22,6 +20,7 @@ import java.util.Map; import java.util.zip.GZIPInputStream; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertInstanceOf; class WarcRecorderTest { Path fileName; @@ -33,7 +32,7 @@ class WarcRecorderTest { .addNetworkInterceptor(new IpInterceptingNetworkInterceptor()) .build(); - fileName = Files.createTempFile("test", ".warc.gz"); + fileName = Files.createTempFile("test", ".warc"); client = new WarcRecorder(fileName); } @@ -73,10 +72,7 @@ class WarcRecorderTest { try (var recorder = new WarcRecorder(fileName)) { recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"), - """ - Content-type: text/html - X-Cookies: 1 - """, + "text/html", 200, "test"); } @@ -95,5 +91,27 @@ class WarcRecorderTest { new GZIPInputStream(Files.newInputStream(fileName)).transferTo(System.out); } + @Test + public void testSaveImport() throws URISyntaxException, IOException { + try (var recorder = new WarcRecorder(fileName)) { + recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"), + "text/html", + 200, + "test"); + } + + try (var reader = new WarcReader(fileName)) { + WarcXResponseReference.register(reader); + + for (var record : reader) { + System.out.println(record.type()); + System.out.println(record.getClass().getSimpleName()); + if (record instanceof WarcXResponseReference rsp) { + assertEquals("https://www.marginalia.nu/", rsp.target()); + } + } + } + + } } \ No newline at end of file diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java index 2f3076cd..4590dde2 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java @@ -4,8 +4,10 @@ import lombok.SneakyThrows; import nu.marginalia.crawl.retreival.RateLimitException; import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; +import nu.marginalia.crawling.body.DocumentBodyExtractor; +import nu.marginalia.crawling.body.DocumentBodyResult; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; -import nu.marginalia.crawl.retreival.logic.ContentTypeLogic; +import nu.marginalia.crawling.body.ContentTypeLogic; import nu.marginalia.model.EdgeUrl; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -33,10 +35,11 @@ class HttpFetcherTest { void fetchUTF8() throws URISyntaxException, RateLimitException, IOException { var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); try (var recorder = new WarcRecorder()) { - var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty()); - System.out.println(str.contentType); + var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty()); + if (DocumentBodyExtractor.extractBody(result) instanceof DocumentBodyResult.Ok bodyOk) { + System.out.println(bodyOk.contentType()); + } } - } @Test @@ -44,8 +47,10 @@ class HttpFetcherTest { var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); try (var recorder = new WarcRecorder()) { - var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty()); - System.out.println(str.contentType); + var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty()); + if (DocumentBodyExtractor.extractBody(result) instanceof DocumentBodyResult.Ok bodyOk) { + System.out.println(bodyOk.contentType()); + } } } } \ No newline at end of file diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index 2a00e6de..b7727022 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -5,6 +5,7 @@ import lombok.SneakyThrows; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.*; +import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawlerDocumentStatus; @@ -13,6 +14,7 @@ import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawlspec.CrawlSpecRecord; import nu.marginalia.test.CommonTestData; +import okhttp3.Headers; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; import org.mockito.Mockito; @@ -21,12 +23,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.net.URISyntaxException; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.function.Consumer; +import java.util.*; public class CrawlerMockFetcherTest { @@ -65,9 +62,9 @@ public class CrawlerMockFetcherTest { } - void crawl(CrawlSpecRecord spec, Consumer consumer) throws IOException { + void crawl(CrawlSpecRecord spec) throws IOException { try (var recorder = new WarcRecorder()) { - new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder, consumer) + new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder) .fetch(); } } @@ -80,9 +77,7 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html"); registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html"); - crawl(new CrawlSpecRecord("startrek.website", 10, new ArrayList<>()), out::add); - - out.forEach(System.out::println); + crawl(new CrawlSpecRecord("startrek.website", 10, new ArrayList<>())); } @Test @@ -91,9 +86,7 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html"); - crawl(new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>()), out::add); - - out.forEach(System.out::println); + crawl(new CrawlSpecRecord("en.wikipedia.org", 10, new ArrayList<>())); } @Test @@ -104,9 +97,7 @@ public class CrawlerMockFetcherTest { registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html"); registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html"); - crawl(new CrawlSpecRecord("community.tt-rss.org", 10, new ArrayList<>()), out::add); - - out.forEach(System.out::println); + crawl(new CrawlSpecRecord("community.tt-rss.org", 10, new ArrayList<>())); } class MockFetcher implements HttpFetcher { @@ -126,21 +117,23 @@ public class CrawlerMockFetcherTest { return new FetchResult(FetchResultState.OK, url); } + @SneakyThrows @Override - public CrawledDocument fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) { + public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) { logger.info("Fetching {}", url); if (mockData.containsKey(url)) { - return mockData.get(url); - } - else { - return CrawledDocument.builder() - .crawlId("1") - .url(url.toString()) - .contentType("text/html") - .httpStatus(404) - .crawlerStatus(CrawlerDocumentStatus.ERROR.name()) - .build(); + byte[] bodyBytes = mockData.get(url).documentBody.getBytes(); + return new HttpFetchResult.ResultOk( + url.asURI(), + 200, + new Headers.Builder().build(), + bodyBytes, + 0, + bodyBytes.length + ); } + + return new HttpFetchResult.ResultNone(); } @Override diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 59bf99f6..286f15f5 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -16,15 +16,14 @@ import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.model.crawlspec.CrawlSpecRecord; import org.junit.jupiter.api.*; -import org.netpreserve.jwarc.WarcReader; -import org.netpreserve.jwarc.WarcRequest; -import org.netpreserve.jwarc.WarcResponse; +import org.netpreserve.jwarc.*; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.*; import java.util.stream.Collectors; +import java.util.zip.GZIPInputStream; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -33,6 +32,8 @@ import static org.junit.jupiter.api.Assertions.assertTrue; class CrawlerRetreiverTest { private HttpFetcher httpFetcher; + Path tempFile; + Path tempFile2; @BeforeEach public void setUp() { httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D"); @@ -45,6 +46,15 @@ class CrawlerRetreiverTest { System.setProperty("http.agent", WmsaHome.getUserAgent().uaString()); } + @AfterEach + public void tearDown() throws IOException { + if (tempFile != null) { + Files.deleteIfExists(tempFile); + } + if (tempFile2 != null) { + Files.deleteIfExists(tempFile2); + } + } @Test public void testWarcOutput() throws IOException { var specs = CrawlSpecRecord @@ -57,10 +67,8 @@ class CrawlerRetreiverTest { try { tempFile = Files.createTempFile("crawling-process", "warc"); - List data = new ArrayList<>(); - try (var recorder = new WarcRecorder(tempFile)) { - new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, data::add).fetch(); + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(); } catch (IOException ex) { Assertions.fail(ex); } @@ -93,7 +101,7 @@ class CrawlerRetreiverTest { } } @Test - public void testWithKnownDomains() { + public void testWithKnownDomains() throws IOException { var specs = CrawlSpecRecord .builder() .crawlDepth(5) @@ -103,15 +111,30 @@ class CrawlerRetreiverTest { List data = new ArrayList<>(); - try (var recorder = new WarcRecorder()) { - new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, data::add).fetch(); + tempFile = Files.createTempFile("crawling-process", ".warc"); + + try (var recorder = new WarcRecorder(tempFile)) { + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(); } catch (IOException ex) { Assertions.fail(ex); } + + try (var stream = CrawledDomainReader.createDataStream(tempFile)) { + while (stream.hasNext()) { + if (stream.next() instanceof CrawledDocument doc) { + data.add(doc); + } + } + } catch (Exception e) { + throw new RuntimeException(e); + } + var fetchedUrls = - data.stream().filter(CrawledDocument.class::isInstance) + data.stream() + .peek(System.out::println) + .filter(CrawledDocument.class::isInstance) .map(CrawledDocument.class::cast) .map(doc -> doc.url) .collect(Collectors.toSet()); @@ -126,7 +149,7 @@ class CrawlerRetreiverTest { } @Test - public void testEmptySet() { + public void testEmptySet() throws IOException { var specs = CrawlSpecRecord .builder() @@ -135,15 +158,30 @@ class CrawlerRetreiverTest { .urls(List.of()) .build(); + List data = new ArrayList<>(); - try (var recorder = new WarcRecorder()) { - new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, data::add).fetch(); + tempFile = Files.createTempFile("crawling-process", ".warc"); + + try (var recorder = new WarcRecorder(tempFile)) { + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(); } catch (IOException ex) { Assertions.fail(ex); } + + try (var stream = CrawledDomainReader.createDataStream(tempFile)) { + while (stream.hasNext()) { + if (stream.next() instanceof CrawledDocument doc) { + data.add(doc); + } + } + } catch (Exception e) { + throw new RuntimeException(e); + } + + data.stream().filter(CrawledDocument.class::isInstance) .map(CrawledDocument.class::cast) .forEach(doc -> System.out.println(doc.url + "\t" + doc.crawlerStatus + "\t" + doc.httpStatus)); @@ -174,43 +212,70 @@ class CrawlerRetreiverTest { .build(); - Path out = Files.createTempDirectory("crawling-process"); - var writer = new CrawledDomainWriter(out, specs.domain, "idid"); + tempFile = Files.createTempFile("crawling-process", ".warc.gz"); + tempFile2 = Files.createTempFile("crawling-process", ".warc.gz"); + Map, List> data = new HashMap<>(); - try (var recorder = new WarcRecorder()) { - new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, d -> { - data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d); - if (d instanceof CrawledDocument doc) { - System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); - if (Math.random() > 0.5) { - doc.headers = ""; - } - } - writer.accept(d); - }).fetch(); + try (var recorder = new WarcRecorder(tempFile)) { + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(); } catch (IOException ex) { Assertions.fail(ex); } - - writer.close(); - - var reader = new CrawledDomainReader(); - var stream = reader.createDataStream(out, specs.domain, "idid"); + try (var stream = CrawledDomainReader.createDataStream(tempFile)) { + while (stream.hasNext()) { + var doc = stream.next(); + data.computeIfAbsent(doc.getClass(), c -> new ArrayList<>()).add(doc); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + var stream = CrawledDomainReader.createDataStream(tempFile); CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0); domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList()); - try (var recorder = new WarcRecorder()) { - new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder, d -> { - if (d instanceof CrawledDocument doc) { - System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus); - } - }).fetch(new DomainLinks(), new CrawlDataReference(stream)); + try (var recorder = new WarcRecorder(tempFile2)) { + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(new DomainLinks(), + new CrawlDataReference(stream)); } catch (IOException ex) { Assertions.fail(ex); } + + new GZIPInputStream(Files.newInputStream(tempFile2)).transferTo(System.out); + + try (var reader = new WarcReader(tempFile2)) { + WarcXResponseReference.register(reader); + + reader.forEach(record -> { + if (record instanceof WarcResponse rsp) { + try { + System.out.println(rsp.type() + ":" + rsp.target() + "/" + rsp.http().status()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + if (record instanceof WarcMetadata rsp) { + System.out.println("meta:" + rsp.target()); + } + }); + } + + try (var ds = CrawledDomainReader.createDataStream(tempFile2)) { + while (ds.hasNext()) { + var doc = ds.next(); + if (doc instanceof CrawledDomain dr) { + System.out.println(dr.domain + "/" + dr.crawlerStatus); + } + else if (doc instanceof CrawledDocument dc) { + System.out.println(dc.url + "/" + dc.crawlerStatus + "/" + dc.httpStatus); + } + } + } catch (Exception e) { + throw new RuntimeException(e); + + } } } \ No newline at end of file diff --git a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java index 0af77acb..353ef965 100644 --- a/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java +++ b/code/services-core/executor-service/src/main/java/nu/marginalia/actor/task/ExportAtagsActor.java @@ -63,8 +63,6 @@ public class ExportAtagsActor extends RecordActorPrototype { Path inputDir = storageService.getStorage(crawlId).asPath(); - var reader = new CrawledDomainReader(); - try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)))); ) { @@ -78,7 +76,7 @@ public class ExportAtagsActor extends RecordActorPrototype { } Path crawlDataPath = inputDir.resolve(item.relPath()); - try (var stream = reader.createDataStream(crawlDataPath)) { + try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) { exportLinks(tagWriter, stream); } catch (Exception ex) { diff --git a/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java b/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java index 1a73a952..4322d3fc 100644 --- a/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java +++ b/code/tools/crawl-data-unfcker/src/main/java/nu/marginalia/tools/CrawlDataUnfcker.java @@ -29,13 +29,11 @@ public class CrawlDataUnfcker { return; } - var reader = new CrawledDomainReader(); - try (var wl = new WorkLog(output.resolve("crawler.log"))) { for (var inputItem : WorkLog.iterable(input.resolve("crawler.log"))) { Path inputPath = input.resolve(inputItem.relPath()); - var domainMaybe = readDomain(reader, inputPath).map(CrawledDomain::getDomain); + var domainMaybe = readDomain(inputPath).map(CrawledDomain::getDomain); if (domainMaybe.isEmpty()) continue; var domain = domainMaybe.get(); @@ -43,7 +41,7 @@ public class CrawlDataUnfcker { // Generate conformant ID String newId = Integer.toHexString(domain.hashCode()); - var outputPath = CrawlerOutputFile.createOutputPath(output, newId, domain); + var outputPath = CrawlerOutputFile.createLegacyOutputPath(output, newId, domain); var outputFileName = outputPath.toFile().getName(); System.out.println(inputPath + " -> " + outputPath); @@ -56,13 +54,13 @@ public class CrawlDataUnfcker { } } - static Optional readDomain(CrawledDomainReader reader, Path file) { + static Optional readDomain(Path file) { if (!Files.exists(file)) { System.out.println("Missing file " + file); return Optional.empty(); } - try (var stream = reader.createDataStream(file)) { + try (var stream = CrawledDomainReader.createDataStream(file)) { while (stream.hasNext()) { if (stream.next() instanceof CrawledDomain domain) { return Optional.of(domain); diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java index 97df4a39..c5751a7a 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/ExperimentRunnerMain.java @@ -50,10 +50,9 @@ public class ExperimentRunnerMain { experiment.args(Arrays.copyOfRange(args, 2, args.length)); Path basePath = Path.of(args[0]); - var reader = new CrawledDomainReader(); for (var item : WorkLog.iterable(basePath.resolve("crawler.log"))) { Path crawlDataPath = basePath.resolve(item.relPath()); - try (var stream = reader.createDataStream(crawlDataPath)) { + try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) { experiment.process(stream); } catch (Exception ex) { diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/LegacyExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/LegacyExperiment.java index 4e61ffc4..5d7d8d11 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/LegacyExperiment.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/LegacyExperiment.java @@ -5,12 +5,12 @@ import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import java.io.IOException; -import java.net.URISyntaxException; import java.util.ArrayList; import java.util.List; public abstract class LegacyExperiment extends Experiment { public abstract boolean process(CrawledDomain domain); + @Override public boolean process(SerializableCrawlDataStream dataStream) throws IOException { List documentList = new ArrayList<>(); diff --git a/settings.gradle b/settings.gradle index af8a45f5..42ae0f47 100644 --- a/settings.gradle +++ b/settings.gradle @@ -155,7 +155,7 @@ dependencyResolutionManagement { library('duckdb', 'org.duckdb', 'duckdb_jdbc').version('0.9.1') library('okhttp3','com.squareup.okhttp3','okhttp').version('4.11.0') - library('jwarc', 'org.netpreserve', 'jwarc').version('0.28.4') + library('jwarc', 'org.netpreserve', 'jwarc').version('0.28.5') library('httpcomponents.core','org.apache.httpcomponents','httpcore').version('4.4.15') library('httpcomponents.client','org.apache.httpcomponents','httpclient').version('4.5.13') From 787a20cbaaa34d93d7751f8ba5ec3ff296679e89 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 13 Dec 2023 16:22:19 +0100 Subject: [PATCH 11/23] (crawling-model) Implement a parquet format for crawl data This is not hooked into anything yet. The change also makes modifications to the parquet-floor library to support reading and writing of byte[] arrays. This is desirable since we may in the future want to support inputs that are not text-based, and codifying the assumption that each document is a string will definitely cause us grief down the line. --- .../crawling-model/build.gradle | 2 + .../parquet/CrawledDocumentParquetRecord.java | 87 +++++++++++++++++++ ...rawledDocumentParquetRecordFileReader.java | 19 ++++ ...rawledDocumentParquetRecordFileWriter.java | 23 +++++ ...edDocumentParquetRecordFileWriterTest.java | 44 ++++++++++ .../blue/strategic/parquet/ParquetReader.java | 7 +- .../blue/strategic/parquet/ParquetWriter.java | 2 +- 7 files changed, 182 insertions(+), 2 deletions(-) create mode 100644 code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java create mode 100644 code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java create mode 100644 code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java create mode 100644 code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java diff --git a/code/process-models/crawling-model/build.gradle b/code/process-models/crawling-model/build.gradle index f1f77a70..03db0de9 100644 --- a/code/process-models/crawling-model/build.gradle +++ b/code/process-models/crawling-model/build.gradle @@ -22,10 +22,12 @@ dependencies { implementation project(':code:common:service-client') implementation project(':code:features-crawl:content-type') implementation project(':code:libraries:language-processing') + implementation project(':third-party:parquet-floor') implementation libs.bundles.slf4j implementation libs.notnull + implementation libs.bundles.parquet implementation libs.jwarc implementation libs.gson diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java new file mode 100644 index 00000000..614be635 --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java @@ -0,0 +1,87 @@ +package nu.marginalia.crawling.parquet; + +import blue.strategic.parquet.Dehydrator; +import blue.strategic.parquet.Hydrator; +import blue.strategic.parquet.ValueWriter; +import lombok.AllArgsConstructor; +import lombok.EqualsAndHashCode; +import lombok.NoArgsConstructor; +import lombok.ToString; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.Types; + +import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; +import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*; + +@AllArgsConstructor +@NoArgsConstructor +@EqualsAndHashCode +@ToString +public class CrawledDocumentParquetRecord { + public String domain; + public String url; + public String ip; + public boolean cookies; + public String contentType; + public byte[] body; + + public static Hydrator newHydrator() { + return new CrawledDocumentParquetRecordHydrator(); + } + + public static Dehydrator newDehydrator() { + return CrawledDocumentParquetRecord::dehydrate; + } + + public static MessageType schema = new MessageType( + CrawledDocumentParquetRecord.class.getSimpleName(), + Types.required(BINARY).as(stringType()).named("domain"), + Types.required(BINARY).as(stringType()).named("url"), + Types.required(BINARY).as(stringType()).named("ip"), + Types.required(BOOLEAN).named("cookies"), + Types.required(BINARY).as(stringType()).named("contentType"), + Types.required(BINARY).named("body") + ); + + + public CrawledDocumentParquetRecord add(String heading, Object value) { + switch (heading) { + case "domain" -> domain = (String) value; + case "url" -> url = (String) value; + case "ip" -> ip = (String) value; + case "cookies" -> cookies = (Boolean) value; + case "contentType" -> contentType = (String) value; + case "body" -> body = (byte[]) value; + default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); + } + return this; + } + + public void dehydrate(ValueWriter valueWriter) { + valueWriter.write("domain", domain); + valueWriter.write("url", url); + valueWriter.write("ip", ip); + valueWriter.write("cookies", cookies); + valueWriter.write("contentType", contentType); + valueWriter.write("body", body); + } +} + +class CrawledDocumentParquetRecordHydrator implements Hydrator { + + @Override + public CrawledDocumentParquetRecord start() { + return new CrawledDocumentParquetRecord(); + } + + @Override + public CrawledDocumentParquetRecord add(CrawledDocumentParquetRecord target, String heading, Object value) { + return target.add(heading, value); + } + + @Override + public CrawledDocumentParquetRecord finish(CrawledDocumentParquetRecord target) { + return target; + } + +} diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java new file mode 100644 index 00000000..7e8c7501 --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java @@ -0,0 +1,19 @@ +package nu.marginalia.crawling.parquet; + +import blue.strategic.parquet.HydratorSupplier; +import blue.strategic.parquet.ParquetReader; +import org.jetbrains.annotations.NotNull; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.stream.Stream; + +public class CrawledDocumentParquetRecordFileReader { + + @NotNull + public static Stream stream(Path path) throws IOException { + return ParquetReader.streamContent(path.toFile(), + HydratorSupplier.constantly(CrawledDocumentParquetRecord.newHydrator())); + } + +} diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java new file mode 100644 index 00000000..f4961c01 --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java @@ -0,0 +1,23 @@ +package nu.marginalia.crawling.parquet; + +import blue.strategic.parquet.ParquetWriter; + +import java.io.IOException; +import java.nio.file.Path; + +public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { + private final ParquetWriter writer; + + public CrawledDocumentParquetRecordFileWriter(Path file) throws IOException { + writer = ParquetWriter.writeFile(CrawledDocumentParquetRecord.schema, + file.toFile(), CrawledDocumentParquetRecord.newDehydrator()); + } + + public void write(CrawledDocumentParquetRecord domainData) throws IOException { + writer.write(domainData); + } + + public void close() throws IOException { + writer.close(); + } +} diff --git a/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java b/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java new file mode 100644 index 00000000..07a27200 --- /dev/null +++ b/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java @@ -0,0 +1,44 @@ +package nu.marginalia.crawling.parquet; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +import static org.junit.jupiter.api.Assertions.*; + +class CrawledDocumentParquetRecordFileWriterTest { + Path tempFile; + + @BeforeEach + public void setUp() throws IOException { + tempFile = Files.createTempFile("test", ".parquet"); + } + + @AfterEach + public void tearDown() throws IOException { + Files.delete(tempFile); + } + + @Test + void write() throws IOException { + var original = new CrawledDocumentParquetRecord("www.marginalia.nu", + "https://www.marginalia.nu/", + "127.0.0.1", + false, + "text/html", + "hello world".getBytes()); + + try (var writer = new CrawledDocumentParquetRecordFileWriter(tempFile)) { + writer.write(original); + } + + try (var stream = CrawledDocumentParquetRecordFileReader.stream(tempFile)) { + var actual = stream.findFirst().orElseThrow(); + assertEquals(original, actual); + } + } +} \ No newline at end of file diff --git a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java index 1ec3e7fb..45718fe8 100644 --- a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java +++ b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetReader.java @@ -13,6 +13,7 @@ import org.apache.parquet.io.DelegatingSeekableInputStream; import org.apache.parquet.io.InputFile; import org.apache.parquet.io.SeekableInputStream; import org.apache.parquet.io.api.GroupConverter; +import org.apache.parquet.schema.LogicalTypeAnnotation; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.PrimitiveType; @@ -144,7 +145,11 @@ public final class ParquetReader implements Spliterator, Closeable { case BINARY: case FIXED_LEN_BYTE_ARRAY: case INT96: - return primitiveType.stringifier().stringify(columnReader.getBinary()); + if (primitiveType.getLogicalTypeAnnotation() == null) { + return columnReader.getBinary().getBytes(); + } else { + return primitiveType.stringifier().stringify(columnReader.getBinary()); + } case BOOLEAN: return columnReader.getBoolean(); case DOUBLE: diff --git a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java index 6e53c189..6d9b5734 100644 --- a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java +++ b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java @@ -242,7 +242,7 @@ public final class ParquetWriter implements Closeable { if (type.getLogicalTypeAnnotation() == LogicalTypeAnnotation.stringType()) { recordConsumer.addBinary(Binary.fromString((String)value)); } else { - throw new UnsupportedOperationException("We don't support writing logical annotation type " + type.getLogicalTypeAnnotation()); + recordConsumer.addBinary(Binary.fromConstantByteArray((byte[])value)); } break; default: From 1328bc4938b63e9910d2bf9d1eae086ad7811ad5 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 14 Dec 2023 16:05:48 +0100 Subject: [PATCH 12/23] (warc) Clean up parquet conversion This commit cleans up the warc->parquet conversion. Records with a http status other than 200 are now included. The commit also fixes a bug where the robots.txt parser would be fed the full HTTP response (and choke), instead of the body. The DocumentBodyExtractor code has also been cleaned up, and now offers a way of just getting the byte[] representation for later processing, as conversion to and from strings is a bit wasteful. --- .../crawling/body/DocumentBodyExtractor.java | 58 ++++++++++++---- .../crawling/body/DocumentBodyResult.java | 30 +++++++-- .../crawling/body/HttpFetchResult.java | 66 +++++++++---------- .../crawling/io/CrawlerOutputFile.java | 40 +++++++---- ...arcReadingSerializableCrawlDataStream.java | 7 +- .../parquet/CrawledDocumentParquetRecord.java | 4 ++ ...rawledDocumentParquetRecordFileWriter.java | 61 +++++++++++++++++ ...edDocumentParquetRecordFileWriterTest.java | 3 + .../java/nu/marginalia/crawl/CrawlerMain.java | 6 +- .../crawl/retreival/CrawlerRetreiver.java | 2 +- .../retreival/CrawlerWarcResynchronizer.java | 33 +++------- .../retreival/fetcher/HttpFetcherImpl.java | 21 +++--- .../warc/WarcProtocolReconstructor.java | 5 +- .../retreival/fetcher/warc/WarcRecorder.java | 3 +- .../revisit/DocumentWithReference.java | 12 ++-- .../retreival/fetcher/WarcRecorderTest.java | 54 +++++++++++---- .../marginalia/crawling/HttpFetcherTest.java | 4 +- .../retreival/CrawlerMockFetcherTest.java | 1 + 18 files changed, 278 insertions(+), 132 deletions(-) diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java index 7bb548e5..00ceac86 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java @@ -11,20 +11,54 @@ import java.io.IOException; import java.util.zip.GZIPInputStream; public class DocumentBodyExtractor { - private static ContentTypeLogic contentTypeLogic = new ContentTypeLogic(); + private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic(); private static final Logger logger = LoggerFactory.getLogger(DocumentBodyExtractor.class); - public static DocumentBodyResult extractBody(HttpFetchResult result) { - if (result instanceof HttpFetchResult.ResultOk fetchOk) { - return extractBody(fetchOk); + public static DocumentBodyResult asString(HttpFetchResult result) { + if (result instanceof HttpFetchResult.ResultOk ok) { + return asString(ok); } - else { - return new DocumentBodyResult.Error(CrawlerDocumentStatus.ERROR, ""); + else if (result instanceof HttpFetchResult.ResultRetained retained) { + return new DocumentBodyResult.Ok<>(retained.contentType(), retained.body()); + } + + return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.ERROR, "Fetch Result Not Ok"); + } + + public static DocumentBodyResult asBytes(HttpFetchResult result) { + if (result instanceof HttpFetchResult.ResultOk fetchOk) { + return asBytes(fetchOk); + } + else if (result instanceof HttpFetchResult.ResultRetained retained) { + return new DocumentBodyResult.Ok<>(retained.contentType(), retained.body().getBytes()); + } + + return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.ERROR, "Fetch Result Not Ok"); + } + + public static DocumentBodyResult asBytes(HttpFetchResult.ResultOk rsp) { + try { + var byteStream = rsp.getInputStream(); + + if ("gzip".equals(rsp.header("Content-Encoding"))) { + byteStream = new GZIPInputStream(byteStream); + } + byteStream = new BOMInputStream(byteStream); + + var contentTypeHeader = rsp.header("Content-Type"); + + byte[] data = byteStream.readAllBytes(); // size is limited by WarcRecorder + var contentType = ContentTypeParser.parseContentType(contentTypeHeader, data); + + return new DocumentBodyResult.Ok<>(contentType.contentType(), data); + } catch (Exception ex) { + logger.error("Failed to extract body", ex); + return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.ERROR, ""); } } - public static DocumentBodyResult extractBody(HttpFetchResult.ResultOk rsp) { + public static DocumentBodyResult asString(HttpFetchResult.ResultOk rsp) { try { var byteStream = rsp.getInputStream(); @@ -35,25 +69,25 @@ public class DocumentBodyExtractor { var contentTypeHeader = rsp.header("Content-Type"); if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) { - return new DocumentBodyResult.Error(CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); + return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); } byte[] data = byteStream.readAllBytes(); // size is limited by WarcRecorder var contentType = ContentTypeParser.parseContentType(contentTypeHeader, data); if (!contentTypeLogic.isAllowableContentType(contentType.contentType())) { - return new DocumentBodyResult.Error(CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); + return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); } if ("Shift_JIS".equalsIgnoreCase(contentType.charset())) { - return new DocumentBodyResult.Error(CrawlerDocumentStatus.BAD_CHARSET, ""); + return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CHARSET, ""); } - return new DocumentBodyResult.Ok(contentType.contentType(), DocumentBodyToString.getStringData(contentType, data)); + return new DocumentBodyResult.Ok<>(contentType.contentType(), DocumentBodyToString.getStringData(contentType, data)); } catch (IOException ex) { logger.error("Failed to extract body", ex); - return new DocumentBodyResult.Error(CrawlerDocumentStatus.ERROR, ""); + return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.ERROR, ""); } } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyResult.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyResult.java index 1959f844..0f30dc1f 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyResult.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyResult.java @@ -5,19 +5,35 @@ import nu.marginalia.crawling.model.CrawlerDocumentStatus; import java.util.Optional; import java.util.function.BiFunction; -public sealed interface DocumentBodyResult { - record Ok(String contentType, String body) implements DocumentBodyResult { +public sealed interface DocumentBodyResult { + record Ok(String contentType, T body) implements DocumentBodyResult { + @Override - public Optional map(BiFunction fun) { - return Optional.of(fun.apply(contentType, body)); + public Optional mapOpt(BiFunction mapper) { + return Optional.of(mapper.apply(contentType, body)); + } + + @Override + public void ifPresent(ExConsumer consumer) throws Exception { + consumer.accept(contentType, body); } } - record Error(CrawlerDocumentStatus status, String why) implements DocumentBodyResult { + record Error(CrawlerDocumentStatus status, String why) implements DocumentBodyResult { @Override - public Optional map(BiFunction fun) { + public Optional mapOpt(BiFunction mapper) { return Optional.empty(); } + + @Override + public void ifPresent(ExConsumer consumer) throws Exception { + } } - Optional map(BiFunction fun); + Optional mapOpt(BiFunction mapper); + + void ifPresent(ExConsumer consumer) throws Exception; + + interface ExConsumer { + void accept(String contentType, T t) throws E; + } } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/HttpFetchResult.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/HttpFetchResult.java index 9790e3da..40db21a5 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/HttpFetchResult.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/HttpFetchResult.java @@ -4,12 +4,12 @@ import okhttp3.Headers; import org.jsoup.Jsoup; import org.netpreserve.jwarc.MessageHeaders; import org.netpreserve.jwarc.WarcResponse; -import org.netpreserve.jwarc.WarcRevisit; import org.jsoup.nodes.Document; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; +import java.net.InetAddress; import java.net.URI; import java.util.List; import java.util.Optional; @@ -18,44 +18,39 @@ public sealed interface HttpFetchResult { boolean isOk(); - static ResultOk importWarc(WarcResponse response) throws IOException { - var http = response.http(); - try (var body = http.body()) { - byte[] bytes = body.stream().readAllBytes(); + static HttpFetchResult importWarc(WarcResponse response) { + try { + var http = response.http(); - return new ResultOk( - response.targetURI(), - http.status(), - http.headers(), - bytes, - 0, - bytes.length - ); + try (var body = http.body()) { + byte[] bytes = body.stream().readAllBytes(); + + String ipAddress = response + .ipAddress() + .map(InetAddress::getHostAddress) + .orElse(""); + + return new ResultOk( + response.targetURI(), + http.status(), + http.headers(), + ipAddress, + bytes, + 0, + bytes.length + ); + } + } + catch (Exception ex) { + return new ResultException(ex); } } - static ResultOk importWarc(WarcRevisit revisit) throws IOException { - var http = revisit.http(); - try (var body = http.body()) { - byte[] bytes = body.stream().readAllBytes(); - - return new ResultOk( - revisit.targetURI(), - http.status(), - http.headers(), - bytes, - 0, - bytes.length - ); - } - finally { - revisit.body().consume(); - } - } record ResultOk(URI uri, int statusCode, Headers headers, + String ipAddress, byte[] bytesRaw, int bytesStart, int bytesLength @@ -68,10 +63,11 @@ public sealed interface HttpFetchResult { public ResultOk(URI uri, int statusCode, MessageHeaders headers, + String ipAddress, byte[] bytesRaw, int bytesStart, int bytesLength) { - this(uri, statusCode, convertHeaders(headers), bytesRaw, bytesStart, bytesLength); + this(uri, statusCode, convertHeaders(headers), ipAddress, bytesRaw, bytesStart, bytesLength); } private static Headers convertHeaders(MessageHeaders headers) { @@ -89,8 +85,8 @@ public sealed interface HttpFetchResult { } public Optional parseDocument() throws IOException { - return switch(DocumentBodyExtractor.extractBody(this)) { - case DocumentBodyResult.Ok ok when "text/html".equalsIgnoreCase(ok.contentType()) + return switch(DocumentBodyExtractor.asString(this)) { + case DocumentBodyResult.Ok ok when "text/html".equalsIgnoreCase(ok.contentType()) -> Optional.of(Jsoup.parse(ok.body())); default -> Optional.empty(); }; @@ -105,7 +101,7 @@ public sealed interface HttpFetchResult { }; - record ResultRetained(String url, String body) implements HttpFetchResult { + record ResultRetained(String url, String contentType, String body) implements HttpFetchResult { public boolean isOk() { return true; diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java index 907eb081..2a0029b4 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java @@ -10,9 +10,7 @@ public class CrawlerOutputFile { /** Return the Path to a file for the given id and name */ public static Path getLegacyOutputFile(Path base, String id, String name) { - if (id.length() < 4) { - id = Strings.repeat("0", 4 - id.length()) + id; - } + id = padId(id); String first = id.substring(0, 2); String second = id.substring(2, 4); @@ -24,9 +22,7 @@ public class CrawlerOutputFile { /** Return the Path to a file for the given id and name, creating the prerequisite * directory structure as necessary. */ public static Path createLegacyOutputPath(Path base, String id, String name) throws IOException { - if (id.length() < 4) { - id = Strings.repeat("0", 4 - id.length()) + id; - } + id = padId(id); String first = id.substring(0, 2); String second = id.substring(2, 4); @@ -54,9 +50,7 @@ public class CrawlerOutputFile { } public static Path createWarcPath(Path basePath, String id, String domain, WarcFileVersion version) throws IOException { - if (id.length() < 4) { - id = Strings.repeat("0", 4 - id.length()) + id; - } + id = padId(id); String first = id.substring(0, 2); String second = id.substring(2, 4); @@ -68,10 +62,20 @@ public class CrawlerOutputFile { return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}-\{version.suffix}.warc.gz"); } - public static Path getWarcPath(Path basePath, String id, String domain, WarcFileVersion version) { - if (id.length() < 4) { - id = Strings.repeat("0", 4 - id.length()) + id; + public static Path createParquetPath(Path basePath, String id, String domain) throws IOException { + id = padId(id); + + String first = id.substring(0, 2); + String second = id.substring(2, 4); + + Path destDir = basePath.resolve(first).resolve(second); + if (!Files.exists(destDir)) { + Files.createDirectories(destDir); } + return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}.parquet"); + } + public static Path getWarcPath(Path basePath, String id, String domain, WarcFileVersion version) { + id = padId(id); String first = id.substring(0, 2); String second = id.substring(2, 4); @@ -80,6 +84,18 @@ public class CrawlerOutputFile { return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}.warc\{version.suffix}"); } + /** + * Pads the given ID with leading zeros to ensure it has a length of 4 characters. + */ + private static String padId(String id) { + if (id.length() < 4) { + id = Strings.repeat("0", 4 - id.length()) + id; + } + + return id; + } + + public enum WarcFileVersion { LIVE("open"), TEMP("tmp"), diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcReadingSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcReadingSerializableCrawlDataStream.java index 9d8d1a63..9c81f0ca 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcReadingSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcReadingSerializableCrawlDataStream.java @@ -88,10 +88,9 @@ public class WarcReadingSerializableCrawlDataStream implements AutoCloseable, Se if (http.status() != 200) { return; } - CrawledDocument document; - var parsedBody = DocumentBodyExtractor.extractBody(HttpFetchResult.importWarc(response)); - if (parsedBody instanceof DocumentBodyResult.Error error) { + var parsedBody = DocumentBodyExtractor.asString(HttpFetchResult.importWarc(response)); + if (parsedBody instanceof DocumentBodyResult.Error error) { next = new CrawledDocument( "", response.targetURI().toString(), @@ -106,7 +105,7 @@ public class WarcReadingSerializableCrawlDataStream implements AutoCloseable, Se "", "", ""); - } else if (parsedBody instanceof DocumentBodyResult.Ok ok) { + } else if (parsedBody instanceof DocumentBodyResult.Ok ok) { next = new CrawledDocument( "", response.targetURI().toString(), diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java index 614be635..6e0e5a0b 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java @@ -22,6 +22,7 @@ public class CrawledDocumentParquetRecord { public String url; public String ip; public boolean cookies; + public int httpStatus; public String contentType; public byte[] body; @@ -39,6 +40,7 @@ public class CrawledDocumentParquetRecord { Types.required(BINARY).as(stringType()).named("url"), Types.required(BINARY).as(stringType()).named("ip"), Types.required(BOOLEAN).named("cookies"), + Types.required(INT32).named("httpStatus"), Types.required(BINARY).as(stringType()).named("contentType"), Types.required(BINARY).named("body") ); @@ -49,6 +51,7 @@ public class CrawledDocumentParquetRecord { case "domain" -> domain = (String) value; case "url" -> url = (String) value; case "ip" -> ip = (String) value; + case "httpStatus" -> httpStatus = (Integer) value; case "cookies" -> cookies = (Boolean) value; case "contentType" -> contentType = (String) value; case "body" -> body = (byte[]) value; @@ -61,6 +64,7 @@ public class CrawledDocumentParquetRecord { valueWriter.write("domain", domain); valueWriter.write("url", url); valueWriter.write("ip", ip); + valueWriter.write("httpStatus", httpStatus); valueWriter.write("cookies", cookies); valueWriter.write("contentType", contentType); valueWriter.write("body", body); diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java index f4961c01..d9fea865 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java @@ -1,12 +1,37 @@ package nu.marginalia.crawling.parquet; import blue.strategic.parquet.ParquetWriter; +import nu.marginalia.crawling.body.DocumentBodyExtractor; +import nu.marginalia.crawling.body.DocumentBodyResult; +import nu.marginalia.crawling.body.HttpFetchResult; +import org.netpreserve.jwarc.WarcReader; +import org.netpreserve.jwarc.WarcRecord; +import org.netpreserve.jwarc.WarcResponse; +import org.netpreserve.jwarc.WarcXResponseReference; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Path; public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { private final ParquetWriter writer; + private static final Logger logger = LoggerFactory.getLogger(CrawledDocumentParquetRecordFileWriter.class); + + public static void convertWarc(String domain, Path warcInputFile, Path parquetOutputFile) throws IOException { + try (var warcReader = new WarcReader(warcInputFile); + var parquetWriter = new CrawledDocumentParquetRecordFileWriter(parquetOutputFile) + ) { + WarcXResponseReference.register(warcReader); + + for (var record : warcReader) { + parquetWriter.write(domain, record); + } + } + catch (Exception ex) { + logger.error("Failed to convert WARC file to Parquet", ex); + } + } public CrawledDocumentParquetRecordFileWriter(Path file) throws IOException { writer = ParquetWriter.writeFile(CrawledDocumentParquetRecord.schema, @@ -17,6 +42,42 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { writer.write(domainData); } + public void write(String domain, WarcRecord record) throws IOException { + if (!(record instanceof WarcResponse ref)) { + return; + } + + HttpFetchResult result = HttpFetchResult.importWarc(ref); + if (!(result instanceof HttpFetchResult.ResultOk fetchOk)) { + return; + } + + byte[] bodyBytes; + String contentType; + + var body = DocumentBodyExtractor.asBytes(result); + + if (body instanceof DocumentBodyResult.Ok bodyOk) { + bodyBytes = bodyOk.body(); + contentType = bodyOk.contentType(); + } + else { + bodyBytes = new byte[0]; + contentType = ""; + } + + write(new CrawledDocumentParquetRecord( + domain, + ref.target(), + fetchOk.ipAddress(), + false, // FIXME + fetchOk.statusCode(), + contentType, + bodyBytes) + ); + } + + public void close() throws IOException { writer.close(); } diff --git a/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java b/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java index 07a27200..f8661355 100644 --- a/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java +++ b/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java @@ -3,6 +3,7 @@ package nu.marginalia.crawling.parquet; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.netpreserve.jwarc.net.WarcRecorder; import java.io.IOException; import java.nio.file.Files; @@ -29,6 +30,7 @@ class CrawledDocumentParquetRecordFileWriterTest { "https://www.marginalia.nu/", "127.0.0.1", false, + 200, "text/html", "hello world".getBytes()); @@ -41,4 +43,5 @@ class CrawledDocumentParquetRecordFileWriterTest { assertEquals(original, actual); } } + } \ No newline at end of file diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index f5b5a10e..5c6241f7 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -19,6 +19,7 @@ import nu.marginalia.crawl.spec.DbCrawlSpecProvider; import nu.marginalia.crawl.spec.ParquetCrawlSpecProvider; import nu.marginalia.crawling.io.CrawledDomainReader; import nu.marginalia.crawling.io.CrawlerOutputFile; +import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; import nu.marginalia.crawlspec.CrawlSpecFileNames; import nu.marginalia.storage.FileStorageService; import nu.marginalia.model.crawlspec.CrawlSpecRecord; @@ -29,7 +30,6 @@ import nu.marginalia.mq.inbox.MqSingleShotInbox; import nu.marginalia.process.control.ProcessHeartbeatImpl; import nu.marginalia.process.log.WorkLog; import nu.marginalia.service.module.DatabaseModule; -import nu.marginalia.crawling.io.CrawledDomainWriter; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.util.SimpleBlockingThreadPool; import okhttp3.ConnectionPool; @@ -216,6 +216,7 @@ public class CrawlerMain { Path newWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.LIVE); Path tempFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.TEMP); Path finalWarcFile = CrawlerOutputFile.createWarcPath(outputDir, id, domain, CrawlerOutputFile.WarcFileVersion.FINAL); + Path parquetFile = CrawlerOutputFile.createParquetPath(outputDir, id, domain); if (Files.exists(newWarcFile)) { Files.move(newWarcFile, tempFile, StandardCopyOption.REPLACE_EXISTING); @@ -245,6 +246,9 @@ public class CrawlerMain { Files.move(newWarcFile, finalWarcFile, StandardCopyOption.REPLACE_EXISTING); + CrawledDocumentParquetRecordFileWriter + .convertWarc(domain, finalWarcFile, parquetFile); + workLog.setJobToFinished(domain, finalWarcFile.toString(), size); heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 514243ee..80d6853b 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -251,7 +251,7 @@ public class CrawlerRetreiver implements AutoCloseable { var doc = reference.doc(); if (doc != null) { warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody); - fetchedDoc = new HttpFetchResult.ResultRetained(doc.url, doc.documentBody); + fetchedDoc = new HttpFetchResult.ResultRetained(doc.url, doc.contentType, doc.documentBody); } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java index 1a66c7a5..47b5b2d8 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java @@ -33,6 +33,8 @@ public class CrawlerWarcResynchronizer { public void run(Path tempFile) { // First pass, enqueue links try (var reader = new WarcReader(tempFile)) { + WarcXResponseReference.register(reader); + for (var item : reader) { accept(item); } @@ -54,8 +56,6 @@ public class CrawlerWarcResynchronizer { try { if (item instanceof WarcResponse rsp) { response(rsp); - } else if (item instanceof WarcRevisit revisit) { - revisit(revisit); } else if (item instanceof WarcRequest req) { request(req); } @@ -76,35 +76,18 @@ public class CrawlerWarcResynchronizer { try { var response = HttpFetchResult.importWarc(rsp); - if (DocumentBodyExtractor.extractBody(response) instanceof DocumentBodyResult.Ok ok) { - var doc = Jsoup.parse(ok.body()); + DocumentBodyExtractor + .asString(response) + .ifPresent((ct, body) -> + { + var doc = Jsoup.parse(body); crawlFrontier.enqueueLinksFromDocument(url, doc); - } + }); } catch (Exception e) { logger.info(STR."Failed to parse response body for \{url}", e); } } - private void revisit(WarcRevisit revisit) throws IOException { - if (!WarcRecorder.documentRevisitURN.equals(revisit.profile())) { - return; - } - - var url = new EdgeUrl(revisit.targetURI()); - - crawlFrontier.addVisited(url); - - try { - var response = HttpFetchResult.importWarc(revisit); - if (DocumentBodyExtractor.extractBody(response) instanceof DocumentBodyResult.Ok ok) { - var doc = Jsoup.parse(ok.body()); - crawlFrontier.enqueueLinksFromDocument(url, doc); - } - } - catch (Exception e) { - logger.info(STR."Failed to parse response body for \{url}", e); - } - } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index d7732baa..f8f11b13 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -11,6 +11,8 @@ import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeR import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory; import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL; +import nu.marginalia.crawling.body.DocumentBodyExtractor; +import nu.marginalia.crawling.body.DocumentBodyResult; import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.body.ContentTypeLogic; @@ -263,24 +265,19 @@ public class HttpFetcherImpl implements HttpFetcher { HttpFetchResult result = recorder.fetch(client, getBuilder.build()); - if (result instanceof HttpFetchResult.ResultOk ok) { - return Optional.of(parseRobotsTxt(ok)); - } - else { - return Optional.empty(); - } + return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) -> + robotsParser.parseContent(url.toString(), + body, + contentType, + userAgent) + ); + } catch (Exception ex) { return Optional.empty(); } } - private SimpleRobotRules parseRobotsTxt(HttpFetchResult.ResultOk ok) { - return robotsParser.parseContent(ok.uri().toString(), - ok.bytesRaw(), - ok.header("Content-Type"), - userAgent); - } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java index 368bf3c7..2ceb076d 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java @@ -6,6 +6,8 @@ import okhttp3.Response; import org.apache.commons.lang3.StringUtils; import java.net.URI; +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.Map; import java.util.StringJoiner; @@ -18,7 +20,8 @@ public class WarcProtocolReconstructor { static String getHttpRequestString(Request request, URI uri) { StringBuilder requestStringBuilder = new StringBuilder(); - requestStringBuilder.append(request.method()).append(" ").append(uri.getPath()); + requestStringBuilder.append(request.method()).append(" ").append(URLEncoder.encode(uri.getPath(), StandardCharsets.UTF_8)); + if (uri.getQuery() != null) { requestStringBuilder.append("?").append(uri.getQuery()); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java index b7bb82bd..bce1b890 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java @@ -29,8 +29,6 @@ import java.util.*; * be reconstructed. */ public class WarcRecorder implements AutoCloseable { - public static final URI documentRevisitURN = URI.create("urn:marginalia/data/doc/revisit"); - public static final URI documentRobotsTxtSkippedURN = URI.create("urn:marginalia/meta/doc/robots-txt-skipped"); public static final URI documentBadContentTypeURN = URI.create("urn:marginalia/meta/doc/content-type-failed-probe"); public static final URI documentProbeTimeout = URI.create("urn:marginalia/meta/doc/timeout-probe"); @@ -173,6 +171,7 @@ public class WarcRecorder implements AutoCloseable { return new HttpFetchResult.ResultOk(uri, response.code(), response.headers(), + ip, responseDataBuffer.data, dataStart, responseDataBuffer.length() - dataStart); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java index 03b96760..31df5e0e 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java @@ -1,13 +1,11 @@ package nu.marginalia.crawl.retreival.revisit; -import lombok.SneakyThrows; import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.fetcher.ContentTags; -import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.body.DocumentBodyExtractor; +import nu.marginalia.crawling.body.DocumentBodyResult; import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.model.EdgeUrl; import javax.annotation.Nullable; @@ -40,9 +38,11 @@ public record DocumentWithReference( if (doc.documentBody == null) return false; - return DocumentBodyExtractor.extractBody(resultOk) - .map((contentType, body) -> reference.isContentBodySame(doc.documentBody, body)) - .orElse(false); + if (!(DocumentBodyExtractor.asString(resultOk) instanceof DocumentBodyResult.Ok bodyOk)) { + return false; + } + + return reference.isContentBodySame(doc.documentBody, bodyOk.body()); } public ContentTags getContentTags() { diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java index 4faa2042..e8ba9437 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java @@ -2,6 +2,8 @@ package nu.marginalia.crawl.retreival.fetcher; import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader; +import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; import nu.marginalia.model.EdgeUrl; import okhttp3.OkHttpClient; import okhttp3.Request; @@ -20,10 +22,10 @@ import java.util.Map; import java.util.zip.GZIPInputStream; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertInstanceOf; class WarcRecorderTest { - Path fileName; + Path fileNameWarc; + Path fileNameParquet; WarcRecorder client; OkHttpClient httpClient; @BeforeEach @@ -32,14 +34,16 @@ class WarcRecorderTest { .addNetworkInterceptor(new IpInterceptingNetworkInterceptor()) .build(); - fileName = Files.createTempFile("test", ".warc"); - client = new WarcRecorder(fileName); + fileNameWarc = Files.createTempFile("test", ".warc"); + fileNameParquet = Files.createTempFile("test", ".parquet"); + + client = new WarcRecorder(fileNameWarc); } @AfterEach public void tearDown() throws Exception { client.close(); - Files.delete(fileName); + Files.delete(fileNameWarc); } @Test @@ -49,10 +53,10 @@ class WarcRecorderTest { .addHeader("Accept-Encoding", "gzip") .get().build()); - new GZIPInputStream(Files.newInputStream(fileName)).transferTo(System.out); + new GZIPInputStream(Files.newInputStream(fileNameWarc)).transferTo(System.out); Map sampleData = new HashMap<>(); - try (var warcReader = new WarcReader(fileName)) { + try (var warcReader = new WarcReader(fileNameWarc)) { warcReader.forEach(record -> { if (record instanceof WarcRequest req) { sampleData.put(record.type(), req.target()); @@ -70,14 +74,14 @@ class WarcRecorderTest { @Test public void flagAsSkipped() throws IOException, URISyntaxException { - try (var recorder = new WarcRecorder(fileName)) { + try (var recorder = new WarcRecorder(fileNameWarc)) { recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"), "text/html", 200, "test"); } - try (var reader = new WarcReader(fileName)) { + try (var reader = new WarcReader(fileNameWarc)) { for (var record : reader) { if (record instanceof WarcResponse rsp) { assertEquals("https://www.marginalia.nu/", rsp.target()); @@ -88,19 +92,19 @@ class WarcRecorderTest { } } - new GZIPInputStream(Files.newInputStream(fileName)).transferTo(System.out); + new GZIPInputStream(Files.newInputStream(fileNameWarc)).transferTo(System.out); } @Test public void testSaveImport() throws URISyntaxException, IOException { - try (var recorder = new WarcRecorder(fileName)) { + try (var recorder = new WarcRecorder(fileNameWarc)) { recorder.flagAsSkipped(new EdgeUrl("https://www.marginalia.nu/"), "text/html", 200, "test"); } - try (var reader = new WarcReader(fileName)) { + try (var reader = new WarcReader(fileNameWarc)) { WarcXResponseReference.register(reader); for (var record : reader) { @@ -114,4 +118,30 @@ class WarcRecorderTest { } + @Test + public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException { + client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/") + .addHeader("User-agent", "test.marginalia.nu") + .addHeader("Accept-Encoding", "gzip") + .get().build()); + client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/log/") + .addHeader("User-agent", "test.marginalia.nu") + .addHeader("Accept-Encoding", "gzip") + .get().build()); + client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/sanic.png") + .addHeader("User-agent", "test.marginalia.nu") + .addHeader("Accept-Encoding", "gzip") + .get().build()); + client.close(); + + CrawledDocumentParquetRecordFileWriter.convertWarc("www.marginalia.nu", fileNameWarc, fileNameParquet); + + var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList(); + assertEquals(3, urls.size()); + assertEquals("https://www.marginalia.nu/", urls.get(0)); + assertEquals("https://www.marginalia.nu/log/", urls.get(1)); + assertEquals("https://www.marginalia.nu/sanic.png", urls.get(2)); + + } + } \ No newline at end of file diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java index 4590dde2..0873924f 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/HttpFetcherTest.java @@ -36,7 +36,7 @@ class HttpFetcherTest { var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); try (var recorder = new WarcRecorder()) { var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty()); - if (DocumentBodyExtractor.extractBody(result) instanceof DocumentBodyResult.Ok bodyOk) { + if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) { System.out.println(bodyOk.contentType()); } } @@ -48,7 +48,7 @@ class HttpFetcherTest { try (var recorder = new WarcRecorder()) { var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty()); - if (DocumentBodyExtractor.extractBody(result) instanceof DocumentBodyResult.Ok bodyOk) { + if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) { System.out.println(bodyOk.contentType()); } } diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index b7727022..749b821c 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -127,6 +127,7 @@ public class CrawlerMockFetcherTest { url.asURI(), 200, new Headers.Builder().build(), + "127.0.0.1", bodyBytes, 0, bodyBytes.length From 0889b6d2476d507fcb534ab7dc17d52ddfd9fd8f Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 14 Dec 2023 20:39:40 +0100 Subject: [PATCH 13/23] (warc) Clean up parquet conversion This commit further cleans up the warc->parquet conversion. It fixes issues with redirect handling in WarcRecorder, adds support information about redirects and errors due to probe failure. It also refactors the fetch result, body extraction and content type abstractions. --- code/features-crawl/content-type/build.gradle | 1 + .../marginalia/contenttype/ContentType.java | 19 +++ .../nu/marginalia/geoip/GeoIpDictionary.java | 4 +- .../crawling/body/ContentTypeLogic.java | 4 + .../crawling/body/DocumentBodyExtractor.java | 71 ++++------- .../crawling/body/DocumentBodyResult.java | 29 ++++- .../crawling/body/HttpFetchResult.java | 50 ++++++-- .../crawling/io/CrawledDomainReader.java | 16 ++- .../crawling/io/CrawlerOutputFile.java | 9 ++ ...=> LegacySerializableCrawlDataStream.java} | 7 +- .../ParquetSerializableCrawlDataStream.java | 118 ++++++++++++++++++ ...a => WarcSerializableCrawlDataStream.java} | 25 ++-- ...rawledDocumentParquetRecordFileWriter.java | 55 ++++++-- ...edDocumentParquetRecordFileWriterTest.java | 39 +++++- .../converting/processor/DomainProcessor.java | 9 +- .../converting/writer/ConverterWriter.java | 6 +- ...CrawlingThenConvertingIntegrationTest.java | 24 +++- .../java/nu/marginalia/crawl/CrawlerMain.java | 6 +- .../crawl/retreival/CrawlerRetreiver.java | 9 +- .../retreival/fetcher/HttpFetcherImpl.java | 5 +- .../retreival/fetcher/warc/WarcRecorder.java | 22 ++-- .../revisit/DocumentWithReference.java | 4 +- .../retreival/fetcher/WarcRecorderTest.java | 4 - 23 files changed, 403 insertions(+), 133 deletions(-) rename code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/{LegacyFileReadingSerializableCrawlDataStream.java => LegacySerializableCrawlDataStream.java} (86%) create mode 100644 code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java rename code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/{WarcReadingSerializableCrawlDataStream.java => WarcSerializableCrawlDataStream.java} (87%) diff --git a/code/features-crawl/content-type/build.gradle b/code/features-crawl/content-type/build.gradle index 17eaea3f..73a155cb 100644 --- a/code/features-crawl/content-type/build.gradle +++ b/code/features-crawl/content-type/build.gradle @@ -21,6 +21,7 @@ dependencies { testImplementation libs.bundles.slf4j.test implementation libs.jsoup + implementation libs.commons.lang3 testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit diff --git a/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java index 374788b4..095497c8 100644 --- a/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java +++ b/code/features-crawl/content-type/src/main/java/nu/marginalia/contenttype/ContentType.java @@ -1,9 +1,28 @@ package nu.marginalia.contenttype; +import org.apache.commons.lang3.StringUtils; + /** Content type and charset of a document * @param contentType The content type, e.g. "text/html" * @param charset The charset, e.g. "UTF-8" */ public record ContentType(String contentType, String charset) { + public static ContentType parse(String contentTypeHeader) { + String[] parts = StringUtils.split(contentTypeHeader, ";", 2); + String contentType = parts[0].trim(); + String charset = parts.length > 1 ? parts[1].trim() : "UTF-8"; + return new ContentType(contentType, charset); + } + + public boolean is(String contentType) { + return this.contentType.equalsIgnoreCase(contentType); + } + + public String toString() { + if (charset == null || charset.isBlank()) + return contentType; + + return STR."\{contentType}; charset=\{charset}"; + } } diff --git a/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java b/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java index 13b982f5..67dd6366 100644 --- a/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java +++ b/code/libraries/geo-ip/src/main/java/nu/marginalia/geoip/GeoIpDictionary.java @@ -37,7 +37,9 @@ public class GeoIpDictionary { throw new RuntimeException(e); } finally { - this.notifyAll(); + synchronized (this) { + this.notifyAll(); + } } }); } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/ContentTypeLogic.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/ContentTypeLogic.java index d7dfa6d1..d884dbe5 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/ContentTypeLogic.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/ContentTypeLogic.java @@ -1,5 +1,6 @@ package nu.marginalia.crawling.body; +import nu.marginalia.contenttype.ContentType; import nu.marginalia.model.EdgeUrl; import java.util.List; @@ -37,6 +38,9 @@ public class ContentTypeLogic { return probableBinaryPattern.test(pathLowerCase); } + public boolean isAllowableContentType(ContentType contentType) { + return isAllowableContentType(contentType.contentType()); + } public boolean isAllowableContentType(String contentType) { if (allowAllContentTypes) return true; diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java index 00ceac86..a485e5bc 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java @@ -1,5 +1,6 @@ package nu.marginalia.crawling.body; +import nu.marginalia.contenttype.ContentType; import nu.marginalia.contenttype.ContentTypeParser; import nu.marginalia.contenttype.DocumentBodyToString; import nu.marginalia.crawling.model.CrawlerDocumentStatus; @@ -7,7 +8,6 @@ import org.apache.commons.io.input.BOMInputStream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; import java.util.zip.GZIPInputStream; public class DocumentBodyExtractor { @@ -15,28 +15,38 @@ public class DocumentBodyExtractor { private static final Logger logger = LoggerFactory.getLogger(DocumentBodyExtractor.class); - public static DocumentBodyResult asString(HttpFetchResult result) { - if (result instanceof HttpFetchResult.ResultOk ok) { - return asString(ok); - } - else if (result instanceof HttpFetchResult.ResultRetained retained) { - return new DocumentBodyResult.Ok<>(retained.contentType(), retained.body()); - } - - return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.ERROR, "Fetch Result Not Ok"); - } - + /** Extract the body from a fetch result as a byte array. */ public static DocumentBodyResult asBytes(HttpFetchResult result) { if (result instanceof HttpFetchResult.ResultOk fetchOk) { return asBytes(fetchOk); } - else if (result instanceof HttpFetchResult.ResultRetained retained) { + else if (result instanceof HttpFetchResult.Result304ReplacedWithReference retained) { return new DocumentBodyResult.Ok<>(retained.contentType(), retained.body().getBytes()); } return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.ERROR, "Fetch Result Not Ok"); } + /** Extract the body from a fetch result as a string. This function performs + * content-type checks to ensure that the content-type is such that this operation + * makes sense. + * + * @see ContentTypeLogic#isAllowableContentType(String) + * */ + public static DocumentBodyResult asString(HttpFetchResult result) { + return asBytes(result).flatMap(DocumentBodyExtractor::toStringResult); + } + + private static DocumentBodyResult toStringResult(ContentType contentType, byte[] bytes) { + if (contentTypeLogic.isAllowableContentType(contentType)) { + return new DocumentBodyResult.Ok<>(contentType, DocumentBodyToString.getStringData(contentType, bytes)); + } + else { + return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); + } + } + + /** Extract the body from a fetch result as a byte array. */ public static DocumentBodyResult asBytes(HttpFetchResult.ResultOk rsp) { try { var byteStream = rsp.getInputStream(); @@ -51,44 +61,11 @@ public class DocumentBodyExtractor { byte[] data = byteStream.readAllBytes(); // size is limited by WarcRecorder var contentType = ContentTypeParser.parseContentType(contentTypeHeader, data); - return new DocumentBodyResult.Ok<>(contentType.contentType(), data); + return new DocumentBodyResult.Ok<>(contentType, data); } catch (Exception ex) { logger.error("Failed to extract body", ex); return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.ERROR, ""); } } - public static DocumentBodyResult asString(HttpFetchResult.ResultOk rsp) { - try { - var byteStream = rsp.getInputStream(); - - if ("gzip".equals(rsp.header("Content-Encoding"))) { - byteStream = new GZIPInputStream(byteStream); - } - byteStream = new BOMInputStream(byteStream); - - var contentTypeHeader = rsp.header("Content-Type"); - if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) { - return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); - } - - byte[] data = byteStream.readAllBytes(); // size is limited by WarcRecorder - - var contentType = ContentTypeParser.parseContentType(contentTypeHeader, data); - if (!contentTypeLogic.isAllowableContentType(contentType.contentType())) { - return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); - } - - if ("Shift_JIS".equalsIgnoreCase(contentType.charset())) { - return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CHARSET, ""); - } - - return new DocumentBodyResult.Ok<>(contentType.contentType(), DocumentBodyToString.getStringData(contentType, data)); - } - catch (IOException ex) { - logger.error("Failed to extract body", ex); - return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.ERROR, ""); - } - } - } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyResult.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyResult.java index 0f30dc1f..04e3fedb 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyResult.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyResult.java @@ -1,17 +1,27 @@ package nu.marginalia.crawling.body; +import nu.marginalia.contenttype.ContentType; import nu.marginalia.crawling.model.CrawlerDocumentStatus; import java.util.Optional; import java.util.function.BiFunction; public sealed interface DocumentBodyResult { - record Ok(String contentType, T body) implements DocumentBodyResult { + record Ok(ContentType contentType, T body) implements DocumentBodyResult { @Override - public Optional mapOpt(BiFunction mapper) { + public Optional mapOpt(BiFunction mapper) { return Optional.of(mapper.apply(contentType, body)); } + @Override + public Optional flatMapOpt(BiFunction> mapper) { + return mapper.apply(contentType, body); + } + + @Override + public DocumentBodyResult flatMap(BiFunction> mapper) { + return mapper.apply(contentType, body); + } @Override public void ifPresent(ExConsumer consumer) throws Exception { @@ -20,20 +30,29 @@ public sealed interface DocumentBodyResult { } record Error(CrawlerDocumentStatus status, String why) implements DocumentBodyResult { @Override - public Optional mapOpt(BiFunction mapper) { + public Optional mapOpt(BiFunction mapper) { return Optional.empty(); } + public Optional flatMapOpt(BiFunction> mapper) { return Optional.empty(); } + + @Override + @SuppressWarnings("unchecked") + public DocumentBodyResult flatMap(BiFunction> mapper) { + return (DocumentBodyResult) this; + } @Override public void ifPresent(ExConsumer consumer) throws Exception { } } - Optional mapOpt(BiFunction mapper); + Optional mapOpt(BiFunction mapper); + Optional flatMapOpt(BiFunction> mapper); + DocumentBodyResult flatMap(BiFunction> mapper); void ifPresent(ExConsumer consumer) throws Exception; interface ExConsumer { - void accept(String contentType, T t) throws E; + void accept(ContentType contentType, T t) throws E; } } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/HttpFetchResult.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/HttpFetchResult.java index 40db21a5..f0db28e8 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/HttpFetchResult.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/HttpFetchResult.java @@ -1,5 +1,6 @@ package nu.marginalia.crawling.body; +import nu.marginalia.contenttype.ContentType; import okhttp3.Headers; import org.jsoup.Jsoup; import org.netpreserve.jwarc.MessageHeaders; @@ -11,13 +12,15 @@ import java.io.IOException; import java.io.InputStream; import java.net.InetAddress; import java.net.URI; -import java.util.List; import java.util.Optional; +/* FIXME: This interface has a very unfortunate name that is not very descriptive. + */ public sealed interface HttpFetchResult { boolean isOk(); + /** Convert a WarcResponse to a HttpFetchResult */ static HttpFetchResult importWarc(WarcResponse response) { try { var http = response.http(); @@ -47,6 +50,10 @@ public sealed interface HttpFetchResult { } + /** Corresponds to a successful retrieval of a document + * from the remote server. Note that byte[] is only borrowed + * and subsequent calls may overwrite the contents of this buffer. + */ record ResultOk(URI uri, int statusCode, Headers headers, @@ -85,23 +92,29 @@ public sealed interface HttpFetchResult { } public Optional parseDocument() throws IOException { - return switch(DocumentBodyExtractor.asString(this)) { - case DocumentBodyResult.Ok ok when "text/html".equalsIgnoreCase(ok.contentType()) - -> Optional.of(Jsoup.parse(ok.body())); - default -> Optional.empty(); - }; + return DocumentBodyExtractor.asString(this).flatMapOpt((contentType, body) -> { + if (contentType.is("text/html")) { + return Optional.of(Jsoup.parse(body)); + } + else { + return Optional.empty(); + } + }); } public String header(String name) { return headers.get(name); } - public List allHeaders(String name) { - return headers.values(name); - } - }; - record ResultRetained(String url, String contentType, String body) implements HttpFetchResult { + + /** This is a special case where the document was not fetched + * because it was already in the database. In this case, we + * replace the original data. + * + * @see Result304Raw for the case where the document has not yet been replaced with the reference data. + */ + record Result304ReplacedWithReference(String url, ContentType contentType, String body) implements HttpFetchResult { public boolean isOk() { return true; @@ -116,16 +129,29 @@ public sealed interface HttpFetchResult { } } }; + + /** Fetching resulted in an exception */ record ResultException(Exception ex) implements HttpFetchResult { public boolean isOk() { return false; } }; - record ResultSame() implements HttpFetchResult { + + /** Fetching resulted in a HTTP 304, the remote content is identical to + * our reference copy. This will be replaced with a Result304ReplacedWithReference + * at a later stage. + * + * @see Result304ReplacedWithReference + */ + record Result304Raw() implements HttpFetchResult { public boolean isOk() { return false; } }; + + /** No result. This is typically injected at a later stage + * of processing, e.g. after filtering out irrelevant responses. + */ record ResultNone() implements HttpFetchResult { public boolean isOk() { return false; diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java index 0da0b790..eb7ffd75 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java @@ -1,8 +1,9 @@ package nu.marginalia.crawling.io; import com.google.gson.Gson; -import nu.marginalia.crawling.io.format.LegacyFileReadingSerializableCrawlDataStream; -import nu.marginalia.crawling.io.format.WarcReadingSerializableCrawlDataStream; +import nu.marginalia.crawling.io.format.LegacySerializableCrawlDataStream; +import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream; +import nu.marginalia.crawling.io.format.WarcSerializableCrawlDataStream; import nu.marginalia.model.gson.GsonFactory; import java.io.*; @@ -19,10 +20,13 @@ public class CrawledDomainReader { public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException { String fileName = fullPath.getFileName().toString(); if (fileName.endsWith(".zstd")) { - return new LegacyFileReadingSerializableCrawlDataStream(gson, fullPath.toFile()); + return new LegacySerializableCrawlDataStream(gson, fullPath.toFile()); } else if (fileName.endsWith(".warc") || fileName.endsWith(".warc.gz")) { - return new WarcReadingSerializableCrawlDataStream(fullPath); + return new WarcSerializableCrawlDataStream(fullPath); + } + else if (fileName.endsWith(".parquet")) { + return new ParquetSerializableCrawlDataStream(fullPath); } else { throw new IllegalArgumentException("Unknown file type: " + fullPath); @@ -31,8 +35,12 @@ public class CrawledDomainReader { /** An iterator-like access to domain data. This must be closed otherwise it will leak off-heap memory! */ public static SerializableCrawlDataStream createDataStream(Path basePath, String domain, String id) throws IOException { + Path parquetPath = CrawlerOutputFile.getParquetPath(basePath, id, domain); Path warcPath = CrawlerOutputFile.getWarcPath(basePath, id, domain, CrawlerOutputFile.WarcFileVersion.FINAL); + if (Files.exists(parquetPath)) { + return createDataStream(parquetPath); + } if (Files.exists(warcPath)) { return createDataStream(warcPath); } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java index 2a0029b4..ad6b4358 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawlerOutputFile.java @@ -74,6 +74,15 @@ public class CrawlerOutputFile { } return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}.parquet"); } + public static Path getParquetPath(Path basePath, String id, String domain) { + id = padId(id); + + String first = id.substring(0, 2); + String second = id.substring(2, 4); + + Path destDir = basePath.resolve(first).resolve(second); + return destDir.resolve(STR."\{id}-\{filesystemSafeName(domain)}.parquet"); + } public static Path getWarcPath(Path basePath, String id, String domain, WarcFileVersion version) { id = padId(id); diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacyFileReadingSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacySerializableCrawlDataStream.java similarity index 86% rename from code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacyFileReadingSerializableCrawlDataStream.java rename to code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacySerializableCrawlDataStream.java index efff17f3..bfd52b78 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacyFileReadingSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/LegacySerializableCrawlDataStream.java @@ -11,13 +11,16 @@ import nu.marginalia.crawling.model.SerializableCrawlData; import java.io.*; import java.nio.file.Path; -public class LegacyFileReadingSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { +/** This class is used to read the old format of crawl data, which was zstd-compressed JSON + * with type delimiters between records. + */ +public class LegacySerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { private final Gson gson; private final BufferedReader bufferedReader; private SerializableCrawlData next = null; private final Path path; - public LegacyFileReadingSerializableCrawlDataStream(Gson gson, File file) throws IOException { + public LegacySerializableCrawlDataStream(Gson gson, File file) throws IOException { this.gson = gson; bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE))); path = file.toPath(); diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java new file mode 100644 index 00000000..0b852e01 --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java @@ -0,0 +1,118 @@ +package nu.marginalia.crawling.io.format; + +import lombok.SneakyThrows; +import nu.marginalia.contenttype.ContentType; +import nu.marginalia.contenttype.DocumentBodyToString; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.CrawlerDomainStatus; +import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecord; +import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader; +import nu.marginalia.model.EdgeUrl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.file.Path; +import java.util.*; + +public class ParquetSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { + private static final Logger logger = LoggerFactory.getLogger(ParquetSerializableCrawlDataStream.class); + + private final Iterator backingIterator; + private Deque nextQ = new ArrayDeque<>(); + private boolean wroteDomainRecord = false; + private final Path path; + + public ParquetSerializableCrawlDataStream(Path file) throws IOException { + path = file; + + backingIterator = CrawledDocumentParquetRecordFileReader.stream(file).iterator(); + } + + @Override + public Path path() { + return path; + } + + @Override + @SneakyThrows + public boolean hasNext() { + while (backingIterator.hasNext() && nextQ.isEmpty()) { + var nextRecord = backingIterator.next(); + if (!wroteDomainRecord) { + createDomainRecord(nextRecord); + wroteDomainRecord = true; + } + createDocumentRecord(nextRecord); + } + return !nextQ.isEmpty(); + } + + private void createDomainRecord(CrawledDocumentParquetRecord parquetRecord) throws URISyntaxException { + + CrawlerDomainStatus status = CrawlerDomainStatus.OK; + String statusReason = ""; + + String redirectDomain = null; + if (parquetRecord.contentType.equals("x-marginalia/advisory;state=redir")) { + EdgeUrl crawledUrl = new EdgeUrl(parquetRecord.url); + redirectDomain = crawledUrl.getDomain().toString(); + status = CrawlerDomainStatus.REDIRECT; + } + else if (parquetRecord.contentType.equals("x-marginalia/advisory;state=blocked")) { + status = CrawlerDomainStatus.BLOCKED; // FIXME we don't write this yet + } + else if (parquetRecord.contentType.equals("x-marginalia/advisory;state=error")) { + status = CrawlerDomainStatus.ERROR; + statusReason = new String(parquetRecord.body); + } + + // FIXME -- cookies + nextQ.add(new CrawledDomain( + parquetRecord.domain, + redirectDomain, + status.toString(), + statusReason, + parquetRecord.ip, + new ArrayList<>(), + new ArrayList<>() + )); + } + + private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) { + String bodyString = DocumentBodyToString.getStringData( + ContentType.parse(nextRecord.contentType), + nextRecord.body); + + // FIXME -- a lot of these fields are not set properly! + nextQ.add(new CrawledDocument("", + nextRecord.url, + nextRecord.contentType, + "", + nextRecord.httpStatus, + "OK", + "", + "", + bodyString, + "", + nextRecord.url, + null, + "")); + } + + public void close() throws IOException { + } + + @Override + public SerializableCrawlData next() throws IOException { + if (!hasNext()) + throw new NoSuchElementException(); + + return nextQ.poll(); + } + +} diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcReadingSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java similarity index 87% rename from code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcReadingSerializableCrawlDataStream.java rename to code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java index 9c81f0ca..a766a58d 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcReadingSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java @@ -14,20 +14,17 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Path; -import java.util.Iterator; -import java.util.List; -import java.util.NoSuchElementException; -import java.util.StringJoiner; +import java.util.*; -public class WarcReadingSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { - private static final Logger logger = LoggerFactory.getLogger(WarcReadingSerializableCrawlDataStream.class); +public class WarcSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { + private static final Logger logger = LoggerFactory.getLogger(WarcSerializableCrawlDataStream.class); private final WarcReader reader; private final Iterator backingIterator; private SerializableCrawlData next = null; private final Path path; - public WarcReadingSerializableCrawlDataStream(Path file) throws IOException { + public WarcSerializableCrawlDataStream(Path file) throws IOException { path = file; reader = new WarcReader(file); WarcXResponseReference.register(reader); @@ -51,17 +48,10 @@ public class WarcReadingSerializableCrawlDataStream implements AutoCloseable, Se else if (nextRecord instanceof Warcinfo warcinfo) { convertWarcinfo(warcinfo); } - else if (nextRecord instanceof WarcMetadata metadata) { - convertMetadata(metadata); - } } return next != null; } - private void convertMetadata(WarcMetadata metadata) { - // Nothing to do here for now - } - private void convertWarcinfo(Warcinfo warcinfo) throws IOException { var headers = warcinfo.fields(); String probeStatus = headers.first("X-WARC-Probe-Status").orElse(""); @@ -79,7 +69,10 @@ public class WarcReadingSerializableCrawlDataStream implements AutoCloseable, Se } // TODO: Fix cookies info somehow - next = new CrawledDomain(domain, redirectDomain, status, statusReason, ip, List.of(), List.of()); + next = new CrawledDomain(domain, redirectDomain, status, statusReason, ip, + new ArrayList<>(), + new ArrayList<>() + ); } private void convertResponse(WarcResponse response) throws IOException { @@ -109,7 +102,7 @@ public class WarcReadingSerializableCrawlDataStream implements AutoCloseable, Se next = new CrawledDocument( "", response.targetURI().toString(), - ok.contentType(), + ok.contentType().toString(), response.date().toString(), http.status(), "OK", diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java index d9fea865..60e0178e 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java @@ -4,10 +4,7 @@ import blue.strategic.parquet.ParquetWriter; import nu.marginalia.crawling.body.DocumentBodyExtractor; import nu.marginalia.crawling.body.DocumentBodyResult; import nu.marginalia.crawling.body.HttpFetchResult; -import org.netpreserve.jwarc.WarcReader; -import org.netpreserve.jwarc.WarcRecord; -import org.netpreserve.jwarc.WarcResponse; -import org.netpreserve.jwarc.WarcXResponseReference; +import org.netpreserve.jwarc.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -25,7 +22,16 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { WarcXResponseReference.register(warcReader); for (var record : warcReader) { - parquetWriter.write(domain, record); + if (record instanceof WarcResponse response) { + parquetWriter.write(domain, response); + } + else if (record instanceof Warcinfo warcinfo) { + parquetWriter.write(domain, warcinfo); + } + else { + logger.warn("Skipping record of type {}", record.type()); + } + } } catch (Exception ex) { @@ -33,6 +39,34 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { } } + private void write(String domain, Warcinfo warcinfo) throws IOException { + String selfDomain = warcinfo.fields().first("domain").orElse(""); + String ip = warcinfo.fields().first("ip").orElse(""); + String probeStatus = warcinfo.fields().first("X-WARC-Probe-Status").orElse(""); + + if (probeStatus.startsWith("REDIRECT")) { + String redirectDomain = probeStatus.substring("REDIRECT;".length()); + write(new CrawledDocumentParquetRecord(selfDomain, + STR."https://\{redirectDomain}/", + ip, + false, + 0, + "x-marginalia/advisory;state=redirect", + new byte[0] + )); + } + else if (!"OK".equals(probeStatus)) { + write(new CrawledDocumentParquetRecord(selfDomain, + STR."https://\{domain}/", + ip, + false, + 0, + "x-marginalia/advisory;state=error", + probeStatus.getBytes() + )); + } + } + public CrawledDocumentParquetRecordFileWriter(Path file) throws IOException { writer = ParquetWriter.writeFile(CrawledDocumentParquetRecord.schema, file.toFile(), CrawledDocumentParquetRecord.newDehydrator()); @@ -42,12 +76,9 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { writer.write(domainData); } - public void write(String domain, WarcRecord record) throws IOException { - if (!(record instanceof WarcResponse ref)) { - return; - } + public void write(String domain, WarcResponse response) throws IOException { - HttpFetchResult result = HttpFetchResult.importWarc(ref); + HttpFetchResult result = HttpFetchResult.importWarc(response); if (!(result instanceof HttpFetchResult.ResultOk fetchOk)) { return; } @@ -59,7 +90,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { if (body instanceof DocumentBodyResult.Ok bodyOk) { bodyBytes = bodyOk.body(); - contentType = bodyOk.contentType(); + contentType = bodyOk.contentType().toString(); } else { bodyBytes = new byte[0]; @@ -68,7 +99,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { write(new CrawledDocumentParquetRecord( domain, - ref.target(), + response.target(), fetchOk.ipAddress(), false, // FIXME fetchOk.statusCode(), diff --git a/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java b/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java index f8661355..15338de1 100644 --- a/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java +++ b/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java @@ -1,5 +1,9 @@ package nu.marginalia.crawling.parquet; +import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream; +import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.SerializableCrawlData; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -8,6 +12,7 @@ import org.netpreserve.jwarc.net.WarcRecorder; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.util.ArrayList; import static org.junit.jupiter.api.Assertions.*; @@ -25,7 +30,7 @@ class CrawledDocumentParquetRecordFileWriterTest { } @Test - void write() throws IOException { + void testWriteRead() throws IOException { var original = new CrawledDocumentParquetRecord("www.marginalia.nu", "https://www.marginalia.nu/", "127.0.0.1", @@ -38,10 +43,36 @@ class CrawledDocumentParquetRecordFileWriterTest { writer.write(original); } - try (var stream = CrawledDocumentParquetRecordFileReader.stream(tempFile)) { - var actual = stream.findFirst().orElseThrow(); - assertEquals(original, actual); + var items = new ArrayList(); + + try (var stream = new ParquetSerializableCrawlDataStream(tempFile)) { + while (stream.hasNext()) { + items.add(stream.next()); + } } + + assertEquals(2, items.size()); + + var firstItem = items.get(0); + assertInstanceOf(CrawledDomain.class, firstItem); + var domain = (CrawledDomain) firstItem; + assertEquals("www.marginalia.nu", domain.domain); + assertNull(domain.redirectDomain); + assertEquals("OK", domain.crawlerStatus); + assertEquals("", domain.crawlerStatusDesc); + assertEquals(new ArrayList<>(), domain.doc); + assertEquals(new ArrayList<>(), domain.cookies); + + var secondItem = items.get(1); + assertInstanceOf(CrawledDocument.class, secondItem); + + var document = (CrawledDocument) secondItem; + assertEquals("https://www.marginalia.nu/", document.url); + assertEquals("text/html", document.contentType); + assertEquals("hello world", document.documentBody); + assertEquals(200, document.httpStatus); + assertEquals("https://www.marginalia.nu/", document.canonicalUrl); } + } \ No newline at end of file diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index fea8f69a..2f0fc690 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -18,6 +18,7 @@ import nu.marginalia.model.EdgeUrl; import nu.marginalia.converting.processor.logic.links.TopKeywords; import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator; import nu.marginalia.model.crawl.HtmlFeature; +import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -53,9 +54,15 @@ public class DomainProcessor { } @SneakyThrows + @Nullable public ProcessedDomain process(SerializableCrawlDataStream dataStream) { + if (!dataStream.hasNext()) { + return null; + } + var ret = new ProcessedDomain(); List docs = new ArrayList<>(); + Set processedUrls = new HashSet<>(); boolean cookies = false; String ip = ""; @@ -90,7 +97,7 @@ public class DomainProcessor { } else if (data instanceof CrawledDocument doc) { try { - if (doc.url == null) + if (doc.url == null || processedUrls.add(doc.url)) continue; fixBadCanonicalTag(doc); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java index 1ca66ed6..3069c5ed 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/writer/ConverterWriter.java @@ -3,6 +3,7 @@ package nu.marginalia.converting.writer; import lombok.SneakyThrows; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.worklog.BatchingWorkLog; +import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -41,7 +42,10 @@ public class ConverterWriter implements AutoCloseable { } @SneakyThrows - public void accept(ProcessedDomain domain) { + public void accept(@Nullable ProcessedDomain domain) { + if (null == domain) + return; + domainData.put(domain); } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 67b4f7b6..844062bb 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -11,16 +11,16 @@ import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.crawling.io.format.WarcSerializableCrawlDataStream; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.model.crawlspec.CrawlSpecRecord; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Tag; -import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.*; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.ArrayList; import java.util.List; @@ -30,6 +30,8 @@ public class CrawlingThenConvertingIntegrationTest { private DomainProcessor domainProcessor; private HttpFetcher httpFetcher; + private Path fileName; + @SneakyThrows @BeforeAll public static void setUpAll() { @@ -46,6 +48,12 @@ public class CrawlingThenConvertingIntegrationTest { domainProcessor = injector.getInstance(DomainProcessor.class); httpFetcher = new HttpFetcherImpl(WmsaHome.getUserAgent().uaString()); + this.fileName = Files.createTempFile("crawling-then-converting", ".warc.gz"); + } + + @AfterEach + public void tearDown() throws IOException { + Files.deleteIfExists(fileName); } @Test @@ -78,10 +86,16 @@ public class CrawlingThenConvertingIntegrationTest { private CrawledDomain crawl(CrawlSpecRecord specs) throws IOException { List data = new ArrayList<>(); - try (var recorder = new WarcRecorder()) { + try (var recorder = new WarcRecorder(fileName)) { new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(); } + try (var reader = new WarcSerializableCrawlDataStream(fileName)) { + while (reader.hasNext()) { + data.add(reader.next()); + } + } + CrawledDomain domain = data.stream().filter(CrawledDomain.class::isInstance).map(CrawledDomain.class::cast).findFirst().get(); data.stream().filter(CrawledDocument.class::isInstance).map(CrawledDocument.class::cast).forEach(domain.doc::add); return domain; diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index 5c6241f7..658acfbe 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -244,12 +244,10 @@ public class CrawlerMain { // (mostly a case when migrating from legacy->warc) reference.delete(); - Files.move(newWarcFile, finalWarcFile, StandardCopyOption.REPLACE_EXISTING); - CrawledDocumentParquetRecordFileWriter - .convertWarc(domain, finalWarcFile, parquetFile); + .convertWarc(domain, newWarcFile, parquetFile); - workLog.setJobToFinished(domain, finalWarcFile.toString(), size); + workLog.setJobToFinished(domain, parquetFile.toString(), size); heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks); logger.info("Fetched {}", domain); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 80d6853b..668f597a 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -5,6 +5,7 @@ import com.google.common.hash.Hashing; import crawlercommons.robots.SimpleRobotRules; import lombok.SneakyThrows; import nu.marginalia.atags.model.DomainLinks; +import nu.marginalia.contenttype.ContentType; import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawling.body.HttpFetchResult; @@ -247,11 +248,13 @@ public class CrawlerRetreiver implements AutoCloseable { var contentTags = reference.getContentTags(); var fetchedDoc = tryDownload(top, timer, contentTags); - if (fetchedDoc instanceof HttpFetchResult.ResultSame) { + if (fetchedDoc instanceof HttpFetchResult.Result304Raw) { var doc = reference.doc(); if (doc != null) { warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody); - fetchedDoc = new HttpFetchResult.ResultRetained(doc.url, doc.contentType, doc.documentBody); + fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url, + new ContentType(doc.contentType, "UTF-8"), + doc.documentBody); } } @@ -265,7 +268,7 @@ public class CrawlerRetreiver implements AutoCloseable { crawlFrontier.addVisited(new EdgeUrl(ok.uri())); } } - else if (fetchedDoc instanceof HttpFetchResult.ResultRetained retained) { + else if (fetchedDoc instanceof HttpFetchResult.Result304ReplacedWithReference retained) { var docOpt = retained.parseDocument(); if (docOpt.isPresent()) { var doc = docOpt.get(); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index f8f11b13..cc4a195d 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -12,7 +12,6 @@ import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL; import nu.marginalia.crawling.body.DocumentBodyExtractor; -import nu.marginalia.crawling.body.DocumentBodyResult; import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.body.ContentTypeLogic; @@ -183,7 +182,7 @@ public class HttpFetcherImpl implements HttpFetcher { throw new RateLimitException(retryAfter); } if (ok.statusCode() == 304) { - return new HttpFetchResult.ResultSame(); + return new HttpFetchResult.Result304Raw(); } if (ok.statusCode() == 200) { return ok; @@ -268,7 +267,7 @@ public class HttpFetcherImpl implements HttpFetcher { return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) -> robotsParser.parseContent(url.toString(), body, - contentType, + contentType.toString(), userAgent) ); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java index bce1b890..a1335eb8 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java @@ -69,8 +69,12 @@ public class WarcRecorder implements AutoCloseable { temporaryFile = true; } - public HttpFetchResult fetch(OkHttpClient client, Request request) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException { - URI uri = request.url().uri(); + public HttpFetchResult fetch(OkHttpClient client, Request request) throws NoSuchAlgorithmException, + IOException, + URISyntaxException, + InterruptedException + { + URI requestUri = request.url().uri(); WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder(); WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder(); @@ -133,7 +137,11 @@ public class WarcRecorder implements AutoCloseable { } } - WarcResponse.Builder responseBuilder = new WarcResponse.Builder(uri) + // It looks like this might be the same as requestUri, but it's not; + // it's the URI after resolving redirects. + final URI responseUri = response.request().url().uri(); + + WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri) .blockDigest(responseDigestBuilder.build()) .date(date) .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes()); @@ -155,11 +163,11 @@ public class WarcRecorder implements AutoCloseable { WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder(); - String httpRequestString = WarcProtocolReconstructor.getHttpRequestString(response.request(), uri); + String httpRequestString = WarcProtocolReconstructor.getHttpRequestString(response.request(), requestUri); requestDigestBuilder.update(httpRequestString); - WarcRequest warcRequest = new WarcRequest.Builder(uri) + WarcRequest warcRequest = new WarcRequest.Builder(requestUri) .blockDigest(requestDigestBuilder.build()) .date(date) .body(MediaType.HTTP_REQUEST, httpRequestString.getBytes()) @@ -168,7 +176,7 @@ public class WarcRecorder implements AutoCloseable { warcRequest.http(); // force HTTP header to be parsed before body is consumed so that caller can use it writer.write(warcRequest); - return new HttpFetchResult.ResultOk(uri, + return new HttpFetchResult.ResultOk(responseUri, response.code(), response.headers(), ip, @@ -177,7 +185,7 @@ public class WarcRecorder implements AutoCloseable { responseDataBuffer.length() - dataStart); } catch (Exception ex) { - logger.warn("Failed to fetch URL {}", uri, ex); + logger.warn("Failed to fetch URL {}", requestUri, ex); return new HttpFetchResult.ResultException(ex); } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java index 31df5e0e..a0559aec 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java @@ -23,9 +23,9 @@ public record DocumentWithReference( * or if the result was retained via HTTP 304. */ public boolean isSame(HttpFetchResult result) { - if (result instanceof HttpFetchResult.ResultSame) + if (result instanceof HttpFetchResult.Result304Raw) return true; - if (result instanceof HttpFetchResult.ResultRetained) + if (result instanceof HttpFetchResult.Result304ReplacedWithReference) return true; if (!(result instanceof HttpFetchResult.ResultOk resultOk)) diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java index e8ba9437..0375f5cb 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java @@ -53,8 +53,6 @@ class WarcRecorderTest { .addHeader("Accept-Encoding", "gzip") .get().build()); - new GZIPInputStream(Files.newInputStream(fileNameWarc)).transferTo(System.out); - Map sampleData = new HashMap<>(); try (var warcReader = new WarcReader(fileNameWarc)) { warcReader.forEach(record -> { @@ -91,8 +89,6 @@ class WarcRecorderTest { } } } - - new GZIPInputStream(Files.newInputStream(fileNameWarc)).transferTo(System.out); } @Test From 9fea22b90d3ffc1c5d4431f628e1cc1863ee21ed Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 15 Dec 2023 15:38:23 +0100 Subject: [PATCH 14/23] (warc) Further tidying This commit includes mostly exception handling, error propagation, a few bug fixes and minor changes to log formatting. The CrawlDelayTimer, HTTP 429 responses and IOException responses are now more accurately handled. A non-standard WarcXEntityRefused WARC record has also been introduced, essentially acting as a rejected 'response' with different semantics. Besides these, several existing features have been refined, such as URL encoding, crawl depth incrementing and usage of Content-Length headers. --- .../WarcSerializableCrawlDataStream.java | 1 + ...rawledDocumentParquetRecordFileWriter.java | 101 ++++++++++++++---- .../netpreserve/jwarc/WarcXEntityRefused.java | 45 ++++++++ .../jwarc/WarcXResponseReference.java | 4 +- .../crawl/retreival/CrawlDataReference.java | 6 +- .../crawl/retreival/CrawlDelayTimer.java | 24 +++-- .../crawl/retreival/CrawlerRetreiver.java | 100 ++++++++--------- .../retreival/CrawlerWarcResynchronizer.java | 14 +++ .../crawl/retreival/DomainCrawlFrontier.java | 7 +- .../warc/WarcProtocolReconstructor.java | 23 +++- .../retreival/fetcher/warc/WarcRecorder.java | 24 ++--- .../retreival/revisit/CrawlerRevisitor.java | 10 +- 12 files changed, 245 insertions(+), 114 deletions(-) create mode 100644 code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXEntityRefused.java diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java index a766a58d..02aefb6d 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java @@ -28,6 +28,7 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa path = file; reader = new WarcReader(file); WarcXResponseReference.register(reader); + WarcXEntityRefused.register(reader); backingIterator = reader.iterator(); } diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java index 60e0178e..5a993fda 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java @@ -9,29 +9,34 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.net.URI; import java.nio.file.Path; public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { private final ParquetWriter writer; private static final Logger logger = LoggerFactory.getLogger(CrawledDocumentParquetRecordFileWriter.class); - public static void convertWarc(String domain, Path warcInputFile, Path parquetOutputFile) throws IOException { + public static void convertWarc(String domain, Path warcInputFile, Path parquetOutputFile) { try (var warcReader = new WarcReader(warcInputFile); var parquetWriter = new CrawledDocumentParquetRecordFileWriter(parquetOutputFile) ) { WarcXResponseReference.register(warcReader); + WarcXEntityRefused.register(warcReader); for (var record : warcReader) { if (record instanceof WarcResponse response) { + // this also captures WarcXResponseReference, which inherits from WarcResponse + // and is used to store old responses from previous crawls; in this part of the logic + // we treat them the same as a normal response + parquetWriter.write(domain, response); } + else if (record instanceof WarcXEntityRefused refused) { + parquetWriter.write(domain, refused); + } else if (record instanceof Warcinfo warcinfo) { - parquetWriter.write(domain, warcinfo); + parquetWriter.write(warcinfo); } - else { - logger.warn("Skipping record of type {}", record.type()); - } - } } catch (Exception ex) { @@ -39,31 +44,40 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { } } - private void write(String domain, Warcinfo warcinfo) throws IOException { + private void write(String domain, WarcXEntityRefused refused) throws IOException { + URI profile = refused.profile(); + + String meta; + if (profile.equals(WarcXEntityRefused.documentRobotsTxtSkippedURN)) { + meta = "x-marginalia/advisory;state=robots-txt-skipped"; + } + else if (profile.equals(WarcXEntityRefused.documentBadContentTypeURN)) { + meta = "x-marginalia/advisory;state=content-type-failed-probe"; + } + else if (profile.equals(WarcXEntityRefused.documentProbeTimeout)) { + meta = "x-marginalia/advisory;state=timeout-probe"; + } + else if (profile.equals(WarcXEntityRefused.documentUnspecifiedError)) { + meta = "x-marginalia/advisory;state=doc-error"; + } + else { + meta = "x-marginalia/advisory;state=unknown"; + } + + write(forDocError(domain, refused.target(), meta)); + } + + private void write(Warcinfo warcinfo) throws IOException { String selfDomain = warcinfo.fields().first("domain").orElse(""); String ip = warcinfo.fields().first("ip").orElse(""); String probeStatus = warcinfo.fields().first("X-WARC-Probe-Status").orElse(""); if (probeStatus.startsWith("REDIRECT")) { String redirectDomain = probeStatus.substring("REDIRECT;".length()); - write(new CrawledDocumentParquetRecord(selfDomain, - STR."https://\{redirectDomain}/", - ip, - false, - 0, - "x-marginalia/advisory;state=redirect", - new byte[0] - )); + write(forDomainRedirect(selfDomain, redirectDomain)); } else if (!"OK".equals(probeStatus)) { - write(new CrawledDocumentParquetRecord(selfDomain, - STR."https://\{domain}/", - ip, - false, - 0, - "x-marginalia/advisory;state=error", - probeStatus.getBytes() - )); + write(forDomainError(selfDomain, ip, probeStatus)); } } @@ -83,6 +97,15 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { return; } + // We don't want to store robots.txt files, as they are not + // interesting for the analysis we want to do. This is important + // since txt-files in general are interesting, and we don't want to + // exclude them as a class. + + if (fetchOk.uri().getPath().equals("/robots.txt")) { + return; + } + byte[] bodyBytes; String contentType; @@ -112,4 +135,36 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { public void close() throws IOException { writer.close(); } + + private CrawledDocumentParquetRecord forDomainRedirect(String domain, String redirectDomain) { + return new CrawledDocumentParquetRecord(domain, + STR."https://\{redirectDomain}/", + "", + false, + 0, + "x-marginalia/advisory;state=redirect", + new byte[0] + ); + } + private CrawledDocumentParquetRecord forDomainError(String domain, String ip, String errorStatus) { + return new CrawledDocumentParquetRecord(domain, + STR."https://\{domain}/", + ip, + false, + 0, + "x-marginalia/advisory;state=error", + errorStatus.getBytes() + ); + } + + private CrawledDocumentParquetRecord forDocError(String domain, String url, String errorStatus) { + return new CrawledDocumentParquetRecord(domain, + url, + "", + false, + 0, + "x-marginalia/advisory;state=error", + errorStatus.getBytes() + ); + } } diff --git a/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXEntityRefused.java b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXEntityRefused.java new file mode 100644 index 00000000..4480115e --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXEntityRefused.java @@ -0,0 +1,45 @@ +package org.netpreserve.jwarc; + +import java.io.IOException; +import java.net.URI; + +/** This defines a non-standard extension to WARC for storing old HTTP responses, + * essentially a 'response' with different semantics + */ +public class WarcXEntityRefused extends WarcRevisit { + private static final String TYPE_NAME = "x-entity-refused"; + + public static final URI documentRobotsTxtSkippedURN = URI.create("urn:marginalia/meta/doc/robots-txt-skipped"); + public static final URI documentBadContentTypeURN = URI.create("urn:marginalia/meta/doc/content-type-failed-probe"); + public static final URI documentProbeTimeout = URI.create("urn:marginalia/meta/doc/timeout-probe"); + public static final URI documentUnspecifiedError = URI.create("urn:marginalia/meta/doc/error"); + + WarcXEntityRefused(MessageVersion version, MessageHeaders headers, MessageBody body) { + super(version, headers, body); + } + + public static void register(WarcReader reader) { + reader.registerType(TYPE_NAME, WarcXEntityRefused::new); + } + + public static class Builder extends AbstractBuilder { + public Builder(URI targetURI, URI profile) { + this(targetURI.toString(), profile.toString()); + } + + public Builder(String targetURI, String profileURI) { + super(TYPE_NAME); + setHeader("WARC-Target-URI", targetURI); + setHeader("WARC-Profile", profileURI); + } + + public Builder body(HttpResponse httpResponse) throws IOException { + return body(MediaType.HTTP_RESPONSE, httpResponse); + } + + @Override + public WarcXEntityRefused build() { + return build(WarcXEntityRefused::new); + } + } +} diff --git a/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java index 7e02d936..19a5a00f 100644 --- a/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java +++ b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXResponseReference.java @@ -4,9 +4,7 @@ import java.io.IOException; import java.net.URI; /** This defines a non-standard extension to WARC for storing old HTTP responses, - * essentially a 'revisit' with a full body, which is not something that is - * expected by the jwarc parser, and goes against the semantics of the revisit - * records a fair bit. + * essentially a 'response' with different semantics.. *

* An x-response-reference record is a response record with a full body, where * the data is a reconstructed HTTP response from a previous crawl. diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java index 9088ebb4..65e1529b 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java @@ -5,6 +5,8 @@ import com.google.common.hash.Hashing; import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.lsh.EasyLSH; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import javax.annotation.Nullable; import java.io.IOException; @@ -15,6 +17,7 @@ import java.nio.file.Path; public class CrawlDataReference implements AutoCloseable { private final SerializableCrawlDataStream data; + private static final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class); public CrawlDataReference(SerializableCrawlDataStream data) { this.data = data; @@ -43,8 +46,9 @@ public class CrawlDataReference implements AutoCloseable { } } catch (IOException ex) { - ex.printStackTrace(); + logger.error("Failed to read next document", ex); } + return null; } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java index ca2494dc..e52b73b6 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDelayTimer.java @@ -20,8 +20,18 @@ public class CrawlDelayTimer { this.delayTime = delayTime; } + /** Call when we've gotten an HTTP 429 response. This will wait a moment, and then + * set a flag that slows down the main crawl delay as well. */ + public void waitRetryDelay(RateLimitException ex) throws InterruptedException { + slowDown = true; + + int delay = ex.retryAfter(); + + Thread.sleep(Math.clamp(delay, 100, 5000)); + } + @SneakyThrows - public void delay(long spentTime) { + public void waitFetchDelay(long spentTime) { long sleepTime = delayTime; if (sleepTime >= 1) { @@ -30,10 +40,6 @@ public class CrawlDelayTimer { Thread.sleep(min(sleepTime - spentTime, 5000)); } - else if (slowDown) { - // Additional delay when the server is signalling it wants slower requests - Thread.sleep( DEFAULT_CRAWL_DELAY_MIN_MS); - } else { // When no crawl delay is specified, lean toward twice the fetch+process time, // within sane limits. This means slower servers get slower crawling, and faster @@ -48,10 +54,10 @@ public class CrawlDelayTimer { Thread.sleep(sleepTime - spentTime); } - } - /** Increase the delay between requests if the server is signalling it wants slower requests with HTTP 429 */ - public void slowDown() { - slowDown = true; + if (slowDown) { + // Additional delay when the server is signalling it wants slower requests + Thread.sleep( DEFAULT_CRAWL_DELAY_MIN_MS); + } } } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 668f597a..35f5bcd0 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -3,7 +3,6 @@ package nu.marginalia.crawl.retreival; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import crawlercommons.robots.SimpleRobotRules; -import lombok.SneakyThrows; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.contenttype.ContentType; import nu.marginalia.crawl.retreival.fetcher.ContentTags; @@ -19,6 +18,7 @@ import nu.marginalia.ip_blocklist.UrlBlocklist; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawlspec.CrawlSpecRecord; +import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -32,6 +32,7 @@ import java.util.*; public class CrawlerRetreiver implements AutoCloseable { private static final int MAX_ERRORS = 20; + private static final int HTTP_429_RETRY_LIMIT = 1; // Retry 429s once private final HttpFetcher fetcher; @@ -40,7 +41,6 @@ public class CrawlerRetreiver implements AutoCloseable { private static final LinkParser linkParser = new LinkParser(); private static final Logger logger = LoggerFactory.getLogger(CrawlerRetreiver.class); - private static final HashFunction hashMethod = Hashing.murmur3_128(0); private static final UrlBlocklist urlBlocklist = new UrlBlocklist(); private static final LinkFilterSelector linkFilterSelector = new LinkFilterSelector(); @@ -104,7 +104,7 @@ public class CrawlerRetreiver implements AutoCloseable { resync.run(warcFile); } - private int crawlDomain(CrawlDataReference oldCrawlData, DomainProber.ProbeResult probeResult, DomainLinks domainLinks) throws IOException { + private int crawlDomain(CrawlDataReference oldCrawlData, DomainProber.ProbeResult probeResult, DomainLinks domainLinks) throws IOException, InterruptedException { String ip = findIp(domain); EdgeUrl rootUrl; @@ -124,7 +124,7 @@ public class CrawlerRetreiver implements AutoCloseable { final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain, warcRecorder); final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay()); - sniffRootDocument(delayTimer, rootUrl); + sniffRootDocument(rootUrl); // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified int recrawled = recrawl(oldCrawlData, robotsRules, delayTimer); @@ -181,8 +181,14 @@ public class CrawlerRetreiver implements AutoCloseable { continue; - if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isOk()) { - fetchedCount++; + try { + if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isOk()) { + fetchedCount++; + } + } + catch (InterruptedException ex) { + Thread.currentThread().interrupt(); + break; } } @@ -192,17 +198,17 @@ public class CrawlerRetreiver implements AutoCloseable { } /** Using the old crawl data, fetch the documents comparing etags and last-modified */ - private int recrawl(CrawlDataReference oldCrawlData, SimpleRobotRules robotsRules, CrawlDelayTimer delayTimer) { + private int recrawl(CrawlDataReference oldCrawlData, SimpleRobotRules robotsRules, CrawlDelayTimer delayTimer) throws InterruptedException { return crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer); } - private void sniffRootDocument(CrawlDelayTimer delayTimer, EdgeUrl rootUrl) { + private void sniffRootDocument(EdgeUrl rootUrl) { try { logger.debug("Configuring link filter"); var url = rootUrl.withPathAndParam("/", null); - var result = tryDownload(url, delayTimer, ContentTags.empty()); + var result = fetcher.fetchContent(url, warcRecorder, ContentTags.empty()); if (!(result instanceof HttpFetchResult.ResultOk ok)) return; @@ -239,22 +245,28 @@ public class CrawlerRetreiver implements AutoCloseable { } public HttpFetchResult fetchWriteAndSleep(EdgeUrl top, - CrawlDelayTimer timer, - DocumentWithReference reference) { + CrawlDelayTimer timer, + DocumentWithReference reference) throws InterruptedException + { logger.debug("Fetching {}", top); + HttpFetchResult fetchedDoc = new HttpFetchResult.ResultNone(); + long startTime = System.currentTimeMillis(); - var contentTags = reference.getContentTags(); - var fetchedDoc = tryDownload(top, timer, contentTags); - if (fetchedDoc instanceof HttpFetchResult.Result304Raw) { - var doc = reference.doc(); - if (doc != null) { - warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody); - fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url, - new ContentType(doc.contentType, "UTF-8"), - doc.documentBody); + // Fetch the document, retrying if we get a rate limit exception + for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) { + try { + fetchedDoc = fetcher.fetchContent(top, warcRecorder, contentTags); + break; + } + catch (RateLimitException ex) { + timer.waitRetryDelay(ex); + } + catch (Exception ex) { + logger.warn("Failed to fetch {}", top, ex); + fetchedDoc = new HttpFetchResult.ResultException(ex); } } @@ -268,14 +280,19 @@ public class CrawlerRetreiver implements AutoCloseable { crawlFrontier.addVisited(new EdgeUrl(ok.uri())); } } - else if (fetchedDoc instanceof HttpFetchResult.Result304ReplacedWithReference retained) { - var docOpt = retained.parseDocument(); - if (docOpt.isPresent()) { - var doc = docOpt.get(); + else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) { + var doc = reference.doc(); - crawlFrontier.enqueueLinksFromDocument(top, doc); - EdgeUrl.parse(retained.url()).ifPresent(crawlFrontier::addVisited); - } + warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody); + + fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url, + new ContentType(doc.contentType, "UTF-8"), + doc.documentBody); + + var parsed = Jsoup.parse(doc.documentBody); + + crawlFrontier.enqueueLinksFromDocument(top, parsed); + crawlFrontier.addVisited(top); } else if (fetchedDoc instanceof HttpFetchResult.ResultException ex) { errorCount ++; @@ -285,7 +302,7 @@ public class CrawlerRetreiver implements AutoCloseable { logger.error("Error parsing document {}", top, ex); } - timer.delay(System.currentTimeMillis() - startTime); + timer.waitFetchDelay(System.currentTimeMillis() - startTime); return fetchedDoc; } @@ -295,33 +312,6 @@ public class CrawlerRetreiver implements AutoCloseable { || proto.equalsIgnoreCase("https"); } - @SneakyThrows - private HttpFetchResult tryDownload(EdgeUrl top, CrawlDelayTimer timer, ContentTags tags) { - for (int i = 0; i < 2; i++) { - try { - return fetcher.fetchContent(top, warcRecorder, tags); - } - catch (RateLimitException ex) { - timer.slowDown(); - - int delay = ex.retryAfter(); - if (delay > 0 && delay < 5000) { - Thread.sleep(delay); - } - } - catch (Exception ex) { - logger.warn("Failed to fetch {}", top, ex); - return new HttpFetchResult.ResultException(ex); - } - } - - return new HttpFetchResult.ResultNone(); - } - - private String createHash(String documentBodyHash) { - return hashMethod.hashUnencodedChars(documentBodyHash).toString(); - } - // FIXME this does not belong in the crawler private Optional findCanonicalUrl(EdgeUrl baseUrl, Document parsed) { baseUrl = baseUrl.domain.toRootUrl(); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java index 47b5b2d8..52ebe2f3 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java @@ -34,6 +34,7 @@ public class CrawlerWarcResynchronizer { // First pass, enqueue links try (var reader = new WarcReader(tempFile)) { WarcXResponseReference.register(reader); + WarcXEntityRefused.register(reader); for (var item : reader) { accept(item); @@ -58,13 +59,26 @@ public class CrawlerWarcResynchronizer { response(rsp); } else if (item instanceof WarcRequest req) { request(req); + } else if (item instanceof WarcXEntityRefused refused) { + refused(refused); } + } catch (Exception ex) { logger.info(STR."Failed to process warc record \{item}", ex); } } + private void refused(WarcXEntityRefused refused) { + // In general, we don't want to re-crawl urls that were refused, + // but to permit circumstances to change over time, we'll + // allow for a small chance of re-probing these entries + + if (Math.random() > 0.1) { + crawlFrontier.addVisited(new EdgeUrl(refused.targetURI())); + } + } + private void request(WarcRequest request) { EdgeUrl.parse(request.target()).ifPresent(crawlFrontier::addVisited); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java index 6d868fdf..46446fee 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/DomainCrawlFrontier.java @@ -50,9 +50,14 @@ public class DomainCrawlFrontier { } } + /** Increase the depth of the crawl by a factor. If the current depth is smaller + * than the number of already visited documents, the base depth will be adjusted + * to the visited count first. + */ public void increaseDepth(double depthIncreaseFactor) { - depth = (int)(depth * depthIncreaseFactor); + depth = (int)(Math.max(visited.size(), depth) * depthIncreaseFactor); } + public void setLinkFilter(Predicate linkFilter) { this.linkFilter = linkFilter; } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java index 2ceb076d..ad29056f 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java @@ -20,7 +20,10 @@ public class WarcProtocolReconstructor { static String getHttpRequestString(Request request, URI uri) { StringBuilder requestStringBuilder = new StringBuilder(); - requestStringBuilder.append(request.method()).append(" ").append(URLEncoder.encode(uri.getPath(), StandardCharsets.UTF_8)); + + final String encodedURL = encodeURLKeepSlashes(uri.getPath()); + + requestStringBuilder.append(request.method()).append(" ").append(encodedURL); if (uri.getQuery() != null) { requestStringBuilder.append("?").append(uri.getQuery()); @@ -37,6 +40,19 @@ public class WarcProtocolReconstructor { return requestStringBuilder.toString(); } + /** Java's URLEncoder will URLEncode slashes, which is not desirable + * when sanitizing a URL for HTTP protocol purposes + */ + + private static String encodeURLKeepSlashes(String URL) { + String[] parts = StringUtils.split(URL,"/"); + StringJoiner joiner = new StringJoiner("/"); + for (String part : parts) { + joiner.add(URLEncoder.encode(part, StandardCharsets.UTF_8)); + } + return joiner.toString(); + } + static String getResponseHeader(String headersAsString, int code) { String version = "1.1"; @@ -131,6 +147,11 @@ public class WarcProtocolReconstructor { if (headerCapitalized.startsWith("X-Marginalia")) return; + // Omit Transfer-Encoding header, as we'll be using Content-Length + // instead in the warc file, despite what the server says + if (headerCapitalized.startsWith("Transfer-Encoding")) + return; + for (var value : values) { joiner.add(headerCapitalized + ": " + value); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java index a1335eb8..5ccfacb5 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java @@ -29,11 +29,6 @@ import java.util.*; * be reconstructed. */ public class WarcRecorder implements AutoCloseable { - public static final URI documentRobotsTxtSkippedURN = URI.create("urn:marginalia/meta/doc/robots-txt-skipped"); - public static final URI documentBadContentTypeURN = URI.create("urn:marginalia/meta/doc/content-type-failed-probe"); - public static final URI documentProbeTimeout = URI.create("urn:marginalia/meta/doc/timeout-probe"); - public static final URI documentUnspecifiedError = URI.create("urn:marginalia/meta/doc/error"); - private static final int MAX_TIME = 30_000; private static final int MAX_SIZE = 1024 * 1024 * 10; private final WarcWriter writer; @@ -91,6 +86,8 @@ public class WarcRecorder implements AutoCloseable { ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(); + boolean hasCookies = !client.cookieJar().loadForRequest(request.url()).isEmpty(); + try (var response = call.execute()) { var body = response.body(); InputStream inputStream; @@ -143,6 +140,7 @@ public class WarcRecorder implements AutoCloseable { WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri) .blockDigest(responseDigestBuilder.build()) + .addHeader("X-Has-Cookies", hasCookies ? "1" : "0") .date(date) .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes()); @@ -280,11 +278,11 @@ public class WarcRecorder implements AutoCloseable { public void flagAsRobotsTxtError(EdgeUrl top) { try { - WarcRevisit revisit = new WarcRevisit.Builder(top.asURI(), documentRobotsTxtSkippedURN) + WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(top.asURI(), WarcXEntityRefused.documentRobotsTxtSkippedURN) .date(Instant.now()) .build(); - writer.write(revisit); + writer.write(refusal); } catch (URISyntaxException | IOException e) { throw new RuntimeException(e); } @@ -292,13 +290,13 @@ public class WarcRecorder implements AutoCloseable { public void flagAsFailedContentTypeProbe(EdgeUrl url, String contentType, int status) { try { - WarcRevisit revisit = new WarcRevisit.Builder(url.asURI(), documentBadContentTypeURN) + WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentBadContentTypeURN) .date(Instant.now()) .addHeader("Rejected-Content-Type", contentType) .addHeader("Http-Status", Integer.toString(status)) .build(); - writer.write(revisit); + writer.write(refusal); } catch (URISyntaxException | IOException e) { throw new RuntimeException(e); } @@ -306,13 +304,13 @@ public class WarcRecorder implements AutoCloseable { public void flagAsError(EdgeUrl url, Exception ex) { try { - WarcRevisit revisit = new WarcRevisit.Builder(url.asURI(), documentUnspecifiedError) + WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentUnspecifiedError) .date(Instant.now()) .addHeader("Exception", ex.getClass().getSimpleName()) .addHeader("ErrorMessage", Objects.requireNonNullElse(ex.getMessage(), "")) .build(); - writer.write(revisit); + writer.write(refusal); } catch (URISyntaxException | IOException e) { throw new RuntimeException(e); } @@ -320,11 +318,11 @@ public class WarcRecorder implements AutoCloseable { public void flagAsTimeout(EdgeUrl url) { try { - WarcRevisit revisit = new WarcRevisit.Builder(url.asURI(), documentProbeTimeout) + WarcXEntityRefused refusal = new WarcXEntityRefused.Builder(url.asURI(), WarcXEntityRefused.documentProbeTimeout) .date(Instant.now()) .build(); - writer.write(revisit); + writer.write(refusal); } catch (URISyntaxException | IOException e) { throw new RuntimeException(e); } diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java index 70a98310..91c21d65 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java @@ -15,13 +15,6 @@ import org.jsoup.Jsoup; * E-Tag and Last-Modified headers. */ public class CrawlerRevisitor { - /** recrawlState tag for documents that had a HTTP status 304 */ - public static final String documentWasRetainedTag = "RETAINED/304"; - - /** recrawlState tag for documents that had a 200 status but were identical to a previous version */ - public static final String documentWasSameTag = "SAME-BY-COMPARISON"; - - private final DomainCrawlFrontier crawlFrontier; private final CrawlerRetreiver crawlerRetreiver; private final WarcRecorder warcRecorder; @@ -37,7 +30,8 @@ public class CrawlerRevisitor { /** Performs a re-crawl of old documents, comparing etags and last-modified */ public int recrawl(CrawlDataReference oldCrawlData, SimpleRobotRules robotsRules, - CrawlDelayTimer delayTimer) { + CrawlDelayTimer delayTimer) + throws InterruptedException { int recrawled = 0; int retained = 0; From fa81e5b8ee7bd0907821e5966882f017e40f9670 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 15 Dec 2023 16:37:53 +0100 Subject: [PATCH 15/23] (warc) Use a non-standard WARC header to convey information about whether a website uses cookies This information is then propagated to the parquet file as a boolean. For documents that are copied from the reference, use whatever value we last saw. This isn't 100% deterministic and may result in false negatives, but permits websites that used cookies but have stopped to repent and have the change reflect in the search engine more quickly. --- ...rawledDocumentParquetRecordFileWriter.java | 2 +- .../jwarc/WarcXCookieInformationHeader.java | 35 +++++++++++++++++++ .../retreival/fetcher/warc/WarcRecorder.java | 20 ++++++++--- 3 files changed, 51 insertions(+), 6 deletions(-) create mode 100644 code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java index 5a993fda..26ba8fe2 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java @@ -124,7 +124,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { domain, response.target(), fetchOk.ipAddress(), - false, // FIXME + WarcXCookieInformationHeader.hasCookies(response), fetchOk.statusCode(), contentType, bodyBytes) diff --git a/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java new file mode 100644 index 00000000..7d983580 --- /dev/null +++ b/code/process-models/crawling-model/src/main/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java @@ -0,0 +1,35 @@ +package org.netpreserve.jwarc; + +import okhttp3.HttpUrl; +import okhttp3.OkHttpClient; + +/** Encapsulates out-of-band information about whether a website uses cookies, + * using a non-standard WARC header "X-Has-Cookies". + */ +public class WarcXCookieInformationHeader { + private boolean hasCookies = false; + private static final String headerName = "X-Has-Cookies"; + + public void update(OkHttpClient client, HttpUrl url) { + if (!hasCookies) { + hasCookies = !client.cookieJar().loadForRequest(url).isEmpty(); + } + } + + public boolean hasCookies() { + return hasCookies; + } + + public void paint(WarcResponse.Builder builder) { + builder.addHeader(headerName, hasCookies ? "1" : "0"); + } + public void paint(WarcXResponseReference.Builder builder) { + builder.addHeader(headerName, hasCookies ? "1" : "0"); + } + + public static boolean hasCookies(WarcRecord record) { + return record.headers().contains(headerName, "1"); + } + + +} diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java index 5ccfacb5..e31585ef 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java @@ -43,6 +43,11 @@ public class WarcRecorder implements AutoCloseable { // in some way private final String warcRecorderVersion = "1.0"; + // We need to know if the site uses cookies so this can be reported among the search results + // -- flip this to true if we see any cookies. This information will also be painted on any + // revisited pages. It's not 100% perfect and a bit order dependent, but it's good enough. + private final WarcXCookieInformationHeader cookieInformation = new WarcXCookieInformationHeader(); + /** * Create a new WarcRecorder that will write to the given file * @@ -86,7 +91,7 @@ public class WarcRecorder implements AutoCloseable { ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(); - boolean hasCookies = !client.cookieJar().loadForRequest(request.url()).isEmpty(); + cookieInformation.update(client, request.url()); try (var response = call.execute()) { var body = response.body(); @@ -140,10 +145,11 @@ public class WarcRecorder implements AutoCloseable { WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri) .blockDigest(responseDigestBuilder.build()) - .addHeader("X-Has-Cookies", hasCookies ? "1" : "0") .date(date) .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes()); + cookieInformation.paint(responseBuilder); + if (ip != null) responseBuilder.ipAddress(InetAddress.getByName(ip)); responseBuilder.payloadDigest(payloadDigestBuilder.build()); @@ -215,16 +221,20 @@ public class WarcRecorder implements AutoCloseable { payloadDigestBuilder.update(bytes, bytes.length); responseDataBuffer.put(bytes, 0, bytes.length); - WarcXResponseReference reference = new WarcXResponseReference.Builder(url.asURI()) + WarcXResponseReference.Builder builder = new WarcXResponseReference.Builder(url.asURI()) .blockDigest(responseDigestBuilder.build()) .payloadDigest(payloadDigestBuilder.build()) .date(Instant.now()) - .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes()) - .build(); + .body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes()); + + cookieInformation.paint(builder); + + var reference = builder.build(); reference.http(); // force HTTP header to be parsed before body is consumed so that caller can use it writer.write(reference); + } catch (URISyntaxException | IOException | NoSuchAlgorithmException e) { throw new RuntimeException(e); } From cf935a533110adda000a74064052e66fbc4558bc Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 15 Dec 2023 18:09:53 +0100 Subject: [PATCH 16/23] (converter) Read cookie information Add an optional new field to CrawledDocument containing information about whether the domain has cookies. This was previously on the CrawledDomain object, but since the WarcFormat requires us to write a WarcInfo object at the start of a crawl rather than at the end, this information is unobtainable when creating the CrawledDomain object. Also fix a bug in the deduplication logic in the DomainProcessor class that caused a test to break. --- .../io/format/ParquetSerializableCrawlDataStream.java | 3 ++- .../io/format/WarcSerializableCrawlDataStream.java | 8 +++++--- .../nu/marginalia/crawling/model/CrawledDocument.java | 4 ++++ .../nu/marginalia/crawling/model/CrawledDomain.java | 3 +++ .../converting/processor/DomainProcessor.java | 6 +++++- .../converting/sideload/SideloaderProcessing.java | 3 ++- .../converting/ConvertingIntegrationTest.java | 4 +++- .../CrawlingThenConvertingIntegrationTest.java | 10 +++++++++- 8 files changed, 33 insertions(+), 8 deletions(-) diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java index 0b852e01..e31913fd 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java @@ -101,7 +101,8 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial "", nextRecord.url, null, - "")); + "", + nextRecord.cookies)); } public void close() throws IOException { diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java index 02aefb6d..2cdb7af1 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/WarcSerializableCrawlDataStream.java @@ -69,7 +69,6 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa redirectDomain = statusReason; } - // TODO: Fix cookies info somehow next = new CrawledDomain(domain, redirectDomain, status, statusReason, ip, new ArrayList<>(), new ArrayList<>() @@ -98,7 +97,9 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa response.payloadDigest().map(WarcDigest::base64).orElse(""), "", "", - ""); + "", + WarcXCookieInformationHeader.hasCookies(response) + ); } else if (parsedBody instanceof DocumentBodyResult.Ok ok) { next = new CrawledDocument( "", @@ -113,7 +114,8 @@ public class WarcSerializableCrawlDataStream implements AutoCloseable, Serializa response.payloadDigest().map(WarcDigest::base64).orElse(""), "", "", - ""); + "", + WarcXCookieInformationHeader.hasCookies(response)); } else { // unreachable throw new IllegalStateException("Unknown body type: " + parsedBody); diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java index 143c775b..7d85bdfd 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java @@ -30,6 +30,10 @@ public class CrawledDocument implements SerializableCrawlData { public String recrawlState; + /** This is not guaranteed to be set in all versions of the format, + * information may come in CrawledDomain instead */ + public Boolean hasCookies = false; + public static final String SERIAL_IDENTIFIER = "// DOCUMENT"; @Override public String getSerialIdentifier() { diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDomain.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDomain.java index 55ec27a6..3add3b8d 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDomain.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDomain.java @@ -17,6 +17,9 @@ public class CrawledDomain implements SerializableCrawlData { public String ip; public List doc; + + /** This is not guaranteed to be set in all versions of the format, + * information may come in CrawledDocument instead */ public List cookies; public int size() { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index 2f0fc690..f86b6bfe 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -97,11 +97,15 @@ public class DomainProcessor { } else if (data instanceof CrawledDocument doc) { try { - if (doc.url == null || processedUrls.add(doc.url)) + if (doc.url == null || !processedUrls.add(doc.url)) continue; fixBadCanonicalTag(doc); + if (Boolean.TRUE.equals(doc.hasCookies)) { + cookies = true; + } + // This case should never be reachable, as we should have initiated // the externalDomainLinks variable above if we made it past the // doc.url == null check; but we'll leave it here just in case diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java index 65f0bd41..16a1ae7c 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloaderProcessing.java @@ -50,7 +50,8 @@ public class SideloaderProcessing { Integer.toHexString(url.hashCode()), url, "", - "SIDELOAD" + "SIDELOAD", + false ); var ret = new ProcessedDocument(); diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java index ce0d8f4a..eaa9d813 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -65,6 +65,7 @@ public class ConvertingIntegrationTest { @Test public void testMemexMarginaliaNu() throws IOException { var ret = domainProcessor.process(asSerializableCrawlData(readMarginaliaWorkingSet())); + assertNotNull(ret); assertEquals(ret.state, DomainIndexingState.ACTIVE); assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu")); @@ -114,7 +115,8 @@ public class ConvertingIntegrationTest { Double.toString(Math.random()), "https://memex.marginalia.nu/" + file, null, - "" + "", + false ); docs.add(doc); } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 844062bb..51667b3a 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -11,10 +11,13 @@ import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream; import nu.marginalia.crawling.io.format.WarcSerializableCrawlDataStream; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecord; +import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; import nu.marginalia.model.crawlspec.CrawlSpecRecord; import org.junit.jupiter.api.*; @@ -31,6 +34,7 @@ public class CrawlingThenConvertingIntegrationTest { private HttpFetcher httpFetcher; private Path fileName; + private Path fileName2; @SneakyThrows @BeforeAll @@ -49,11 +53,13 @@ public class CrawlingThenConvertingIntegrationTest { domainProcessor = injector.getInstance(DomainProcessor.class); httpFetcher = new HttpFetcherImpl(WmsaHome.getUserAgent().uaString()); this.fileName = Files.createTempFile("crawling-then-converting", ".warc.gz"); + this.fileName2 = Files.createTempFile("crawling-then-converting", ".warc.gz"); } @AfterEach public void tearDown() throws IOException { Files.deleteIfExists(fileName); + Files.deleteIfExists(fileName2); } @Test @@ -90,7 +96,9 @@ public class CrawlingThenConvertingIntegrationTest { new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(); } - try (var reader = new WarcSerializableCrawlDataStream(fileName)) { + CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain, fileName, fileName2); + + try (var reader = new ParquetSerializableCrawlDataStream(fileName2)) { while (reader.hasNext()) { data.add(reader.next()); } From 2e536e3141de72517400c1fd55a8646fd1a634a3 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 15 Dec 2023 20:23:27 +0100 Subject: [PATCH 17/23] (crawler) Add timestamp to CrawledDocument records This update includes the addition of timestamps to the parquet format for crawl data, as extracted from the Warc stream. The parquet format stores the timestamp as a 64 bit long, seconds since unix epoch, without a logical type. This is to avoid having to do format conversions when writing and reading the data. This parquet field populates the timestamp field in CrawledDocument. --- .../ParquetSerializableCrawlDataStream.java | 2 +- .../parquet/CrawledDocumentParquetRecord.java | 8 +++++++- .../CrawledDocumentParquetRecordFileWriter.java | 17 +++++++++++------ ...wledDocumentParquetRecordFileWriterTest.java | 3 ++- .../CrawlingThenConvertingIntegrationTest.java | 6 +++++- 5 files changed, 26 insertions(+), 10 deletions(-) diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java index e31913fd..d4ad4ed4 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java @@ -92,7 +92,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial nextQ.add(new CrawledDocument("", nextRecord.url, nextRecord.contentType, - "", + nextRecord.timestamp.toString(), nextRecord.httpStatus, "OK", "", diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java index 6e0e5a0b..c96aeb25 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java @@ -10,7 +10,9 @@ import lombok.ToString; import org.apache.parquet.schema.MessageType; import org.apache.parquet.schema.Types; -import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; +import java.time.Instant; + +import static org.apache.parquet.schema.LogicalTypeAnnotation.*; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*; @AllArgsConstructor @@ -23,6 +25,7 @@ public class CrawledDocumentParquetRecord { public String ip; public boolean cookies; public int httpStatus; + public Instant timestamp; public String contentType; public byte[] body; @@ -41,6 +44,7 @@ public class CrawledDocumentParquetRecord { Types.required(BINARY).as(stringType()).named("ip"), Types.required(BOOLEAN).named("cookies"), Types.required(INT32).named("httpStatus"), + Types.required(INT64).named("epochSeconds"), Types.required(BINARY).as(stringType()).named("contentType"), Types.required(BINARY).named("body") ); @@ -55,6 +59,7 @@ public class CrawledDocumentParquetRecord { case "cookies" -> cookies = (Boolean) value; case "contentType" -> contentType = (String) value; case "body" -> body = (byte[]) value; + case "epochSeconds" -> timestamp = Instant.ofEpochSecond((Long) value); default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); } return this; @@ -64,6 +69,7 @@ public class CrawledDocumentParquetRecord { valueWriter.write("domain", domain); valueWriter.write("url", url); valueWriter.write("ip", ip); + valueWriter.write("epochSeconds", timestamp.getEpochSecond()); valueWriter.write("httpStatus", httpStatus); valueWriter.write("cookies", cookies); valueWriter.write("contentType", contentType); diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java index 26ba8fe2..40830299 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java @@ -11,6 +11,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.net.URI; import java.nio.file.Path; +import java.time.Instant; public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { private final ParquetWriter writer; @@ -64,7 +65,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { meta = "x-marginalia/advisory;state=unknown"; } - write(forDocError(domain, refused.target(), meta)); + write(forDocError(domain, refused.date(), refused.target(), meta)); } private void write(Warcinfo warcinfo) throws IOException { @@ -74,10 +75,10 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { if (probeStatus.startsWith("REDIRECT")) { String redirectDomain = probeStatus.substring("REDIRECT;".length()); - write(forDomainRedirect(selfDomain, redirectDomain)); + write(forDomainRedirect(selfDomain, warcinfo.date(), redirectDomain)); } else if (!"OK".equals(probeStatus)) { - write(forDomainError(selfDomain, ip, probeStatus)); + write(forDomainError(selfDomain, warcinfo.date(), ip, probeStatus)); } } @@ -126,6 +127,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { fetchOk.ipAddress(), WarcXCookieInformationHeader.hasCookies(response), fetchOk.statusCode(), + response.date(), contentType, bodyBytes) ); @@ -136,33 +138,36 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { writer.close(); } - private CrawledDocumentParquetRecord forDomainRedirect(String domain, String redirectDomain) { + private CrawledDocumentParquetRecord forDomainRedirect(String domain, Instant date, String redirectDomain) { return new CrawledDocumentParquetRecord(domain, STR."https://\{redirectDomain}/", "", false, 0, + date, "x-marginalia/advisory;state=redirect", new byte[0] ); } - private CrawledDocumentParquetRecord forDomainError(String domain, String ip, String errorStatus) { + private CrawledDocumentParquetRecord forDomainError(String domain, Instant date, String ip, String errorStatus) { return new CrawledDocumentParquetRecord(domain, STR."https://\{domain}/", ip, false, 0, + date, "x-marginalia/advisory;state=error", errorStatus.getBytes() ); } - private CrawledDocumentParquetRecord forDocError(String domain, String url, String errorStatus) { + private CrawledDocumentParquetRecord forDocError(String domain, Instant date, String url, String errorStatus) { return new CrawledDocumentParquetRecord(domain, url, "", false, 0, + date, "x-marginalia/advisory;state=error", errorStatus.getBytes() ); diff --git a/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java b/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java index 15338de1..450d147b 100644 --- a/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java +++ b/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java @@ -7,11 +7,11 @@ import nu.marginalia.crawling.model.SerializableCrawlData; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.netpreserve.jwarc.net.WarcRecorder; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.time.Instant; import java.util.ArrayList; import static org.junit.jupiter.api.Assertions.*; @@ -36,6 +36,7 @@ class CrawledDocumentParquetRecordFileWriterTest { "127.0.0.1", false, 200, + Instant.now(), "text/html", "hello world".getBytes()); diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 51667b3a..59912f8b 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -104,7 +104,11 @@ public class CrawlingThenConvertingIntegrationTest { } } - CrawledDomain domain = data.stream().filter(CrawledDomain.class::isInstance).map(CrawledDomain.class::cast).findFirst().get(); + CrawledDomain domain = data.stream() + .filter(CrawledDomain.class::isInstance) + .map(CrawledDomain.class::cast) + .findFirst() + .get(); data.stream().filter(CrawledDocument.class::isInstance).map(CrawledDocument.class::cast).forEach(domain.doc::add); return domain; } From 532996815592328ebb53545388fc72c9d0181fbc Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 15 Dec 2023 21:04:06 +0100 Subject: [PATCH 18/23] (crawler) Update CrawlingThenConvertingIntegrationTest This commit updates CrawlingThenConvertingIntegrationTest with additional tests for invalid, redirecting, and blocked domains. Improvements have also been made to filter out irrelevant entries in ParquetSerializableCrawlDataStream. --- .../ParquetSerializableCrawlDataStream.java | 6 +- ...CrawlingThenConvertingIntegrationTest.java | 117 ++++++++++++++++-- 2 files changed, 111 insertions(+), 12 deletions(-) diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java index d4ad4ed4..0dcd4625 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java @@ -58,7 +58,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial String statusReason = ""; String redirectDomain = null; - if (parquetRecord.contentType.equals("x-marginalia/advisory;state=redir")) { + if (parquetRecord.contentType.equals("x-marginalia/advisory;state=redirect")) { EdgeUrl crawledUrl = new EdgeUrl(parquetRecord.url); redirectDomain = crawledUrl.getDomain().toString(); status = CrawlerDomainStatus.REDIRECT; @@ -84,6 +84,10 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial } private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) { + if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { + return; + } + String bodyString = DocumentBodyToString.getStringData( ContentType.parse(nextRecord.contentType), nextRecord.body); diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 59912f8b..14d2e528 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -4,35 +4,44 @@ import com.google.inject.Guice; import com.google.inject.Injector; import lombok.SneakyThrows; import nu.marginalia.WmsaHome; +import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream; -import nu.marginalia.crawling.io.format.WarcSerializableCrawlDataStream; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.SerializableCrawlData; -import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecord; import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawlspec.CrawlSpecRecord; import org.junit.jupiter.api.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; +import java.util.function.Predicate; -/* This is mostly a debugging utility */ +import static org.junit.jupiter.api.Assertions.*; + +/** Tests for the crawler and converter integration. These are pretty slow and potentially + * a bit flaky, since they attempt to fetch real websites. + */ @Tag("slow") public class CrawlingThenConvertingIntegrationTest { private DomainProcessor domainProcessor; private HttpFetcher httpFetcher; + private static final Logger logger = LoggerFactory.getLogger(CrawlingThenConvertingIntegrationTest.class); + private Path fileName; private Path fileName2; @@ -63,7 +72,69 @@ public class CrawlingThenConvertingIntegrationTest { } @Test - public void crawlThenProcess() throws IOException { + public void testInvalidDomain() throws IOException { + // Attempt to fetch an invalid domain + var specs = CrawlSpecRecord.builder() + .domain("invalid.invalid.invalid") + .crawlDepth(10) + .urls(List.of()) // add specific URLs to crawl here + .build(); + + CrawledDomain crawlData = crawl(specs); + + assertEquals("ERROR", crawlData.crawlerStatus); + assertTrue(crawlData.doc.isEmpty()); + + var processedData = process(); + + assertNotNull(processedData); + assertTrue(processedData.documents.isEmpty()); + } + + @Test + public void testRedirectingDomain() throws IOException { + // Attempt to fetch an invalid domain + var specs = CrawlSpecRecord.builder() + .domain("memex.marginalia.nu") + .crawlDepth(10) + .urls(List.of()) // add specific URLs to crawl here + .build(); + + CrawledDomain crawlData = crawl(specs); + + assertEquals("REDIRECT", crawlData.crawlerStatus); + assertEquals("www.marginalia.nu", crawlData.redirectDomain); + assertTrue(crawlData.doc.isEmpty()); + + var processedData = process(); + + assertNotNull(processedData); + assertTrue(processedData.documents.isEmpty()); + } + + @Test + public void testBlockedDomain() throws IOException { + // Attempt to fetch an invalid domain + var specs = CrawlSpecRecord.builder() + .domain("search.marginalia.nu") + .crawlDepth(10) + .urls(List.of()) // add specific URLs to crawl here + .build(); + + CrawledDomain crawlData = crawl(specs, d->false); // simulate blocking by blacklisting everything + + assertEquals("ERROR", crawlData.crawlerStatus); + assertEquals("BLOCKED;IP not allowed", crawlData.crawlerStatusDesc); + assertTrue(crawlData.doc.isEmpty()); + + var processedData = process(); + + assertNotNull(processedData); + assertTrue(processedData.documents.isEmpty()); + } + + @Test + public void crawlSunnyDay() throws IOException { var specs = CrawlSpecRecord.builder() .domain("www.marginalia.nu") .crawlDepth(10) @@ -71,12 +142,20 @@ public class CrawlingThenConvertingIntegrationTest { .build(); CrawledDomain domain = crawl(specs); + assertFalse(domain.doc.isEmpty()); + assertEquals("OK", domain.crawlerStatus); + assertEquals("www.marginalia.nu", domain.domain); - List data = new ArrayList<>(); - data.add(domain); - data.addAll(domain.doc); + boolean hasRobotsTxt = domain.doc.stream().map(doc -> doc.url).anyMatch(url -> url.endsWith("/robots.txt")); + assertFalse(hasRobotsTxt, "Robots.txt should not leave the crawler"); + + var output = process(); + + assertNotNull(output); + assertFalse(output.documents.isEmpty()); + assertEquals(new EdgeDomain("www.marginalia.nu"), output.domain); + assertEquals(DomainIndexingState.ACTIVE, output.state); - var output = domainProcessor.process(SerializableCrawlDataStream.fromIterator(data.iterator())); for (var doc : output.documents) { if (doc.isOk()) { @@ -89,18 +168,33 @@ public class CrawlingThenConvertingIntegrationTest { } + private ProcessedDomain process() { + try (var stream = new ParquetSerializableCrawlDataStream(fileName2)) { + return domainProcessor.process(stream); + } + catch (Exception e) { + Assertions.fail(e); + return null; // unreachable + } + } private CrawledDomain crawl(CrawlSpecRecord specs) throws IOException { + return crawl(specs, domain -> true); + } + + private CrawledDomain crawl(CrawlSpecRecord specs, Predicate domainBlacklist) throws IOException { List data = new ArrayList<>(); try (var recorder = new WarcRecorder(fileName)) { - new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(); + new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).fetch(); } CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain, fileName, fileName2); try (var reader = new ParquetSerializableCrawlDataStream(fileName2)) { while (reader.hasNext()) { - data.add(reader.next()); + var next = reader.next(); + logger.info("{}", next); + data.add(next); } } @@ -109,6 +203,7 @@ public class CrawlingThenConvertingIntegrationTest { .map(CrawledDomain.class::cast) .findFirst() .get(); + data.stream().filter(CrawledDocument.class::isInstance).map(CrawledDocument.class::cast).forEach(domain.doc::add); return domain; } From 2e7db61808e0a33aa60d1cec26889dab5774f8af Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 15 Dec 2023 21:31:16 +0100 Subject: [PATCH 19/23] (warc) More accurate filering of advisory records We want to mute some of these records so that they don't produce documents, but in some cases we want a document to be produced for accounting purposes. Added improved tests that reach for known resources on www.marginalia.nu to test the behavior when encountering bad content type and 404s. The commit also adds some safety try-catch:es around the charset handling, as it may sometimes explode when fed incorrect data, and we do be guessing... --- .../crawling-model/build.gradle | 1 + .../crawling/body/DocumentBodyExtractor.java | 7 ++- .../ParquetSerializableCrawlDataStream.java | 39 ++++++++++------- ...rawledDocumentParquetRecordFileWriter.java | 4 +- ...CrawlingThenConvertingIntegrationTest.java | 43 +++++++++++++++++++ 5 files changed, 76 insertions(+), 18 deletions(-) diff --git a/code/process-models/crawling-model/build.gradle b/code/process-models/crawling-model/build.gradle index 03db0de9..c933ea55 100644 --- a/code/process-models/crawling-model/build.gradle +++ b/code/process-models/crawling-model/build.gradle @@ -23,6 +23,7 @@ dependencies { implementation project(':code:features-crawl:content-type') implementation project(':code:libraries:language-processing') implementation project(':third-party:parquet-floor') + implementation project(':third-party:commons-codec') implementation libs.bundles.slf4j diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java index a485e5bc..019aa761 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java @@ -39,7 +39,12 @@ public class DocumentBodyExtractor { private static DocumentBodyResult toStringResult(ContentType contentType, byte[] bytes) { if (contentTypeLogic.isAllowableContentType(contentType)) { - return new DocumentBodyResult.Ok<>(contentType, DocumentBodyToString.getStringData(contentType, bytes)); + try { + return new DocumentBodyResult.Ok<>(contentType, DocumentBodyToString.getStringData(contentType, bytes)); + } + catch (Exception ex) { + return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); + } } else { return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.BAD_CONTENT_TYPE, ""); diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java index 0dcd4625..85b06157 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java @@ -4,12 +4,10 @@ import lombok.SneakyThrows; import nu.marginalia.contenttype.ContentType; import nu.marginalia.contenttype.DocumentBodyToString; import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.crawling.model.CrawlerDomainStatus; -import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.crawling.model.*; import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecord; import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader; +import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.model.EdgeUrl; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -22,8 +20,9 @@ import java.util.*; public class ParquetSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { private static final Logger logger = LoggerFactory.getLogger(ParquetSerializableCrawlDataStream.class); + private final MurmurHash3_128 hash = new MurmurHash3_128(); private final Iterator backingIterator; - private Deque nextQ = new ArrayDeque<>(); + private final Deque nextQ = new ArrayDeque<>(); private boolean wroteDomainRecord = false; private final Path path; @@ -64,14 +63,13 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial status = CrawlerDomainStatus.REDIRECT; } else if (parquetRecord.contentType.equals("x-marginalia/advisory;state=blocked")) { - status = CrawlerDomainStatus.BLOCKED; // FIXME we don't write this yet + status = CrawlerDomainStatus.BLOCKED; } else if (parquetRecord.contentType.equals("x-marginalia/advisory;state=error")) { status = CrawlerDomainStatus.ERROR; statusReason = new String(parquetRecord.body); } - // FIXME -- cookies nextQ.add(new CrawledDomain( parquetRecord.domain, redirectDomain, @@ -84,25 +82,36 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial } private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) { - if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { + String bodyString = ""; + CrawlerDocumentStatus status = CrawlerDocumentStatus.OK; + + if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=content-type-failed-probe")) { + status = CrawlerDocumentStatus.BAD_CONTENT_TYPE; + } + else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { // other advisory stuff we don't want return; } + else { + try { + bodyString = DocumentBodyToString.getStringData( + ContentType.parse(nextRecord.contentType), + nextRecord.body); + } catch (Exception ex) { + logger.error("Failed to convert body to string", ex); + status = CrawlerDocumentStatus.BAD_CHARSET; + } + } - String bodyString = DocumentBodyToString.getStringData( - ContentType.parse(nextRecord.contentType), - nextRecord.body); - - // FIXME -- a lot of these fields are not set properly! nextQ.add(new CrawledDocument("", nextRecord.url, nextRecord.contentType, nextRecord.timestamp.toString(), nextRecord.httpStatus, - "OK", + status.toString(), "", "", bodyString, - "", + Long.toHexString(hash.hashNearlyASCII(bodyString)), // this field isn't actually used, maybe we can skip calculating it? nextRecord.url, null, "", diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java index 40830299..edfbc6b1 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java @@ -168,8 +168,8 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { false, 0, date, - "x-marginalia/advisory;state=error", - errorStatus.getBytes() + errorStatus, + new byte[0] ); } } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 14d2e528..e19aa79c 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -28,7 +28,9 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; +import java.util.Set; import java.util.function.Predicate; +import java.util.stream.Collectors; import static org.junit.jupiter.api.Assertions.*; @@ -168,6 +170,47 @@ public class CrawlingThenConvertingIntegrationTest { } + + + @Test + public void crawlContentTypes() throws IOException { + var specs = CrawlSpecRecord.builder() + .domain("www.marginalia.nu") + .crawlDepth(5) + .urls(List.of( + "https://www.marginalia.nu/sanic.png", + "https://www.marginalia.nu/invalid" + )) + .build(); + + CrawledDomain domain = crawl(specs); + assertFalse(domain.doc.isEmpty()); + assertEquals("OK", domain.crawlerStatus); + assertEquals("www.marginalia.nu", domain.domain); + + Set allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet()); + assertTrue(allUrls.contains("https://www.marginalia.nu/sanic.png"), "Should have record for image despite blocked content type"); + assertTrue(allUrls.contains("https://www.marginalia.nu/invalid"), "Should have have record for invalid URL"); + + var output = process(); + + assertNotNull(output); + assertFalse(output.documents.isEmpty()); + assertEquals(new EdgeDomain("www.marginalia.nu"), output.domain); + assertEquals(DomainIndexingState.ACTIVE, output.state); + + + for (var doc : output.documents) { + if (doc.isOk()) { + System.out.println(doc.url + "\t" + doc.state + "\t" + doc.details.title); + } + else { + System.out.println(doc.url + "\t" + doc.state + "\t" + doc.stateReason); + } + } + + } + private ProcessedDomain process() { try (var stream = new ParquetSerializableCrawlDataStream(fileName2)) { return domainProcessor.process(stream); From 0f9cd9c87dfe5a2fbd6b47b7ab05cf7e2c3ae0b3 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 15 Dec 2023 21:37:02 +0100 Subject: [PATCH 20/23] (warc) More accurate filering of advisory records Further create records for resources that were blocked due to robots.txt; as well as tests to verify this happens. --- .../ParquetSerializableCrawlDataStream.java | 3 ++ ...CrawlingThenConvertingIntegrationTest.java | 37 +++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java index 85b06157..d3e54a07 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java @@ -88,6 +88,9 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=content-type-failed-probe")) { status = CrawlerDocumentStatus.BAD_CONTENT_TYPE; } + else if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=robots-txt-skipped")) { + status = CrawlerDocumentStatus.ROBOTS_TXT; + } else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { // other advisory stuff we don't want return; } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index e19aa79c..266670fd 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -211,6 +211,43 @@ public class CrawlingThenConvertingIntegrationTest { } + + @Test + public void crawlRobotsTxt() throws IOException { + var specs = CrawlSpecRecord.builder() + .domain("search.marginalia.nu") + .crawlDepth(5) + .urls(List.of( + "https://search.marginalia.nu/search?q=hello+world" + )) + .build(); + + CrawledDomain domain = crawl(specs); + assertFalse(domain.doc.isEmpty()); + assertEquals("OK", domain.crawlerStatus); + assertEquals("search.marginalia.nu", domain.domain); + + Set allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet()); + assertTrue(allUrls.contains("https://search.marginalia.nu/search"), "We expect a record for entities that are forbidden"); + + var output = process(); + + assertNotNull(output); + assertFalse(output.documents.isEmpty()); + assertEquals(new EdgeDomain("search.marginalia.nu"), output.domain); + assertEquals(DomainIndexingState.ACTIVE, output.state); + + for (var doc : output.documents) { + if (doc.isOk()) { + System.out.println(doc.url + "\t" + doc.state + "\t" + doc.details.title); + } + else { + System.out.println(doc.url + "\t" + doc.state + "\t" + doc.stateReason); + } + } + + } + private ProcessedDomain process() { try (var stream = new ParquetSerializableCrawlDataStream(fileName2)) { return domainProcessor.process(stream); From 2001d0f707d7f1fe2d6885b2a7a66178c545f643 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 15 Dec 2023 21:42:00 +0100 Subject: [PATCH 21/23] (converter) Add @Deprecated annotation to a few fields that should no longer be used. --- .../crawling/model/CrawledDocument.java | 4 ++++ ...edDocumentParquetRecordFileWriterTest.java | 1 - .../processor/DocumentProcessor.java | 7 ------- .../converting/processor/DomainProcessor.java | 21 ------------------- 4 files changed, 4 insertions(+), 29 deletions(-) diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java index 7d85bdfd..6b9ba1be 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java @@ -23,11 +23,15 @@ public class CrawledDocument implements SerializableCrawlData { public String headers; public String documentBody; + + @Deprecated public String documentBodyHash; + @Deprecated public String canonicalUrl; public String redirectUrl; + @Deprecated public String recrawlState; /** This is not guaranteed to be set in all versions of the format, diff --git a/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java b/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java index 450d147b..c79154a4 100644 --- a/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java +++ b/code/process-models/crawling-model/src/test/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java @@ -72,7 +72,6 @@ class CrawledDocumentParquetRecordFileWriterTest { assertEquals("text/html", document.contentType); assertEquals("hello world", document.documentBody); assertEquals(200, document.httpStatus); - assertEquals("https://www.marginalia.nu/", document.canonicalUrl); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java index 8e8841a0..4b5d9173 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java @@ -105,13 +105,6 @@ public class DocumentProcessor { private EdgeUrl getDocumentUrl(CrawledDocument crawledDocument) throws URISyntaxException { - if (crawledDocument.canonicalUrl != null) { - try { - return new EdgeUrl(crawledDocument.canonicalUrl); - } - catch (URISyntaxException ex) { /* fallthrough */ } - } - return new EdgeUrl(crawledDocument.url); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index f86b6bfe..e9794aad 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -100,8 +100,6 @@ public class DomainProcessor { if (doc.url == null || !processedUrls.add(doc.url)) continue; - fixBadCanonicalTag(doc); - if (Boolean.TRUE.equals(doc.hasCookies)) { cookies = true; } @@ -172,25 +170,6 @@ public class DomainProcessor { return false; } - private void fixBadCanonicalTag(CrawledDocument doc) { - // Some sites have a canonical tag that points to a different domain, - // but our loader can not support this, so we point these back to the - // original url. - - var canonicalOpt = EdgeUrl.parse(doc.canonicalUrl); - if (canonicalOpt.isEmpty()) return; - - var urlOpt = EdgeUrl.parse(doc.url); - if (urlOpt.isEmpty()) return; - - var urlActual = urlOpt.get(); - var canonicalActual = canonicalOpt.get(); - - if (!Objects.equals(urlActual.domain, canonicalActual.domain)) { - doc.canonicalUrl = doc.url; - } - } - private void calculateStatistics(ProcessedDomain ret, DomainLinks externalDomainLinks) { LinkGraph linkGraph = new LinkGraph(); TopKeywords topKeywords = new TopKeywords(); From 54ed3b86bae11bcfe6fb8daa09bff4bd15968dbd Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 15 Dec 2023 21:49:35 +0100 Subject: [PATCH 22/23] (minor) Remove dead code. --- .../marginalia/crawl/retreival/CrawlerRetreiver.java | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 35f5bcd0..18035d52 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -312,17 +312,6 @@ public class CrawlerRetreiver implements AutoCloseable { || proto.equalsIgnoreCase("https"); } - // FIXME this does not belong in the crawler - private Optional findCanonicalUrl(EdgeUrl baseUrl, Document parsed) { - baseUrl = baseUrl.domain.toRootUrl(); - - for (var link : parsed.select("link[rel=canonical]")) { - return linkParser.parseLink(baseUrl, link); - } - - return Optional.empty(); - } - private String findIp(String domain) { try { return InetAddress.getByName(domain).getHostAddress(); From 3113b5a5514fe6b146737c331f2c25f4e944bf9d Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 16 Dec 2023 15:57:10 +0100 Subject: [PATCH 23/23] (warc) Filter WarcResponses based on X-Robots-Tags There really is no fantastic place to put this logic, but we need to remove entries with an X-Robots-Tags header where that header indicates it doesn't want to be crawled by Marginalia. --- .../crawling-model/build.gradle | 2 + ...rawledDocumentParquetRecordFileWriter.java | 92 +++++++++++++++++-- ...CrawlingThenConvertingIntegrationTest.java | 5 +- .../java/nu/marginalia/crawl/CrawlerMain.java | 4 +- .../retreival/fetcher/HttpFetcherImpl.java | 48 ---------- .../fetcher/HttpFetcherImplTest.java | 39 ++++---- .../retreival/fetcher/WarcRecorderTest.java | 8 +- 7 files changed, 117 insertions(+), 81 deletions(-) diff --git a/code/process-models/crawling-model/build.gradle b/code/process-models/crawling-model/build.gradle index c933ea55..ab4e8a8a 100644 --- a/code/process-models/crawling-model/build.gradle +++ b/code/process-models/crawling-model/build.gradle @@ -15,6 +15,7 @@ java { dependencies { implementation project(':code:common:model') implementation project(':code:common:db') + implementation project(':code:common:config') implementation project(':code:common:process') implementation project(':code:libraries:big-string') implementation project(':code:api:index-api') @@ -33,6 +34,7 @@ dependencies { implementation libs.jwarc implementation libs.gson implementation libs.commons.io + implementation libs.commons.lang3 implementation libs.okhttp3 implementation libs.jsoup implementation libs.snakeyaml diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java index edfbc6b1..9245156f 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java @@ -1,9 +1,11 @@ package nu.marginalia.crawling.parquet; import blue.strategic.parquet.ParquetWriter; +import nu.marginalia.UserAgent; import nu.marginalia.crawling.body.DocumentBodyExtractor; import nu.marginalia.crawling.body.DocumentBodyResult; import nu.marginalia.crawling.body.HttpFetchResult; +import org.apache.commons.lang3.StringUtils; import org.netpreserve.jwarc.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -12,24 +14,35 @@ import java.io.IOException; import java.net.URI; import java.nio.file.Path; import java.time.Instant; +import java.util.List; +import java.util.Objects; public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { private final ParquetWriter writer; private static final Logger logger = LoggerFactory.getLogger(CrawledDocumentParquetRecordFileWriter.class); - public static void convertWarc(String domain, Path warcInputFile, Path parquetOutputFile) { + public static void convertWarc(String domain, + UserAgent userAgent, + Path warcInputFile, + Path parquetOutputFile) { try (var warcReader = new WarcReader(warcInputFile); var parquetWriter = new CrawledDocumentParquetRecordFileWriter(parquetOutputFile) ) { WarcXResponseReference.register(warcReader); WarcXEntityRefused.register(warcReader); + String uaString = userAgent.uaString(); + for (var record : warcReader) { if (record instanceof WarcResponse response) { // this also captures WarcXResponseReference, which inherits from WarcResponse // and is used to store old responses from previous crawls; in this part of the logic // we treat them the same as a normal response + if (!filterResponse(uaString, response)) { + continue; + } + parquetWriter.write(domain, response); } else if (record instanceof WarcXEntityRefused refused) { @@ -45,6 +58,26 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { } } + /** Return true if the WarcResponse should be excluded from conversion */ + private static boolean filterResponse(String uaString, WarcResponse response) throws IOException { + + // We don't want to store robots.txt files, as they are not + // interesting for the analysis we want to do. This is important + // since txt-files in general are interesting, and we don't want to + // exclude them as a class. + + if (response.targetURI().getPath().equals("/robots.txt")) { + return false; + } + + var robotsTags = response.http().headers().all("X-Robots-Tag"); + if (!isXRobotsTagsPermitted(robotsTags, uaString)) { + return false; + } + + return true; + } + private void write(String domain, WarcXEntityRefused refused) throws IOException { URI profile = refused.profile(); @@ -98,15 +131,6 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { return; } - // We don't want to store robots.txt files, as they are not - // interesting for the analysis we want to do. This is important - // since txt-files in general are interesting, and we don't want to - // exclude them as a class. - - if (fetchOk.uri().getPath().equals("/robots.txt")) { - return; - } - byte[] bodyBytes; String contentType; @@ -172,4 +196,52 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { new byte[0] ); } + + + /** Check X-Robots-Tag header tag to see if we are allowed to index this page. + *

+ * Reference: https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag + * + * @param xRobotsHeaderTags List of X-Robots-Tag values + * @param userAgent User agent string + * @return true if we are allowed to index this page + */ + // Visible for tests + public static boolean isXRobotsTagsPermitted(List xRobotsHeaderTags, String userAgent) { + boolean isPermittedGeneral = true; + boolean isPermittedMarginalia = false; + boolean isForbiddenMarginalia = false; + + for (String header : xRobotsHeaderTags) { + if (header.indexOf(':') >= 0) { + String[] parts = StringUtils.split(header, ":", 2); + + if (parts.length < 2) + continue; + + // Is this relevant to us? + if (!Objects.equals(parts[0].trim(), userAgent)) + continue; + + if (parts[1].contains("noindex")) + isForbiddenMarginalia = true; + else if (parts[1].contains("none")) + isForbiddenMarginalia = true; + else if (parts[1].contains("all")) + isPermittedMarginalia = true; + } + else { + if (header.contains("noindex")) + isPermittedGeneral = false; + if (header.contains("none")) + isPermittedGeneral = false; + } + } + + if (isPermittedMarginalia) + return true; + if (isForbiddenMarginalia) + return false; + return isPermittedGeneral; + } } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 266670fd..535eac31 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -3,6 +3,7 @@ package nu.marginalia.converting; import com.google.inject.Guice; import com.google.inject.Injector; import lombok.SneakyThrows; +import nu.marginalia.UserAgent; import nu.marginalia.WmsaHome; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.processor.DomainProcessor; @@ -268,7 +269,9 @@ public class CrawlingThenConvertingIntegrationTest { new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).fetch(); } - CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain, fileName, fileName2); + CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain, + new UserAgent("test"), + fileName, fileName2); try (var reader = new ParquetSerializableCrawlDataStream(fileName2)) { while (reader.hasNext()) { diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java index 658acfbe..c3864868 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/CrawlerMain.java @@ -51,6 +51,7 @@ import static nu.marginalia.mqapi.ProcessInboxNames.CRAWLER_INBOX; public class CrawlerMain { private final static Logger logger = LoggerFactory.getLogger(CrawlerMain.class); + private final UserAgent userAgent; private final ProcessHeartbeatImpl heartbeat; private final MessageQueueFactory messageQueueFactory; private final DomainProber domainProber; @@ -78,6 +79,7 @@ public class CrawlerMain { DbCrawlSpecProvider dbCrawlSpecProvider, AnchorTagsSourceFactory anchorTagsSourceFactory, Gson gson) { + this.userAgent = userAgent; this.heartbeat = heartbeat; this.messageQueueFactory = messageQueueFactory; this.domainProber = domainProber; @@ -245,7 +247,7 @@ public class CrawlerMain { reference.delete(); CrawledDocumentParquetRecordFileWriter - .convertWarc(domain, newWarcFile, parquetFile); + .convertWarc(domain, userAgent, newWarcFile, parquetFile); workLog.setJobToFinished(domain, parquetFile.toString(), size); heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index cc4a195d..ef6b48cb 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -192,54 +192,6 @@ public class HttpFetcherImpl implements HttpFetcher { return new HttpFetchResult.ResultNone(); } - /** Check X-Robots-Tag header tag to see if we are allowed to index this page. - *

- * Reference: https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag - * - * @param xRobotsHeaderTags List of X-Robots-Tag values - * @param userAgent User agent string - * @return true if we are allowed to index this page - */ - // Visible for tests - public static boolean isXRobotsTagsPermitted(List xRobotsHeaderTags, String userAgent) { - boolean isPermittedGeneral = true; - boolean isPermittedMarginalia = false; - boolean isForbiddenMarginalia = false; - - for (String header : xRobotsHeaderTags) { - if (header.indexOf(':') >= 0) { - String[] parts = StringUtils.split(header, ":", 2); - - if (parts.length < 2) - continue; - - // Is this relevant to us? - if (!Objects.equals(parts[0].trim(), userAgent)) - continue; - - if (parts[1].contains("noindex")) - isForbiddenMarginalia = true; - else if (parts[1].contains("none")) - isForbiddenMarginalia = true; - else if (parts[1].contains("all")) - isPermittedMarginalia = true; - } - else { - if (header.contains("noindex")) - isPermittedGeneral = false; - if (header.contains("none")) - isPermittedGeneral = false; - } - } - - if (isPermittedMarginalia) - return true; - if (isForbiddenMarginalia) - return false; - return isPermittedGeneral; - } - - @Override public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) { return fetchRobotsForProto("https", recorder, domain) diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImplTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImplTest.java index 27b55760..e5673a6a 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImplTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImplTest.java @@ -1,5 +1,6 @@ package nu.marginalia.crawl.retreival.fetcher; +import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; import org.junit.jupiter.api.Test; import java.util.List; @@ -7,30 +8,30 @@ import java.util.List; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; -class HttpFetcherImplTest { +class CrawledDocumentParquetRecordFileWriterTest { @Test public void testXRobotsTag() { - assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("foo:"), "search.marginalia.nu")); - assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(":bar"), "search.marginalia.nu")); - assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(":"), "search.marginalia.nu")); - assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(""), "search.marginalia.nu")); + assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("foo:"), "search.marginalia.nu")); + assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of(":bar"), "search.marginalia.nu")); + assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of(":"), "search.marginalia.nu")); + assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of(""), "search.marginalia.nu")); - assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("doindex"), "search.marginalia.nu")); - assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("noindex"), "search.marginalia.nu")); - assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none"), "search.marginalia.nu")); - assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: noindex"), "search.marginalia.nu")); - assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none"), "search.marginalia.nu")); - assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("googlebot: noindex"), "search.marginalia.nu")); + assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("doindex"), "search.marginalia.nu")); + assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("noindex"), "search.marginalia.nu")); + assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("none"), "search.marginalia.nu")); + assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: noindex"), "search.marginalia.nu")); + assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none"), "search.marginalia.nu")); + assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("googlebot: noindex"), "search.marginalia.nu")); - assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("noindex", "search.marginalia.nu: all"), "search.marginalia.nu")); - assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: all"), "search.marginalia.nu")); - assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: none"), "search.marginalia.nu")); - assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("all", "search.marginalia.nu: none"), "search.marginalia.nu")); - assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "noindex"), "search.marginalia.nu")); - assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "none"), "search.marginalia.nu")); - assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "none"), "search.marginalia.nu")); - assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "all"), "search.marginalia.nu")); + assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("noindex", "search.marginalia.nu: all"), "search.marginalia.nu")); + assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: all"), "search.marginalia.nu")); + assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: none"), "search.marginalia.nu")); + assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("all", "search.marginalia.nu: none"), "search.marginalia.nu")); + assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "noindex"), "search.marginalia.nu")); + assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "none"), "search.marginalia.nu")); + assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "none"), "search.marginalia.nu")); + assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "all"), "search.marginalia.nu")); } } \ No newline at end of file diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java index 0375f5cb..cdc10bd2 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java @@ -1,5 +1,6 @@ package nu.marginalia.crawl.retreival.fetcher; +import nu.marginalia.UserAgent; import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader; @@ -19,7 +20,6 @@ import java.nio.file.Path; import java.security.NoSuchAlgorithmException; import java.util.HashMap; import java.util.Map; -import java.util.zip.GZIPInputStream; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -130,7 +130,11 @@ class WarcRecorderTest { .get().build()); client.close(); - CrawledDocumentParquetRecordFileWriter.convertWarc("www.marginalia.nu", fileNameWarc, fileNameParquet); + CrawledDocumentParquetRecordFileWriter.convertWarc( + "www.marginalia.nu", + new UserAgent("test"), + fileNameWarc, + fileNameParquet); var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList(); assertEquals(3, urls.size());