MarginaliaSearch/code/processes/crawling-process/build.gradle
Viktor Lofgren 072b5fcd12 Implement Warc-recording wrapper for OkHttp3 client
This is a first step of using WARC as an intermediate flight recorder style step in the crawler, ultimately aimed at being able to resume crawls if the crawler is restarted.  This component is currently not hooked into anything.

The OkHttp3 client wrapper class 'WarcRecordingFetcherClient' was implemented for web archiving. This allows for the recording of HTTP requests and responses. New classes were introduced, 'WarcDigestBuilder', 'IpInterceptingNetworkInterceptor', and 'WarcProtocolReconstructor'.

The JWarc dependency was added to the build.gradle file, and relevant unit tests were also introduced. Some HttpFetcher-adjacent structural changes were also done for better organization.
2023-12-08 13:49:16 +01:00

69 lines
2.1 KiB
Groovy

plugins {
id 'java'
id 'application'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(21))
}
}
application {
mainClass = 'nu.marginalia.crawl.CrawlerMain'
applicationName = 'crawler-process'
}
tasks.distZip.enabled = false
dependencies {
implementation project(':code:common:process')
implementation project(':code:common:db')
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:common:service')
implementation project(':code:libraries:big-string')
implementation project(':code:libraries:blocking-thread-pool')
implementation project(':code:api:index-api')
implementation project(':code:api:process-mqapi')
implementation project(':code:common:service-discovery')
implementation project(':code:common:service-client')
implementation project(':code:libraries:message-queue')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:easy-lsh')
implementation project(':code:process-models:crawling-model')
implementation project(':code:process-models:crawl-spec')
implementation project(':code:features-convert:anchor-keywords')
implementation project(':code:features-crawl:crawl-blocklist')
implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-crawl:content-type')
implementation libs.bundles.slf4j
implementation libs.notnull
implementation libs.guice
implementation libs.gson
implementation libs.zstd
implementation libs.jwarc
implementation libs.crawlercommons
implementation libs.okhttp3
implementation libs.jsoup
implementation libs.opencsv
implementation libs.rxjava
implementation libs.fastutil
implementation libs.bundles.mariadb
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation project(':code:processes:test-data')
}