mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 13:19:02 +00:00
data:image/s3,"s3://crabby-images/c765d/c765d5283f4176ac41b612e7ae83ed62e7ddf9a1" alt="Viktor Lofgren"
This is a first step of using WARC as an intermediate flight recorder style step in the crawler, ultimately aimed at being able to resume crawls if the crawler is restarted. This component is currently not hooked into anything. The OkHttp3 client wrapper class 'WarcRecordingFetcherClient' was implemented for web archiving. This allows for the recording of HTTP requests and responses. New classes were introduced, 'WarcDigestBuilder', 'IpInterceptingNetworkInterceptor', and 'WarcProtocolReconstructor'. The JWarc dependency was added to the build.gradle file, and relevant unit tests were also introduced. Some HttpFetcher-adjacent structural changes were also done for better organization.
69 lines
2.1 KiB
Groovy
69 lines
2.1 KiB
Groovy
plugins {
|
|
id 'java'
|
|
|
|
id 'application'
|
|
id 'jvm-test-suite'
|
|
}
|
|
|
|
java {
|
|
toolchain {
|
|
languageVersion.set(JavaLanguageVersion.of(21))
|
|
}
|
|
}
|
|
|
|
application {
|
|
mainClass = 'nu.marginalia.crawl.CrawlerMain'
|
|
applicationName = 'crawler-process'
|
|
}
|
|
|
|
tasks.distZip.enabled = false
|
|
|
|
dependencies {
|
|
implementation project(':code:common:process')
|
|
|
|
implementation project(':code:common:db')
|
|
implementation project(':code:common:model')
|
|
implementation project(':code:common:config')
|
|
implementation project(':code:common:service')
|
|
implementation project(':code:libraries:big-string')
|
|
implementation project(':code:libraries:blocking-thread-pool')
|
|
implementation project(':code:api:index-api')
|
|
implementation project(':code:api:process-mqapi')
|
|
implementation project(':code:common:service-discovery')
|
|
implementation project(':code:common:service-client')
|
|
implementation project(':code:libraries:message-queue')
|
|
implementation project(':code:libraries:language-processing')
|
|
implementation project(':code:libraries:easy-lsh')
|
|
implementation project(':code:process-models:crawling-model')
|
|
implementation project(':code:process-models:crawl-spec')
|
|
|
|
|
|
implementation project(':code:features-convert:anchor-keywords')
|
|
implementation project(':code:features-crawl:crawl-blocklist')
|
|
implementation project(':code:features-crawl:link-parser')
|
|
implementation project(':code:features-crawl:content-type')
|
|
|
|
implementation libs.bundles.slf4j
|
|
|
|
implementation libs.notnull
|
|
implementation libs.guice
|
|
implementation libs.gson
|
|
implementation libs.zstd
|
|
implementation libs.jwarc
|
|
implementation libs.crawlercommons
|
|
implementation libs.okhttp3
|
|
implementation libs.jsoup
|
|
implementation libs.opencsv
|
|
implementation libs.rxjava
|
|
implementation libs.fastutil
|
|
|
|
implementation libs.bundles.mariadb
|
|
|
|
testImplementation libs.bundles.slf4j.test
|
|
testImplementation libs.bundles.junit
|
|
testImplementation libs.mockito
|
|
|
|
testImplementation project(':code:processes:test-data')
|
|
}
|
|
|