Merge pull request #150 from MarginaliaSearch/httpclient-in-crawler

Reduce the use of 3rd party code in the crawler
This commit is contained in:
Viktor 2025-01-20 19:35:30 +01:00 committed by GitHub
commit 2c67f50a43
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
27 changed files with 433 additions and 468 deletions

View File

@ -106,11 +106,7 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
return false;
var url = new EdgeUrl(warcResponse.target());
if (!Objects.equals(url.getDomain(), domain)) {
return false;
}
return true;
return Objects.equals(url.getDomain(), domain);
} catch (Exception e) {
logger.warn("Failed to process response", e);
}

View File

@ -8,6 +8,7 @@ import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.crawl.CrawlerMain;
import nu.marginalia.crawl.DomainStateDb;
import nu.marginalia.crawl.fetcher.Cookies;
import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
@ -200,23 +201,23 @@ public class CrawlingThenConvertingIntegrationTest {
@Test
public void crawlRobotsTxt() throws Exception {
var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 5,
List.of("https://search.marginalia.nu/search?q=hello+world")
var specs = new CrawlerMain.CrawlSpecRecord("marginalia-search.com", 5,
List.of("https://marginalia-search.com/search?q=hello+world")
);
CrawledDomain domain = crawl(specs);
assertFalse(domain.doc.isEmpty());
assertEquals("OK", domain.crawlerStatus);
assertEquals("search.marginalia.nu", domain.domain);
assertEquals("marginalia-search.com", domain.domain);
Set<String> allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet());
assertTrue(allUrls.contains("https://search.marginalia.nu/search"), "We expect a record for entities that are forbidden");
assertTrue(allUrls.contains("https://marginalia-search.com/search"), "We expect a record for entities that are forbidden");
var output = process();
assertNotNull(output);
assertFalse(output.documents.isEmpty());
assertEquals(new EdgeDomain("search.marginalia.nu"), output.domain);
assertEquals(new EdgeDomain("marginalia-search.com"), output.domain);
assertEquals(DomainIndexingState.ACTIVE, output.state);
for (var doc : output.documents) {
@ -246,7 +247,7 @@ public class CrawlingThenConvertingIntegrationTest {
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws Exception {
List<SerializableCrawlData> data = new ArrayList<>();
try (var recorder = new WarcRecorder(fileName);
try (var recorder = new WarcRecorder(fileName, new Cookies());
var db = new DomainStateDb(dbTempFile))
{
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain();

View File

@ -55,7 +55,6 @@ dependencies {
implementation libs.zstd
implementation libs.jwarc
implementation libs.crawlercommons
implementation libs.okhttp3
implementation libs.jsoup
implementation libs.opencsv
implementation libs.fastutil

View File

@ -33,8 +33,6 @@ import nu.marginalia.service.module.DatabaseModule;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.util.SimpleBlockingThreadPool;
import okhttp3.ConnectionPool;
import okhttp3.Dispatcher;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -85,6 +83,7 @@ public class CrawlerMain extends ProcessMainClass {
@Inject
public CrawlerMain(UserAgent userAgent,
HttpFetcherImpl httpFetcher,
ProcessHeartbeatImpl heartbeat,
MessageQueueFactory messageQueueFactory, DomainProber domainProber,
FileStorageService fileStorageService,
@ -98,6 +97,7 @@ public class CrawlerMain extends ProcessMainClass {
super(messageQueueFactory, processConfiguration, gson, CRAWLER_INBOX);
this.userAgent = userAgent;
this.fetcher = httpFetcher;
this.heartbeat = heartbeat;
this.domainProber = domainProber;
this.fileStorageService = fileStorageService;
@ -111,10 +111,6 @@ public class CrawlerMain extends ProcessMainClass {
Integer.getInteger("crawler.poolSize", 256),
1);
fetcher = new HttpFetcherImpl(userAgent,
new Dispatcher(),
new ConnectionPool(5, 10, TimeUnit.SECONDS)
);
// Wait for the blacklist to be loaded before starting the crawl
blacklist.waitUntilLoaded();
@ -132,6 +128,10 @@ public class CrawlerMain extends ProcessMainClass {
System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
System.setProperty("sun.net.client.defaultReadTimeout", "30000");
// Set the maximum number of connections to keep alive in the connection pool
System.setProperty("jdk.httpclient.idleTimeout", "15"); // 15 seconds
System.setProperty("jdk.httpclient.connectionPoolSize", "256");
// We don't want to use too much memory caching sessions for https
System.setProperty("javax.net.ssl.sessionCacheSize", "2048");
@ -364,9 +364,9 @@ public class CrawlerMain extends ProcessMainClass {
Files.deleteIfExists(tempFile);
}
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
try (var warcRecorder = new WarcRecorder(newWarcFile, fetcher); // write to a temp file for now
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
CrawlDataReference reference = getReference();
CrawlDataReference reference = getReference()
)
{
// Resume the crawl if it was aborted

View File

@ -1,6 +1,6 @@
package nu.marginalia.crawl.fetcher;
import okhttp3.Request;
import java.net.http.HttpRequest;
/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
public record ContentTags(String etag, String lastMod) {
@ -17,14 +17,14 @@ public record ContentTags(String etag, String lastMod) {
}
/** Paints the tags onto the request builder. */
public void paint(Request.Builder getBuilder) {
public void paint(HttpRequest.Builder getBuilder) {
if (etag != null) {
getBuilder.addHeader("If-None-Match", etag);
getBuilder.header("If-None-Match", etag);
}
if (lastMod != null) {
getBuilder.addHeader("If-Modified-Since", lastMod);
getBuilder.header("If-Modified-Since", lastMod);
}
}
}

View File

@ -1,33 +1,14 @@
package nu.marginalia.crawl.fetcher;
import okhttp3.Cookie;
import okhttp3.CookieJar;
import okhttp3.HttpUrl;
import java.util.Collections;
import java.io.IOException;
import java.net.CookieHandler;
import java.net.URI;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
public class Cookies {
final ThreadLocal<ConcurrentHashMap<String, List<Cookie>>> cookieJar = ThreadLocal.withInitial(ConcurrentHashMap::new);
public CookieJar getJar() {
return new CookieJar() {
@Override
public void saveFromResponse(HttpUrl url, List<Cookie> cookies) {
if (!cookies.isEmpty()) {
cookieJar.get().put(url.host(), cookies);
}
}
@Override
public List<Cookie> loadForRequest(HttpUrl url) {
return cookieJar.get().getOrDefault(url.host(), Collections.emptyList());
}
};
}
public class Cookies extends CookieHandler {
final ThreadLocal<ConcurrentHashMap<String, List<String>>> cookieJar = ThreadLocal.withInitial(ConcurrentHashMap::new);
public void clear() {
cookieJar.get().clear();
@ -38,6 +19,16 @@ public class Cookies {
}
public List<String> getCookies() {
return cookieJar.get().values().stream().flatMap(List::stream).map(Cookie::toString).toList();
return cookieJar.get().values().stream().flatMap(List::stream).toList();
}
@Override
public Map<String, List<String>> get(URI uri, Map<String, List<String>> requestHeaders) throws IOException {
return cookieJar.get();
}
@Override
public void put(URI uri, Map<String, List<String>> responseHeaders) throws IOException {
cookieJar.get().putAll(responseHeaders);
}
}

View File

@ -3,6 +3,7 @@ package nu.marginalia.crawl.fetcher;
import com.google.inject.ImplementedBy;
import crawlercommons.robots.SimpleRobotRules;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.HttpFetchResult;
@ -11,10 +12,10 @@ import nu.marginalia.model.crawldata.CrawlerDomainStatus;
import java.util.List;
@ImplementedBy(HttpFetcherImpl.class)
public interface HttpFetcher {
public interface HttpFetcher extends AutoCloseable {
void setAllowAllContentTypes(boolean allowAllContentTypes);
List<String> getCookies();
Cookies getCookies();
void clearCookies();
DomainProbeResult probeDomain(EdgeUrl url);
@ -27,7 +28,9 @@ public interface HttpFetcher {
HttpFetchResult fetchContent(EdgeUrl url,
WarcRecorder recorder,
ContentTags tags,
ProbeType probeType) throws HttpFetcherImpl.RateLimitException, Exception;
ProbeType probeType) throws Exception;
List<EdgeUrl> fetchSitemapUrls(String rootSitemapUrl, CrawlDelayTimer delayTimer);
SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);

View File

@ -1,35 +1,41 @@
package nu.marginalia.crawl.fetcher;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import crawlercommons.robots.SimpleRobotRules;
import crawlercommons.robots.SimpleRobotRulesParser;
import nu.marginalia.UserAgent;
import nu.marginalia.crawl.fetcher.socket.FastTerminatingSocketFactory;
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.crawl.fetcher.socket.NoSecuritySSL;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.ContentTypeLogic;
import nu.marginalia.model.body.DocumentBodyExtractor;
import nu.marginalia.model.body.HttpFetchResult;
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
import okhttp3.ConnectionPool;
import okhttp3.Dispatcher;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.net.ssl.X509TrustManager;
import java.io.InterruptedIOException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.net.URLDecoder;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.net.http.HttpTimeoutException;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import java.util.*;
import java.util.concurrent.Executors;
import java.util.zip.GZIPInputStream;
@Singleton
public class HttpFetcherImpl implements HttpFetcher {
private final Logger logger = LoggerFactory.getLogger(getClass());
@ -40,39 +46,28 @@ public class HttpFetcherImpl implements HttpFetcher {
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
private final Duration requestTimeout = Duration.ofSeconds(10);
@Override
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
contentTypeLogic.setAllowAllContentTypes(allowAllContentTypes);
}
private final OkHttpClient client;
private final HttpClient client;
private static final FastTerminatingSocketFactory ftSocketFactory = new FastTerminatingSocketFactory();
private OkHttpClient createClient(Dispatcher dispatcher, ConnectionPool pool) {
var builder = new OkHttpClient.Builder();
if (dispatcher != null) {
builder.dispatcher(dispatcher);
}
return builder.sslSocketFactory(NoSecuritySSL.buildSocketFactory(), (X509TrustManager) NoSecuritySSL.trustAllCerts[0])
.socketFactory(ftSocketFactory)
.hostnameVerifier(NoSecuritySSL.buildHostnameVerifyer())
.addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
.connectionPool(pool)
.cookieJar(cookies.getJar())
.followRedirects(true)
.followSslRedirects(true)
.connectTimeout(8, TimeUnit.SECONDS)
.readTimeout(10, TimeUnit.SECONDS)
.writeTimeout(10, TimeUnit.SECONDS)
private HttpClient createClient() {
return HttpClient.newBuilder()
.sslContext(NoSecuritySSL.buildSslContext())
.cookieHandler(cookies)
.followRedirects(HttpClient.Redirect.NORMAL)
.connectTimeout(Duration.ofSeconds(8))
.executor(Executors.newCachedThreadPool())
.build();
}
@Override
public List<String> getCookies() {
return cookies.getCookies();
public Cookies getCookies() {
return cookies;
}
@Override
@ -81,26 +76,24 @@ public class HttpFetcherImpl implements HttpFetcher {
}
@Inject
public HttpFetcherImpl(UserAgent userAgent,
Dispatcher dispatcher,
ConnectionPool connectionPool)
public HttpFetcherImpl(UserAgent userAgent)
{
this.client = createClient(dispatcher, connectionPool);
this.client = createClient();
this.userAgentString = userAgent.uaString();
this.userAgentIdentifier = userAgent.uaIdentifier();
}
public HttpFetcherImpl(String userAgent) {
this.client = createClient(null, new ConnectionPool());
this.client = createClient();
this.userAgentString = userAgent;
this.userAgentIdentifier = userAgent;
}
// Not necessary in prod, but useful in test
public void close() {
client.dispatcher().executorService().shutdown();
client.connectionPool().evictAll();
client.close();
}
/**
* Probe the domain to see if it is reachable, attempting to identify which schema to use,
* and if there are any redirects. This is done by one or more HEAD requests.
@ -110,19 +103,26 @@ public class HttpFetcherImpl implements HttpFetcher {
*/
@Override
public DomainProbeResult probeDomain(EdgeUrl url) {
var head = new Request.Builder().head().addHeader("User-agent", userAgentString)
.url(url.toString())
HttpRequest head;
try {
head = HttpRequest.newBuilder()
.HEAD()
.uri(url.asURI())
.header("User-agent", userAgentString)
.timeout(requestTimeout)
.build();
var call = client.newCall(head);
try (var rsp = call.execute()) {
EdgeUrl requestUrl = new EdgeUrl(rsp.request().url().toString());
if (!Objects.equals(requestUrl.domain, url.domain)) {
return new DomainProbeResult.Redirect(requestUrl.domain);
} catch (URISyntaxException e) {
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
}
return new DomainProbeResult.Ok(requestUrl);
try {
var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
EdgeUrl rspUri = new EdgeUrl(rsp.uri());
if (!Objects.equals(rspUri.domain, url.domain)) {
return new DomainProbeResult.Redirect(rspUri.domain);
}
return new DomainProbeResult.Ok(rspUri);
}
catch (Exception ex) {
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
@ -140,21 +140,25 @@ public class HttpFetcherImpl implements HttpFetcher {
WarcRecorder warcRecorder,
ContentTags tags) throws RateLimitException {
if (tags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) {
var headBuilder = new Request.Builder().head()
.addHeader("User-agent", userAgentString)
.addHeader("Accept-Encoding", "gzip")
.url(url.toString());
var head = headBuilder.build();
var call = client.newCall(head);
try {
var headBuilder = HttpRequest.newBuilder()
.HEAD()
.uri(url.asURI())
.header("User-agent", userAgentString)
.header("Accept-Encoding", "gzip")
.timeout(requestTimeout)
;
try (var rsp = call.execute()) {
var contentTypeHeader = rsp.header("Content-type");
var rsp = client.send(headBuilder.build(), HttpResponse.BodyHandlers.discarding());
var headers = rsp.headers();
var contentTypeHeader = headers.firstValue("Content-Type").orElse(null);
if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
warcRecorder.flagAsFailedContentTypeProbe(url, contentTypeHeader, rsp.code());
warcRecorder.flagAsFailedContentTypeProbe(url, contentTypeHeader, rsp.statusCode());
return new ContentTypeProbeResult.BadContentType(contentTypeHeader, rsp.code());
return new ContentTypeProbeResult.BadContentType(contentTypeHeader, rsp.statusCode());
}
// Update the URL to the final URL of the HEAD request, otherwise we might end up doing
@ -168,27 +172,27 @@ public class HttpFetcherImpl implements HttpFetcher {
// too many eyebrows when looking at the logs on the target server. Overall it's probably desirable
// that it looks like the traffic makes sense, as opposed to looking like a broken bot.
var redirectUrl = new EdgeUrl(rsp.request().url().toString());
var redirectUrl = new EdgeUrl(rsp.uri());
EdgeUrl ret;
if (Objects.equals(redirectUrl.domain, url.domain)) ret = redirectUrl;
else ret = url;
// Intercept rate limiting
if (rsp.code() == 429) {
throw new HttpFetcherImpl.RateLimitException(Objects.requireNonNullElse(rsp.header("Retry-After"), "1"));
if (rsp.statusCode() == 429) {
throw new HttpFetcherImpl.RateLimitException(headers.firstValue("Retry-After").orElse("1"));
}
return new ContentTypeProbeResult.Ok(ret);
}
catch (HttpTimeoutException ex) {
warcRecorder.flagAsTimeout(url);
return new ContentTypeProbeResult.Timeout(ex);
}
catch (RateLimitException ex) {
throw ex;
}
catch (InterruptedIOException ex) {
warcRecorder.flagAsTimeout(url);
return new ContentTypeProbeResult.Timeout(ex);
} catch (Exception ex) {
catch (Exception ex) {
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
warcRecorder.flagAsError(url, ex);
@ -210,13 +214,15 @@ public class HttpFetcherImpl implements HttpFetcher {
ProbeType probeType)
throws Exception
{
var getBuilder = new Request.Builder().get();
getBuilder.url(url.toString())
.addHeader("Accept-Encoding", "gzip")
.addHeader("Accept-Language", "en,*;q=0.5")
.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
.addHeader("User-agent", userAgentString);
var getBuilder = HttpRequest.newBuilder()
.GET()
.uri(url.asURI())
.header("User-agent", userAgentString)
.header("Accept-Encoding", "gzip")
.header("Accept-Language", "en,*;q=0.5")
.header("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
.timeout(requestTimeout)
;
contentTags.paint(getBuilder);
@ -242,6 +248,122 @@ public class HttpFetcherImpl implements HttpFetcher {
return new SitemapRetriever();
}
@Override
public List<EdgeUrl> fetchSitemapUrls(String root, CrawlDelayTimer delayTimer) {
try {
List<EdgeUrl> ret = new ArrayList<>();
Set<String> seenUrls = new HashSet<>();
Set<String> seenSitemaps = new HashSet<>();
Deque<EdgeUrl> sitemapQueue = new LinkedList<>();
EdgeUrl rootSitemapUrl = new EdgeUrl(root);
sitemapQueue.add(rootSitemapUrl);
int fetchedSitemaps = 0;
while (!sitemapQueue.isEmpty() && ret.size() < 20_000 && ++fetchedSitemaps < 10) {
var head = sitemapQueue.removeFirst();
switch (fetchSitemap(head)) {
case SitemapResult.SitemapUrls(List<String> urls) -> {
for (var url : urls) {
if (seenUrls.add(url)) {
EdgeUrl.parse(url)
.filter(u -> u.domain.equals(rootSitemapUrl.domain))
.ifPresent(ret::add);
}
}
}
case SitemapResult.SitemapReferences(List<String> refs) -> {
for (var ref : refs) {
if (seenSitemaps.add(ref)) {
EdgeUrl.parse(ref)
.filter(url -> url.domain.equals(rootSitemapUrl.domain))
.ifPresent(sitemapQueue::addFirst);
}
}
}
case SitemapResult.SitemapError() -> {}
}
delayTimer.waitFetchDelay();
}
return ret;
}
catch (Exception ex) {
logger.error("Error while fetching sitemaps via " + root, ex);
return List.of();
}
}
private SitemapResult fetchSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
HttpRequest getRequest = HttpRequest.newBuilder()
.GET()
.uri(sitemapUrl.asURI())
.header("Accept-Encoding", "gzip")
.header("Accept", "text/*, */*;q=0.9")
.header("User-agent", userAgentString)
.timeout(requestTimeout)
.build();
var response = client.send(getRequest, HttpResponse.BodyHandlers.ofInputStream());
if (response.statusCode() != 200) {
return new SitemapResult.SitemapError();
}
try (InputStream inputStream = response.body()) {
InputStream parserStream;
if (sitemapUrl.path.endsWith(".gz")) {
parserStream = new GZIPInputStream(inputStream);
}
else {
parserStream = inputStream;
}
Document parsedSitemap = Jsoup.parse(parserStream, "UTF-8", sitemapUrl.toString(), Parser.xmlParser());
String rootTagName = parsedSitemap.child(0).tagName();
return switch (rootTagName.toLowerCase()) {
case "sitemapindex" -> {
List<String> references = new ArrayList<>();
for (var locTag : parsedSitemap.getElementsByTag("loc")) {
references.add(URLDecoder.decode(locTag.text().trim(), StandardCharsets.UTF_8));
}
yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
}
case "urlset" -> {
List<String> urls = new ArrayList<>();
for (var locTag : parsedSitemap.select("url > loc")) {
urls.add(URLDecoder.decode(locTag.text().trim(), StandardCharsets.UTF_8));
}
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
}
case "rss", "atom" -> {
List<String> urls = new ArrayList<>();
for (var locTag : parsedSitemap.select("link, url")) {
urls.add(locTag.text().trim());
}
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
}
default -> new SitemapResult.SitemapError();
};
}
}
private sealed interface SitemapResult {
record SitemapUrls(List<String> urls) implements SitemapResult {}
record SitemapReferences(List<String> sitemapRefs) implements SitemapResult {}
record SitemapError() implements SitemapResult {}
}
@Override
public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) {
var ret = fetchAndParseRobotsTxt(new EdgeUrl("https", domain, null, "/robots.txt", null), recorder);
@ -257,14 +379,15 @@ public class HttpFetcherImpl implements HttpFetcher {
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
try {
var getBuilder = new Request.Builder().get();
var getRequest = HttpRequest.newBuilder()
.GET()
.uri(url.asURI())
.header("Accept-Encoding", "gzip")
.header("Accept", "text/*, */*;q=0.9")
.header("User-agent", userAgentString)
.timeout(requestTimeout);
getBuilder.url(url.toString())
.addHeader("Accept-Encoding", "gzip")
.addHeader("Accept", "text/*, */*;q=0.9")
.addHeader("User-agent", userAgentString);
HttpFetchResult result = recorder.fetch(client, getBuilder.build());
HttpFetchResult result = recorder.fetch(client, getRequest.build());
return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) ->
robotsParser.parseContent(url.toString(),

View File

@ -1,31 +0,0 @@
package nu.marginalia.crawl.fetcher.socket;
import okhttp3.Interceptor;
import okhttp3.Response;
import org.jetbrains.annotations.NotNull;
import java.io.IOException;
/** An interceptor that intercepts network requests and adds the remote IP address as
* a header in the response. This is used to pass the remote IP address to the Warc
* writer, as this information is not available in the response.
*/
public class IpInterceptingNetworkInterceptor implements Interceptor {
private static final String pseudoHeaderName = "X-Marginalia-Remote-IP";
@NotNull
@Override
public Response intercept(@NotNull Interceptor.Chain chain) throws IOException {
String IP = chain.connection().socket().getInetAddress().getHostAddress();
return chain.proceed(chain.request())
.newBuilder()
.addHeader(pseudoHeaderName, IP)
.build();
}
public static String getIpFromResponse(Response response) {
return response.header(pseudoHeaderName);
}
}

View File

@ -27,7 +27,7 @@ public class NoSecuritySSL {
}
};
public static SSLSocketFactory buildSocketFactory() {
public static SSLContext buildSslContext() {
try {
// Install the all-trusting trust manager
final SSLContext sslContext = SSLContext.getInstance("TLS");
@ -40,14 +40,11 @@ public class NoSecuritySSL {
clientSessionContext.setSessionCacheSize(2048);
// Create a ssl socket factory with our all-trusting manager
return sslContext.getSocketFactory();
return sslContext;
}
catch (Exception e) {
throw new RuntimeException(e);
}
}
public static HostnameVerifier buildHostnameVerifyer() {
return (hn, session) -> true;
}
}

View File

@ -1,14 +1,14 @@
package nu.marginalia.crawl.fetcher.warc;
import okhttp3.Headers;
import okhttp3.Response;
import org.apache.commons.io.input.BOMInputStream;
import org.netpreserve.jwarc.WarcTruncationReason;
import java.io.*;
import java.net.http.HttpHeaders;
import java.net.http.HttpResponse;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Objects;
import java.util.Map;
import java.util.zip.GZIPInputStream;
/** Input buffer for temporary storage of a HTTP response
@ -17,8 +17,9 @@ import java.util.zip.GZIPInputStream;
* */
public abstract class WarcInputBuffer implements AutoCloseable {
protected WarcTruncationReason truncationReason = WarcTruncationReason.NOT_TRUNCATED;
protected Headers headers;
WarcInputBuffer(Headers headers) {
protected HttpHeaders headers;
WarcInputBuffer(HttpHeaders headers) {
this.headers = headers;
}
@ -30,7 +31,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
public final WarcTruncationReason truncationReason() { return truncationReason; }
public final Headers headers() { return headers; }
public final HttpHeaders headers() { return headers; }
/** Create a buffer for a response.
* If the response is small and not compressed, it will be stored in memory.
@ -38,26 +39,27 @@ public abstract class WarcInputBuffer implements AutoCloseable {
* and suppressed from the headers.
* If an error occurs, a buffer will be created with no content and an error status.
*/
static WarcInputBuffer forResponse(Response rsp) {
static WarcInputBuffer forResponse(HttpResponse<InputStream> rsp) {
if (rsp == null)
return new ErrorBuffer();
try {
String contentLengthHeader = Objects.requireNonNullElse(rsp.header("Content-Length"), "-1");
int contentLength = Integer.parseInt(contentLengthHeader);
String contentEncoding = rsp.header("Content-Encoding");
var headers = rsp.headers();
try (var is = rsp.body()) {
int contentLength = (int) headers.firstValueAsLong("Content-Length").orElse(-1L);
String contentEncoding = headers.firstValue("Content-Encoding").orElse(null);
if (contentEncoding == null && contentLength > 0 && contentLength < 8192) {
// If the content is small and not compressed, we can just read it into memory
return new MemoryBuffer(rsp, contentLength);
return new MemoryBuffer(headers, is, contentLength);
}
else {
// Otherwise, we unpack it into a file and read it from there
return new FileBuffer(rsp);
return new FileBuffer(headers, is);
}
}
catch (Exception ex) {
return new ErrorBuffer(rsp);
return new ErrorBuffer();
}
}
@ -99,12 +101,8 @@ public abstract class WarcInputBuffer implements AutoCloseable {
/** Pseudo-buffer for when we have an error */
class ErrorBuffer extends WarcInputBuffer {
public ErrorBuffer() {
super(Headers.of());
truncationReason = WarcTruncationReason.UNSPECIFIED;
}
super(HttpHeaders.of(Map.of(), (k,v)->false));
public ErrorBuffer(Response rsp) {
super(rsp.headers());
truncationReason = WarcTruncationReason.UNSPECIFIED;
}
@ -125,12 +123,12 @@ class ErrorBuffer extends WarcInputBuffer {
/** Buffer for when we have the response in memory */
class MemoryBuffer extends WarcInputBuffer {
byte[] data;
public MemoryBuffer(Response response, int size) {
super(response.headers());
public MemoryBuffer(HttpHeaders headers, InputStream responseStream, int size) {
super(headers);
var outputStream = new ByteArrayOutputStream(size);
copy(response.body().byteStream(), outputStream);
copy(responseStream, outputStream);
data = outputStream.toByteArray();
}
@ -154,19 +152,15 @@ class MemoryBuffer extends WarcInputBuffer {
class FileBuffer extends WarcInputBuffer {
private final Path tempFile;
public FileBuffer(Response response) throws IOException {
super(suppressContentEncoding(response.headers()));
public FileBuffer(HttpHeaders headers, InputStream responseStream) throws IOException {
super(suppressContentEncoding(headers));
this.tempFile = Files.createTempFile("rsp", ".html");
if (response.body() == null) {
truncationReason = WarcTruncationReason.DISCONNECT;
return;
}
if ("gzip".equals(response.header("Content-Encoding"))) {
if ("gzip".equalsIgnoreCase(headers.firstValue("Content-Encoding").orElse(""))) {
try (var out = Files.newOutputStream(tempFile)) {
copy(new GZIPInputStream(response.body().byteStream()), out);
copy(new GZIPInputStream(responseStream), out);
}
catch (Exception ex) {
truncationReason = WarcTruncationReason.UNSPECIFIED;
@ -174,7 +168,7 @@ class FileBuffer extends WarcInputBuffer {
}
else {
try (var out = Files.newOutputStream(tempFile)) {
copy(response.body().byteStream(), out);
copy(responseStream, out);
}
catch (Exception ex) {
truncationReason = WarcTruncationReason.UNSPECIFIED;
@ -182,22 +176,13 @@ class FileBuffer extends WarcInputBuffer {
}
}
private static Headers suppressContentEncoding(Headers headers) {
var builder = new Headers.Builder();
headers.toMultimap().forEach((k, values) -> {
private static HttpHeaders suppressContentEncoding(HttpHeaders headers) {
return HttpHeaders.of(headers.map(), (k, v) -> {
if ("Content-Encoding".equalsIgnoreCase(k)) {
return;
}
if ("Transfer-Encoding".equalsIgnoreCase(k)) {
return;
}
for (var value : values) {
builder.add(k, value);
return false;
}
return !"Transfer-Encoding".equalsIgnoreCase(k);
});
return builder.build();
}

View File

@ -1,11 +1,12 @@
package nu.marginalia.crawl.fetcher.warc;
import okhttp3.Protocol;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
import java.net.URI;
import java.net.URLEncoder;
import java.net.http.HttpClient;
import java.net.http.HttpHeaders;
import java.net.http.HttpResponse;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.stream.Collectors;
@ -75,13 +76,13 @@ public class WarcProtocolReconstructor {
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
}
static String getResponseHeader(Response response, long size) {
String version = response.protocol() == Protocol.HTTP_1_1 ? "1.1" : "2.0";
static String getResponseHeader(HttpResponse<?> response, long size) {
String version = response.version() == HttpClient.Version.HTTP_1_1 ? "1.1" : "2.0";
String statusCode = String.valueOf(response.code());
String statusMessage = STATUS_CODE_MAP.getOrDefault(response.code(), "Unknown");
String statusCode = String.valueOf(response.statusCode());
String statusMessage = STATUS_CODE_MAP.getOrDefault(response.statusCode(), "Unknown");
String headerString = getHeadersAsString(response, size);
String headerString = getHeadersAsString(response.headers(), size);
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
}
@ -148,10 +149,10 @@ public class WarcProtocolReconstructor {
return joiner.toString();
}
static private String getHeadersAsString(Response response, long responseSize) {
static private String getHeadersAsString(HttpHeaders headers, long responseSize) {
StringJoiner joiner = new StringJoiner("\r\n");
response.headers().toMultimap().forEach((k, values) -> {
headers.map().forEach((k, values) -> {
String headerCapitalized = capitalizeHeader(k);
// Omit pseudoheaders injected by the crawler itself
@ -179,8 +180,8 @@ public class WarcProtocolReconstructor {
return joiner.toString();
}
// okhttp gives us flattened headers, so we need to reconstruct Camel-Kebab-Case style
// for the WARC parser's sake...
// okhttp gave us flattened headers, so we need to reconstruct Camel-Kebab-Case style
// for the WARC parser's sake... (do we still need this, mr chesterton?)
static private String capitalizeHeader(String k) {
return Arrays.stream(StringUtils.split(k, '-'))
.map(StringUtils::capitalize)

View File

@ -1,13 +1,11 @@
package nu.marginalia.crawl.fetcher.warc;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.Cookies;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.HttpFetchResult;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import org.jetbrains.annotations.Nullable;
import org.netpreserve.jwarc.*;
import org.slf4j.Logger;
@ -18,6 +16,7 @@ import java.io.InputStream;
import java.net.InetAddress;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.http.HttpClient;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
@ -28,7 +27,7 @@ import java.util.*;
/** Based on JWarc's fetch method, APL 2.0 license
* <p></p>
* This class wraps OkHttp's OkHttpClient and records the HTTP request and response in a WARC file,
* This class wraps HttpClient and records the HTTP request and response in a WARC file,
* as best is possible given not all the data is available at the same time and needs to
* be reconstructed.
*/
@ -48,20 +47,22 @@ public class WarcRecorder implements AutoCloseable {
// Affix a version string in case we need to change the format in the future
// in some way
private final String warcRecorderVersion = "1.0";
// We need to know if the site uses cookies so this can be reported among the search results
// -- flip this to true if we see any cookies. This information will also be painted on any
// revisited pages. It's not 100% perfect and a bit order dependent, but it's good enough.
private final WarcXCookieInformationHeader cookieInformation = new WarcXCookieInformationHeader();
private final Cookies cookies;
/**
* Create a new WarcRecorder that will write to the given file
*
* @param warcFile The file to write to
*/
public WarcRecorder(Path warcFile) throws IOException {
public WarcRecorder(Path warcFile, HttpFetcherImpl fetcher) throws IOException {
this.warcFile = warcFile;
this.writer = new WarcWriter(warcFile);
this.cookies = fetcher.getCookies();
}
public WarcRecorder(Path warcFile, Cookies cookies) throws IOException {
this.warcFile = warcFile;
this.writer = new WarcWriter(warcFile);
this.cookies = cookies;
}
/**
@ -71,38 +72,41 @@ public class WarcRecorder implements AutoCloseable {
public WarcRecorder() throws IOException {
this.warcFile = Files.createTempFile("warc", ".warc.gz");
this.writer = new WarcWriter(this.warcFile);
this.cookies = new Cookies();
temporaryFile = true;
}
public HttpFetchResult fetch(OkHttpClient client, Request request) throws NoSuchAlgorithmException,
IOException,
URISyntaxException,
InterruptedException
public HttpFetchResult fetch(HttpClient client,
java.net.http.HttpRequest request)
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
{
URI requestUri = request.url().uri();
URI requestUri = request.uri();
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
String ip;
Instant date = Instant.now();
var call = client.newCall(request);
var response = client.send(request, java.net.http.HttpResponse.BodyHandlers.ofInputStream());
cookieInformation.update(client, request.url());
Map<String, List<String>> extraHeaders = new HashMap<>();
try (var response = call.execute();
WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response))
// Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
extraHeaders.putAll(request.headers().map());
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response))
{
if (cookies.hasCookies()) {
extraHeaders.put("X-Has-Cookies", List.of("1"));
}
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
InputStream inputStream = inputBuffer.read();
ip = IpInterceptingNetworkInterceptor.getIpFromResponse(response);
responseDataBuffer.put(responseHeaders);
responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseHeaders.length);
@ -125,17 +129,15 @@ public class WarcRecorder implements AutoCloseable {
// It looks like this might be the same as requestUri, but it's not;
// it's the URI after resolving redirects.
final URI responseUri = response.request().url().uri();
final URI responseUri = response.uri();
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
.blockDigest(responseDigestBuilder.build())
.date(date)
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
cookieInformation.paint(responseBuilder);
if (ip != null) responseBuilder.ipAddress(InetAddress.getByName(ip));
InetAddress inetAddress = InetAddress.getByName(responseUri.getHost());
responseBuilder.ipAddress(inetAddress);
responseBuilder.payloadDigest(payloadDigestBuilder.build());
responseBuilder.truncated(inputBuffer.truncationReason());
@ -152,8 +154,8 @@ public class WarcRecorder implements AutoCloseable {
byte[] httpRequestString = WarcProtocolReconstructor
.getHttpRequestString(
response.request().method(),
response.request().headers().toMultimap(),
request.headers().toMultimap(),
response.request().headers().map(),
extraHeaders,
requestUri)
.getBytes();
@ -171,7 +173,7 @@ public class WarcRecorder implements AutoCloseable {
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
&& inputBuffer.size() < 2048
&& !request.url().encodedPath().endsWith("robots.txt")) // don't bail on robots.txt
&& !request.uri().getPath().endsWith("robots.txt")) // don't bail on robots.txt
{
// Fast detection and mitigation of crawler traps that respond with slow
// small responses, with a high branching factor
@ -189,9 +191,9 @@ public class WarcRecorder implements AutoCloseable {
}
return new HttpFetchResult.ResultOk(responseUri,
response.code(),
response.statusCode(),
inputBuffer.headers(),
ip,
inetAddress.getHostAddress(),
responseDataBuffer.data,
dataStart,
responseDataBuffer.length() - dataStart);
@ -267,7 +269,9 @@ public class WarcRecorder implements AutoCloseable {
.date(Instant.now())
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
cookieInformation.paint(builder);
if (cookies.hasCookies()) {
builder.addHeader("X-Has-Cookies", "1");
}
var reference = builder.build();

View File

@ -12,7 +12,6 @@ import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.logic.LinkFilterSelector;
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
import nu.marginalia.ip_blocklist.UrlBlocklist;
import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeDomain;
@ -53,7 +52,6 @@ public class CrawlerRetreiver implements AutoCloseable {
private final WarcRecorder warcRecorder;
private final CrawlerRevisitor crawlerRevisitor;
private final SitemapFetcher sitemapFetcher;
int errorCount = 0;
public CrawlerRetreiver(HttpFetcher fetcher,
@ -71,7 +69,6 @@ public class CrawlerRetreiver implements AutoCloseable {
crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), specs.urls(), specs.crawlDepth());
crawlerRevisitor = new CrawlerRevisitor(crawlFrontier, this, warcRecorder);
sitemapFetcher = new SitemapFetcher(crawlFrontier, fetcher.createSitemapRetriever());
// We must always crawl the index page first, this is assumed when fingerprinting the server
var fst = crawlFrontier.peek();
@ -145,9 +142,11 @@ public class CrawlerRetreiver implements AutoCloseable {
// Add external links to the crawl frontier
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
// Add links from the sitemap to the crawl frontier
sitemapFetcher.downloadSitemaps(robotsRules, rootUrl);
// Fetch sitemaps
for (var sitemap : robotsRules.getSitemaps()) {
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
}
while (!crawlFrontier.isEmpty()
&& !crawlFrontier.isCrawlDepthReached()
@ -271,10 +270,7 @@ public class CrawlerRetreiver implements AutoCloseable {
}
// Download the sitemap if available
if (feedLink.isPresent()) {
sitemapFetcher.downloadSitemaps(List.of(feedLink.get()));
timer.waitFetchDelay(0);
}
feedLink.ifPresent(s -> fetcher.fetchSitemapUrls(s, timer));
// Grab the favicon if it exists
fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());

View File

@ -1,72 +0,0 @@
package nu.marginalia.crawl.retreival.sitemap;
import crawlercommons.robots.SimpleRobotRules;
import nu.marginalia.crawl.fetcher.SitemapRetriever;
import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
import nu.marginalia.model.EdgeUrl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
public class SitemapFetcher {
private final DomainCrawlFrontier crawlFrontier;
private final SitemapRetriever sitemapRetriever;
private static final Logger logger = LoggerFactory.getLogger(SitemapFetcher.class);
public SitemapFetcher(DomainCrawlFrontier crawlFrontier, SitemapRetriever sitemapRetriever) {
this.crawlFrontier = crawlFrontier;
this.sitemapRetriever = sitemapRetriever;
}
public void downloadSitemaps(SimpleRobotRules robotsRules, EdgeUrl rootUrl) {
List<String> urls = robotsRules.getSitemaps();
if (urls.isEmpty()) {
urls = List.of(rootUrl.withPathAndParam("/sitemap.xml", null).toString());
}
downloadSitemaps(urls);
}
public void downloadSitemaps(List<String> urls) {
Set<String> checkedSitemaps = new HashSet<>();
for (var rawUrl : urls) {
Optional<EdgeUrl> parsedUrl = EdgeUrl.parse(rawUrl);
if (parsedUrl.isEmpty()) {
continue;
}
EdgeUrl url = parsedUrl.get();
// Let's not download sitemaps from other domains for now
if (!crawlFrontier.isSameDomain(url)) {
continue;
}
if (checkedSitemaps.contains(url.path))
continue;
var sitemap = sitemapRetriever.fetchSitemap(url);
if (sitemap.isEmpty()) {
continue;
}
// ensure we don't try to download this sitemap again
// (don't move this up, as we may want to check the same
// path with different protocols until we find one that works)
checkedSitemaps.add(url.path);
crawlFrontier.addAllToQueue(sitemap);
}
logger.debug("Queue is now {}", crawlFrontier.queueSize());
}
}

View File

@ -36,7 +36,6 @@ dependencies {
implementation libs.gson
implementation libs.commons.io
implementation libs.commons.lang3
implementation libs.okhttp3
implementation libs.jsoup
implementation libs.snakeyaml
implementation libs.zstd

View File

@ -1,17 +1,17 @@
package nu.marginalia.model.body;
import nu.marginalia.contenttype.ContentType;
import okhttp3.Headers;
import org.jetbrains.annotations.Nullable;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.netpreserve.jwarc.MessageHeaders;
import org.netpreserve.jwarc.WarcResponse;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.InetAddress;
import java.net.URI;
import java.net.http.HttpHeaders;
import java.util.Optional;
/* FIXME: This interface has a very unfortunate name that is not very descriptive.
@ -56,42 +56,26 @@ public sealed interface HttpFetchResult {
*/
record ResultOk(URI uri,
int statusCode,
Headers headers,
HttpHeaders headers,
String ipAddress,
byte[] bytesRaw,
int bytesStart,
int bytesLength
) implements HttpFetchResult {
public ResultOk(URI uri, int status, MessageHeaders headers, String ipAddress, byte[] bytes, int bytesStart, int length) {
this(uri, status, HttpHeaders.of(headers.map(), (k,v) -> true), ipAddress, bytes, bytesStart, length);
}
public boolean isOk() {
return statusCode >= 200 && statusCode < 300;
}
public ResultOk(URI uri,
int statusCode,
MessageHeaders headers,
String ipAddress,
byte[] bytesRaw,
int bytesStart,
int bytesLength) {
this(uri, statusCode, convertHeaders(headers), ipAddress, bytesRaw, bytesStart, bytesLength);
}
private static Headers convertHeaders(MessageHeaders headers) {
var ret = new Headers.Builder();
for (var header : headers.map().entrySet()) {
for (var value : header.getValue()) {
ret.add(header.getKey(), value);
}
}
return ret.build();
}
public InputStream getInputStream() {
return new ByteArrayInputStream(bytesRaw, bytesStart, bytesLength);
}
public Optional<Document> parseDocument() throws IOException {
public Optional<Document> parseDocument() {
return DocumentBodyExtractor.asString(this).flatMapOpt((contentType, body) -> {
if (contentType.is("text/html")) {
return Optional.of(Jsoup.parse(body));
@ -102,8 +86,9 @@ public sealed interface HttpFetchResult {
});
}
@Nullable
public String header(String name) {
return headers.get(name);
return headers.firstValue(name).orElse(null);
}
}

View File

@ -165,27 +165,28 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
contentType = "";
}
String headersStr = null;
StringJoiner headersStrBuilder = new StringJoiner("\n");
for (var header : headers) {
headersStrBuilder.add(header.getFirst() + ": " + header.getSecond());
for (var header : headers.map().entrySet()) {
for (var value : header.getValue()) {
headersStrBuilder.add(header.getKey() + ": " + value);
}
headersStr = headersStrBuilder.toString();
}
String headersStr = headersStrBuilder.toString();
write(new CrawledDocumentParquetRecord(
domain,
response.target(),
fetchOk.ipAddress(),
WarcXCookieInformationHeader.hasCookies(response),
headers.firstValue("X-Has-Cookies").orElse("0").equals("1"),
fetchOk.statusCode(),
response.date(),
contentType,
bodyBytes,
headersStr,
headers.get("ETag"),
headers.get("Last-Modified"))
);
headers.firstValue("ETag").orElse(null),
headers.firstValue("Last-Modified").orElse(null)
));
}

View File

@ -1,35 +0,0 @@
package org.netpreserve.jwarc;
import okhttp3.HttpUrl;
import okhttp3.OkHttpClient;
/** Encapsulates out-of-band information about whether a website uses cookies,
* using a non-standard WARC header "X-Has-Cookies".
*/
public class WarcXCookieInformationHeader {
private boolean hasCookies = false;
private static final String headerName = "X-Has-Cookies";
public void update(OkHttpClient client, HttpUrl url) {
if (!hasCookies) {
hasCookies = !client.cookieJar().loadForRequest(url).isEmpty();
}
}
public boolean hasCookies() {
return hasCookies;
}
public void paint(WarcResponse.Builder builder) {
builder.addHeader(headerName, hasCookies ? "1" : "0");
}
public void paint(WarcXResponseReference.Builder builder) {
builder.addHeader(headerName, hasCookies ? "1" : "0");
}
public static boolean hasCookies(WarcRecord record) {
return record.headers().contains(headerName, "1");
}
}

View File

@ -1,11 +1,9 @@
package nu.marginalia.crawl.retreival;
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.crawl.fetcher.Cookies;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -15,6 +13,8 @@ import org.netpreserve.jwarc.WarcResponse;
import java.io.IOException;
import java.net.URISyntaxException;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.nio.file.Files;
import java.nio.file.Path;
import java.security.NoSuchAlgorithmException;
@ -27,11 +27,10 @@ import static org.junit.jupiter.api.Assertions.fail;
class CrawlerWarcResynchronizerTest {
Path fileName;
Path outputFile;
OkHttpClient httpClient;
HttpClient httpClient;
@BeforeEach
public void setUp() throws Exception {
httpClient = new OkHttpClient.Builder()
.addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
httpClient = HttpClient.newBuilder()
.build();
fileName = Files.createTempFile("test", ".warc.gz");
@ -46,7 +45,7 @@ class CrawlerWarcResynchronizerTest {
@Test
void run() throws IOException, URISyntaxException {
try (var oldRecorder = new WarcRecorder(fileName)) {
try (var oldRecorder = new WarcRecorder(fileName, new Cookies())) {
fetchUrl(oldRecorder, "https://www.marginalia.nu/");
fetchUrl(oldRecorder, "https://www.marginalia.nu/log/");
fetchUrl(oldRecorder, "https://www.marginalia.nu/feed/");
@ -56,7 +55,7 @@ class CrawlerWarcResynchronizerTest {
var crawlFrontier = new DomainCrawlFrontier(new EdgeDomain("www.marginalia.nu"), List.of(), 100);
try (var newRecorder = new WarcRecorder(outputFile)) {
try (var newRecorder = new WarcRecorder(outputFile, new Cookies())) {
new CrawlerWarcResynchronizer(crawlFrontier, newRecorder).run(fileName);
}
@ -79,10 +78,11 @@ class CrawlerWarcResynchronizerTest {
}
void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
var req = new Request.Builder().url(url)
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.get().build();
var req = HttpRequest.newBuilder()
.uri(new java.net.URI(url))
.header("User-agent", "test.marginalia.nu")
.header("Accept-Encoding", "gzip")
.GET().build();
recorder.fetch(httpClient, req);
}
}

View File

@ -2,6 +2,7 @@ package nu.marginalia.crawl.retreival.fetcher;
import com.sun.net.httpserver.HttpServer;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.Cookies;
import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
@ -79,7 +80,7 @@ class ContentTypeProberTest {
htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir.gz").get();
fetcher = new HttpFetcherImpl("test");
recorder = new WarcRecorder(warcFile);
recorder = new WarcRecorder(warcFile, new Cookies());
}
@AfterEach

View File

@ -2,13 +2,11 @@ package nu.marginalia.crawl.retreival.fetcher;
import nu.marginalia.UserAgent;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.crawl.fetcher.Cookies;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -19,6 +17,8 @@ import org.netpreserve.jwarc.WarcXResponseReference;
import java.io.IOException;
import java.net.URISyntaxException;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.nio.file.Files;
import java.nio.file.Path;
import java.security.NoSuchAlgorithmException;
@ -31,17 +31,16 @@ class WarcRecorderTest {
Path fileNameWarc;
Path fileNameParquet;
WarcRecorder client;
OkHttpClient httpClient;
HttpClient httpClient;
@BeforeEach
public void setUp() throws Exception {
httpClient = new OkHttpClient.Builder()
.addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
.build();
httpClient = HttpClient.newBuilder().build();
fileNameWarc = Files.createTempFile("test", ".warc");
fileNameParquet = Files.createTempFile("test", ".parquet");
client = new WarcRecorder(fileNameWarc);
client = new WarcRecorder(fileNameWarc, new Cookies());
}
@AfterEach
@ -52,10 +51,13 @@ class WarcRecorderTest {
@Test
void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/")
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.get().build());
client.fetch(httpClient,
HttpRequest.newBuilder()
.uri(new java.net.URI("https://www.marginalia.nu/"))
.header("User-agent", "test.marginalia.nu")
.header("Accept-Encoding", "gzip")
.GET().build()
);
Map<String, String> sampleData = new HashMap<>();
try (var warcReader = new WarcReader(fileNameWarc)) {
@ -76,7 +78,7 @@ class WarcRecorderTest {
@Test
public void flagAsSkipped() throws IOException, URISyntaxException {
try (var recorder = new WarcRecorder(fileNameWarc)) {
try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
"text/html",
200,
@ -100,7 +102,7 @@ class WarcRecorderTest {
@Test
public void flagAsSkippedNullBody() throws IOException, URISyntaxException {
try (var recorder = new WarcRecorder(fileNameWarc)) {
try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
"text/html",
200,
@ -112,7 +114,7 @@ class WarcRecorderTest {
@Test
public void testSaveImport() throws URISyntaxException, IOException {
try (var recorder = new WarcRecorder(fileNameWarc)) {
try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
"text/html",
200,
@ -136,19 +138,23 @@ class WarcRecorderTest {
@Test
public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/")
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.get().build());
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/log/")
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.get().build());
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/sanic.png")
.addHeader("User-agent", "test.marginalia.nu")
.addHeader("Accept-Encoding", "gzip")
.get().build());
client.close();
client.fetch(httpClient, HttpRequest.newBuilder()
.uri(new java.net.URI("https://www.marginalia.nu/"))
.header("User-agent", "test.marginalia.nu")
.header("Accept-Encoding", "gzip")
.GET().build());
client.fetch(httpClient, HttpRequest.newBuilder()
.uri(new java.net.URI("https://www.marginalia.nu/log/"))
.header("User-agent", "test.marginalia.nu")
.header("Accept-Encoding", "gzip")
.GET().build());
client.fetch(httpClient, HttpRequest.newBuilder()
.uri(new java.net.URI("https://www.marginalia.nu/sanic.png"))
.header("User-agent", "test.marginalia.nu")
.header("Accept-Encoding", "gzip")
.GET().build());
CrawledDocumentParquetRecordFileWriter.convertWarc(
"www.marginalia.nu",

View File

@ -4,6 +4,7 @@ import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.ContentTypeLogic;
import nu.marginalia.model.body.DocumentBodyExtractor;
@ -37,6 +38,12 @@ class HttpFetcherTest {
}
}
@Test
void testSitemapMarginalia() {
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
fetcher.fetchSitemapUrls("https://www.marginalia.nu/sitemap.xml", new CrawlDelayTimer(1)).forEach(System.out::println);
}
@Test
void fetchText() throws Exception {
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");

View File

@ -3,11 +3,9 @@ package nu.marginalia.crawling.retreival;
import crawlercommons.robots.SimpleRobotRules;
import nu.marginalia.crawl.CrawlerMain;
import nu.marginalia.crawl.DomainStateDb;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.SitemapRetriever;
import nu.marginalia.crawl.fetcher.*;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.model.EdgeDomain;
@ -17,7 +15,6 @@ import nu.marginalia.model.crawldata.CrawledDocument;
import nu.marginalia.model.crawldata.CrawlerDocumentStatus;
import nu.marginalia.model.crawldata.SerializableCrawlData;
import nu.marginalia.test.CommonTestData;
import okhttp3.Headers;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -27,6 +24,7 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URISyntaxException;
import java.net.http.HttpHeaders;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
@ -122,7 +120,7 @@ public class CrawlerMockFetcherTest {
public void setAllowAllContentTypes(boolean allowAllContentTypes) {}
@Override
public List<String> getCookies() { return List.of();}
public Cookies getCookies() { return new Cookies();}
@Override
public void clearCookies() {}
@ -149,7 +147,7 @@ public class CrawlerMockFetcherTest {
return new HttpFetchResult.ResultOk(
url.asURI(),
200,
new Headers.Builder().build(),
HttpHeaders.of(Map.of(), (k,v)->true),
"127.0.0.1",
bodyBytes,
0,
@ -164,6 +162,11 @@ public class CrawlerMockFetcherTest {
return new HttpFetchResult.ResultNone();
}
@Override
public List<EdgeUrl> fetchSitemapUrls(String rootSitemapUrl, CrawlDelayTimer delayTimer) {
return List.of();
}
@Override
public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) {
return new SimpleRobotRules();
@ -174,5 +177,9 @@ public class CrawlerMockFetcherTest {
return Mockito.mock(SitemapRetriever.class);
}
@Override
public void close() {
}
}
}

View File

@ -5,6 +5,7 @@ import nu.marginalia.WmsaHome;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.crawl.CrawlerMain;
import nu.marginalia.crawl.DomainStateDb;
import nu.marginalia.crawl.fetcher.Cookies;
import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
@ -180,7 +181,7 @@ class CrawlerRetreiverTest {
new EdgeDomain("www.marginalia.nu"),
List.of(), 100);
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
new WarcRecorder(tempFileWarc2)
new WarcRecorder(tempFileWarc2, new Cookies())
);
// truncate the size of the file to simulate a crash
@ -458,7 +459,7 @@ class CrawlerRetreiverTest {
List.of(), 100);
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
new WarcRecorder(tempFileWarc3)
new WarcRecorder(tempFileWarc3, new Cookies())
);
// truncate the size of the file to simulate a crash
@ -509,7 +510,7 @@ class CrawlerRetreiverTest {
}
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
try (var recorder = new WarcRecorder(tempFileWarc2);
try (var recorder = new WarcRecorder(tempFileWarc2, new Cookies());
var db = new DomainStateDb(tempFileDb)
) {
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(),
@ -522,7 +523,7 @@ class CrawlerRetreiverTest {
@NotNull
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) {
try (var recorder = new WarcRecorder(tempFileWarc1);
try (var recorder = new WarcRecorder(tempFileWarc1, new Cookies());
var db = new DomainStateDb(tempFileDb)
) {
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder);

View File

@ -56,7 +56,6 @@ dependencies {
implementation libs.zstd
implementation libs.jwarc
implementation libs.crawlercommons
implementation libs.okhttp3
implementation libs.jsoup
implementation libs.opencsv
implementation libs.fastutil

View File

@ -10,6 +10,7 @@ import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.converting.writer.ConverterBatchWriter;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.Cookies;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.functions.searchquery.QueryFactory;
@ -120,7 +121,7 @@ public class IntegrationTest {
public void run() throws Exception {
/** CREATE WARC */
try (WarcRecorder warcRecorder = new WarcRecorder(warcData)) {
try (WarcRecorder warcRecorder = new WarcRecorder(warcData, new Cookies())) {
warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"),
new HttpFetcherImpl.DomainProbeResult.Ok(new EdgeUrl("https://www.example.com/")));