mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Merge pull request #150 from MarginaliaSearch/httpclient-in-crawler
Reduce the use of 3rd party code in the crawler
This commit is contained in:
commit
2c67f50a43
@ -106,11 +106,7 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
|
||||
return false;
|
||||
|
||||
var url = new EdgeUrl(warcResponse.target());
|
||||
if (!Objects.equals(url.getDomain(), domain)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
return Objects.equals(url.getDomain(), domain);
|
||||
} catch (Exception e) {
|
||||
logger.warn("Failed to process response", e);
|
||||
}
|
||||
|
@ -8,6 +8,7 @@ import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.crawl.DomainStateDb;
|
||||
import nu.marginalia.crawl.fetcher.Cookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
@ -200,23 +201,23 @@ public class CrawlingThenConvertingIntegrationTest {
|
||||
|
||||
@Test
|
||||
public void crawlRobotsTxt() throws Exception {
|
||||
var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 5,
|
||||
List.of("https://search.marginalia.nu/search?q=hello+world")
|
||||
var specs = new CrawlerMain.CrawlSpecRecord("marginalia-search.com", 5,
|
||||
List.of("https://marginalia-search.com/search?q=hello+world")
|
||||
);
|
||||
|
||||
CrawledDomain domain = crawl(specs);
|
||||
assertFalse(domain.doc.isEmpty());
|
||||
assertEquals("OK", domain.crawlerStatus);
|
||||
assertEquals("search.marginalia.nu", domain.domain);
|
||||
assertEquals("marginalia-search.com", domain.domain);
|
||||
|
||||
Set<String> allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet());
|
||||
assertTrue(allUrls.contains("https://search.marginalia.nu/search"), "We expect a record for entities that are forbidden");
|
||||
assertTrue(allUrls.contains("https://marginalia-search.com/search"), "We expect a record for entities that are forbidden");
|
||||
|
||||
var output = process();
|
||||
|
||||
assertNotNull(output);
|
||||
assertFalse(output.documents.isEmpty());
|
||||
assertEquals(new EdgeDomain("search.marginalia.nu"), output.domain);
|
||||
assertEquals(new EdgeDomain("marginalia-search.com"), output.domain);
|
||||
assertEquals(DomainIndexingState.ACTIVE, output.state);
|
||||
|
||||
for (var doc : output.documents) {
|
||||
@ -246,7 +247,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
||||
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws Exception {
|
||||
List<SerializableCrawlData> data = new ArrayList<>();
|
||||
|
||||
try (var recorder = new WarcRecorder(fileName);
|
||||
try (var recorder = new WarcRecorder(fileName, new Cookies());
|
||||
var db = new DomainStateDb(dbTempFile))
|
||||
{
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain();
|
||||
|
@ -55,7 +55,6 @@ dependencies {
|
||||
implementation libs.zstd
|
||||
implementation libs.jwarc
|
||||
implementation libs.crawlercommons
|
||||
implementation libs.okhttp3
|
||||
implementation libs.jsoup
|
||||
implementation libs.opencsv
|
||||
implementation libs.fastutil
|
||||
|
@ -33,8 +33,6 @@ import nu.marginalia.service.module.DatabaseModule;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import okhttp3.ConnectionPool;
|
||||
import okhttp3.Dispatcher;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -85,6 +83,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
|
||||
@Inject
|
||||
public CrawlerMain(UserAgent userAgent,
|
||||
HttpFetcherImpl httpFetcher,
|
||||
ProcessHeartbeatImpl heartbeat,
|
||||
MessageQueueFactory messageQueueFactory, DomainProber domainProber,
|
||||
FileStorageService fileStorageService,
|
||||
@ -98,6 +97,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
super(messageQueueFactory, processConfiguration, gson, CRAWLER_INBOX);
|
||||
|
||||
this.userAgent = userAgent;
|
||||
this.fetcher = httpFetcher;
|
||||
this.heartbeat = heartbeat;
|
||||
this.domainProber = domainProber;
|
||||
this.fileStorageService = fileStorageService;
|
||||
@ -111,10 +111,6 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
Integer.getInteger("crawler.poolSize", 256),
|
||||
1);
|
||||
|
||||
fetcher = new HttpFetcherImpl(userAgent,
|
||||
new Dispatcher(),
|
||||
new ConnectionPool(5, 10, TimeUnit.SECONDS)
|
||||
);
|
||||
|
||||
// Wait for the blacklist to be loaded before starting the crawl
|
||||
blacklist.waitUntilLoaded();
|
||||
@ -132,6 +128,10 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
|
||||
System.setProperty("sun.net.client.defaultReadTimeout", "30000");
|
||||
|
||||
// Set the maximum number of connections to keep alive in the connection pool
|
||||
System.setProperty("jdk.httpclient.idleTimeout", "15"); // 15 seconds
|
||||
System.setProperty("jdk.httpclient.connectionPoolSize", "256");
|
||||
|
||||
// We don't want to use too much memory caching sessions for https
|
||||
System.setProperty("javax.net.ssl.sessionCacheSize", "2048");
|
||||
|
||||
@ -364,9 +364,9 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
Files.deleteIfExists(tempFile);
|
||||
}
|
||||
|
||||
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
|
||||
try (var warcRecorder = new WarcRecorder(newWarcFile, fetcher); // write to a temp file for now
|
||||
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
|
||||
CrawlDataReference reference = getReference();
|
||||
CrawlDataReference reference = getReference()
|
||||
)
|
||||
{
|
||||
// Resume the crawl if it was aborted
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import okhttp3.Request;
|
||||
import java.net.http.HttpRequest;
|
||||
|
||||
/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
|
||||
public record ContentTags(String etag, String lastMod) {
|
||||
@ -17,14 +17,14 @@ public record ContentTags(String etag, String lastMod) {
|
||||
}
|
||||
|
||||
/** Paints the tags onto the request builder. */
|
||||
public void paint(Request.Builder getBuilder) {
|
||||
public void paint(HttpRequest.Builder getBuilder) {
|
||||
|
||||
if (etag != null) {
|
||||
getBuilder.addHeader("If-None-Match", etag);
|
||||
getBuilder.header("If-None-Match", etag);
|
||||
}
|
||||
|
||||
if (lastMod != null) {
|
||||
getBuilder.addHeader("If-Modified-Since", lastMod);
|
||||
getBuilder.header("If-Modified-Since", lastMod);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,33 +1,14 @@
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import okhttp3.Cookie;
|
||||
import okhttp3.CookieJar;
|
||||
import okhttp3.HttpUrl;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.io.IOException;
|
||||
import java.net.CookieHandler;
|
||||
import java.net.URI;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
public class Cookies {
|
||||
final ThreadLocal<ConcurrentHashMap<String, List<Cookie>>> cookieJar = ThreadLocal.withInitial(ConcurrentHashMap::new);
|
||||
|
||||
public CookieJar getJar() {
|
||||
return new CookieJar() {
|
||||
|
||||
@Override
|
||||
public void saveFromResponse(HttpUrl url, List<Cookie> cookies) {
|
||||
|
||||
if (!cookies.isEmpty()) {
|
||||
cookieJar.get().put(url.host(), cookies);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Cookie> loadForRequest(HttpUrl url) {
|
||||
return cookieJar.get().getOrDefault(url.host(), Collections.emptyList());
|
||||
}
|
||||
};
|
||||
}
|
||||
public class Cookies extends CookieHandler {
|
||||
final ThreadLocal<ConcurrentHashMap<String, List<String>>> cookieJar = ThreadLocal.withInitial(ConcurrentHashMap::new);
|
||||
|
||||
public void clear() {
|
||||
cookieJar.get().clear();
|
||||
@ -38,6 +19,16 @@ public class Cookies {
|
||||
}
|
||||
|
||||
public List<String> getCookies() {
|
||||
return cookieJar.get().values().stream().flatMap(List::stream).map(Cookie::toString).toList();
|
||||
return cookieJar.get().values().stream().flatMap(List::stream).toList();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, List<String>> get(URI uri, Map<String, List<String>> requestHeaders) throws IOException {
|
||||
return cookieJar.get();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void put(URI uri, Map<String, List<String>> responseHeaders) throws IOException {
|
||||
cookieJar.get().putAll(responseHeaders);
|
||||
}
|
||||
}
|
||||
|
@ -3,6 +3,7 @@ package nu.marginalia.crawl.fetcher;
|
||||
import com.google.inject.ImplementedBy;
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
@ -11,10 +12,10 @@ import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||
import java.util.List;
|
||||
|
||||
@ImplementedBy(HttpFetcherImpl.class)
|
||||
public interface HttpFetcher {
|
||||
public interface HttpFetcher extends AutoCloseable {
|
||||
void setAllowAllContentTypes(boolean allowAllContentTypes);
|
||||
|
||||
List<String> getCookies();
|
||||
Cookies getCookies();
|
||||
void clearCookies();
|
||||
|
||||
DomainProbeResult probeDomain(EdgeUrl url);
|
||||
@ -27,7 +28,9 @@ public interface HttpFetcher {
|
||||
HttpFetchResult fetchContent(EdgeUrl url,
|
||||
WarcRecorder recorder,
|
||||
ContentTags tags,
|
||||
ProbeType probeType) throws HttpFetcherImpl.RateLimitException, Exception;
|
||||
ProbeType probeType) throws Exception;
|
||||
|
||||
List<EdgeUrl> fetchSitemapUrls(String rootSitemapUrl, CrawlDelayTimer delayTimer);
|
||||
|
||||
SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);
|
||||
|
||||
|
@ -1,35 +1,41 @@
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import crawlercommons.robots.SimpleRobotRulesParser;
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.crawl.fetcher.socket.FastTerminatingSocketFactory;
|
||||
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||
import nu.marginalia.crawl.fetcher.socket.NoSecuritySSL;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.ContentTypeLogic;
|
||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||
import okhttp3.ConnectionPool;
|
||||
import okhttp3.Dispatcher;
|
||||
import okhttp3.OkHttpClient;
|
||||
import okhttp3.Request;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.parser.Parser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.net.ssl.X509TrustManager;
|
||||
import java.io.InterruptedIOException;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URLDecoder;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.net.http.HttpTimeoutException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.Duration;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
|
||||
@Singleton
|
||||
public class HttpFetcherImpl implements HttpFetcher {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
@ -40,39 +46,28 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
|
||||
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
||||
|
||||
private final Duration requestTimeout = Duration.ofSeconds(10);
|
||||
|
||||
@Override
|
||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
||||
contentTypeLogic.setAllowAllContentTypes(allowAllContentTypes);
|
||||
}
|
||||
|
||||
private final OkHttpClient client;
|
||||
private final HttpClient client;
|
||||
|
||||
private static final FastTerminatingSocketFactory ftSocketFactory = new FastTerminatingSocketFactory();
|
||||
|
||||
private OkHttpClient createClient(Dispatcher dispatcher, ConnectionPool pool) {
|
||||
var builder = new OkHttpClient.Builder();
|
||||
if (dispatcher != null) {
|
||||
builder.dispatcher(dispatcher);
|
||||
}
|
||||
|
||||
return builder.sslSocketFactory(NoSecuritySSL.buildSocketFactory(), (X509TrustManager) NoSecuritySSL.trustAllCerts[0])
|
||||
.socketFactory(ftSocketFactory)
|
||||
.hostnameVerifier(NoSecuritySSL.buildHostnameVerifyer())
|
||||
.addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
|
||||
.connectionPool(pool)
|
||||
.cookieJar(cookies.getJar())
|
||||
.followRedirects(true)
|
||||
.followSslRedirects(true)
|
||||
.connectTimeout(8, TimeUnit.SECONDS)
|
||||
.readTimeout(10, TimeUnit.SECONDS)
|
||||
.writeTimeout(10, TimeUnit.SECONDS)
|
||||
private HttpClient createClient() {
|
||||
return HttpClient.newBuilder()
|
||||
.sslContext(NoSecuritySSL.buildSslContext())
|
||||
.cookieHandler(cookies)
|
||||
.followRedirects(HttpClient.Redirect.NORMAL)
|
||||
.connectTimeout(Duration.ofSeconds(8))
|
||||
.executor(Executors.newCachedThreadPool())
|
||||
.build();
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getCookies() {
|
||||
return cookies.getCookies();
|
||||
public Cookies getCookies() {
|
||||
return cookies;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -81,26 +76,24 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
}
|
||||
|
||||
@Inject
|
||||
public HttpFetcherImpl(UserAgent userAgent,
|
||||
Dispatcher dispatcher,
|
||||
ConnectionPool connectionPool)
|
||||
public HttpFetcherImpl(UserAgent userAgent)
|
||||
{
|
||||
this.client = createClient(dispatcher, connectionPool);
|
||||
this.client = createClient();
|
||||
this.userAgentString = userAgent.uaString();
|
||||
this.userAgentIdentifier = userAgent.uaIdentifier();
|
||||
}
|
||||
|
||||
public HttpFetcherImpl(String userAgent) {
|
||||
this.client = createClient(null, new ConnectionPool());
|
||||
this.client = createClient();
|
||||
this.userAgentString = userAgent;
|
||||
this.userAgentIdentifier = userAgent;
|
||||
}
|
||||
|
||||
// Not necessary in prod, but useful in test
|
||||
public void close() {
|
||||
client.dispatcher().executorService().shutdown();
|
||||
client.connectionPool().evictAll();
|
||||
client.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Probe the domain to see if it is reachable, attempting to identify which schema to use,
|
||||
* and if there are any redirects. This is done by one or more HEAD requests.
|
||||
@ -110,19 +103,26 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
*/
|
||||
@Override
|
||||
public DomainProbeResult probeDomain(EdgeUrl url) {
|
||||
var head = new Request.Builder().head().addHeader("User-agent", userAgentString)
|
||||
.url(url.toString())
|
||||
HttpRequest head;
|
||||
try {
|
||||
head = HttpRequest.newBuilder()
|
||||
.HEAD()
|
||||
.uri(url.asURI())
|
||||
.header("User-agent", userAgentString)
|
||||
.timeout(requestTimeout)
|
||||
.build();
|
||||
|
||||
var call = client.newCall(head);
|
||||
|
||||
try (var rsp = call.execute()) {
|
||||
EdgeUrl requestUrl = new EdgeUrl(rsp.request().url().toString());
|
||||
|
||||
if (!Objects.equals(requestUrl.domain, url.domain)) {
|
||||
return new DomainProbeResult.Redirect(requestUrl.domain);
|
||||
} catch (URISyntaxException e) {
|
||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
|
||||
}
|
||||
return new DomainProbeResult.Ok(requestUrl);
|
||||
|
||||
try {
|
||||
var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
|
||||
EdgeUrl rspUri = new EdgeUrl(rsp.uri());
|
||||
|
||||
if (!Objects.equals(rspUri.domain, url.domain)) {
|
||||
return new DomainProbeResult.Redirect(rspUri.domain);
|
||||
}
|
||||
return new DomainProbeResult.Ok(rspUri);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
|
||||
@ -140,21 +140,25 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
WarcRecorder warcRecorder,
|
||||
ContentTags tags) throws RateLimitException {
|
||||
if (tags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) {
|
||||
var headBuilder = new Request.Builder().head()
|
||||
.addHeader("User-agent", userAgentString)
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.url(url.toString());
|
||||
|
||||
var head = headBuilder.build();
|
||||
var call = client.newCall(head);
|
||||
try {
|
||||
var headBuilder = HttpRequest.newBuilder()
|
||||
.HEAD()
|
||||
.uri(url.asURI())
|
||||
.header("User-agent", userAgentString)
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.timeout(requestTimeout)
|
||||
;
|
||||
|
||||
try (var rsp = call.execute()) {
|
||||
var contentTypeHeader = rsp.header("Content-type");
|
||||
var rsp = client.send(headBuilder.build(), HttpResponse.BodyHandlers.discarding());
|
||||
var headers = rsp.headers();
|
||||
|
||||
var contentTypeHeader = headers.firstValue("Content-Type").orElse(null);
|
||||
|
||||
if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
|
||||
warcRecorder.flagAsFailedContentTypeProbe(url, contentTypeHeader, rsp.code());
|
||||
warcRecorder.flagAsFailedContentTypeProbe(url, contentTypeHeader, rsp.statusCode());
|
||||
|
||||
return new ContentTypeProbeResult.BadContentType(contentTypeHeader, rsp.code());
|
||||
return new ContentTypeProbeResult.BadContentType(contentTypeHeader, rsp.statusCode());
|
||||
}
|
||||
|
||||
// Update the URL to the final URL of the HEAD request, otherwise we might end up doing
|
||||
@ -168,27 +172,27 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
// too many eyebrows when looking at the logs on the target server. Overall it's probably desirable
|
||||
// that it looks like the traffic makes sense, as opposed to looking like a broken bot.
|
||||
|
||||
var redirectUrl = new EdgeUrl(rsp.request().url().toString());
|
||||
var redirectUrl = new EdgeUrl(rsp.uri());
|
||||
EdgeUrl ret;
|
||||
|
||||
if (Objects.equals(redirectUrl.domain, url.domain)) ret = redirectUrl;
|
||||
else ret = url;
|
||||
|
||||
// Intercept rate limiting
|
||||
if (rsp.code() == 429) {
|
||||
throw new HttpFetcherImpl.RateLimitException(Objects.requireNonNullElse(rsp.header("Retry-After"), "1"));
|
||||
if (rsp.statusCode() == 429) {
|
||||
throw new HttpFetcherImpl.RateLimitException(headers.firstValue("Retry-After").orElse("1"));
|
||||
}
|
||||
|
||||
return new ContentTypeProbeResult.Ok(ret);
|
||||
}
|
||||
catch (HttpTimeoutException ex) {
|
||||
warcRecorder.flagAsTimeout(url);
|
||||
return new ContentTypeProbeResult.Timeout(ex);
|
||||
}
|
||||
catch (RateLimitException ex) {
|
||||
throw ex;
|
||||
}
|
||||
catch (InterruptedIOException ex) {
|
||||
warcRecorder.flagAsTimeout(url);
|
||||
|
||||
return new ContentTypeProbeResult.Timeout(ex);
|
||||
} catch (Exception ex) {
|
||||
catch (Exception ex) {
|
||||
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
||||
|
||||
warcRecorder.flagAsError(url, ex);
|
||||
@ -210,13 +214,15 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
ProbeType probeType)
|
||||
throws Exception
|
||||
{
|
||||
var getBuilder = new Request.Builder().get();
|
||||
|
||||
getBuilder.url(url.toString())
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.addHeader("Accept-Language", "en,*;q=0.5")
|
||||
.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
|
||||
.addHeader("User-agent", userAgentString);
|
||||
var getBuilder = HttpRequest.newBuilder()
|
||||
.GET()
|
||||
.uri(url.asURI())
|
||||
.header("User-agent", userAgentString)
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.header("Accept-Language", "en,*;q=0.5")
|
||||
.header("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
|
||||
.timeout(requestTimeout)
|
||||
;
|
||||
|
||||
contentTags.paint(getBuilder);
|
||||
|
||||
@ -242,6 +248,122 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
return new SitemapRetriever();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<EdgeUrl> fetchSitemapUrls(String root, CrawlDelayTimer delayTimer) {
|
||||
try {
|
||||
List<EdgeUrl> ret = new ArrayList<>();
|
||||
|
||||
Set<String> seenUrls = new HashSet<>();
|
||||
Set<String> seenSitemaps = new HashSet<>();
|
||||
|
||||
Deque<EdgeUrl> sitemapQueue = new LinkedList<>();
|
||||
|
||||
EdgeUrl rootSitemapUrl = new EdgeUrl(root);
|
||||
|
||||
sitemapQueue.add(rootSitemapUrl);
|
||||
|
||||
int fetchedSitemaps = 0;
|
||||
|
||||
while (!sitemapQueue.isEmpty() && ret.size() < 20_000 && ++fetchedSitemaps < 10) {
|
||||
var head = sitemapQueue.removeFirst();
|
||||
|
||||
switch (fetchSitemap(head)) {
|
||||
case SitemapResult.SitemapUrls(List<String> urls) -> {
|
||||
|
||||
for (var url : urls) {
|
||||
if (seenUrls.add(url)) {
|
||||
EdgeUrl.parse(url)
|
||||
.filter(u -> u.domain.equals(rootSitemapUrl.domain))
|
||||
.ifPresent(ret::add);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
case SitemapResult.SitemapReferences(List<String> refs) -> {
|
||||
for (var ref : refs) {
|
||||
if (seenSitemaps.add(ref)) {
|
||||
EdgeUrl.parse(ref)
|
||||
.filter(url -> url.domain.equals(rootSitemapUrl.domain))
|
||||
.ifPresent(sitemapQueue::addFirst);
|
||||
}
|
||||
}
|
||||
}
|
||||
case SitemapResult.SitemapError() -> {}
|
||||
}
|
||||
|
||||
delayTimer.waitFetchDelay();
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error while fetching sitemaps via " + root, ex);
|
||||
return List.of();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private SitemapResult fetchSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
|
||||
HttpRequest getRequest = HttpRequest.newBuilder()
|
||||
.GET()
|
||||
.uri(sitemapUrl.asURI())
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.header("Accept", "text/*, */*;q=0.9")
|
||||
.header("User-agent", userAgentString)
|
||||
.timeout(requestTimeout)
|
||||
.build();
|
||||
|
||||
var response = client.send(getRequest, HttpResponse.BodyHandlers.ofInputStream());
|
||||
if (response.statusCode() != 200) {
|
||||
return new SitemapResult.SitemapError();
|
||||
}
|
||||
|
||||
try (InputStream inputStream = response.body()) {
|
||||
|
||||
InputStream parserStream;
|
||||
if (sitemapUrl.path.endsWith(".gz")) {
|
||||
parserStream = new GZIPInputStream(inputStream);
|
||||
}
|
||||
else {
|
||||
parserStream = inputStream;
|
||||
}
|
||||
|
||||
Document parsedSitemap = Jsoup.parse(parserStream, "UTF-8", sitemapUrl.toString(), Parser.xmlParser());
|
||||
String rootTagName = parsedSitemap.child(0).tagName();
|
||||
|
||||
return switch (rootTagName.toLowerCase()) {
|
||||
case "sitemapindex" -> {
|
||||
List<String> references = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.getElementsByTag("loc")) {
|
||||
references.add(URLDecoder.decode(locTag.text().trim(), StandardCharsets.UTF_8));
|
||||
}
|
||||
yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
|
||||
}
|
||||
case "urlset" -> {
|
||||
List<String> urls = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.select("url > loc")) {
|
||||
urls.add(URLDecoder.decode(locTag.text().trim(), StandardCharsets.UTF_8));
|
||||
}
|
||||
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||
}
|
||||
case "rss", "atom" -> {
|
||||
List<String> urls = new ArrayList<>();
|
||||
for (var locTag : parsedSitemap.select("link, url")) {
|
||||
urls.add(locTag.text().trim());
|
||||
}
|
||||
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||
}
|
||||
default -> new SitemapResult.SitemapError();
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
private sealed interface SitemapResult {
|
||||
record SitemapUrls(List<String> urls) implements SitemapResult {}
|
||||
record SitemapReferences(List<String> sitemapRefs) implements SitemapResult {}
|
||||
record SitemapError() implements SitemapResult {}
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) {
|
||||
var ret = fetchAndParseRobotsTxt(new EdgeUrl("https", domain, null, "/robots.txt", null), recorder);
|
||||
@ -257,14 +379,15 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
|
||||
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
|
||||
try {
|
||||
var getBuilder = new Request.Builder().get();
|
||||
var getRequest = HttpRequest.newBuilder()
|
||||
.GET()
|
||||
.uri(url.asURI())
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.header("Accept", "text/*, */*;q=0.9")
|
||||
.header("User-agent", userAgentString)
|
||||
.timeout(requestTimeout);
|
||||
|
||||
getBuilder.url(url.toString())
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.addHeader("Accept", "text/*, */*;q=0.9")
|
||||
.addHeader("User-agent", userAgentString);
|
||||
|
||||
HttpFetchResult result = recorder.fetch(client, getBuilder.build());
|
||||
HttpFetchResult result = recorder.fetch(client, getRequest.build());
|
||||
|
||||
return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) ->
|
||||
robotsParser.parseContent(url.toString(),
|
||||
|
@ -1,31 +0,0 @@
|
||||
package nu.marginalia.crawl.fetcher.socket;
|
||||
|
||||
import okhttp3.Interceptor;
|
||||
import okhttp3.Response;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
/** An interceptor that intercepts network requests and adds the remote IP address as
|
||||
* a header in the response. This is used to pass the remote IP address to the Warc
|
||||
* writer, as this information is not available in the response.
|
||||
*/
|
||||
public class IpInterceptingNetworkInterceptor implements Interceptor {
|
||||
private static final String pseudoHeaderName = "X-Marginalia-Remote-IP";
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
public Response intercept(@NotNull Interceptor.Chain chain) throws IOException {
|
||||
String IP = chain.connection().socket().getInetAddress().getHostAddress();
|
||||
|
||||
return chain.proceed(chain.request())
|
||||
.newBuilder()
|
||||
.addHeader(pseudoHeaderName, IP)
|
||||
.build();
|
||||
}
|
||||
|
||||
public static String getIpFromResponse(Response response) {
|
||||
return response.header(pseudoHeaderName);
|
||||
}
|
||||
}
|
@ -27,7 +27,7 @@ public class NoSecuritySSL {
|
||||
}
|
||||
};
|
||||
|
||||
public static SSLSocketFactory buildSocketFactory() {
|
||||
public static SSLContext buildSslContext() {
|
||||
try {
|
||||
// Install the all-trusting trust manager
|
||||
final SSLContext sslContext = SSLContext.getInstance("TLS");
|
||||
@ -40,14 +40,11 @@ public class NoSecuritySSL {
|
||||
clientSessionContext.setSessionCacheSize(2048);
|
||||
|
||||
// Create a ssl socket factory with our all-trusting manager
|
||||
return sslContext.getSocketFactory();
|
||||
return sslContext;
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public static HostnameVerifier buildHostnameVerifyer() {
|
||||
return (hn, session) -> true;
|
||||
}
|
||||
}
|
||||
|
@ -1,14 +1,14 @@
|
||||
package nu.marginalia.crawl.fetcher.warc;
|
||||
|
||||
import okhttp3.Headers;
|
||||
import okhttp3.Response;
|
||||
import org.apache.commons.io.input.BOMInputStream;
|
||||
import org.netpreserve.jwarc.WarcTruncationReason;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.http.HttpHeaders;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Objects;
|
||||
import java.util.Map;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
/** Input buffer for temporary storage of a HTTP response
|
||||
@ -17,8 +17,9 @@ import java.util.zip.GZIPInputStream;
|
||||
* */
|
||||
public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
protected WarcTruncationReason truncationReason = WarcTruncationReason.NOT_TRUNCATED;
|
||||
protected Headers headers;
|
||||
WarcInputBuffer(Headers headers) {
|
||||
protected HttpHeaders headers;
|
||||
|
||||
WarcInputBuffer(HttpHeaders headers) {
|
||||
this.headers = headers;
|
||||
}
|
||||
|
||||
@ -30,7 +31,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
|
||||
public final WarcTruncationReason truncationReason() { return truncationReason; }
|
||||
|
||||
public final Headers headers() { return headers; }
|
||||
public final HttpHeaders headers() { return headers; }
|
||||
|
||||
/** Create a buffer for a response.
|
||||
* If the response is small and not compressed, it will be stored in memory.
|
||||
@ -38,26 +39,27 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
* and suppressed from the headers.
|
||||
* If an error occurs, a buffer will be created with no content and an error status.
|
||||
*/
|
||||
static WarcInputBuffer forResponse(Response rsp) {
|
||||
static WarcInputBuffer forResponse(HttpResponse<InputStream> rsp) {
|
||||
if (rsp == null)
|
||||
return new ErrorBuffer();
|
||||
|
||||
try {
|
||||
String contentLengthHeader = Objects.requireNonNullElse(rsp.header("Content-Length"), "-1");
|
||||
int contentLength = Integer.parseInt(contentLengthHeader);
|
||||
String contentEncoding = rsp.header("Content-Encoding");
|
||||
var headers = rsp.headers();
|
||||
|
||||
try (var is = rsp.body()) {
|
||||
int contentLength = (int) headers.firstValueAsLong("Content-Length").orElse(-1L);
|
||||
String contentEncoding = headers.firstValue("Content-Encoding").orElse(null);
|
||||
|
||||
if (contentEncoding == null && contentLength > 0 && contentLength < 8192) {
|
||||
// If the content is small and not compressed, we can just read it into memory
|
||||
return new MemoryBuffer(rsp, contentLength);
|
||||
return new MemoryBuffer(headers, is, contentLength);
|
||||
}
|
||||
else {
|
||||
// Otherwise, we unpack it into a file and read it from there
|
||||
return new FileBuffer(rsp);
|
||||
return new FileBuffer(headers, is);
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
return new ErrorBuffer(rsp);
|
||||
return new ErrorBuffer();
|
||||
}
|
||||
|
||||
}
|
||||
@ -99,12 +101,8 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
||||
/** Pseudo-buffer for when we have an error */
|
||||
class ErrorBuffer extends WarcInputBuffer {
|
||||
public ErrorBuffer() {
|
||||
super(Headers.of());
|
||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||
}
|
||||
super(HttpHeaders.of(Map.of(), (k,v)->false));
|
||||
|
||||
public ErrorBuffer(Response rsp) {
|
||||
super(rsp.headers());
|
||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||
}
|
||||
|
||||
@ -125,12 +123,12 @@ class ErrorBuffer extends WarcInputBuffer {
|
||||
/** Buffer for when we have the response in memory */
|
||||
class MemoryBuffer extends WarcInputBuffer {
|
||||
byte[] data;
|
||||
public MemoryBuffer(Response response, int size) {
|
||||
super(response.headers());
|
||||
public MemoryBuffer(HttpHeaders headers, InputStream responseStream, int size) {
|
||||
super(headers);
|
||||
|
||||
var outputStream = new ByteArrayOutputStream(size);
|
||||
|
||||
copy(response.body().byteStream(), outputStream);
|
||||
copy(responseStream, outputStream);
|
||||
|
||||
data = outputStream.toByteArray();
|
||||
}
|
||||
@ -154,19 +152,15 @@ class MemoryBuffer extends WarcInputBuffer {
|
||||
class FileBuffer extends WarcInputBuffer {
|
||||
private final Path tempFile;
|
||||
|
||||
public FileBuffer(Response response) throws IOException {
|
||||
super(suppressContentEncoding(response.headers()));
|
||||
public FileBuffer(HttpHeaders headers, InputStream responseStream) throws IOException {
|
||||
super(suppressContentEncoding(headers));
|
||||
|
||||
this.tempFile = Files.createTempFile("rsp", ".html");
|
||||
|
||||
if (response.body() == null) {
|
||||
truncationReason = WarcTruncationReason.DISCONNECT;
|
||||
return;
|
||||
}
|
||||
|
||||
if ("gzip".equals(response.header("Content-Encoding"))) {
|
||||
if ("gzip".equalsIgnoreCase(headers.firstValue("Content-Encoding").orElse(""))) {
|
||||
try (var out = Files.newOutputStream(tempFile)) {
|
||||
copy(new GZIPInputStream(response.body().byteStream()), out);
|
||||
copy(new GZIPInputStream(responseStream), out);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||
@ -174,7 +168,7 @@ class FileBuffer extends WarcInputBuffer {
|
||||
}
|
||||
else {
|
||||
try (var out = Files.newOutputStream(tempFile)) {
|
||||
copy(response.body().byteStream(), out);
|
||||
copy(responseStream, out);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||
@ -182,22 +176,13 @@ class FileBuffer extends WarcInputBuffer {
|
||||
}
|
||||
}
|
||||
|
||||
private static Headers suppressContentEncoding(Headers headers) {
|
||||
var builder = new Headers.Builder();
|
||||
|
||||
headers.toMultimap().forEach((k, values) -> {
|
||||
private static HttpHeaders suppressContentEncoding(HttpHeaders headers) {
|
||||
return HttpHeaders.of(headers.map(), (k, v) -> {
|
||||
if ("Content-Encoding".equalsIgnoreCase(k)) {
|
||||
return;
|
||||
}
|
||||
if ("Transfer-Encoding".equalsIgnoreCase(k)) {
|
||||
return;
|
||||
}
|
||||
for (var value : values) {
|
||||
builder.add(k, value);
|
||||
return false;
|
||||
}
|
||||
return !"Transfer-Encoding".equalsIgnoreCase(k);
|
||||
});
|
||||
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,11 +1,12 @@
|
||||
package nu.marginalia.crawl.fetcher.warc;
|
||||
|
||||
import okhttp3.Protocol;
|
||||
import okhttp3.Response;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.net.URI;
|
||||
import java.net.URLEncoder;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpHeaders;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
@ -75,13 +76,13 @@ public class WarcProtocolReconstructor {
|
||||
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
||||
}
|
||||
|
||||
static String getResponseHeader(Response response, long size) {
|
||||
String version = response.protocol() == Protocol.HTTP_1_1 ? "1.1" : "2.0";
|
||||
static String getResponseHeader(HttpResponse<?> response, long size) {
|
||||
String version = response.version() == HttpClient.Version.HTTP_1_1 ? "1.1" : "2.0";
|
||||
|
||||
String statusCode = String.valueOf(response.code());
|
||||
String statusMessage = STATUS_CODE_MAP.getOrDefault(response.code(), "Unknown");
|
||||
String statusCode = String.valueOf(response.statusCode());
|
||||
String statusMessage = STATUS_CODE_MAP.getOrDefault(response.statusCode(), "Unknown");
|
||||
|
||||
String headerString = getHeadersAsString(response, size);
|
||||
String headerString = getHeadersAsString(response.headers(), size);
|
||||
|
||||
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
||||
}
|
||||
@ -148,10 +149,10 @@ public class WarcProtocolReconstructor {
|
||||
return joiner.toString();
|
||||
}
|
||||
|
||||
static private String getHeadersAsString(Response response, long responseSize) {
|
||||
static private String getHeadersAsString(HttpHeaders headers, long responseSize) {
|
||||
StringJoiner joiner = new StringJoiner("\r\n");
|
||||
|
||||
response.headers().toMultimap().forEach((k, values) -> {
|
||||
headers.map().forEach((k, values) -> {
|
||||
String headerCapitalized = capitalizeHeader(k);
|
||||
|
||||
// Omit pseudoheaders injected by the crawler itself
|
||||
@ -179,8 +180,8 @@ public class WarcProtocolReconstructor {
|
||||
return joiner.toString();
|
||||
}
|
||||
|
||||
// okhttp gives us flattened headers, so we need to reconstruct Camel-Kebab-Case style
|
||||
// for the WARC parser's sake...
|
||||
// okhttp gave us flattened headers, so we need to reconstruct Camel-Kebab-Case style
|
||||
// for the WARC parser's sake... (do we still need this, mr chesterton?)
|
||||
static private String capitalizeHeader(String k) {
|
||||
return Arrays.stream(StringUtils.split(k, '-'))
|
||||
.map(StringUtils::capitalize)
|
||||
|
@ -1,13 +1,11 @@
|
||||
package nu.marginalia.crawl.fetcher.warc;
|
||||
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.Cookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
import okhttp3.OkHttpClient;
|
||||
import okhttp3.Request;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
import org.netpreserve.jwarc.*;
|
||||
import org.slf4j.Logger;
|
||||
@ -18,6 +16,7 @@ import java.io.InputStream;
|
||||
import java.net.InetAddress;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
@ -28,7 +27,7 @@ import java.util.*;
|
||||
|
||||
/** Based on JWarc's fetch method, APL 2.0 license
|
||||
* <p></p>
|
||||
* This class wraps OkHttp's OkHttpClient and records the HTTP request and response in a WARC file,
|
||||
* This class wraps HttpClient and records the HTTP request and response in a WARC file,
|
||||
* as best is possible given not all the data is available at the same time and needs to
|
||||
* be reconstructed.
|
||||
*/
|
||||
@ -48,20 +47,22 @@ public class WarcRecorder implements AutoCloseable {
|
||||
// Affix a version string in case we need to change the format in the future
|
||||
// in some way
|
||||
private final String warcRecorderVersion = "1.0";
|
||||
|
||||
// We need to know if the site uses cookies so this can be reported among the search results
|
||||
// -- flip this to true if we see any cookies. This information will also be painted on any
|
||||
// revisited pages. It's not 100% perfect and a bit order dependent, but it's good enough.
|
||||
private final WarcXCookieInformationHeader cookieInformation = new WarcXCookieInformationHeader();
|
||||
|
||||
private final Cookies cookies;
|
||||
/**
|
||||
* Create a new WarcRecorder that will write to the given file
|
||||
*
|
||||
* @param warcFile The file to write to
|
||||
*/
|
||||
public WarcRecorder(Path warcFile) throws IOException {
|
||||
public WarcRecorder(Path warcFile, HttpFetcherImpl fetcher) throws IOException {
|
||||
this.warcFile = warcFile;
|
||||
this.writer = new WarcWriter(warcFile);
|
||||
this.cookies = fetcher.getCookies();
|
||||
}
|
||||
|
||||
public WarcRecorder(Path warcFile, Cookies cookies) throws IOException {
|
||||
this.warcFile = warcFile;
|
||||
this.writer = new WarcWriter(warcFile);
|
||||
this.cookies = cookies;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -71,38 +72,41 @@ public class WarcRecorder implements AutoCloseable {
|
||||
public WarcRecorder() throws IOException {
|
||||
this.warcFile = Files.createTempFile("warc", ".warc.gz");
|
||||
this.writer = new WarcWriter(this.warcFile);
|
||||
this.cookies = new Cookies();
|
||||
|
||||
temporaryFile = true;
|
||||
}
|
||||
|
||||
public HttpFetchResult fetch(OkHttpClient client, Request request) throws NoSuchAlgorithmException,
|
||||
IOException,
|
||||
URISyntaxException,
|
||||
InterruptedException
|
||||
public HttpFetchResult fetch(HttpClient client,
|
||||
java.net.http.HttpRequest request)
|
||||
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
|
||||
{
|
||||
URI requestUri = request.url().uri();
|
||||
URI requestUri = request.uri();
|
||||
|
||||
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
||||
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
||||
|
||||
String ip;
|
||||
Instant date = Instant.now();
|
||||
|
||||
var call = client.newCall(request);
|
||||
var response = client.send(request, java.net.http.HttpResponse.BodyHandlers.ofInputStream());
|
||||
|
||||
|
||||
cookieInformation.update(client, request.url());
|
||||
Map<String, List<String>> extraHeaders = new HashMap<>();
|
||||
|
||||
try (var response = call.execute();
|
||||
WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response))
|
||||
// Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
|
||||
extraHeaders.putAll(request.headers().map());
|
||||
|
||||
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response))
|
||||
{
|
||||
if (cookies.hasCookies()) {
|
||||
extraHeaders.put("X-Has-Cookies", List.of("1"));
|
||||
}
|
||||
|
||||
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
||||
|
||||
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
|
||||
InputStream inputStream = inputBuffer.read();
|
||||
|
||||
ip = IpInterceptingNetworkInterceptor.getIpFromResponse(response);
|
||||
|
||||
responseDataBuffer.put(responseHeaders);
|
||||
responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseHeaders.length);
|
||||
|
||||
@ -125,17 +129,15 @@ public class WarcRecorder implements AutoCloseable {
|
||||
|
||||
// It looks like this might be the same as requestUri, but it's not;
|
||||
// it's the URI after resolving redirects.
|
||||
final URI responseUri = response.request().url().uri();
|
||||
final URI responseUri = response.uri();
|
||||
|
||||
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
|
||||
.blockDigest(responseDigestBuilder.build())
|
||||
.date(date)
|
||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||
|
||||
cookieInformation.paint(responseBuilder);
|
||||
|
||||
if (ip != null) responseBuilder.ipAddress(InetAddress.getByName(ip));
|
||||
|
||||
InetAddress inetAddress = InetAddress.getByName(responseUri.getHost());
|
||||
responseBuilder.ipAddress(inetAddress);
|
||||
responseBuilder.payloadDigest(payloadDigestBuilder.build());
|
||||
responseBuilder.truncated(inputBuffer.truncationReason());
|
||||
|
||||
@ -152,8 +154,8 @@ public class WarcRecorder implements AutoCloseable {
|
||||
byte[] httpRequestString = WarcProtocolReconstructor
|
||||
.getHttpRequestString(
|
||||
response.request().method(),
|
||||
response.request().headers().toMultimap(),
|
||||
request.headers().toMultimap(),
|
||||
response.request().headers().map(),
|
||||
extraHeaders,
|
||||
requestUri)
|
||||
.getBytes();
|
||||
|
||||
@ -171,7 +173,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
|
||||
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
||||
&& inputBuffer.size() < 2048
|
||||
&& !request.url().encodedPath().endsWith("robots.txt")) // don't bail on robots.txt
|
||||
&& !request.uri().getPath().endsWith("robots.txt")) // don't bail on robots.txt
|
||||
{
|
||||
// Fast detection and mitigation of crawler traps that respond with slow
|
||||
// small responses, with a high branching factor
|
||||
@ -189,9 +191,9 @@ public class WarcRecorder implements AutoCloseable {
|
||||
}
|
||||
|
||||
return new HttpFetchResult.ResultOk(responseUri,
|
||||
response.code(),
|
||||
response.statusCode(),
|
||||
inputBuffer.headers(),
|
||||
ip,
|
||||
inetAddress.getHostAddress(),
|
||||
responseDataBuffer.data,
|
||||
dataStart,
|
||||
responseDataBuffer.length() - dataStart);
|
||||
@ -267,7 +269,9 @@ public class WarcRecorder implements AutoCloseable {
|
||||
.date(Instant.now())
|
||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||
|
||||
cookieInformation.paint(builder);
|
||||
if (cookies.hasCookies()) {
|
||||
builder.addHeader("X-Has-Cookies", "1");
|
||||
}
|
||||
|
||||
var reference = builder.build();
|
||||
|
||||
|
@ -12,7 +12,6 @@ import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.logic.LinkFilterSelector;
|
||||
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
|
||||
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
|
||||
import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
|
||||
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
||||
import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
@ -53,7 +52,6 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
private final WarcRecorder warcRecorder;
|
||||
private final CrawlerRevisitor crawlerRevisitor;
|
||||
|
||||
private final SitemapFetcher sitemapFetcher;
|
||||
int errorCount = 0;
|
||||
|
||||
public CrawlerRetreiver(HttpFetcher fetcher,
|
||||
@ -71,7 +69,6 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
|
||||
crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), specs.urls(), specs.crawlDepth());
|
||||
crawlerRevisitor = new CrawlerRevisitor(crawlFrontier, this, warcRecorder);
|
||||
sitemapFetcher = new SitemapFetcher(crawlFrontier, fetcher.createSitemapRetriever());
|
||||
|
||||
// We must always crawl the index page first, this is assumed when fingerprinting the server
|
||||
var fst = crawlFrontier.peek();
|
||||
@ -145,9 +142,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
// Add external links to the crawl frontier
|
||||
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
|
||||
|
||||
// Add links from the sitemap to the crawl frontier
|
||||
sitemapFetcher.downloadSitemaps(robotsRules, rootUrl);
|
||||
|
||||
// Fetch sitemaps
|
||||
for (var sitemap : robotsRules.getSitemaps()) {
|
||||
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
|
||||
}
|
||||
|
||||
while (!crawlFrontier.isEmpty()
|
||||
&& !crawlFrontier.isCrawlDepthReached()
|
||||
@ -271,10 +270,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
}
|
||||
|
||||
// Download the sitemap if available
|
||||
if (feedLink.isPresent()) {
|
||||
sitemapFetcher.downloadSitemaps(List.of(feedLink.get()));
|
||||
timer.waitFetchDelay(0);
|
||||
}
|
||||
feedLink.ifPresent(s -> fetcher.fetchSitemapUrls(s, timer));
|
||||
|
||||
// Grab the favicon if it exists
|
||||
fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
||||
|
@ -1,72 +0,0 @@
|
||||
package nu.marginalia.crawl.retreival.sitemap;
|
||||
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import nu.marginalia.crawl.fetcher.SitemapRetriever;
|
||||
import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
public class SitemapFetcher {
|
||||
|
||||
private final DomainCrawlFrontier crawlFrontier;
|
||||
private final SitemapRetriever sitemapRetriever;
|
||||
private static final Logger logger = LoggerFactory.getLogger(SitemapFetcher.class);
|
||||
|
||||
public SitemapFetcher(DomainCrawlFrontier crawlFrontier, SitemapRetriever sitemapRetriever) {
|
||||
this.crawlFrontier = crawlFrontier;
|
||||
this.sitemapRetriever = sitemapRetriever;
|
||||
}
|
||||
|
||||
public void downloadSitemaps(SimpleRobotRules robotsRules, EdgeUrl rootUrl) {
|
||||
List<String> urls = robotsRules.getSitemaps();
|
||||
|
||||
if (urls.isEmpty()) {
|
||||
urls = List.of(rootUrl.withPathAndParam("/sitemap.xml", null).toString());
|
||||
}
|
||||
|
||||
downloadSitemaps(urls);
|
||||
}
|
||||
|
||||
public void downloadSitemaps(List<String> urls) {
|
||||
|
||||
Set<String> checkedSitemaps = new HashSet<>();
|
||||
|
||||
for (var rawUrl : urls) {
|
||||
Optional<EdgeUrl> parsedUrl = EdgeUrl.parse(rawUrl);
|
||||
if (parsedUrl.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
EdgeUrl url = parsedUrl.get();
|
||||
|
||||
// Let's not download sitemaps from other domains for now
|
||||
if (!crawlFrontier.isSameDomain(url)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (checkedSitemaps.contains(url.path))
|
||||
continue;
|
||||
|
||||
var sitemap = sitemapRetriever.fetchSitemap(url);
|
||||
if (sitemap.isEmpty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// ensure we don't try to download this sitemap again
|
||||
// (don't move this up, as we may want to check the same
|
||||
// path with different protocols until we find one that works)
|
||||
|
||||
checkedSitemaps.add(url.path);
|
||||
|
||||
crawlFrontier.addAllToQueue(sitemap);
|
||||
}
|
||||
|
||||
logger.debug("Queue is now {}", crawlFrontier.queueSize());
|
||||
}
|
||||
}
|
@ -36,7 +36,6 @@ dependencies {
|
||||
implementation libs.gson
|
||||
implementation libs.commons.io
|
||||
implementation libs.commons.lang3
|
||||
implementation libs.okhttp3
|
||||
implementation libs.jsoup
|
||||
implementation libs.snakeyaml
|
||||
implementation libs.zstd
|
||||
|
@ -1,17 +1,17 @@
|
||||
package nu.marginalia.model.body;
|
||||
|
||||
import nu.marginalia.contenttype.ContentType;
|
||||
import okhttp3.Headers;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.netpreserve.jwarc.MessageHeaders;
|
||||
import org.netpreserve.jwarc.WarcResponse;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.InetAddress;
|
||||
import java.net.URI;
|
||||
import java.net.http.HttpHeaders;
|
||||
import java.util.Optional;
|
||||
|
||||
/* FIXME: This interface has a very unfortunate name that is not very descriptive.
|
||||
@ -56,42 +56,26 @@ public sealed interface HttpFetchResult {
|
||||
*/
|
||||
record ResultOk(URI uri,
|
||||
int statusCode,
|
||||
Headers headers,
|
||||
HttpHeaders headers,
|
||||
String ipAddress,
|
||||
byte[] bytesRaw,
|
||||
int bytesStart,
|
||||
int bytesLength
|
||||
) implements HttpFetchResult {
|
||||
|
||||
public ResultOk(URI uri, int status, MessageHeaders headers, String ipAddress, byte[] bytes, int bytesStart, int length) {
|
||||
this(uri, status, HttpHeaders.of(headers.map(), (k,v) -> true), ipAddress, bytes, bytesStart, length);
|
||||
}
|
||||
|
||||
public boolean isOk() {
|
||||
return statusCode >= 200 && statusCode < 300;
|
||||
}
|
||||
|
||||
public ResultOk(URI uri,
|
||||
int statusCode,
|
||||
MessageHeaders headers,
|
||||
String ipAddress,
|
||||
byte[] bytesRaw,
|
||||
int bytesStart,
|
||||
int bytesLength) {
|
||||
this(uri, statusCode, convertHeaders(headers), ipAddress, bytesRaw, bytesStart, bytesLength);
|
||||
}
|
||||
|
||||
private static Headers convertHeaders(MessageHeaders headers) {
|
||||
var ret = new Headers.Builder();
|
||||
for (var header : headers.map().entrySet()) {
|
||||
for (var value : header.getValue()) {
|
||||
ret.add(header.getKey(), value);
|
||||
}
|
||||
}
|
||||
return ret.build();
|
||||
}
|
||||
|
||||
public InputStream getInputStream() {
|
||||
return new ByteArrayInputStream(bytesRaw, bytesStart, bytesLength);
|
||||
}
|
||||
|
||||
public Optional<Document> parseDocument() throws IOException {
|
||||
public Optional<Document> parseDocument() {
|
||||
return DocumentBodyExtractor.asString(this).flatMapOpt((contentType, body) -> {
|
||||
if (contentType.is("text/html")) {
|
||||
return Optional.of(Jsoup.parse(body));
|
||||
@ -102,8 +86,9 @@ public sealed interface HttpFetchResult {
|
||||
});
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public String header(String name) {
|
||||
return headers.get(name);
|
||||
return headers.firstValue(name).orElse(null);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -165,27 +165,28 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
||||
contentType = "";
|
||||
}
|
||||
|
||||
String headersStr = null;
|
||||
StringJoiner headersStrBuilder = new StringJoiner("\n");
|
||||
for (var header : headers) {
|
||||
headersStrBuilder.add(header.getFirst() + ": " + header.getSecond());
|
||||
for (var header : headers.map().entrySet()) {
|
||||
for (var value : header.getValue()) {
|
||||
headersStrBuilder.add(header.getKey() + ": " + value);
|
||||
}
|
||||
headersStr = headersStrBuilder.toString();
|
||||
}
|
||||
String headersStr = headersStrBuilder.toString();
|
||||
|
||||
|
||||
write(new CrawledDocumentParquetRecord(
|
||||
domain,
|
||||
response.target(),
|
||||
fetchOk.ipAddress(),
|
||||
WarcXCookieInformationHeader.hasCookies(response),
|
||||
headers.firstValue("X-Has-Cookies").orElse("0").equals("1"),
|
||||
fetchOk.statusCode(),
|
||||
response.date(),
|
||||
contentType,
|
||||
bodyBytes,
|
||||
headersStr,
|
||||
headers.get("ETag"),
|
||||
headers.get("Last-Modified"))
|
||||
);
|
||||
headers.firstValue("ETag").orElse(null),
|
||||
headers.firstValue("Last-Modified").orElse(null)
|
||||
));
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,35 +0,0 @@
|
||||
package org.netpreserve.jwarc;
|
||||
|
||||
import okhttp3.HttpUrl;
|
||||
import okhttp3.OkHttpClient;
|
||||
|
||||
/** Encapsulates out-of-band information about whether a website uses cookies,
|
||||
* using a non-standard WARC header "X-Has-Cookies".
|
||||
*/
|
||||
public class WarcXCookieInformationHeader {
|
||||
private boolean hasCookies = false;
|
||||
private static final String headerName = "X-Has-Cookies";
|
||||
|
||||
public void update(OkHttpClient client, HttpUrl url) {
|
||||
if (!hasCookies) {
|
||||
hasCookies = !client.cookieJar().loadForRequest(url).isEmpty();
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasCookies() {
|
||||
return hasCookies;
|
||||
}
|
||||
|
||||
public void paint(WarcResponse.Builder builder) {
|
||||
builder.addHeader(headerName, hasCookies ? "1" : "0");
|
||||
}
|
||||
public void paint(WarcXResponseReference.Builder builder) {
|
||||
builder.addHeader(headerName, hasCookies ? "1" : "0");
|
||||
}
|
||||
|
||||
public static boolean hasCookies(WarcRecord record) {
|
||||
return record.headers().contains(headerName, "1");
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -1,11 +1,9 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
|
||||
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||
import nu.marginalia.crawl.fetcher.Cookies;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import okhttp3.OkHttpClient;
|
||||
import okhttp3.Request;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@ -15,6 +13,8 @@ import org.netpreserve.jwarc.WarcResponse;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
@ -27,11 +27,10 @@ import static org.junit.jupiter.api.Assertions.fail;
|
||||
class CrawlerWarcResynchronizerTest {
|
||||
Path fileName;
|
||||
Path outputFile;
|
||||
OkHttpClient httpClient;
|
||||
HttpClient httpClient;
|
||||
@BeforeEach
|
||||
public void setUp() throws Exception {
|
||||
httpClient = new OkHttpClient.Builder()
|
||||
.addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
|
||||
httpClient = HttpClient.newBuilder()
|
||||
.build();
|
||||
|
||||
fileName = Files.createTempFile("test", ".warc.gz");
|
||||
@ -46,7 +45,7 @@ class CrawlerWarcResynchronizerTest {
|
||||
|
||||
@Test
|
||||
void run() throws IOException, URISyntaxException {
|
||||
try (var oldRecorder = new WarcRecorder(fileName)) {
|
||||
try (var oldRecorder = new WarcRecorder(fileName, new Cookies())) {
|
||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/");
|
||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/log/");
|
||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/feed/");
|
||||
@ -56,7 +55,7 @@ class CrawlerWarcResynchronizerTest {
|
||||
|
||||
var crawlFrontier = new DomainCrawlFrontier(new EdgeDomain("www.marginalia.nu"), List.of(), 100);
|
||||
|
||||
try (var newRecorder = new WarcRecorder(outputFile)) {
|
||||
try (var newRecorder = new WarcRecorder(outputFile, new Cookies())) {
|
||||
new CrawlerWarcResynchronizer(crawlFrontier, newRecorder).run(fileName);
|
||||
}
|
||||
|
||||
@ -79,10 +78,11 @@ class CrawlerWarcResynchronizerTest {
|
||||
}
|
||||
|
||||
void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||
var req = new Request.Builder().url(url)
|
||||
.addHeader("User-agent", "test.marginalia.nu")
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.get().build();
|
||||
var req = HttpRequest.newBuilder()
|
||||
.uri(new java.net.URI(url))
|
||||
.header("User-agent", "test.marginalia.nu")
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.GET().build();
|
||||
recorder.fetch(httpClient, req);
|
||||
}
|
||||
}
|
@ -2,6 +2,7 @@ package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
import com.sun.net.httpserver.HttpServer;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.Cookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
@ -79,7 +80,7 @@ class ContentTypeProberTest {
|
||||
htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir.gz").get();
|
||||
|
||||
fetcher = new HttpFetcherImpl("test");
|
||||
recorder = new WarcRecorder(warcFile);
|
||||
recorder = new WarcRecorder(warcFile, new Cookies());
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
|
@ -2,13 +2,11 @@ package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||
import nu.marginalia.crawl.fetcher.Cookies;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
||||
import okhttp3.OkHttpClient;
|
||||
import okhttp3.Request;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@ -19,6 +17,8 @@ import org.netpreserve.jwarc.WarcXResponseReference;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.security.NoSuchAlgorithmException;
|
||||
@ -31,17 +31,16 @@ class WarcRecorderTest {
|
||||
Path fileNameWarc;
|
||||
Path fileNameParquet;
|
||||
WarcRecorder client;
|
||||
OkHttpClient httpClient;
|
||||
|
||||
HttpClient httpClient;
|
||||
@BeforeEach
|
||||
public void setUp() throws Exception {
|
||||
httpClient = new OkHttpClient.Builder()
|
||||
.addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
|
||||
.build();
|
||||
httpClient = HttpClient.newBuilder().build();
|
||||
|
||||
fileNameWarc = Files.createTempFile("test", ".warc");
|
||||
fileNameParquet = Files.createTempFile("test", ".parquet");
|
||||
|
||||
client = new WarcRecorder(fileNameWarc);
|
||||
client = new WarcRecorder(fileNameWarc, new Cookies());
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
@ -52,10 +51,13 @@ class WarcRecorderTest {
|
||||
|
||||
@Test
|
||||
void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/")
|
||||
.addHeader("User-agent", "test.marginalia.nu")
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.get().build());
|
||||
client.fetch(httpClient,
|
||||
HttpRequest.newBuilder()
|
||||
.uri(new java.net.URI("https://www.marginalia.nu/"))
|
||||
.header("User-agent", "test.marginalia.nu")
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.GET().build()
|
||||
);
|
||||
|
||||
Map<String, String> sampleData = new HashMap<>();
|
||||
try (var warcReader = new WarcReader(fileNameWarc)) {
|
||||
@ -76,7 +78,7 @@ class WarcRecorderTest {
|
||||
@Test
|
||||
public void flagAsSkipped() throws IOException, URISyntaxException {
|
||||
|
||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
||||
try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
|
||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||
"text/html",
|
||||
200,
|
||||
@ -100,7 +102,7 @@ class WarcRecorderTest {
|
||||
@Test
|
||||
public void flagAsSkippedNullBody() throws IOException, URISyntaxException {
|
||||
|
||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
||||
try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
|
||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||
"text/html",
|
||||
200,
|
||||
@ -112,7 +114,7 @@ class WarcRecorderTest {
|
||||
|
||||
@Test
|
||||
public void testSaveImport() throws URISyntaxException, IOException {
|
||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
||||
try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
|
||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||
"text/html",
|
||||
200,
|
||||
@ -136,19 +138,23 @@ class WarcRecorderTest {
|
||||
|
||||
@Test
|
||||
public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/")
|
||||
.addHeader("User-agent", "test.marginalia.nu")
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.get().build());
|
||||
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/log/")
|
||||
.addHeader("User-agent", "test.marginalia.nu")
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.get().build());
|
||||
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/sanic.png")
|
||||
.addHeader("User-agent", "test.marginalia.nu")
|
||||
.addHeader("Accept-Encoding", "gzip")
|
||||
.get().build());
|
||||
client.close();
|
||||
client.fetch(httpClient, HttpRequest.newBuilder()
|
||||
.uri(new java.net.URI("https://www.marginalia.nu/"))
|
||||
.header("User-agent", "test.marginalia.nu")
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.GET().build());
|
||||
|
||||
client.fetch(httpClient, HttpRequest.newBuilder()
|
||||
.uri(new java.net.URI("https://www.marginalia.nu/log/"))
|
||||
.header("User-agent", "test.marginalia.nu")
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.GET().build());
|
||||
|
||||
client.fetch(httpClient, HttpRequest.newBuilder()
|
||||
.uri(new java.net.URI("https://www.marginalia.nu/sanic.png"))
|
||||
.header("User-agent", "test.marginalia.nu")
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.GET().build());
|
||||
|
||||
CrawledDocumentParquetRecordFileWriter.convertWarc(
|
||||
"www.marginalia.nu",
|
||||
|
@ -4,6 +4,7 @@ import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.ContentTypeLogic;
|
||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||
@ -37,6 +38,12 @@ class HttpFetcherTest {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
void testSitemapMarginalia() {
|
||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||
fetcher.fetchSitemapUrls("https://www.marginalia.nu/sitemap.xml", new CrawlDelayTimer(1)).forEach(System.out::println);
|
||||
}
|
||||
|
||||
@Test
|
||||
void fetchText() throws Exception {
|
||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||
|
@ -3,11 +3,9 @@ package nu.marginalia.crawling.retreival;
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.crawl.DomainStateDb;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.SitemapRetriever;
|
||||
import nu.marginalia.crawl.fetcher.*;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
@ -17,7 +15,6 @@ import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.model.crawldata.CrawlerDocumentStatus;
|
||||
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||
import nu.marginalia.test.CommonTestData;
|
||||
import okhttp3.Headers;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@ -27,6 +24,7 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.http.HttpHeaders;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
@ -122,7 +120,7 @@ public class CrawlerMockFetcherTest {
|
||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {}
|
||||
|
||||
@Override
|
||||
public List<String> getCookies() { return List.of();}
|
||||
public Cookies getCookies() { return new Cookies();}
|
||||
|
||||
@Override
|
||||
public void clearCookies() {}
|
||||
@ -149,7 +147,7 @@ public class CrawlerMockFetcherTest {
|
||||
return new HttpFetchResult.ResultOk(
|
||||
url.asURI(),
|
||||
200,
|
||||
new Headers.Builder().build(),
|
||||
HttpHeaders.of(Map.of(), (k,v)->true),
|
||||
"127.0.0.1",
|
||||
bodyBytes,
|
||||
0,
|
||||
@ -164,6 +162,11 @@ public class CrawlerMockFetcherTest {
|
||||
return new HttpFetchResult.ResultNone();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<EdgeUrl> fetchSitemapUrls(String rootSitemapUrl, CrawlDelayTimer delayTimer) {
|
||||
return List.of();
|
||||
}
|
||||
|
||||
@Override
|
||||
public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) {
|
||||
return new SimpleRobotRules();
|
||||
@ -174,5 +177,9 @@ public class CrawlerMockFetcherTest {
|
||||
return Mockito.mock(SitemapRetriever.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -5,6 +5,7 @@ import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.crawl.CrawlerMain;
|
||||
import nu.marginalia.crawl.DomainStateDb;
|
||||
import nu.marginalia.crawl.fetcher.Cookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
@ -180,7 +181,7 @@ class CrawlerRetreiverTest {
|
||||
new EdgeDomain("www.marginalia.nu"),
|
||||
List.of(), 100);
|
||||
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
||||
new WarcRecorder(tempFileWarc2)
|
||||
new WarcRecorder(tempFileWarc2, new Cookies())
|
||||
);
|
||||
|
||||
// truncate the size of the file to simulate a crash
|
||||
@ -458,7 +459,7 @@ class CrawlerRetreiverTest {
|
||||
List.of(), 100);
|
||||
|
||||
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
||||
new WarcRecorder(tempFileWarc3)
|
||||
new WarcRecorder(tempFileWarc3, new Cookies())
|
||||
);
|
||||
|
||||
// truncate the size of the file to simulate a crash
|
||||
@ -509,7 +510,7 @@ class CrawlerRetreiverTest {
|
||||
}
|
||||
|
||||
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
|
||||
try (var recorder = new WarcRecorder(tempFileWarc2);
|
||||
try (var recorder = new WarcRecorder(tempFileWarc2, new Cookies());
|
||||
var db = new DomainStateDb(tempFileDb)
|
||||
) {
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(),
|
||||
@ -522,7 +523,7 @@ class CrawlerRetreiverTest {
|
||||
|
||||
@NotNull
|
||||
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) {
|
||||
try (var recorder = new WarcRecorder(tempFileWarc1);
|
||||
try (var recorder = new WarcRecorder(tempFileWarc1, new Cookies());
|
||||
var db = new DomainStateDb(tempFileDb)
|
||||
) {
|
||||
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder);
|
||||
|
@ -56,7 +56,6 @@ dependencies {
|
||||
implementation libs.zstd
|
||||
implementation libs.jwarc
|
||||
implementation libs.crawlercommons
|
||||
implementation libs.okhttp3
|
||||
implementation libs.jsoup
|
||||
implementation libs.opencsv
|
||||
implementation libs.fastutil
|
||||
|
@ -10,6 +10,7 @@ import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.Cookies;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||
@ -120,7 +121,7 @@ public class IntegrationTest {
|
||||
public void run() throws Exception {
|
||||
|
||||
/** CREATE WARC */
|
||||
try (WarcRecorder warcRecorder = new WarcRecorder(warcData)) {
|
||||
try (WarcRecorder warcRecorder = new WarcRecorder(warcData, new Cookies())) {
|
||||
warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"),
|
||||
new HttpFetcherImpl.DomainProbeResult.Ok(new EdgeUrl("https://www.example.com/")));
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user