mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Merge pull request #150 from MarginaliaSearch/httpclient-in-crawler
Reduce the use of 3rd party code in the crawler
This commit is contained in:
commit
2c67f50a43
@ -106,11 +106,7 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
var url = new EdgeUrl(warcResponse.target());
|
var url = new EdgeUrl(warcResponse.target());
|
||||||
if (!Objects.equals(url.getDomain(), domain)) {
|
return Objects.equals(url.getDomain(), domain);
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.warn("Failed to process response", e);
|
logger.warn("Failed to process response", e);
|
||||||
}
|
}
|
||||||
|
@ -8,6 +8,7 @@ import nu.marginalia.converting.model.ProcessedDomain;
|
|||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
import nu.marginalia.crawl.CrawlerMain;
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
import nu.marginalia.crawl.DomainStateDb;
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
|
import nu.marginalia.crawl.fetcher.Cookies;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
@ -200,23 +201,23 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void crawlRobotsTxt() throws Exception {
|
public void crawlRobotsTxt() throws Exception {
|
||||||
var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 5,
|
var specs = new CrawlerMain.CrawlSpecRecord("marginalia-search.com", 5,
|
||||||
List.of("https://search.marginalia.nu/search?q=hello+world")
|
List.of("https://marginalia-search.com/search?q=hello+world")
|
||||||
);
|
);
|
||||||
|
|
||||||
CrawledDomain domain = crawl(specs);
|
CrawledDomain domain = crawl(specs);
|
||||||
assertFalse(domain.doc.isEmpty());
|
assertFalse(domain.doc.isEmpty());
|
||||||
assertEquals("OK", domain.crawlerStatus);
|
assertEquals("OK", domain.crawlerStatus);
|
||||||
assertEquals("search.marginalia.nu", domain.domain);
|
assertEquals("marginalia-search.com", domain.domain);
|
||||||
|
|
||||||
Set<String> allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet());
|
Set<String> allUrls = domain.doc.stream().map(doc -> doc.url).collect(Collectors.toSet());
|
||||||
assertTrue(allUrls.contains("https://search.marginalia.nu/search"), "We expect a record for entities that are forbidden");
|
assertTrue(allUrls.contains("https://marginalia-search.com/search"), "We expect a record for entities that are forbidden");
|
||||||
|
|
||||||
var output = process();
|
var output = process();
|
||||||
|
|
||||||
assertNotNull(output);
|
assertNotNull(output);
|
||||||
assertFalse(output.documents.isEmpty());
|
assertFalse(output.documents.isEmpty());
|
||||||
assertEquals(new EdgeDomain("search.marginalia.nu"), output.domain);
|
assertEquals(new EdgeDomain("marginalia-search.com"), output.domain);
|
||||||
assertEquals(DomainIndexingState.ACTIVE, output.state);
|
assertEquals(DomainIndexingState.ACTIVE, output.state);
|
||||||
|
|
||||||
for (var doc : output.documents) {
|
for (var doc : output.documents) {
|
||||||
@ -246,7 +247,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws Exception {
|
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws Exception {
|
||||||
List<SerializableCrawlData> data = new ArrayList<>();
|
List<SerializableCrawlData> data = new ArrayList<>();
|
||||||
|
|
||||||
try (var recorder = new WarcRecorder(fileName);
|
try (var recorder = new WarcRecorder(fileName, new Cookies());
|
||||||
var db = new DomainStateDb(dbTempFile))
|
var db = new DomainStateDb(dbTempFile))
|
||||||
{
|
{
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain();
|
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain();
|
||||||
|
@ -55,7 +55,6 @@ dependencies {
|
|||||||
implementation libs.zstd
|
implementation libs.zstd
|
||||||
implementation libs.jwarc
|
implementation libs.jwarc
|
||||||
implementation libs.crawlercommons
|
implementation libs.crawlercommons
|
||||||
implementation libs.okhttp3
|
|
||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
implementation libs.opencsv
|
implementation libs.opencsv
|
||||||
implementation libs.fastutil
|
implementation libs.fastutil
|
||||||
|
@ -33,8 +33,6 @@ import nu.marginalia.service.module.DatabaseModule;
|
|||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.storage.model.FileStorageId;
|
import nu.marginalia.storage.model.FileStorageId;
|
||||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||||
import okhttp3.ConnectionPool;
|
|
||||||
import okhttp3.Dispatcher;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -85,6 +83,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public CrawlerMain(UserAgent userAgent,
|
public CrawlerMain(UserAgent userAgent,
|
||||||
|
HttpFetcherImpl httpFetcher,
|
||||||
ProcessHeartbeatImpl heartbeat,
|
ProcessHeartbeatImpl heartbeat,
|
||||||
MessageQueueFactory messageQueueFactory, DomainProber domainProber,
|
MessageQueueFactory messageQueueFactory, DomainProber domainProber,
|
||||||
FileStorageService fileStorageService,
|
FileStorageService fileStorageService,
|
||||||
@ -98,6 +97,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
super(messageQueueFactory, processConfiguration, gson, CRAWLER_INBOX);
|
super(messageQueueFactory, processConfiguration, gson, CRAWLER_INBOX);
|
||||||
|
|
||||||
this.userAgent = userAgent;
|
this.userAgent = userAgent;
|
||||||
|
this.fetcher = httpFetcher;
|
||||||
this.heartbeat = heartbeat;
|
this.heartbeat = heartbeat;
|
||||||
this.domainProber = domainProber;
|
this.domainProber = domainProber;
|
||||||
this.fileStorageService = fileStorageService;
|
this.fileStorageService = fileStorageService;
|
||||||
@ -111,10 +111,6 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
Integer.getInteger("crawler.poolSize", 256),
|
Integer.getInteger("crawler.poolSize", 256),
|
||||||
1);
|
1);
|
||||||
|
|
||||||
fetcher = new HttpFetcherImpl(userAgent,
|
|
||||||
new Dispatcher(),
|
|
||||||
new ConnectionPool(5, 10, TimeUnit.SECONDS)
|
|
||||||
);
|
|
||||||
|
|
||||||
// Wait for the blacklist to be loaded before starting the crawl
|
// Wait for the blacklist to be loaded before starting the crawl
|
||||||
blacklist.waitUntilLoaded();
|
blacklist.waitUntilLoaded();
|
||||||
@ -132,6 +128,10 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
|
System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
|
||||||
System.setProperty("sun.net.client.defaultReadTimeout", "30000");
|
System.setProperty("sun.net.client.defaultReadTimeout", "30000");
|
||||||
|
|
||||||
|
// Set the maximum number of connections to keep alive in the connection pool
|
||||||
|
System.setProperty("jdk.httpclient.idleTimeout", "15"); // 15 seconds
|
||||||
|
System.setProperty("jdk.httpclient.connectionPoolSize", "256");
|
||||||
|
|
||||||
// We don't want to use too much memory caching sessions for https
|
// We don't want to use too much memory caching sessions for https
|
||||||
System.setProperty("javax.net.ssl.sessionCacheSize", "2048");
|
System.setProperty("javax.net.ssl.sessionCacheSize", "2048");
|
||||||
|
|
||||||
@ -364,10 +364,10 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
Files.deleteIfExists(tempFile);
|
Files.deleteIfExists(tempFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
|
try (var warcRecorder = new WarcRecorder(newWarcFile, fetcher); // write to a temp file for now
|
||||||
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
|
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
|
||||||
CrawlDataReference reference = getReference();
|
CrawlDataReference reference = getReference()
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
// Resume the crawl if it was aborted
|
// Resume the crawl if it was aborted
|
||||||
if (Files.exists(tempFile)) {
|
if (Files.exists(tempFile)) {
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.crawl.fetcher;
|
package nu.marginalia.crawl.fetcher;
|
||||||
|
|
||||||
import okhttp3.Request;
|
import java.net.http.HttpRequest;
|
||||||
|
|
||||||
/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
|
/** Encapsulates request modifiers; the ETag and Last-Modified tags for a resource */
|
||||||
public record ContentTags(String etag, String lastMod) {
|
public record ContentTags(String etag, String lastMod) {
|
||||||
@ -17,14 +17,14 @@ public record ContentTags(String etag, String lastMod) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Paints the tags onto the request builder. */
|
/** Paints the tags onto the request builder. */
|
||||||
public void paint(Request.Builder getBuilder) {
|
public void paint(HttpRequest.Builder getBuilder) {
|
||||||
|
|
||||||
if (etag != null) {
|
if (etag != null) {
|
||||||
getBuilder.addHeader("If-None-Match", etag);
|
getBuilder.header("If-None-Match", etag);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (lastMod != null) {
|
if (lastMod != null) {
|
||||||
getBuilder.addHeader("If-Modified-Since", lastMod);
|
getBuilder.header("If-Modified-Since", lastMod);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,33 +1,14 @@
|
|||||||
package nu.marginalia.crawl.fetcher;
|
package nu.marginalia.crawl.fetcher;
|
||||||
|
|
||||||
import okhttp3.Cookie;
|
import java.io.IOException;
|
||||||
import okhttp3.CookieJar;
|
import java.net.CookieHandler;
|
||||||
import okhttp3.HttpUrl;
|
import java.net.URI;
|
||||||
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
public class Cookies {
|
public class Cookies extends CookieHandler {
|
||||||
final ThreadLocal<ConcurrentHashMap<String, List<Cookie>>> cookieJar = ThreadLocal.withInitial(ConcurrentHashMap::new);
|
final ThreadLocal<ConcurrentHashMap<String, List<String>>> cookieJar = ThreadLocal.withInitial(ConcurrentHashMap::new);
|
||||||
|
|
||||||
public CookieJar getJar() {
|
|
||||||
return new CookieJar() {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void saveFromResponse(HttpUrl url, List<Cookie> cookies) {
|
|
||||||
|
|
||||||
if (!cookies.isEmpty()) {
|
|
||||||
cookieJar.get().put(url.host(), cookies);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public List<Cookie> loadForRequest(HttpUrl url) {
|
|
||||||
return cookieJar.get().getOrDefault(url.host(), Collections.emptyList());
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
public void clear() {
|
public void clear() {
|
||||||
cookieJar.get().clear();
|
cookieJar.get().clear();
|
||||||
@ -38,6 +19,16 @@ public class Cookies {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public List<String> getCookies() {
|
public List<String> getCookies() {
|
||||||
return cookieJar.get().values().stream().flatMap(List::stream).map(Cookie::toString).toList();
|
return cookieJar.get().values().stream().flatMap(List::stream).toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<String, List<String>> get(URI uri, Map<String, List<String>> requestHeaders) throws IOException {
|
||||||
|
return cookieJar.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void put(URI uri, Map<String, List<String>> responseHeaders) throws IOException {
|
||||||
|
cookieJar.get().putAll(responseHeaders);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,7 @@ package nu.marginalia.crawl.fetcher;
|
|||||||
import com.google.inject.ImplementedBy;
|
import com.google.inject.ImplementedBy;
|
||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
|
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
@ -11,10 +12,10 @@ import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@ImplementedBy(HttpFetcherImpl.class)
|
@ImplementedBy(HttpFetcherImpl.class)
|
||||||
public interface HttpFetcher {
|
public interface HttpFetcher extends AutoCloseable {
|
||||||
void setAllowAllContentTypes(boolean allowAllContentTypes);
|
void setAllowAllContentTypes(boolean allowAllContentTypes);
|
||||||
|
|
||||||
List<String> getCookies();
|
Cookies getCookies();
|
||||||
void clearCookies();
|
void clearCookies();
|
||||||
|
|
||||||
DomainProbeResult probeDomain(EdgeUrl url);
|
DomainProbeResult probeDomain(EdgeUrl url);
|
||||||
@ -27,7 +28,9 @@ public interface HttpFetcher {
|
|||||||
HttpFetchResult fetchContent(EdgeUrl url,
|
HttpFetchResult fetchContent(EdgeUrl url,
|
||||||
WarcRecorder recorder,
|
WarcRecorder recorder,
|
||||||
ContentTags tags,
|
ContentTags tags,
|
||||||
ProbeType probeType) throws HttpFetcherImpl.RateLimitException, Exception;
|
ProbeType probeType) throws Exception;
|
||||||
|
|
||||||
|
List<EdgeUrl> fetchSitemapUrls(String rootSitemapUrl, CrawlDelayTimer delayTimer);
|
||||||
|
|
||||||
SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);
|
SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);
|
||||||
|
|
||||||
|
@ -1,35 +1,41 @@
|
|||||||
package nu.marginalia.crawl.fetcher;
|
package nu.marginalia.crawl.fetcher;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import crawlercommons.robots.SimpleRobotRulesParser;
|
import crawlercommons.robots.SimpleRobotRulesParser;
|
||||||
import nu.marginalia.UserAgent;
|
import nu.marginalia.UserAgent;
|
||||||
import nu.marginalia.crawl.fetcher.socket.FastTerminatingSocketFactory;
|
|
||||||
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
|
|
||||||
import nu.marginalia.crawl.fetcher.socket.NoSecuritySSL;
|
import nu.marginalia.crawl.fetcher.socket.NoSecuritySSL;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
|
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.ContentTypeLogic;
|
import nu.marginalia.model.body.ContentTypeLogic;
|
||||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||||
import okhttp3.ConnectionPool;
|
import org.jsoup.Jsoup;
|
||||||
import okhttp3.Dispatcher;
|
import org.jsoup.nodes.Document;
|
||||||
import okhttp3.OkHttpClient;
|
import org.jsoup.parser.Parser;
|
||||||
import okhttp3.Request;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import javax.net.ssl.X509TrustManager;
|
import java.io.IOException;
|
||||||
import java.io.InterruptedIOException;
|
import java.io.InputStream;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.net.URLDecoder;
|
||||||
|
import java.net.http.HttpClient;
|
||||||
|
import java.net.http.HttpRequest;
|
||||||
|
import java.net.http.HttpResponse;
|
||||||
|
import java.net.http.HttpTimeoutException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
import java.util.List;
|
import java.util.*;
|
||||||
import java.util.Objects;
|
import java.util.concurrent.Executors;
|
||||||
import java.util.Optional;
|
import java.util.zip.GZIPInputStream;
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
|
|
||||||
|
|
||||||
|
@Singleton
|
||||||
public class HttpFetcherImpl implements HttpFetcher {
|
public class HttpFetcherImpl implements HttpFetcher {
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
@ -40,39 +46,28 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
|
private static final SimpleRobotRulesParser robotsParser = new SimpleRobotRulesParser();
|
||||||
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic();
|
||||||
|
|
||||||
|
private final Duration requestTimeout = Duration.ofSeconds(10);
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
public void setAllowAllContentTypes(boolean allowAllContentTypes) {
|
||||||
contentTypeLogic.setAllowAllContentTypes(allowAllContentTypes);
|
contentTypeLogic.setAllowAllContentTypes(allowAllContentTypes);
|
||||||
}
|
}
|
||||||
|
|
||||||
private final OkHttpClient client;
|
private final HttpClient client;
|
||||||
|
|
||||||
private static final FastTerminatingSocketFactory ftSocketFactory = new FastTerminatingSocketFactory();
|
|
||||||
|
|
||||||
private OkHttpClient createClient(Dispatcher dispatcher, ConnectionPool pool) {
|
|
||||||
var builder = new OkHttpClient.Builder();
|
|
||||||
if (dispatcher != null) {
|
|
||||||
builder.dispatcher(dispatcher);
|
|
||||||
}
|
|
||||||
|
|
||||||
return builder.sslSocketFactory(NoSecuritySSL.buildSocketFactory(), (X509TrustManager) NoSecuritySSL.trustAllCerts[0])
|
|
||||||
.socketFactory(ftSocketFactory)
|
|
||||||
.hostnameVerifier(NoSecuritySSL.buildHostnameVerifyer())
|
|
||||||
.addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
|
|
||||||
.connectionPool(pool)
|
|
||||||
.cookieJar(cookies.getJar())
|
|
||||||
.followRedirects(true)
|
|
||||||
.followSslRedirects(true)
|
|
||||||
.connectTimeout(8, TimeUnit.SECONDS)
|
|
||||||
.readTimeout(10, TimeUnit.SECONDS)
|
|
||||||
.writeTimeout(10, TimeUnit.SECONDS)
|
|
||||||
.build();
|
|
||||||
|
|
||||||
|
private HttpClient createClient() {
|
||||||
|
return HttpClient.newBuilder()
|
||||||
|
.sslContext(NoSecuritySSL.buildSslContext())
|
||||||
|
.cookieHandler(cookies)
|
||||||
|
.followRedirects(HttpClient.Redirect.NORMAL)
|
||||||
|
.connectTimeout(Duration.ofSeconds(8))
|
||||||
|
.executor(Executors.newCachedThreadPool())
|
||||||
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<String> getCookies() {
|
public Cookies getCookies() {
|
||||||
return cookies.getCookies();
|
return cookies;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -81,26 +76,24 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public HttpFetcherImpl(UserAgent userAgent,
|
public HttpFetcherImpl(UserAgent userAgent)
|
||||||
Dispatcher dispatcher,
|
|
||||||
ConnectionPool connectionPool)
|
|
||||||
{
|
{
|
||||||
this.client = createClient(dispatcher, connectionPool);
|
this.client = createClient();
|
||||||
this.userAgentString = userAgent.uaString();
|
this.userAgentString = userAgent.uaString();
|
||||||
this.userAgentIdentifier = userAgent.uaIdentifier();
|
this.userAgentIdentifier = userAgent.uaIdentifier();
|
||||||
}
|
}
|
||||||
|
|
||||||
public HttpFetcherImpl(String userAgent) {
|
public HttpFetcherImpl(String userAgent) {
|
||||||
this.client = createClient(null, new ConnectionPool());
|
this.client = createClient();
|
||||||
this.userAgentString = userAgent;
|
this.userAgentString = userAgent;
|
||||||
this.userAgentIdentifier = userAgent;
|
this.userAgentIdentifier = userAgent;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Not necessary in prod, but useful in test
|
// Not necessary in prod, but useful in test
|
||||||
public void close() {
|
public void close() {
|
||||||
client.dispatcher().executorService().shutdown();
|
client.close();
|
||||||
client.connectionPool().evictAll();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Probe the domain to see if it is reachable, attempting to identify which schema to use,
|
* Probe the domain to see if it is reachable, attempting to identify which schema to use,
|
||||||
* and if there are any redirects. This is done by one or more HEAD requests.
|
* and if there are any redirects. This is done by one or more HEAD requests.
|
||||||
@ -110,19 +103,26 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public DomainProbeResult probeDomain(EdgeUrl url) {
|
public DomainProbeResult probeDomain(EdgeUrl url) {
|
||||||
var head = new Request.Builder().head().addHeader("User-agent", userAgentString)
|
HttpRequest head;
|
||||||
.url(url.toString())
|
try {
|
||||||
.build();
|
head = HttpRequest.newBuilder()
|
||||||
|
.HEAD()
|
||||||
|
.uri(url.asURI())
|
||||||
|
.header("User-agent", userAgentString)
|
||||||
|
.timeout(requestTimeout)
|
||||||
|
.build();
|
||||||
|
} catch (URISyntaxException e) {
|
||||||
|
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, "Invalid URL");
|
||||||
|
}
|
||||||
|
|
||||||
var call = client.newCall(head);
|
try {
|
||||||
|
var rsp = client.send(head, HttpResponse.BodyHandlers.discarding());
|
||||||
|
EdgeUrl rspUri = new EdgeUrl(rsp.uri());
|
||||||
|
|
||||||
try (var rsp = call.execute()) {
|
if (!Objects.equals(rspUri.domain, url.domain)) {
|
||||||
EdgeUrl requestUrl = new EdgeUrl(rsp.request().url().toString());
|
return new DomainProbeResult.Redirect(rspUri.domain);
|
||||||
|
|
||||||
if (!Objects.equals(requestUrl.domain, url.domain)) {
|
|
||||||
return new DomainProbeResult.Redirect(requestUrl.domain);
|
|
||||||
}
|
}
|
||||||
return new DomainProbeResult.Ok(requestUrl);
|
return new DomainProbeResult.Ok(rspUri);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
|
return new DomainProbeResult.Error(CrawlerDomainStatus.ERROR, ex.getMessage());
|
||||||
@ -140,21 +140,25 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
WarcRecorder warcRecorder,
|
WarcRecorder warcRecorder,
|
||||||
ContentTags tags) throws RateLimitException {
|
ContentTags tags) throws RateLimitException {
|
||||||
if (tags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) {
|
if (tags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) {
|
||||||
var headBuilder = new Request.Builder().head()
|
|
||||||
.addHeader("User-agent", userAgentString)
|
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
|
||||||
.url(url.toString());
|
|
||||||
|
|
||||||
var head = headBuilder.build();
|
try {
|
||||||
var call = client.newCall(head);
|
var headBuilder = HttpRequest.newBuilder()
|
||||||
|
.HEAD()
|
||||||
|
.uri(url.asURI())
|
||||||
|
.header("User-agent", userAgentString)
|
||||||
|
.header("Accept-Encoding", "gzip")
|
||||||
|
.timeout(requestTimeout)
|
||||||
|
;
|
||||||
|
|
||||||
try (var rsp = call.execute()) {
|
var rsp = client.send(headBuilder.build(), HttpResponse.BodyHandlers.discarding());
|
||||||
var contentTypeHeader = rsp.header("Content-type");
|
var headers = rsp.headers();
|
||||||
|
|
||||||
|
var contentTypeHeader = headers.firstValue("Content-Type").orElse(null);
|
||||||
|
|
||||||
if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
|
if (contentTypeHeader != null && !contentTypeLogic.isAllowableContentType(contentTypeHeader)) {
|
||||||
warcRecorder.flagAsFailedContentTypeProbe(url, contentTypeHeader, rsp.code());
|
warcRecorder.flagAsFailedContentTypeProbe(url, contentTypeHeader, rsp.statusCode());
|
||||||
|
|
||||||
return new ContentTypeProbeResult.BadContentType(contentTypeHeader, rsp.code());
|
return new ContentTypeProbeResult.BadContentType(contentTypeHeader, rsp.statusCode());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update the URL to the final URL of the HEAD request, otherwise we might end up doing
|
// Update the URL to the final URL of the HEAD request, otherwise we might end up doing
|
||||||
@ -168,27 +172,27 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
// too many eyebrows when looking at the logs on the target server. Overall it's probably desirable
|
// too many eyebrows when looking at the logs on the target server. Overall it's probably desirable
|
||||||
// that it looks like the traffic makes sense, as opposed to looking like a broken bot.
|
// that it looks like the traffic makes sense, as opposed to looking like a broken bot.
|
||||||
|
|
||||||
var redirectUrl = new EdgeUrl(rsp.request().url().toString());
|
var redirectUrl = new EdgeUrl(rsp.uri());
|
||||||
EdgeUrl ret;
|
EdgeUrl ret;
|
||||||
|
|
||||||
if (Objects.equals(redirectUrl.domain, url.domain)) ret = redirectUrl;
|
if (Objects.equals(redirectUrl.domain, url.domain)) ret = redirectUrl;
|
||||||
else ret = url;
|
else ret = url;
|
||||||
|
|
||||||
// Intercept rate limiting
|
// Intercept rate limiting
|
||||||
if (rsp.code() == 429) {
|
if (rsp.statusCode() == 429) {
|
||||||
throw new HttpFetcherImpl.RateLimitException(Objects.requireNonNullElse(rsp.header("Retry-After"), "1"));
|
throw new HttpFetcherImpl.RateLimitException(headers.firstValue("Retry-After").orElse("1"));
|
||||||
}
|
}
|
||||||
|
|
||||||
return new ContentTypeProbeResult.Ok(ret);
|
return new ContentTypeProbeResult.Ok(ret);
|
||||||
}
|
}
|
||||||
|
catch (HttpTimeoutException ex) {
|
||||||
|
warcRecorder.flagAsTimeout(url);
|
||||||
|
return new ContentTypeProbeResult.Timeout(ex);
|
||||||
|
}
|
||||||
catch (RateLimitException ex) {
|
catch (RateLimitException ex) {
|
||||||
throw ex;
|
throw ex;
|
||||||
}
|
}
|
||||||
catch (InterruptedIOException ex) {
|
catch (Exception ex) {
|
||||||
warcRecorder.flagAsTimeout(url);
|
|
||||||
|
|
||||||
return new ContentTypeProbeResult.Timeout(ex);
|
|
||||||
} catch (Exception ex) {
|
|
||||||
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
||||||
|
|
||||||
warcRecorder.flagAsError(url, ex);
|
warcRecorder.flagAsError(url, ex);
|
||||||
@ -210,13 +214,15 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
ProbeType probeType)
|
ProbeType probeType)
|
||||||
throws Exception
|
throws Exception
|
||||||
{
|
{
|
||||||
var getBuilder = new Request.Builder().get();
|
var getBuilder = HttpRequest.newBuilder()
|
||||||
|
.GET()
|
||||||
getBuilder.url(url.toString())
|
.uri(url.asURI())
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
.header("User-agent", userAgentString)
|
||||||
.addHeader("Accept-Language", "en,*;q=0.5")
|
.header("Accept-Encoding", "gzip")
|
||||||
.addHeader("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
|
.header("Accept-Language", "en,*;q=0.5")
|
||||||
.addHeader("User-agent", userAgentString);
|
.header("Accept", "text/html, application/xhtml+xml, text/*;q=0.8")
|
||||||
|
.timeout(requestTimeout)
|
||||||
|
;
|
||||||
|
|
||||||
contentTags.paint(getBuilder);
|
contentTags.paint(getBuilder);
|
||||||
|
|
||||||
@ -242,6 +248,122 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
return new SitemapRetriever();
|
return new SitemapRetriever();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<EdgeUrl> fetchSitemapUrls(String root, CrawlDelayTimer delayTimer) {
|
||||||
|
try {
|
||||||
|
List<EdgeUrl> ret = new ArrayList<>();
|
||||||
|
|
||||||
|
Set<String> seenUrls = new HashSet<>();
|
||||||
|
Set<String> seenSitemaps = new HashSet<>();
|
||||||
|
|
||||||
|
Deque<EdgeUrl> sitemapQueue = new LinkedList<>();
|
||||||
|
|
||||||
|
EdgeUrl rootSitemapUrl = new EdgeUrl(root);
|
||||||
|
|
||||||
|
sitemapQueue.add(rootSitemapUrl);
|
||||||
|
|
||||||
|
int fetchedSitemaps = 0;
|
||||||
|
|
||||||
|
while (!sitemapQueue.isEmpty() && ret.size() < 20_000 && ++fetchedSitemaps < 10) {
|
||||||
|
var head = sitemapQueue.removeFirst();
|
||||||
|
|
||||||
|
switch (fetchSitemap(head)) {
|
||||||
|
case SitemapResult.SitemapUrls(List<String> urls) -> {
|
||||||
|
|
||||||
|
for (var url : urls) {
|
||||||
|
if (seenUrls.add(url)) {
|
||||||
|
EdgeUrl.parse(url)
|
||||||
|
.filter(u -> u.domain.equals(rootSitemapUrl.domain))
|
||||||
|
.ifPresent(ret::add);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
case SitemapResult.SitemapReferences(List<String> refs) -> {
|
||||||
|
for (var ref : refs) {
|
||||||
|
if (seenSitemaps.add(ref)) {
|
||||||
|
EdgeUrl.parse(ref)
|
||||||
|
.filter(url -> url.domain.equals(rootSitemapUrl.domain))
|
||||||
|
.ifPresent(sitemapQueue::addFirst);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
case SitemapResult.SitemapError() -> {}
|
||||||
|
}
|
||||||
|
|
||||||
|
delayTimer.waitFetchDelay();
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.error("Error while fetching sitemaps via " + root, ex);
|
||||||
|
return List.of();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private SitemapResult fetchSitemap(EdgeUrl sitemapUrl) throws URISyntaxException, IOException, InterruptedException {
|
||||||
|
HttpRequest getRequest = HttpRequest.newBuilder()
|
||||||
|
.GET()
|
||||||
|
.uri(sitemapUrl.asURI())
|
||||||
|
.header("Accept-Encoding", "gzip")
|
||||||
|
.header("Accept", "text/*, */*;q=0.9")
|
||||||
|
.header("User-agent", userAgentString)
|
||||||
|
.timeout(requestTimeout)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
var response = client.send(getRequest, HttpResponse.BodyHandlers.ofInputStream());
|
||||||
|
if (response.statusCode() != 200) {
|
||||||
|
return new SitemapResult.SitemapError();
|
||||||
|
}
|
||||||
|
|
||||||
|
try (InputStream inputStream = response.body()) {
|
||||||
|
|
||||||
|
InputStream parserStream;
|
||||||
|
if (sitemapUrl.path.endsWith(".gz")) {
|
||||||
|
parserStream = new GZIPInputStream(inputStream);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
parserStream = inputStream;
|
||||||
|
}
|
||||||
|
|
||||||
|
Document parsedSitemap = Jsoup.parse(parserStream, "UTF-8", sitemapUrl.toString(), Parser.xmlParser());
|
||||||
|
String rootTagName = parsedSitemap.child(0).tagName();
|
||||||
|
|
||||||
|
return switch (rootTagName.toLowerCase()) {
|
||||||
|
case "sitemapindex" -> {
|
||||||
|
List<String> references = new ArrayList<>();
|
||||||
|
for (var locTag : parsedSitemap.getElementsByTag("loc")) {
|
||||||
|
references.add(URLDecoder.decode(locTag.text().trim(), StandardCharsets.UTF_8));
|
||||||
|
}
|
||||||
|
yield new SitemapResult.SitemapReferences(Collections.unmodifiableList(references));
|
||||||
|
}
|
||||||
|
case "urlset" -> {
|
||||||
|
List<String> urls = new ArrayList<>();
|
||||||
|
for (var locTag : parsedSitemap.select("url > loc")) {
|
||||||
|
urls.add(URLDecoder.decode(locTag.text().trim(), StandardCharsets.UTF_8));
|
||||||
|
}
|
||||||
|
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||||
|
}
|
||||||
|
case "rss", "atom" -> {
|
||||||
|
List<String> urls = new ArrayList<>();
|
||||||
|
for (var locTag : parsedSitemap.select("link, url")) {
|
||||||
|
urls.add(locTag.text().trim());
|
||||||
|
}
|
||||||
|
yield new SitemapResult.SitemapUrls(Collections.unmodifiableList(urls));
|
||||||
|
}
|
||||||
|
default -> new SitemapResult.SitemapError();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private sealed interface SitemapResult {
|
||||||
|
record SitemapUrls(List<String> urls) implements SitemapResult {}
|
||||||
|
record SitemapReferences(List<String> sitemapRefs) implements SitemapResult {}
|
||||||
|
record SitemapError() implements SitemapResult {}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) {
|
public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) {
|
||||||
var ret = fetchAndParseRobotsTxt(new EdgeUrl("https", domain, null, "/robots.txt", null), recorder);
|
var ret = fetchAndParseRobotsTxt(new EdgeUrl("https", domain, null, "/robots.txt", null), recorder);
|
||||||
@ -257,14 +379,15 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
|
|
||||||
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
|
private Optional<SimpleRobotRules> fetchAndParseRobotsTxt(EdgeUrl url, WarcRecorder recorder) {
|
||||||
try {
|
try {
|
||||||
var getBuilder = new Request.Builder().get();
|
var getRequest = HttpRequest.newBuilder()
|
||||||
|
.GET()
|
||||||
|
.uri(url.asURI())
|
||||||
|
.header("Accept-Encoding", "gzip")
|
||||||
|
.header("Accept", "text/*, */*;q=0.9")
|
||||||
|
.header("User-agent", userAgentString)
|
||||||
|
.timeout(requestTimeout);
|
||||||
|
|
||||||
getBuilder.url(url.toString())
|
HttpFetchResult result = recorder.fetch(client, getRequest.build());
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
|
||||||
.addHeader("Accept", "text/*, */*;q=0.9")
|
|
||||||
.addHeader("User-agent", userAgentString);
|
|
||||||
|
|
||||||
HttpFetchResult result = recorder.fetch(client, getBuilder.build());
|
|
||||||
|
|
||||||
return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) ->
|
return DocumentBodyExtractor.asBytes(result).mapOpt((contentType, body) ->
|
||||||
robotsParser.parseContent(url.toString(),
|
robotsParser.parseContent(url.toString(),
|
||||||
|
@ -1,31 +0,0 @@
|
|||||||
package nu.marginalia.crawl.fetcher.socket;
|
|
||||||
|
|
||||||
import okhttp3.Interceptor;
|
|
||||||
import okhttp3.Response;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
|
|
||||||
/** An interceptor that intercepts network requests and adds the remote IP address as
|
|
||||||
* a header in the response. This is used to pass the remote IP address to the Warc
|
|
||||||
* writer, as this information is not available in the response.
|
|
||||||
*/
|
|
||||||
public class IpInterceptingNetworkInterceptor implements Interceptor {
|
|
||||||
private static final String pseudoHeaderName = "X-Marginalia-Remote-IP";
|
|
||||||
|
|
||||||
@NotNull
|
|
||||||
@Override
|
|
||||||
public Response intercept(@NotNull Interceptor.Chain chain) throws IOException {
|
|
||||||
String IP = chain.connection().socket().getInetAddress().getHostAddress();
|
|
||||||
|
|
||||||
return chain.proceed(chain.request())
|
|
||||||
.newBuilder()
|
|
||||||
.addHeader(pseudoHeaderName, IP)
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String getIpFromResponse(Response response) {
|
|
||||||
return response.header(pseudoHeaderName);
|
|
||||||
}
|
|
||||||
}
|
|
@ -27,7 +27,7 @@ public class NoSecuritySSL {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
public static SSLSocketFactory buildSocketFactory() {
|
public static SSLContext buildSslContext() {
|
||||||
try {
|
try {
|
||||||
// Install the all-trusting trust manager
|
// Install the all-trusting trust manager
|
||||||
final SSLContext sslContext = SSLContext.getInstance("TLS");
|
final SSLContext sslContext = SSLContext.getInstance("TLS");
|
||||||
@ -40,14 +40,11 @@ public class NoSecuritySSL {
|
|||||||
clientSessionContext.setSessionCacheSize(2048);
|
clientSessionContext.setSessionCacheSize(2048);
|
||||||
|
|
||||||
// Create a ssl socket factory with our all-trusting manager
|
// Create a ssl socket factory with our all-trusting manager
|
||||||
return sslContext.getSocketFactory();
|
return sslContext;
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static HostnameVerifier buildHostnameVerifyer() {
|
|
||||||
return (hn, session) -> true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -1,14 +1,14 @@
|
|||||||
package nu.marginalia.crawl.fetcher.warc;
|
package nu.marginalia.crawl.fetcher.warc;
|
||||||
|
|
||||||
import okhttp3.Headers;
|
|
||||||
import okhttp3.Response;
|
|
||||||
import org.apache.commons.io.input.BOMInputStream;
|
import org.apache.commons.io.input.BOMInputStream;
|
||||||
import org.netpreserve.jwarc.WarcTruncationReason;
|
import org.netpreserve.jwarc.WarcTruncationReason;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
|
import java.net.http.HttpHeaders;
|
||||||
|
import java.net.http.HttpResponse;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.Objects;
|
import java.util.Map;
|
||||||
import java.util.zip.GZIPInputStream;
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
/** Input buffer for temporary storage of a HTTP response
|
/** Input buffer for temporary storage of a HTTP response
|
||||||
@ -17,8 +17,9 @@ import java.util.zip.GZIPInputStream;
|
|||||||
* */
|
* */
|
||||||
public abstract class WarcInputBuffer implements AutoCloseable {
|
public abstract class WarcInputBuffer implements AutoCloseable {
|
||||||
protected WarcTruncationReason truncationReason = WarcTruncationReason.NOT_TRUNCATED;
|
protected WarcTruncationReason truncationReason = WarcTruncationReason.NOT_TRUNCATED;
|
||||||
protected Headers headers;
|
protected HttpHeaders headers;
|
||||||
WarcInputBuffer(Headers headers) {
|
|
||||||
|
WarcInputBuffer(HttpHeaders headers) {
|
||||||
this.headers = headers;
|
this.headers = headers;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -30,7 +31,7 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
|
|
||||||
public final WarcTruncationReason truncationReason() { return truncationReason; }
|
public final WarcTruncationReason truncationReason() { return truncationReason; }
|
||||||
|
|
||||||
public final Headers headers() { return headers; }
|
public final HttpHeaders headers() { return headers; }
|
||||||
|
|
||||||
/** Create a buffer for a response.
|
/** Create a buffer for a response.
|
||||||
* If the response is small and not compressed, it will be stored in memory.
|
* If the response is small and not compressed, it will be stored in memory.
|
||||||
@ -38,26 +39,27 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
* and suppressed from the headers.
|
* and suppressed from the headers.
|
||||||
* If an error occurs, a buffer will be created with no content and an error status.
|
* If an error occurs, a buffer will be created with no content and an error status.
|
||||||
*/
|
*/
|
||||||
static WarcInputBuffer forResponse(Response rsp) {
|
static WarcInputBuffer forResponse(HttpResponse<InputStream> rsp) {
|
||||||
if (rsp == null)
|
if (rsp == null)
|
||||||
return new ErrorBuffer();
|
return new ErrorBuffer();
|
||||||
|
|
||||||
try {
|
var headers = rsp.headers();
|
||||||
String contentLengthHeader = Objects.requireNonNullElse(rsp.header("Content-Length"), "-1");
|
|
||||||
int contentLength = Integer.parseInt(contentLengthHeader);
|
try (var is = rsp.body()) {
|
||||||
String contentEncoding = rsp.header("Content-Encoding");
|
int contentLength = (int) headers.firstValueAsLong("Content-Length").orElse(-1L);
|
||||||
|
String contentEncoding = headers.firstValue("Content-Encoding").orElse(null);
|
||||||
|
|
||||||
if (contentEncoding == null && contentLength > 0 && contentLength < 8192) {
|
if (contentEncoding == null && contentLength > 0 && contentLength < 8192) {
|
||||||
// If the content is small and not compressed, we can just read it into memory
|
// If the content is small and not compressed, we can just read it into memory
|
||||||
return new MemoryBuffer(rsp, contentLength);
|
return new MemoryBuffer(headers, is, contentLength);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// Otherwise, we unpack it into a file and read it from there
|
// Otherwise, we unpack it into a file and read it from there
|
||||||
return new FileBuffer(rsp);
|
return new FileBuffer(headers, is);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
return new ErrorBuffer(rsp);
|
return new ErrorBuffer();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -99,12 +101,8 @@ public abstract class WarcInputBuffer implements AutoCloseable {
|
|||||||
/** Pseudo-buffer for when we have an error */
|
/** Pseudo-buffer for when we have an error */
|
||||||
class ErrorBuffer extends WarcInputBuffer {
|
class ErrorBuffer extends WarcInputBuffer {
|
||||||
public ErrorBuffer() {
|
public ErrorBuffer() {
|
||||||
super(Headers.of());
|
super(HttpHeaders.of(Map.of(), (k,v)->false));
|
||||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ErrorBuffer(Response rsp) {
|
|
||||||
super(rsp.headers());
|
|
||||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -125,12 +123,12 @@ class ErrorBuffer extends WarcInputBuffer {
|
|||||||
/** Buffer for when we have the response in memory */
|
/** Buffer for when we have the response in memory */
|
||||||
class MemoryBuffer extends WarcInputBuffer {
|
class MemoryBuffer extends WarcInputBuffer {
|
||||||
byte[] data;
|
byte[] data;
|
||||||
public MemoryBuffer(Response response, int size) {
|
public MemoryBuffer(HttpHeaders headers, InputStream responseStream, int size) {
|
||||||
super(response.headers());
|
super(headers);
|
||||||
|
|
||||||
var outputStream = new ByteArrayOutputStream(size);
|
var outputStream = new ByteArrayOutputStream(size);
|
||||||
|
|
||||||
copy(response.body().byteStream(), outputStream);
|
copy(responseStream, outputStream);
|
||||||
|
|
||||||
data = outputStream.toByteArray();
|
data = outputStream.toByteArray();
|
||||||
}
|
}
|
||||||
@ -154,19 +152,15 @@ class MemoryBuffer extends WarcInputBuffer {
|
|||||||
class FileBuffer extends WarcInputBuffer {
|
class FileBuffer extends WarcInputBuffer {
|
||||||
private final Path tempFile;
|
private final Path tempFile;
|
||||||
|
|
||||||
public FileBuffer(Response response) throws IOException {
|
public FileBuffer(HttpHeaders headers, InputStream responseStream) throws IOException {
|
||||||
super(suppressContentEncoding(response.headers()));
|
super(suppressContentEncoding(headers));
|
||||||
|
|
||||||
this.tempFile = Files.createTempFile("rsp", ".html");
|
this.tempFile = Files.createTempFile("rsp", ".html");
|
||||||
|
|
||||||
if (response.body() == null) {
|
|
||||||
truncationReason = WarcTruncationReason.DISCONNECT;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ("gzip".equals(response.header("Content-Encoding"))) {
|
if ("gzip".equalsIgnoreCase(headers.firstValue("Content-Encoding").orElse(""))) {
|
||||||
try (var out = Files.newOutputStream(tempFile)) {
|
try (var out = Files.newOutputStream(tempFile)) {
|
||||||
copy(new GZIPInputStream(response.body().byteStream()), out);
|
copy(new GZIPInputStream(responseStream), out);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||||
@ -174,7 +168,7 @@ class FileBuffer extends WarcInputBuffer {
|
|||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
try (var out = Files.newOutputStream(tempFile)) {
|
try (var out = Files.newOutputStream(tempFile)) {
|
||||||
copy(response.body().byteStream(), out);
|
copy(responseStream, out);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
truncationReason = WarcTruncationReason.UNSPECIFIED;
|
||||||
@ -182,22 +176,13 @@ class FileBuffer extends WarcInputBuffer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Headers suppressContentEncoding(Headers headers) {
|
private static HttpHeaders suppressContentEncoding(HttpHeaders headers) {
|
||||||
var builder = new Headers.Builder();
|
return HttpHeaders.of(headers.map(), (k, v) -> {
|
||||||
|
|
||||||
headers.toMultimap().forEach((k, values) -> {
|
|
||||||
if ("Content-Encoding".equalsIgnoreCase(k)) {
|
if ("Content-Encoding".equalsIgnoreCase(k)) {
|
||||||
return;
|
return false;
|
||||||
}
|
|
||||||
if ("Transfer-Encoding".equalsIgnoreCase(k)) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
for (var value : values) {
|
|
||||||
builder.add(k, value);
|
|
||||||
}
|
}
|
||||||
|
return !"Transfer-Encoding".equalsIgnoreCase(k);
|
||||||
});
|
});
|
||||||
|
|
||||||
return builder.build();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,11 +1,12 @@
|
|||||||
package nu.marginalia.crawl.fetcher.warc;
|
package nu.marginalia.crawl.fetcher.warc;
|
||||||
|
|
||||||
import okhttp3.Protocol;
|
|
||||||
import okhttp3.Response;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.net.URLEncoder;
|
import java.net.URLEncoder;
|
||||||
|
import java.net.http.HttpClient;
|
||||||
|
import java.net.http.HttpHeaders;
|
||||||
|
import java.net.http.HttpResponse;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
@ -75,13 +76,13 @@ public class WarcProtocolReconstructor {
|
|||||||
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
static String getResponseHeader(Response response, long size) {
|
static String getResponseHeader(HttpResponse<?> response, long size) {
|
||||||
String version = response.protocol() == Protocol.HTTP_1_1 ? "1.1" : "2.0";
|
String version = response.version() == HttpClient.Version.HTTP_1_1 ? "1.1" : "2.0";
|
||||||
|
|
||||||
String statusCode = String.valueOf(response.code());
|
String statusCode = String.valueOf(response.statusCode());
|
||||||
String statusMessage = STATUS_CODE_MAP.getOrDefault(response.code(), "Unknown");
|
String statusMessage = STATUS_CODE_MAP.getOrDefault(response.statusCode(), "Unknown");
|
||||||
|
|
||||||
String headerString = getHeadersAsString(response, size);
|
String headerString = getHeadersAsString(response.headers(), size);
|
||||||
|
|
||||||
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
||||||
}
|
}
|
||||||
@ -148,10 +149,10 @@ public class WarcProtocolReconstructor {
|
|||||||
return joiner.toString();
|
return joiner.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
static private String getHeadersAsString(Response response, long responseSize) {
|
static private String getHeadersAsString(HttpHeaders headers, long responseSize) {
|
||||||
StringJoiner joiner = new StringJoiner("\r\n");
|
StringJoiner joiner = new StringJoiner("\r\n");
|
||||||
|
|
||||||
response.headers().toMultimap().forEach((k, values) -> {
|
headers.map().forEach((k, values) -> {
|
||||||
String headerCapitalized = capitalizeHeader(k);
|
String headerCapitalized = capitalizeHeader(k);
|
||||||
|
|
||||||
// Omit pseudoheaders injected by the crawler itself
|
// Omit pseudoheaders injected by the crawler itself
|
||||||
@ -179,8 +180,8 @@ public class WarcProtocolReconstructor {
|
|||||||
return joiner.toString();
|
return joiner.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
// okhttp gives us flattened headers, so we need to reconstruct Camel-Kebab-Case style
|
// okhttp gave us flattened headers, so we need to reconstruct Camel-Kebab-Case style
|
||||||
// for the WARC parser's sake...
|
// for the WARC parser's sake... (do we still need this, mr chesterton?)
|
||||||
static private String capitalizeHeader(String k) {
|
static private String capitalizeHeader(String k) {
|
||||||
return Arrays.stream(StringUtils.split(k, '-'))
|
return Arrays.stream(StringUtils.split(k, '-'))
|
||||||
.map(StringUtils::capitalize)
|
.map(StringUtils::capitalize)
|
||||||
|
@ -1,13 +1,11 @@
|
|||||||
package nu.marginalia.crawl.fetcher.warc;
|
package nu.marginalia.crawl.fetcher.warc;
|
||||||
|
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
|
import nu.marginalia.crawl.fetcher.Cookies;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
|
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
import okhttp3.OkHttpClient;
|
|
||||||
import okhttp3.Request;
|
|
||||||
import org.jetbrains.annotations.Nullable;
|
import org.jetbrains.annotations.Nullable;
|
||||||
import org.netpreserve.jwarc.*;
|
import org.netpreserve.jwarc.*;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -18,6 +16,7 @@ import java.io.InputStream;
|
|||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
|
import java.net.http.HttpClient;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
@ -28,7 +27,7 @@ import java.util.*;
|
|||||||
|
|
||||||
/** Based on JWarc's fetch method, APL 2.0 license
|
/** Based on JWarc's fetch method, APL 2.0 license
|
||||||
* <p></p>
|
* <p></p>
|
||||||
* This class wraps OkHttp's OkHttpClient and records the HTTP request and response in a WARC file,
|
* This class wraps HttpClient and records the HTTP request and response in a WARC file,
|
||||||
* as best is possible given not all the data is available at the same time and needs to
|
* as best is possible given not all the data is available at the same time and needs to
|
||||||
* be reconstructed.
|
* be reconstructed.
|
||||||
*/
|
*/
|
||||||
@ -48,20 +47,22 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
// Affix a version string in case we need to change the format in the future
|
// Affix a version string in case we need to change the format in the future
|
||||||
// in some way
|
// in some way
|
||||||
private final String warcRecorderVersion = "1.0";
|
private final String warcRecorderVersion = "1.0";
|
||||||
|
private final Cookies cookies;
|
||||||
// We need to know if the site uses cookies so this can be reported among the search results
|
|
||||||
// -- flip this to true if we see any cookies. This information will also be painted on any
|
|
||||||
// revisited pages. It's not 100% perfect and a bit order dependent, but it's good enough.
|
|
||||||
private final WarcXCookieInformationHeader cookieInformation = new WarcXCookieInformationHeader();
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new WarcRecorder that will write to the given file
|
* Create a new WarcRecorder that will write to the given file
|
||||||
*
|
*
|
||||||
* @param warcFile The file to write to
|
* @param warcFile The file to write to
|
||||||
*/
|
*/
|
||||||
public WarcRecorder(Path warcFile) throws IOException {
|
public WarcRecorder(Path warcFile, HttpFetcherImpl fetcher) throws IOException {
|
||||||
this.warcFile = warcFile;
|
this.warcFile = warcFile;
|
||||||
this.writer = new WarcWriter(warcFile);
|
this.writer = new WarcWriter(warcFile);
|
||||||
|
this.cookies = fetcher.getCookies();
|
||||||
|
}
|
||||||
|
|
||||||
|
public WarcRecorder(Path warcFile, Cookies cookies) throws IOException {
|
||||||
|
this.warcFile = warcFile;
|
||||||
|
this.writer = new WarcWriter(warcFile);
|
||||||
|
this.cookies = cookies;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -71,38 +72,41 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
public WarcRecorder() throws IOException {
|
public WarcRecorder() throws IOException {
|
||||||
this.warcFile = Files.createTempFile("warc", ".warc.gz");
|
this.warcFile = Files.createTempFile("warc", ".warc.gz");
|
||||||
this.writer = new WarcWriter(this.warcFile);
|
this.writer = new WarcWriter(this.warcFile);
|
||||||
|
this.cookies = new Cookies();
|
||||||
|
|
||||||
temporaryFile = true;
|
temporaryFile = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public HttpFetchResult fetch(OkHttpClient client, Request request) throws NoSuchAlgorithmException,
|
public HttpFetchResult fetch(HttpClient client,
|
||||||
IOException,
|
java.net.http.HttpRequest request)
|
||||||
URISyntaxException,
|
throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException
|
||||||
InterruptedException
|
|
||||||
{
|
{
|
||||||
URI requestUri = request.url().uri();
|
URI requestUri = request.uri();
|
||||||
|
|
||||||
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
||||||
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
||||||
|
|
||||||
String ip;
|
|
||||||
Instant date = Instant.now();
|
Instant date = Instant.now();
|
||||||
|
|
||||||
var call = client.newCall(request);
|
var response = client.send(request, java.net.http.HttpResponse.BodyHandlers.ofInputStream());
|
||||||
|
|
||||||
|
|
||||||
cookieInformation.update(client, request.url());
|
Map<String, List<String>> extraHeaders = new HashMap<>();
|
||||||
|
|
||||||
try (var response = call.execute();
|
// Not entirely sure why we need to do this, but keeping it due to Chesterton's Fence
|
||||||
WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response))
|
extraHeaders.putAll(request.headers().map());
|
||||||
|
|
||||||
|
try (WarcInputBuffer inputBuffer = WarcInputBuffer.forResponse(response))
|
||||||
{
|
{
|
||||||
|
if (cookies.hasCookies()) {
|
||||||
|
extraHeaders.put("X-Has-Cookies", List.of("1"));
|
||||||
|
}
|
||||||
|
|
||||||
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
byte[] responseHeaders = WarcProtocolReconstructor.getResponseHeader(response, inputBuffer.size()).getBytes(StandardCharsets.UTF_8);
|
||||||
|
|
||||||
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
|
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(inputBuffer.size() + responseHeaders.length);
|
||||||
InputStream inputStream = inputBuffer.read();
|
InputStream inputStream = inputBuffer.read();
|
||||||
|
|
||||||
ip = IpInterceptingNetworkInterceptor.getIpFromResponse(response);
|
|
||||||
|
|
||||||
responseDataBuffer.put(responseHeaders);
|
responseDataBuffer.put(responseHeaders);
|
||||||
responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseHeaders.length);
|
responseDataBuffer.updateDigest(responseDigestBuilder, 0, responseHeaders.length);
|
||||||
|
|
||||||
@ -125,17 +129,15 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
|
|
||||||
// It looks like this might be the same as requestUri, but it's not;
|
// It looks like this might be the same as requestUri, but it's not;
|
||||||
// it's the URI after resolving redirects.
|
// it's the URI after resolving redirects.
|
||||||
final URI responseUri = response.request().url().uri();
|
final URI responseUri = response.uri();
|
||||||
|
|
||||||
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
|
WarcResponse.Builder responseBuilder = new WarcResponse.Builder(responseUri)
|
||||||
.blockDigest(responseDigestBuilder.build())
|
.blockDigest(responseDigestBuilder.build())
|
||||||
.date(date)
|
.date(date)
|
||||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||||
|
|
||||||
cookieInformation.paint(responseBuilder);
|
InetAddress inetAddress = InetAddress.getByName(responseUri.getHost());
|
||||||
|
responseBuilder.ipAddress(inetAddress);
|
||||||
if (ip != null) responseBuilder.ipAddress(InetAddress.getByName(ip));
|
|
||||||
|
|
||||||
responseBuilder.payloadDigest(payloadDigestBuilder.build());
|
responseBuilder.payloadDigest(payloadDigestBuilder.build());
|
||||||
responseBuilder.truncated(inputBuffer.truncationReason());
|
responseBuilder.truncated(inputBuffer.truncationReason());
|
||||||
|
|
||||||
@ -152,8 +154,8 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
byte[] httpRequestString = WarcProtocolReconstructor
|
byte[] httpRequestString = WarcProtocolReconstructor
|
||||||
.getHttpRequestString(
|
.getHttpRequestString(
|
||||||
response.request().method(),
|
response.request().method(),
|
||||||
response.request().headers().toMultimap(),
|
response.request().headers().map(),
|
||||||
request.headers().toMultimap(),
|
extraHeaders,
|
||||||
requestUri)
|
requestUri)
|
||||||
.getBytes();
|
.getBytes();
|
||||||
|
|
||||||
@ -171,7 +173,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
|
|
||||||
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
if (Duration.between(date, Instant.now()).compareTo(Duration.ofSeconds(9)) > 0
|
||||||
&& inputBuffer.size() < 2048
|
&& inputBuffer.size() < 2048
|
||||||
&& !request.url().encodedPath().endsWith("robots.txt")) // don't bail on robots.txt
|
&& !request.uri().getPath().endsWith("robots.txt")) // don't bail on robots.txt
|
||||||
{
|
{
|
||||||
// Fast detection and mitigation of crawler traps that respond with slow
|
// Fast detection and mitigation of crawler traps that respond with slow
|
||||||
// small responses, with a high branching factor
|
// small responses, with a high branching factor
|
||||||
@ -189,9 +191,9 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return new HttpFetchResult.ResultOk(responseUri,
|
return new HttpFetchResult.ResultOk(responseUri,
|
||||||
response.code(),
|
response.statusCode(),
|
||||||
inputBuffer.headers(),
|
inputBuffer.headers(),
|
||||||
ip,
|
inetAddress.getHostAddress(),
|
||||||
responseDataBuffer.data,
|
responseDataBuffer.data,
|
||||||
dataStart,
|
dataStart,
|
||||||
responseDataBuffer.length() - dataStart);
|
responseDataBuffer.length() - dataStart);
|
||||||
@ -267,7 +269,9 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
.date(Instant.now())
|
.date(Instant.now())
|
||||||
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
.body(MediaType.HTTP_RESPONSE, responseDataBuffer.copyBytes());
|
||||||
|
|
||||||
cookieInformation.paint(builder);
|
if (cookies.hasCookies()) {
|
||||||
|
builder.addHeader("X-Has-Cookies", "1");
|
||||||
|
}
|
||||||
|
|
||||||
var reference = builder.build();
|
var reference = builder.build();
|
||||||
|
|
||||||
|
@ -12,7 +12,6 @@ import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
|||||||
import nu.marginalia.crawl.logic.LinkFilterSelector;
|
import nu.marginalia.crawl.logic.LinkFilterSelector;
|
||||||
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
|
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
|
||||||
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
|
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
|
||||||
import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
|
|
||||||
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
||||||
import nu.marginalia.link_parser.LinkParser;
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
@ -53,7 +52,6 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
private final WarcRecorder warcRecorder;
|
private final WarcRecorder warcRecorder;
|
||||||
private final CrawlerRevisitor crawlerRevisitor;
|
private final CrawlerRevisitor crawlerRevisitor;
|
||||||
|
|
||||||
private final SitemapFetcher sitemapFetcher;
|
|
||||||
int errorCount = 0;
|
int errorCount = 0;
|
||||||
|
|
||||||
public CrawlerRetreiver(HttpFetcher fetcher,
|
public CrawlerRetreiver(HttpFetcher fetcher,
|
||||||
@ -71,7 +69,6 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
|
|
||||||
crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), specs.urls(), specs.crawlDepth());
|
crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), specs.urls(), specs.crawlDepth());
|
||||||
crawlerRevisitor = new CrawlerRevisitor(crawlFrontier, this, warcRecorder);
|
crawlerRevisitor = new CrawlerRevisitor(crawlFrontier, this, warcRecorder);
|
||||||
sitemapFetcher = new SitemapFetcher(crawlFrontier, fetcher.createSitemapRetriever());
|
|
||||||
|
|
||||||
// We must always crawl the index page first, this is assumed when fingerprinting the server
|
// We must always crawl the index page first, this is assumed when fingerprinting the server
|
||||||
var fst = crawlFrontier.peek();
|
var fst = crawlFrontier.peek();
|
||||||
@ -145,9 +142,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
// Add external links to the crawl frontier
|
// Add external links to the crawl frontier
|
||||||
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
|
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
|
||||||
|
|
||||||
// Add links from the sitemap to the crawl frontier
|
|
||||||
sitemapFetcher.downloadSitemaps(robotsRules, rootUrl);
|
|
||||||
|
|
||||||
|
// Fetch sitemaps
|
||||||
|
for (var sitemap : robotsRules.getSitemaps()) {
|
||||||
|
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
|
||||||
|
}
|
||||||
|
|
||||||
while (!crawlFrontier.isEmpty()
|
while (!crawlFrontier.isEmpty()
|
||||||
&& !crawlFrontier.isCrawlDepthReached()
|
&& !crawlFrontier.isCrawlDepthReached()
|
||||||
@ -271,10 +270,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Download the sitemap if available
|
// Download the sitemap if available
|
||||||
if (feedLink.isPresent()) {
|
feedLink.ifPresent(s -> fetcher.fetchSitemapUrls(s, timer));
|
||||||
sitemapFetcher.downloadSitemaps(List.of(feedLink.get()));
|
|
||||||
timer.waitFetchDelay(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Grab the favicon if it exists
|
// Grab the favicon if it exists
|
||||||
fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
||||||
|
@ -1,72 +0,0 @@
|
|||||||
package nu.marginalia.crawl.retreival.sitemap;
|
|
||||||
|
|
||||||
import crawlercommons.robots.SimpleRobotRules;
|
|
||||||
import nu.marginalia.crawl.fetcher.SitemapRetriever;
|
|
||||||
import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
|
|
||||||
import nu.marginalia.model.EdgeUrl;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
public class SitemapFetcher {
|
|
||||||
|
|
||||||
private final DomainCrawlFrontier crawlFrontier;
|
|
||||||
private final SitemapRetriever sitemapRetriever;
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(SitemapFetcher.class);
|
|
||||||
|
|
||||||
public SitemapFetcher(DomainCrawlFrontier crawlFrontier, SitemapRetriever sitemapRetriever) {
|
|
||||||
this.crawlFrontier = crawlFrontier;
|
|
||||||
this.sitemapRetriever = sitemapRetriever;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void downloadSitemaps(SimpleRobotRules robotsRules, EdgeUrl rootUrl) {
|
|
||||||
List<String> urls = robotsRules.getSitemaps();
|
|
||||||
|
|
||||||
if (urls.isEmpty()) {
|
|
||||||
urls = List.of(rootUrl.withPathAndParam("/sitemap.xml", null).toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
downloadSitemaps(urls);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void downloadSitemaps(List<String> urls) {
|
|
||||||
|
|
||||||
Set<String> checkedSitemaps = new HashSet<>();
|
|
||||||
|
|
||||||
for (var rawUrl : urls) {
|
|
||||||
Optional<EdgeUrl> parsedUrl = EdgeUrl.parse(rawUrl);
|
|
||||||
if (parsedUrl.isEmpty()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
EdgeUrl url = parsedUrl.get();
|
|
||||||
|
|
||||||
// Let's not download sitemaps from other domains for now
|
|
||||||
if (!crawlFrontier.isSameDomain(url)) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (checkedSitemaps.contains(url.path))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
var sitemap = sitemapRetriever.fetchSitemap(url);
|
|
||||||
if (sitemap.isEmpty()) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ensure we don't try to download this sitemap again
|
|
||||||
// (don't move this up, as we may want to check the same
|
|
||||||
// path with different protocols until we find one that works)
|
|
||||||
|
|
||||||
checkedSitemaps.add(url.path);
|
|
||||||
|
|
||||||
crawlFrontier.addAllToQueue(sitemap);
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.debug("Queue is now {}", crawlFrontier.queueSize());
|
|
||||||
}
|
|
||||||
}
|
|
@ -36,7 +36,6 @@ dependencies {
|
|||||||
implementation libs.gson
|
implementation libs.gson
|
||||||
implementation libs.commons.io
|
implementation libs.commons.io
|
||||||
implementation libs.commons.lang3
|
implementation libs.commons.lang3
|
||||||
implementation libs.okhttp3
|
|
||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
implementation libs.snakeyaml
|
implementation libs.snakeyaml
|
||||||
implementation libs.zstd
|
implementation libs.zstd
|
||||||
|
@ -1,17 +1,17 @@
|
|||||||
package nu.marginalia.model.body;
|
package nu.marginalia.model.body;
|
||||||
|
|
||||||
import nu.marginalia.contenttype.ContentType;
|
import nu.marginalia.contenttype.ContentType;
|
||||||
import okhttp3.Headers;
|
import org.jetbrains.annotations.Nullable;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.netpreserve.jwarc.MessageHeaders;
|
import org.netpreserve.jwarc.MessageHeaders;
|
||||||
import org.netpreserve.jwarc.WarcResponse;
|
import org.netpreserve.jwarc.WarcResponse;
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
|
import java.net.http.HttpHeaders;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
/* FIXME: This interface has a very unfortunate name that is not very descriptive.
|
/* FIXME: This interface has a very unfortunate name that is not very descriptive.
|
||||||
@ -56,42 +56,26 @@ public sealed interface HttpFetchResult {
|
|||||||
*/
|
*/
|
||||||
record ResultOk(URI uri,
|
record ResultOk(URI uri,
|
||||||
int statusCode,
|
int statusCode,
|
||||||
Headers headers,
|
HttpHeaders headers,
|
||||||
String ipAddress,
|
String ipAddress,
|
||||||
byte[] bytesRaw,
|
byte[] bytesRaw,
|
||||||
int bytesStart,
|
int bytesStart,
|
||||||
int bytesLength
|
int bytesLength
|
||||||
) implements HttpFetchResult {
|
) implements HttpFetchResult {
|
||||||
|
|
||||||
|
public ResultOk(URI uri, int status, MessageHeaders headers, String ipAddress, byte[] bytes, int bytesStart, int length) {
|
||||||
|
this(uri, status, HttpHeaders.of(headers.map(), (k,v) -> true), ipAddress, bytes, bytesStart, length);
|
||||||
|
}
|
||||||
|
|
||||||
public boolean isOk() {
|
public boolean isOk() {
|
||||||
return statusCode >= 200 && statusCode < 300;
|
return statusCode >= 200 && statusCode < 300;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ResultOk(URI uri,
|
|
||||||
int statusCode,
|
|
||||||
MessageHeaders headers,
|
|
||||||
String ipAddress,
|
|
||||||
byte[] bytesRaw,
|
|
||||||
int bytesStart,
|
|
||||||
int bytesLength) {
|
|
||||||
this(uri, statusCode, convertHeaders(headers), ipAddress, bytesRaw, bytesStart, bytesLength);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Headers convertHeaders(MessageHeaders headers) {
|
|
||||||
var ret = new Headers.Builder();
|
|
||||||
for (var header : headers.map().entrySet()) {
|
|
||||||
for (var value : header.getValue()) {
|
|
||||||
ret.add(header.getKey(), value);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ret.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
public InputStream getInputStream() {
|
public InputStream getInputStream() {
|
||||||
return new ByteArrayInputStream(bytesRaw, bytesStart, bytesLength);
|
return new ByteArrayInputStream(bytesRaw, bytesStart, bytesLength);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Optional<Document> parseDocument() throws IOException {
|
public Optional<Document> parseDocument() {
|
||||||
return DocumentBodyExtractor.asString(this).flatMapOpt((contentType, body) -> {
|
return DocumentBodyExtractor.asString(this).flatMapOpt((contentType, body) -> {
|
||||||
if (contentType.is("text/html")) {
|
if (contentType.is("text/html")) {
|
||||||
return Optional.of(Jsoup.parse(body));
|
return Optional.of(Jsoup.parse(body));
|
||||||
@ -102,8 +86,9 @@ public sealed interface HttpFetchResult {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Nullable
|
||||||
public String header(String name) {
|
public String header(String name) {
|
||||||
return headers.get(name);
|
return headers.firstValue(name).orElse(null);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -165,27 +165,28 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
|||||||
contentType = "";
|
contentType = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
String headersStr = null;
|
|
||||||
StringJoiner headersStrBuilder = new StringJoiner("\n");
|
StringJoiner headersStrBuilder = new StringJoiner("\n");
|
||||||
for (var header : headers) {
|
for (var header : headers.map().entrySet()) {
|
||||||
headersStrBuilder.add(header.getFirst() + ": " + header.getSecond());
|
for (var value : header.getValue()) {
|
||||||
|
headersStrBuilder.add(header.getKey() + ": " + value);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
headersStr = headersStrBuilder.toString();
|
String headersStr = headersStrBuilder.toString();
|
||||||
|
|
||||||
|
|
||||||
write(new CrawledDocumentParquetRecord(
|
write(new CrawledDocumentParquetRecord(
|
||||||
domain,
|
domain,
|
||||||
response.target(),
|
response.target(),
|
||||||
fetchOk.ipAddress(),
|
fetchOk.ipAddress(),
|
||||||
WarcXCookieInformationHeader.hasCookies(response),
|
headers.firstValue("X-Has-Cookies").orElse("0").equals("1"),
|
||||||
fetchOk.statusCode(),
|
fetchOk.statusCode(),
|
||||||
response.date(),
|
response.date(),
|
||||||
contentType,
|
contentType,
|
||||||
bodyBytes,
|
bodyBytes,
|
||||||
headersStr,
|
headersStr,
|
||||||
headers.get("ETag"),
|
headers.firstValue("ETag").orElse(null),
|
||||||
headers.get("Last-Modified"))
|
headers.firstValue("Last-Modified").orElse(null)
|
||||||
);
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,35 +0,0 @@
|
|||||||
package org.netpreserve.jwarc;
|
|
||||||
|
|
||||||
import okhttp3.HttpUrl;
|
|
||||||
import okhttp3.OkHttpClient;
|
|
||||||
|
|
||||||
/** Encapsulates out-of-band information about whether a website uses cookies,
|
|
||||||
* using a non-standard WARC header "X-Has-Cookies".
|
|
||||||
*/
|
|
||||||
public class WarcXCookieInformationHeader {
|
|
||||||
private boolean hasCookies = false;
|
|
||||||
private static final String headerName = "X-Has-Cookies";
|
|
||||||
|
|
||||||
public void update(OkHttpClient client, HttpUrl url) {
|
|
||||||
if (!hasCookies) {
|
|
||||||
hasCookies = !client.cookieJar().loadForRequest(url).isEmpty();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean hasCookies() {
|
|
||||||
return hasCookies;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void paint(WarcResponse.Builder builder) {
|
|
||||||
builder.addHeader(headerName, hasCookies ? "1" : "0");
|
|
||||||
}
|
|
||||||
public void paint(WarcXResponseReference.Builder builder) {
|
|
||||||
builder.addHeader(headerName, hasCookies ? "1" : "0");
|
|
||||||
}
|
|
||||||
|
|
||||||
public static boolean hasCookies(WarcRecord record) {
|
|
||||||
return record.headers().contains(headerName, "1");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
@ -1,11 +1,9 @@
|
|||||||
package nu.marginalia.crawl.retreival;
|
package nu.marginalia.crawl.retreival;
|
||||||
|
|
||||||
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
|
import nu.marginalia.crawl.fetcher.Cookies;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import okhttp3.OkHttpClient;
|
|
||||||
import okhttp3.Request;
|
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@ -15,6 +13,8 @@ import org.netpreserve.jwarc.WarcResponse;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
|
import java.net.http.HttpClient;
|
||||||
|
import java.net.http.HttpRequest;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.security.NoSuchAlgorithmException;
|
import java.security.NoSuchAlgorithmException;
|
||||||
@ -27,11 +27,10 @@ import static org.junit.jupiter.api.Assertions.fail;
|
|||||||
class CrawlerWarcResynchronizerTest {
|
class CrawlerWarcResynchronizerTest {
|
||||||
Path fileName;
|
Path fileName;
|
||||||
Path outputFile;
|
Path outputFile;
|
||||||
OkHttpClient httpClient;
|
HttpClient httpClient;
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
httpClient = new OkHttpClient.Builder()
|
httpClient = HttpClient.newBuilder()
|
||||||
.addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
|
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
fileName = Files.createTempFile("test", ".warc.gz");
|
fileName = Files.createTempFile("test", ".warc.gz");
|
||||||
@ -46,7 +45,7 @@ class CrawlerWarcResynchronizerTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
void run() throws IOException, URISyntaxException {
|
void run() throws IOException, URISyntaxException {
|
||||||
try (var oldRecorder = new WarcRecorder(fileName)) {
|
try (var oldRecorder = new WarcRecorder(fileName, new Cookies())) {
|
||||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/");
|
fetchUrl(oldRecorder, "https://www.marginalia.nu/");
|
||||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/log/");
|
fetchUrl(oldRecorder, "https://www.marginalia.nu/log/");
|
||||||
fetchUrl(oldRecorder, "https://www.marginalia.nu/feed/");
|
fetchUrl(oldRecorder, "https://www.marginalia.nu/feed/");
|
||||||
@ -56,7 +55,7 @@ class CrawlerWarcResynchronizerTest {
|
|||||||
|
|
||||||
var crawlFrontier = new DomainCrawlFrontier(new EdgeDomain("www.marginalia.nu"), List.of(), 100);
|
var crawlFrontier = new DomainCrawlFrontier(new EdgeDomain("www.marginalia.nu"), List.of(), 100);
|
||||||
|
|
||||||
try (var newRecorder = new WarcRecorder(outputFile)) {
|
try (var newRecorder = new WarcRecorder(outputFile, new Cookies())) {
|
||||||
new CrawlerWarcResynchronizer(crawlFrontier, newRecorder).run(fileName);
|
new CrawlerWarcResynchronizer(crawlFrontier, newRecorder).run(fileName);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -79,10 +78,11 @@ class CrawlerWarcResynchronizerTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
void fetchUrl(WarcRecorder recorder, String url) throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||||
var req = new Request.Builder().url(url)
|
var req = HttpRequest.newBuilder()
|
||||||
.addHeader("User-agent", "test.marginalia.nu")
|
.uri(new java.net.URI(url))
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
.header("User-agent", "test.marginalia.nu")
|
||||||
.get().build();
|
.header("Accept-Encoding", "gzip")
|
||||||
|
.GET().build();
|
||||||
recorder.fetch(httpClient, req);
|
recorder.fetch(httpClient, req);
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -2,6 +2,7 @@ package nu.marginalia.crawl.retreival.fetcher;
|
|||||||
|
|
||||||
import com.sun.net.httpserver.HttpServer;
|
import com.sun.net.httpserver.HttpServer;
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
|
import nu.marginalia.crawl.fetcher.Cookies;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
@ -79,7 +80,7 @@ class ContentTypeProberTest {
|
|||||||
htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir.gz").get();
|
htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir.gz").get();
|
||||||
|
|
||||||
fetcher = new HttpFetcherImpl("test");
|
fetcher = new HttpFetcherImpl("test");
|
||||||
recorder = new WarcRecorder(warcFile);
|
recorder = new WarcRecorder(warcFile, new Cookies());
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterEach
|
@AfterEach
|
||||||
|
@ -2,13 +2,11 @@ package nu.marginalia.crawl.retreival.fetcher;
|
|||||||
|
|
||||||
import nu.marginalia.UserAgent;
|
import nu.marginalia.UserAgent;
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
|
import nu.marginalia.crawl.fetcher.Cookies;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
||||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
||||||
import okhttp3.OkHttpClient;
|
|
||||||
import okhttp3.Request;
|
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@ -19,6 +17,8 @@ import org.netpreserve.jwarc.WarcXResponseReference;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
|
import java.net.http.HttpClient;
|
||||||
|
import java.net.http.HttpRequest;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.security.NoSuchAlgorithmException;
|
import java.security.NoSuchAlgorithmException;
|
||||||
@ -31,17 +31,16 @@ class WarcRecorderTest {
|
|||||||
Path fileNameWarc;
|
Path fileNameWarc;
|
||||||
Path fileNameParquet;
|
Path fileNameParquet;
|
||||||
WarcRecorder client;
|
WarcRecorder client;
|
||||||
OkHttpClient httpClient;
|
|
||||||
|
HttpClient httpClient;
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
httpClient = new OkHttpClient.Builder()
|
httpClient = HttpClient.newBuilder().build();
|
||||||
.addNetworkInterceptor(new IpInterceptingNetworkInterceptor())
|
|
||||||
.build();
|
|
||||||
|
|
||||||
fileNameWarc = Files.createTempFile("test", ".warc");
|
fileNameWarc = Files.createTempFile("test", ".warc");
|
||||||
fileNameParquet = Files.createTempFile("test", ".parquet");
|
fileNameParquet = Files.createTempFile("test", ".parquet");
|
||||||
|
|
||||||
client = new WarcRecorder(fileNameWarc);
|
client = new WarcRecorder(fileNameWarc, new Cookies());
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterEach
|
@AfterEach
|
||||||
@ -52,10 +51,13 @@ class WarcRecorderTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
void fetch() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||||
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/")
|
client.fetch(httpClient,
|
||||||
.addHeader("User-agent", "test.marginalia.nu")
|
HttpRequest.newBuilder()
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
.uri(new java.net.URI("https://www.marginalia.nu/"))
|
||||||
.get().build());
|
.header("User-agent", "test.marginalia.nu")
|
||||||
|
.header("Accept-Encoding", "gzip")
|
||||||
|
.GET().build()
|
||||||
|
);
|
||||||
|
|
||||||
Map<String, String> sampleData = new HashMap<>();
|
Map<String, String> sampleData = new HashMap<>();
|
||||||
try (var warcReader = new WarcReader(fileNameWarc)) {
|
try (var warcReader = new WarcReader(fileNameWarc)) {
|
||||||
@ -76,7 +78,7 @@ class WarcRecorderTest {
|
|||||||
@Test
|
@Test
|
||||||
public void flagAsSkipped() throws IOException, URISyntaxException {
|
public void flagAsSkipped() throws IOException, URISyntaxException {
|
||||||
|
|
||||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
|
||||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||||
"text/html",
|
"text/html",
|
||||||
200,
|
200,
|
||||||
@ -100,7 +102,7 @@ class WarcRecorderTest {
|
|||||||
@Test
|
@Test
|
||||||
public void flagAsSkippedNullBody() throws IOException, URISyntaxException {
|
public void flagAsSkippedNullBody() throws IOException, URISyntaxException {
|
||||||
|
|
||||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
|
||||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||||
"text/html",
|
"text/html",
|
||||||
200,
|
200,
|
||||||
@ -112,7 +114,7 @@ class WarcRecorderTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testSaveImport() throws URISyntaxException, IOException {
|
public void testSaveImport() throws URISyntaxException, IOException {
|
||||||
try (var recorder = new WarcRecorder(fileNameWarc)) {
|
try (var recorder = new WarcRecorder(fileNameWarc, new Cookies())) {
|
||||||
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"),
|
||||||
"text/html",
|
"text/html",
|
||||||
200,
|
200,
|
||||||
@ -136,19 +138,23 @@ class WarcRecorderTest {
|
|||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
public void testConvertToParquet() throws NoSuchAlgorithmException, IOException, URISyntaxException, InterruptedException {
|
||||||
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/")
|
client.fetch(httpClient, HttpRequest.newBuilder()
|
||||||
.addHeader("User-agent", "test.marginalia.nu")
|
.uri(new java.net.URI("https://www.marginalia.nu/"))
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
.header("User-agent", "test.marginalia.nu")
|
||||||
.get().build());
|
.header("Accept-Encoding", "gzip")
|
||||||
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/log/")
|
.GET().build());
|
||||||
.addHeader("User-agent", "test.marginalia.nu")
|
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
client.fetch(httpClient, HttpRequest.newBuilder()
|
||||||
.get().build());
|
.uri(new java.net.URI("https://www.marginalia.nu/log/"))
|
||||||
client.fetch(httpClient, new Request.Builder().url("https://www.marginalia.nu/sanic.png")
|
.header("User-agent", "test.marginalia.nu")
|
||||||
.addHeader("User-agent", "test.marginalia.nu")
|
.header("Accept-Encoding", "gzip")
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
.GET().build());
|
||||||
.get().build());
|
|
||||||
client.close();
|
client.fetch(httpClient, HttpRequest.newBuilder()
|
||||||
|
.uri(new java.net.URI("https://www.marginalia.nu/sanic.png"))
|
||||||
|
.header("User-agent", "test.marginalia.nu")
|
||||||
|
.header("Accept-Encoding", "gzip")
|
||||||
|
.GET().build());
|
||||||
|
|
||||||
CrawledDocumentParquetRecordFileWriter.convertWarc(
|
CrawledDocumentParquetRecordFileWriter.convertWarc(
|
||||||
"www.marginalia.nu",
|
"www.marginalia.nu",
|
||||||
|
@ -4,6 +4,7 @@ import nu.marginalia.crawl.fetcher.ContentTags;
|
|||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
|
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.ContentTypeLogic;
|
import nu.marginalia.model.body.ContentTypeLogic;
|
||||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||||
@ -37,6 +38,12 @@ class HttpFetcherTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testSitemapMarginalia() {
|
||||||
|
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||||
|
fetcher.fetchSitemapUrls("https://www.marginalia.nu/sitemap.xml", new CrawlDelayTimer(1)).forEach(System.out::println);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void fetchText() throws Exception {
|
void fetchText() throws Exception {
|
||||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||||
|
@ -3,11 +3,9 @@ package nu.marginalia.crawling.retreival;
|
|||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import nu.marginalia.crawl.CrawlerMain;
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
import nu.marginalia.crawl.DomainStateDb;
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.*;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
|
||||||
import nu.marginalia.crawl.fetcher.SitemapRetriever;
|
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
|
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||||
import nu.marginalia.crawl.retreival.DomainProber;
|
import nu.marginalia.crawl.retreival.DomainProber;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
@ -17,7 +15,6 @@ import nu.marginalia.model.crawldata.CrawledDocument;
|
|||||||
import nu.marginalia.model.crawldata.CrawlerDocumentStatus;
|
import nu.marginalia.model.crawldata.CrawlerDocumentStatus;
|
||||||
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
||||||
import nu.marginalia.test.CommonTestData;
|
import nu.marginalia.test.CommonTestData;
|
||||||
import okhttp3.Headers;
|
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@ -27,6 +24,7 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
|
import java.net.http.HttpHeaders;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
@ -122,7 +120,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
public void setAllowAllContentTypes(boolean allowAllContentTypes) {}
|
public void setAllowAllContentTypes(boolean allowAllContentTypes) {}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<String> getCookies() { return List.of();}
|
public Cookies getCookies() { return new Cookies();}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void clearCookies() {}
|
public void clearCookies() {}
|
||||||
@ -149,7 +147,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
return new HttpFetchResult.ResultOk(
|
return new HttpFetchResult.ResultOk(
|
||||||
url.asURI(),
|
url.asURI(),
|
||||||
200,
|
200,
|
||||||
new Headers.Builder().build(),
|
HttpHeaders.of(Map.of(), (k,v)->true),
|
||||||
"127.0.0.1",
|
"127.0.0.1",
|
||||||
bodyBytes,
|
bodyBytes,
|
||||||
0,
|
0,
|
||||||
@ -164,6 +162,11 @@ public class CrawlerMockFetcherTest {
|
|||||||
return new HttpFetchResult.ResultNone();
|
return new HttpFetchResult.ResultNone();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<EdgeUrl> fetchSitemapUrls(String rootSitemapUrl, CrawlDelayTimer delayTimer) {
|
||||||
|
return List.of();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) {
|
public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) {
|
||||||
return new SimpleRobotRules();
|
return new SimpleRobotRules();
|
||||||
@ -174,5 +177,9 @@ public class CrawlerMockFetcherTest {
|
|||||||
return Mockito.mock(SitemapRetriever.class);
|
return Mockito.mock(SitemapRetriever.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() {
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -5,6 +5,7 @@ import nu.marginalia.WmsaHome;
|
|||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
import nu.marginalia.crawl.CrawlerMain;
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
import nu.marginalia.crawl.DomainStateDb;
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
|
import nu.marginalia.crawl.fetcher.Cookies;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
@ -180,7 +181,7 @@ class CrawlerRetreiverTest {
|
|||||||
new EdgeDomain("www.marginalia.nu"),
|
new EdgeDomain("www.marginalia.nu"),
|
||||||
List.of(), 100);
|
List.of(), 100);
|
||||||
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
||||||
new WarcRecorder(tempFileWarc2)
|
new WarcRecorder(tempFileWarc2, new Cookies())
|
||||||
);
|
);
|
||||||
|
|
||||||
// truncate the size of the file to simulate a crash
|
// truncate the size of the file to simulate a crash
|
||||||
@ -458,7 +459,7 @@ class CrawlerRetreiverTest {
|
|||||||
List.of(), 100);
|
List.of(), 100);
|
||||||
|
|
||||||
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
var resync = new CrawlerWarcResynchronizer(revisitCrawlFrontier,
|
||||||
new WarcRecorder(tempFileWarc3)
|
new WarcRecorder(tempFileWarc3, new Cookies())
|
||||||
);
|
);
|
||||||
|
|
||||||
// truncate the size of the file to simulate a crash
|
// truncate the size of the file to simulate a crash
|
||||||
@ -509,7 +510,7 @@ class CrawlerRetreiverTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
|
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
|
||||||
try (var recorder = new WarcRecorder(tempFileWarc2);
|
try (var recorder = new WarcRecorder(tempFileWarc2, new Cookies());
|
||||||
var db = new DomainStateDb(tempFileDb)
|
var db = new DomainStateDb(tempFileDb)
|
||||||
) {
|
) {
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(),
|
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(),
|
||||||
@ -522,7 +523,7 @@ class CrawlerRetreiverTest {
|
|||||||
|
|
||||||
@NotNull
|
@NotNull
|
||||||
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) {
|
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) {
|
||||||
try (var recorder = new WarcRecorder(tempFileWarc1);
|
try (var recorder = new WarcRecorder(tempFileWarc1, new Cookies());
|
||||||
var db = new DomainStateDb(tempFileDb)
|
var db = new DomainStateDb(tempFileDb)
|
||||||
) {
|
) {
|
||||||
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder);
|
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder);
|
||||||
|
@ -56,7 +56,6 @@ dependencies {
|
|||||||
implementation libs.zstd
|
implementation libs.zstd
|
||||||
implementation libs.jwarc
|
implementation libs.jwarc
|
||||||
implementation libs.crawlercommons
|
implementation libs.crawlercommons
|
||||||
implementation libs.okhttp3
|
|
||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
implementation libs.opencsv
|
implementation libs.opencsv
|
||||||
implementation libs.fastutil
|
implementation libs.fastutil
|
||||||
|
@ -10,6 +10,7 @@ import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
|
|||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
|
import nu.marginalia.crawl.fetcher.Cookies;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||||
@ -120,7 +121,7 @@ public class IntegrationTest {
|
|||||||
public void run() throws Exception {
|
public void run() throws Exception {
|
||||||
|
|
||||||
/** CREATE WARC */
|
/** CREATE WARC */
|
||||||
try (WarcRecorder warcRecorder = new WarcRecorder(warcData)) {
|
try (WarcRecorder warcRecorder = new WarcRecorder(warcData, new Cookies())) {
|
||||||
warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"),
|
warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"),
|
||||||
new HttpFetcherImpl.DomainProbeResult.Ok(new EdgeUrl("https://www.example.com/")));
|
new HttpFetcherImpl.DomainProbeResult.Ok(new EdgeUrl("https://www.example.com/")));
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user