mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(crawler) Refactor
* Restructure the code to make a bit more sense * Store full headers in crawl data * Fix bug in retry-after header that assumed the timeout was in milliseconds, and then clamped it to a lower bound of 500ms, meaning this was almost always handled wrong
This commit is contained in:
parent
9c292a4f62
commit
e9854f194c
@ -7,11 +7,11 @@ import nu.marginalia.UserAgent;
|
|||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.converting.model.ProcessedDomain;
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||||
import nu.marginalia.crawl.retreival.DomainProber;
|
import nu.marginalia.crawl.retreival.DomainProber;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
|
||||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
|
||||||
import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream;
|
import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
@ -266,7 +266,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
List<SerializableCrawlData> data = new ArrayList<>();
|
List<SerializableCrawlData> data = new ArrayList<>();
|
||||||
|
|
||||||
try (var recorder = new WarcRecorder(fileName)) {
|
try (var recorder = new WarcRecorder(fileName)) {
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).fetch();
|
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).crawlDomain();
|
||||||
}
|
}
|
||||||
|
|
||||||
CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain,
|
CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain,
|
||||||
|
@ -10,12 +10,12 @@ import nu.marginalia.UserAgent;
|
|||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.atags.source.AnchorTagsSource;
|
import nu.marginalia.atags.source.AnchorTagsSource;
|
||||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||||
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
|
import nu.marginalia.crawl.logic.DomainLocks;
|
||||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||||
import nu.marginalia.crawl.retreival.DomainLocks;
|
|
||||||
import nu.marginalia.crawl.retreival.DomainProber;
|
import nu.marginalia.crawl.retreival.DomainProber;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
|
||||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
|
||||||
import nu.marginalia.crawl.spec.CrawlSpecProvider;
|
import nu.marginalia.crawl.spec.CrawlSpecProvider;
|
||||||
import nu.marginalia.crawl.spec.DbCrawlSpecProvider;
|
import nu.marginalia.crawl.spec.DbCrawlSpecProvider;
|
||||||
import nu.marginalia.crawl.spec.ParquetCrawlSpecProvider;
|
import nu.marginalia.crawl.spec.ParquetCrawlSpecProvider;
|
||||||
@ -294,7 +294,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
Files.delete(tempFile);
|
Files.delete(tempFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
int size = retriever.fetch(domainLinks, reference);
|
int size = retriever.crawlDomain(domainLinks, reference);
|
||||||
|
|
||||||
// Delete the reference crawl data if it's not the same as the new one
|
// Delete the reference crawl data if it's not the same as the new one
|
||||||
// (mostly a case when migrating from legacy->warc)
|
// (mostly a case when migrating from legacy->warc)
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.crawl.retreival.fetcher;
|
package nu.marginalia.crawl.fetcher;
|
||||||
|
|
||||||
import okhttp3.Request;
|
import okhttp3.Request;
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.crawl.retreival;
|
package nu.marginalia.crawl.fetcher;
|
||||||
|
|
||||||
import okhttp3.Cookie;
|
import okhttp3.Cookie;
|
||||||
import okhttp3.CookieJar;
|
import okhttp3.CookieJar;
|
@ -1,9 +1,8 @@
|
|||||||
package nu.marginalia.crawl.retreival.fetcher;
|
package nu.marginalia.crawl.fetcher;
|
||||||
|
|
||||||
import com.google.inject.ImplementedBy;
|
import com.google.inject.ImplementedBy;
|
||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import nu.marginalia.crawl.retreival.RateLimitException;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
@ -17,12 +16,12 @@ public interface HttpFetcher {
|
|||||||
List<String> getCookies();
|
List<String> getCookies();
|
||||||
void clearCookies();
|
void clearCookies();
|
||||||
|
|
||||||
FetchResult probeDomain(EdgeUrl url);
|
HttpFetcherImpl.ProbeResult probeDomain(EdgeUrl url);
|
||||||
|
|
||||||
HttpFetchResult fetchContent(EdgeUrl url,
|
HttpFetchResult fetchContent(EdgeUrl url,
|
||||||
WarcRecorder recorder,
|
WarcRecorder recorder,
|
||||||
ContentTags tags,
|
ContentTags tags,
|
||||||
ProbeType probeType) throws RateLimitException;
|
ProbeType probeType) throws HttpFetcherImpl.RateLimitException;
|
||||||
|
|
||||||
SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);
|
SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);
|
||||||
|
|
@ -1,22 +1,23 @@
|
|||||||
package nu.marginalia.crawl.retreival.fetcher;
|
package nu.marginalia.crawl.fetcher;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import crawlercommons.robots.SimpleRobotRulesParser;
|
import crawlercommons.robots.SimpleRobotRulesParser;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.UserAgent;
|
import nu.marginalia.UserAgent;
|
||||||
import nu.marginalia.crawl.retreival.Cookies;
|
import nu.marginalia.crawl.fetcher.socket.FastTerminatingSocketFactory;
|
||||||
import nu.marginalia.crawl.retreival.RateLimitException;
|
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult;
|
import nu.marginalia.crawl.fetcher.socket.NoSecuritySSL;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
|
import nu.marginalia.crawl.logic.ContentTypeProber;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL;
|
import nu.marginalia.crawl.logic.ContentTypeProber.ContentTypeProbeResult;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.logic.SoftIfModifiedSinceProber;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.ContentTypeLogic;
|
import nu.marginalia.model.body.ContentTypeLogic;
|
||||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
|
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||||
import okhttp3.ConnectionPool;
|
import okhttp3.ConnectionPool;
|
||||||
import okhttp3.Dispatcher;
|
import okhttp3.Dispatcher;
|
||||||
import okhttp3.OkHttpClient;
|
import okhttp3.OkHttpClient;
|
||||||
@ -25,6 +26,7 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import javax.net.ssl.X509TrustManager;
|
import javax.net.ssl.X509TrustManager;
|
||||||
|
import java.time.Duration;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@ -114,7 +116,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public FetchResult probeDomain(EdgeUrl url) {
|
public ProbeResult probeDomain(EdgeUrl url) {
|
||||||
var head = new Request.Builder().head().addHeader("User-agent", userAgentString)
|
var head = new Request.Builder().head().addHeader("User-agent", userAgentString)
|
||||||
.url(url.toString())
|
.url(url.toString())
|
||||||
.build();
|
.build();
|
||||||
@ -125,9 +127,9 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
EdgeUrl requestUrl = new EdgeUrl(rsp.request().url().toString());
|
EdgeUrl requestUrl = new EdgeUrl(rsp.request().url().toString());
|
||||||
|
|
||||||
if (!Objects.equals(requestUrl.domain, url.domain)) {
|
if (!Objects.equals(requestUrl.domain, url.domain)) {
|
||||||
return new FetchResult(FetchResultState.REDIRECT, requestUrl);
|
return new ProbeResultRedirect(url.domain);
|
||||||
}
|
}
|
||||||
return new FetchResult(FetchResultState.OK, requestUrl);
|
return new ProbeResultOk(requestUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
@ -136,7 +138,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
}
|
}
|
||||||
|
|
||||||
logger.info("Error during fetching {}", ex.getMessage());
|
logger.info("Error during fetching {}", ex.getMessage());
|
||||||
return new FetchResult(FetchResultState.ERROR, url);
|
return new ProbeResultError(CrawlerDomainStatus.ERROR, ex.getMessage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -196,8 +198,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
|
|
||||||
if (result instanceof HttpFetchResult.ResultOk ok) {
|
if (result instanceof HttpFetchResult.ResultOk ok) {
|
||||||
if (ok.statusCode() == 429) {
|
if (ok.statusCode() == 429) {
|
||||||
String retryAfter = Objects.requireNonNullElse(ok.header("Retry-After"), "1000");
|
throw new RateLimitException(Objects.requireNonNullElse(ok.header("Retry-After"), "1"));
|
||||||
throw new RateLimitException(retryAfter);
|
|
||||||
}
|
}
|
||||||
if (ok.statusCode() == 304) {
|
if (ok.statusCode() == 304) {
|
||||||
return new HttpFetchResult.Result304Raw();
|
return new HttpFetchResult.Result304Raw();
|
||||||
@ -249,5 +250,44 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public sealed interface ProbeResult permits ProbeResultError, ProbeResultRedirect, ProbeResultOk {}
|
||||||
|
|
||||||
|
/** The probing failed for one reason or another
|
||||||
|
* @param status Machine readable status
|
||||||
|
* @param desc Human-readable description of the error
|
||||||
|
*/
|
||||||
|
public record ProbeResultError(CrawlerDomainStatus status, String desc) implements ProbeResult {}
|
||||||
|
|
||||||
|
/** This domain redirects to another domain */
|
||||||
|
public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {}
|
||||||
|
|
||||||
|
/** If the retrieval of the probed url was successful, return the url as it was fetched
|
||||||
|
* (which may be different from the url we probed, if we attempted another URL schema).
|
||||||
|
*
|
||||||
|
* @param probedUrl The url we successfully probed
|
||||||
|
*/
|
||||||
|
public record ProbeResultOk(EdgeUrl probedUrl) implements ProbeResult {}
|
||||||
|
|
||||||
|
|
||||||
|
/** Exception thrown when the server signals the rate limit is exceeded */
|
||||||
|
public static class RateLimitException extends Exception {
|
||||||
|
private final String retryAfter;
|
||||||
|
|
||||||
|
public RateLimitException(String retryAfterHeader) {
|
||||||
|
this.retryAfter = retryAfterHeader;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public StackTraceElement[] getStackTrace() { return new StackTraceElement[0]; }
|
||||||
|
|
||||||
|
public Duration retryAfter() {
|
||||||
|
try {
|
||||||
|
return Duration.ofSeconds(Integer.parseInt(retryAfter));
|
||||||
|
}
|
||||||
|
catch (NumberFormatException ex) {
|
||||||
|
return Duration.ofSeconds(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.crawl.retreival.fetcher;
|
package nu.marginalia.crawl.fetcher;
|
||||||
|
|
||||||
import crawlercommons.sitemaps.*;
|
import crawlercommons.sitemaps.*;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.crawl.retreival.fetcher.socket;
|
package nu.marginalia.crawl.fetcher.socket;
|
||||||
|
|
||||||
import javax.net.SocketFactory;
|
import javax.net.SocketFactory;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.crawl.retreival.fetcher.socket;
|
package nu.marginalia.crawl.fetcher.socket;
|
||||||
|
|
||||||
import okhttp3.Interceptor;
|
import okhttp3.Interceptor;
|
||||||
import okhttp3.Response;
|
import okhttp3.Response;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.crawl.retreival.fetcher.socket;
|
package nu.marginalia.crawl.fetcher.socket;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.crawl.retreival.fetcher.warc;
|
package nu.marginalia.crawl.fetcher.warc;
|
||||||
|
|
||||||
import org.netpreserve.jwarc.WarcDigest;
|
import org.netpreserve.jwarc.WarcDigest;
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.crawl.retreival.fetcher.warc;
|
package nu.marginalia.crawl.fetcher.warc;
|
||||||
|
|
||||||
import okhttp3.Headers;
|
import okhttp3.Headers;
|
||||||
import okhttp3.Response;
|
import okhttp3.Response;
|
@ -1,7 +1,6 @@
|
|||||||
package nu.marginalia.crawl.retreival.fetcher.warc;
|
package nu.marginalia.crawl.fetcher.warc;
|
||||||
|
|
||||||
import okhttp3.Protocol;
|
import okhttp3.Protocol;
|
||||||
import okhttp3.Request;
|
|
||||||
import okhttp3.Response;
|
import okhttp3.Response;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
@ -73,7 +72,7 @@ public class WarcProtocolReconstructor {
|
|||||||
|
|
||||||
String headerString = getHeadersAsString(headersAsString);
|
String headerString = getHeadersAsString(headersAsString);
|
||||||
|
|
||||||
return STR."HTTP/\{version} \{statusCode} \{statusMessage}\r\n\{headerString}\r\n\r\n";
|
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
static String getResponseHeader(Response response, long size) {
|
static String getResponseHeader(Response response, long size) {
|
||||||
@ -84,7 +83,7 @@ public class WarcProtocolReconstructor {
|
|||||||
|
|
||||||
String headerString = getHeadersAsString(response, size);
|
String headerString = getHeadersAsString(response, size);
|
||||||
|
|
||||||
return STR."HTTP/\{version} \{statusCode} \{statusMessage}\r\n\{headerString}\r\n\r\n";
|
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final Map<Integer, String> STATUS_CODE_MAP = Map.ofEntries(
|
private static final Map<Integer, String> STATUS_CODE_MAP = Map.ofEntries(
|
@ -1,13 +1,14 @@
|
|||||||
package nu.marginalia.crawl.retreival.fetcher.warc;
|
package nu.marginalia.crawl.fetcher.warc;
|
||||||
|
|
||||||
import nu.marginalia.crawl.retreival.DomainProber;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
|
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
import okhttp3.OkHttpClient;
|
import okhttp3.OkHttpClient;
|
||||||
import okhttp3.Request;
|
import okhttp3.Request;
|
||||||
|
import org.jetbrains.annotations.Nullable;
|
||||||
import org.netpreserve.jwarc.*;
|
import org.netpreserve.jwarc.*;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -183,7 +184,7 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
writer.write(item);
|
writer.write(item);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, String documentBody, ContentTags contentTags) {
|
private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, String documentBody, @Nullable String headers, ContentTags contentTags) {
|
||||||
try {
|
try {
|
||||||
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
||||||
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
||||||
@ -192,24 +193,42 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
|
|
||||||
if (documentBody == null) {
|
if (documentBody == null) {
|
||||||
bytes = new byte[0];
|
bytes = new byte[0];
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
bytes = documentBody.getBytes();
|
bytes = documentBody.getBytes();
|
||||||
}
|
}
|
||||||
|
|
||||||
StringJoiner fakeHeadersBuilder = new StringJoiner("\n");
|
// Create a synthesis of custom headers and the original headers
|
||||||
|
// to create a new set of headers that will be written to the WARC file.
|
||||||
|
|
||||||
fakeHeadersBuilder.add(STR."Content-Type: \{contentType}");
|
StringJoiner syntheticHeadersBuilder = new StringJoiner("\n");
|
||||||
fakeHeadersBuilder.add(STR."Content-Length: \{bytes.length}");
|
|
||||||
|
syntheticHeadersBuilder.add("Content-Type: " + contentType);
|
||||||
|
syntheticHeadersBuilder.add("Content-Length: " + bytes.length);
|
||||||
if (contentTags.etag() != null) {
|
if (contentTags.etag() != null) {
|
||||||
fakeHeadersBuilder.add(STR."ETag: \{contentTags.etag()}");
|
syntheticHeadersBuilder.add("ETag: " + contentTags.etag());
|
||||||
}
|
}
|
||||||
if (contentTags.lastMod() != null) {
|
if (contentTags.lastMod() != null) {
|
||||||
fakeHeadersBuilder.add(STR."Last-Modified: \{contentTags.lastMod()}");
|
syntheticHeadersBuilder.add("Last-Modified: " + contentTags.lastMod());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Grab the headers from the original response and add them to the fake headers if they are not
|
||||||
|
// Content-Type, Content-Length, ETag, or Last-Modified
|
||||||
|
for (String headerLine : Objects.requireNonNullElse(headers, "").split("\n")) {
|
||||||
|
if (headerLine.isBlank()) continue;
|
||||||
|
|
||||||
|
var lowerCase = headerLine.toLowerCase();
|
||||||
|
|
||||||
|
if (lowerCase.startsWith("content-type:")) continue;
|
||||||
|
if (lowerCase.startsWith("content-length:")) continue;
|
||||||
|
|
||||||
|
if (contentTags.etag() != null && lowerCase.startsWith("etag:")) continue;
|
||||||
|
if (contentTags.lastMod() != null && lowerCase.startsWith("last-modified:")) continue;
|
||||||
|
|
||||||
|
syntheticHeadersBuilder.add(headerLine);
|
||||||
}
|
}
|
||||||
|
|
||||||
byte[] header = WarcProtocolReconstructor
|
byte[] header = WarcProtocolReconstructor
|
||||||
.getResponseHeader(fakeHeadersBuilder.toString(), statusCode)
|
.getResponseHeader(syntheticHeadersBuilder.toString(), statusCode)
|
||||||
.getBytes(StandardCharsets.UTF_8);
|
.getBytes(StandardCharsets.UTF_8);
|
||||||
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(bytes.length + header.length);
|
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(bytes.length + header.length);
|
||||||
responseDataBuffer.put(header);
|
responseDataBuffer.put(header);
|
||||||
@ -244,25 +263,25 @@ public class WarcRecorder implements AutoCloseable {
|
|||||||
* an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this
|
* an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this
|
||||||
* scenario we want to record the data as it was in the previous crawl, but not re-fetch it.
|
* scenario we want to record the data as it was in the previous crawl, but not re-fetch it.
|
||||||
*/
|
*/
|
||||||
public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, String documentBody, ContentTags ctags) {
|
public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, String documentBody, @Nullable String headers, ContentTags ctags) {
|
||||||
saveOldResponse(url, contentType, statusCode, documentBody, ctags);
|
saveOldResponse(url, contentType, statusCode, documentBody, headers, ctags);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void writeWarcinfoHeader(String ip, EdgeDomain domain, DomainProber.ProbeResult result) throws IOException {
|
public void writeWarcinfoHeader(String ip, EdgeDomain domain, HttpFetcherImpl.ProbeResult result) throws IOException {
|
||||||
|
|
||||||
Map<String, List<String>> fields = new HashMap<>();
|
Map<String, List<String>> fields = new HashMap<>();
|
||||||
fields.put("ip", List.of(ip));
|
fields.put("ip", List.of(ip));
|
||||||
fields.put("software", List.of(STR."search.marginalia.nu/\{warcRecorderVersion}"));
|
fields.put("software", List.of("search.marginalia.nu/" + warcRecorderVersion));
|
||||||
fields.put("domain", List.of(domain.toString()));
|
fields.put("domain", List.of(domain.toString()));
|
||||||
|
|
||||||
switch (result) {
|
switch (result) {
|
||||||
case DomainProber.ProbeResultRedirect redirectDomain:
|
case HttpFetcherImpl.ProbeResultRedirect redirectDomain:
|
||||||
fields.put("X-WARC-Probe-Status", List.of(STR."REDIRECT;\{redirectDomain.domain()}"));
|
fields.put("X-WARC-Probe-Status", List.of("REDIRECT;" + redirectDomain.domain()));
|
||||||
break;
|
break;
|
||||||
case DomainProber.ProbeResultError error:
|
case HttpFetcherImpl.ProbeResultError error:
|
||||||
fields.put("X-WARC-Probe-Status", List.of(STR."\{error.status().toString()};\{error.desc()}"));
|
fields.put("X-WARC-Probe-Status", List.of(error.status().toString() + ";" + error.desc()));
|
||||||
break;
|
break;
|
||||||
case DomainProber.ProbeResultOk ok:
|
case HttpFetcherImpl.ProbeResultOk ok:
|
||||||
fields.put("X-WARC-Probe-Status", List.of("OK"));
|
fields.put("X-WARC-Probe-Status", List.of("OK"));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.crawl.retreival.fetcher;
|
package nu.marginalia.crawl.logic;
|
||||||
|
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.ContentTypeLogic;
|
import nu.marginalia.model.body.ContentTypeLogic;
|
||||||
@ -7,7 +7,7 @@ import okhttp3.Request;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.net.SocketTimeoutException;
|
import java.io.InterruptedIOException;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
|
||||||
public class ContentTypeProber {
|
public class ContentTypeProber {
|
||||||
@ -68,7 +68,7 @@ public class ContentTypeProber {
|
|||||||
|
|
||||||
return new ContentTypeProbeResult.Ok(ret);
|
return new ContentTypeProbeResult.Ok(ret);
|
||||||
|
|
||||||
} catch (SocketTimeoutException ex) {
|
} catch (InterruptedIOException ex) {
|
||||||
return new ContentTypeProbeResult.Timeout(ex);
|
return new ContentTypeProbeResult.Timeout(ex);
|
||||||
} catch (Exception ex) {
|
} catch (Exception ex) {
|
||||||
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.crawl.retreival;
|
package nu.marginalia.crawl.logic;
|
||||||
|
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.crawl.retreival;
|
package nu.marginalia.crawl.logic;
|
||||||
|
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
@ -26,6 +26,20 @@ public class LinkFilterSelector {
|
|||||||
if (isDiscourse(head)) {
|
if (isDiscourse(head)) {
|
||||||
return url -> url.path.startsWith("/t/") || url.path.contains("/latest");
|
return url -> url.path.startsWith("/t/") || url.path.contains("/latest");
|
||||||
}
|
}
|
||||||
|
if (isMediawiki(head)) {
|
||||||
|
return url -> {
|
||||||
|
if (url.path.endsWith(".php")) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (url.path.contains("Special:")) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (url.path.contains("Talk:")) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
return LinkFilterSelector::defaultFilter;
|
return LinkFilterSelector::defaultFilter;
|
||||||
}
|
}
|
@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.crawl.retreival.fetcher;
|
package nu.marginalia.crawl.logic;
|
||||||
|
|
||||||
import com.google.common.base.Strings;
|
import com.google.common.base.Strings;
|
||||||
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import okhttp3.OkHttpClient;
|
import okhttp3.OkHttpClient;
|
||||||
import okhttp3.Request;
|
import okhttp3.Request;
|
@ -1,6 +1,9 @@
|
|||||||
package nu.marginalia.crawl.retreival;
|
package nu.marginalia.crawl.retreival;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
|
|
||||||
|
import java.time.Duration;
|
||||||
|
|
||||||
import static java.lang.Math.max;
|
import static java.lang.Math.max;
|
||||||
import static java.lang.Math.min;
|
import static java.lang.Math.min;
|
||||||
@ -22,12 +25,21 @@ public class CrawlDelayTimer {
|
|||||||
|
|
||||||
/** Call when we've gotten an HTTP 429 response. This will wait a moment, and then
|
/** Call when we've gotten an HTTP 429 response. This will wait a moment, and then
|
||||||
* set a flag that slows down the main crawl delay as well. */
|
* set a flag that slows down the main crawl delay as well. */
|
||||||
public void waitRetryDelay(RateLimitException ex) throws InterruptedException {
|
public void waitRetryDelay(HttpFetcherImpl.RateLimitException ex) throws InterruptedException {
|
||||||
slowDown = true;
|
slowDown = true;
|
||||||
|
|
||||||
int delay = ex.retryAfter();
|
Duration delay = ex.retryAfter();
|
||||||
|
|
||||||
Thread.sleep(Math.clamp(delay, 100, 5000));
|
if (delay.compareTo(Duration.ofSeconds(1)) < 0) {
|
||||||
|
// If the server wants us to retry in less than a second, we'll just wait a bit
|
||||||
|
delay = Duration.ofSeconds(1);
|
||||||
|
}
|
||||||
|
else if (delay.compareTo(Duration.ofSeconds(5)) > 0) {
|
||||||
|
// If the server wants us to retry in more than a minute, we'll wait a bit
|
||||||
|
delay = Duration.ofSeconds(5);
|
||||||
|
}
|
||||||
|
|
||||||
|
Thread.sleep(delay.toMillis());
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
|
@ -1,91 +0,0 @@
|
|||||||
package nu.marginalia.crawl.retreival;
|
|
||||||
|
|
||||||
import nu.marginalia.model.EdgeUrl;
|
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
|
||||||
import nu.marginalia.model.crawldata.CrawlerDocumentStatus;
|
|
||||||
|
|
||||||
import java.time.LocalDateTime;
|
|
||||||
import java.util.Objects;
|
|
||||||
|
|
||||||
public class CrawledDocumentFactory {
|
|
||||||
|
|
||||||
public static CrawledDocument createHardErrorRsp(EdgeUrl url, Exception why) {
|
|
||||||
return CrawledDocument.builder()
|
|
||||||
.crawlerStatus(CrawlerDocumentStatus.ERROR.toString())
|
|
||||||
.crawlerStatusDesc(why.getClass().getSimpleName() + ": " + why.getMessage())
|
|
||||||
.timestamp(LocalDateTime.now().toString())
|
|
||||||
.url(url.toString())
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static CrawledDocument createUnknownHostError(EdgeUrl url) {
|
|
||||||
return CrawledDocument.builder()
|
|
||||||
.crawlerStatus(CrawlerDocumentStatus.ERROR.toString())
|
|
||||||
.crawlerStatusDesc("Unknown Host")
|
|
||||||
.timestamp(LocalDateTime.now().toString())
|
|
||||||
.url(url.toString())
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static CrawledDocument createTimeoutErrorRsp(EdgeUrl url) {
|
|
||||||
return CrawledDocument.builder()
|
|
||||||
.crawlerStatus("Timeout")
|
|
||||||
.timestamp(LocalDateTime.now().toString())
|
|
||||||
.url(url.toString())
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static CrawledDocument createErrorResponse(EdgeUrl url, HttpFetchResult.ResultOk rsp, CrawlerDocumentStatus status, String why) {
|
|
||||||
return CrawledDocument.builder()
|
|
||||||
.crawlerStatus(status.toString())
|
|
||||||
.crawlerStatusDesc(why)
|
|
||||||
.headers(rsp.headers().toString())
|
|
||||||
.contentType(Objects.requireNonNullElse(rsp.headers().get("Content-Type"), ""))
|
|
||||||
.timestamp(LocalDateTime.now().toString())
|
|
||||||
.httpStatus(rsp.statusCode())
|
|
||||||
.url(url.toString())
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
public static CrawledDocument createErrorResponse(EdgeUrl url, String contentType, int statusCode, CrawlerDocumentStatus status, String why) {
|
|
||||||
return CrawledDocument.builder()
|
|
||||||
.crawlerStatus(status.toString())
|
|
||||||
.crawlerStatusDesc(why)
|
|
||||||
.headers("")
|
|
||||||
.contentType(contentType)
|
|
||||||
.timestamp(LocalDateTime.now().toString())
|
|
||||||
.httpStatus(statusCode)
|
|
||||||
.url(url.toString())
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static CrawledDocument createRedirectResponse(EdgeUrl url, HttpFetchResult.ResultOk rsp, EdgeUrl responseUrl) {
|
|
||||||
|
|
||||||
return CrawledDocument.builder()
|
|
||||||
.crawlerStatus(CrawlerDocumentStatus.REDIRECT.name())
|
|
||||||
.redirectUrl(responseUrl.toString())
|
|
||||||
.headers(rsp.headers().toString())
|
|
||||||
.contentType(Objects.requireNonNullElse(rsp.headers().get("Content-Type"), ""))
|
|
||||||
.timestamp(LocalDateTime.now().toString())
|
|
||||||
.httpStatus(rsp.statusCode())
|
|
||||||
.url(url.toString())
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static CrawledDocument createRobotsError(EdgeUrl url) {
|
|
||||||
return CrawledDocument.builder()
|
|
||||||
.url(url.toString())
|
|
||||||
.timestamp(LocalDateTime.now().toString())
|
|
||||||
.httpStatus(-1)
|
|
||||||
.crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name())
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
public static CrawledDocument createRetryError(EdgeUrl url) {
|
|
||||||
return CrawledDocument.builder()
|
|
||||||
.url(url.toString())
|
|
||||||
.timestamp(LocalDateTime.now().toString())
|
|
||||||
.httpStatus(429)
|
|
||||||
.crawlerStatus(CrawlerDocumentStatus.ERROR.name())
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
}
|
|
@ -3,9 +3,11 @@ package nu.marginalia.crawl.retreival;
|
|||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
import nu.marginalia.contenttype.ContentType;
|
import nu.marginalia.contenttype.ContentType;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
|
import nu.marginalia.crawl.logic.LinkFilterSelector;
|
||||||
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
|
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
|
||||||
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
|
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
|
||||||
import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
|
import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
|
||||||
@ -14,8 +16,6 @@ import nu.marginalia.link_parser.LinkParser;
|
|||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
import nu.marginalia.model.crawldata.CrawledDomain;
|
|
||||||
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
|
||||||
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -25,7 +25,6 @@ import java.io.IOException;
|
|||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.net.UnknownHostException;
|
import java.net.UnknownHostException;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@ -84,11 +83,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
return crawlFrontier;
|
return crawlFrontier;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int fetch() {
|
public int crawlDomain() {
|
||||||
return fetch(new DomainLinks(), new CrawlDataReference());
|
return crawlDomain(new DomainLinks(), new CrawlDataReference());
|
||||||
}
|
}
|
||||||
|
|
||||||
public int fetch(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
|
public int crawlDomain(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
|
||||||
try {
|
try {
|
||||||
return crawlDomain(oldCrawlData, domainLinks);
|
return crawlDomain(oldCrawlData, domainLinks);
|
||||||
}
|
}
|
||||||
@ -98,28 +97,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void syncAbortedRun(Path warcFile) {
|
|
||||||
var resync = new CrawlerWarcResynchronizer(crawlFrontier, warcRecorder);
|
|
||||||
|
|
||||||
resync.run(warcFile);
|
|
||||||
}
|
|
||||||
|
|
||||||
private DomainProber.ProbeResult probeRootUrl(String ip) throws IOException {
|
|
||||||
// Construct an URL to the root of the domain, we don't know the schema yet so we'll
|
|
||||||
// start with http and then try https if that fails
|
|
||||||
var httpUrl = new EdgeUrl("http", new EdgeDomain(domain), null, "/", null);
|
|
||||||
final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, httpUrl);
|
|
||||||
|
|
||||||
warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult);
|
|
||||||
|
|
||||||
return probeResult;
|
|
||||||
}
|
|
||||||
|
|
||||||
private int crawlDomain(CrawlDataReference oldCrawlData, DomainLinks domainLinks) throws IOException, InterruptedException {
|
private int crawlDomain(CrawlDataReference oldCrawlData, DomainLinks domainLinks) throws IOException, InterruptedException {
|
||||||
String ip = findIp(domain);
|
String ip = findIp(domain);
|
||||||
EdgeUrl rootUrl;
|
EdgeUrl rootUrl;
|
||||||
|
|
||||||
if (probeRootUrl(ip) instanceof DomainProber.ProbeResultOk ok) rootUrl = ok.probedUrl();
|
if (probeRootUrl(ip) instanceof HttpFetcherImpl.ProbeResultOk ok) rootUrl = ok.probedUrl();
|
||||||
else return 1;
|
else return 1;
|
||||||
|
|
||||||
// Sleep after the initial probe, we don't have access to the robots.txt yet
|
// Sleep after the initial probe, we don't have access to the robots.txt yet
|
||||||
@ -130,12 +112,13 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
|
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
|
||||||
|
|
||||||
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
|
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
|
||||||
|
|
||||||
sniffRootDocument(rootUrl, delayTimer);
|
sniffRootDocument(rootUrl, delayTimer);
|
||||||
|
|
||||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||||
int recrawled = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
|
int fetchedCount = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
|
||||||
|
|
||||||
if (recrawled > 0) {
|
if (fetchedCount > 0) {
|
||||||
// If we have reference data, we will always grow the crawl depth a bit
|
// If we have reference data, we will always grow the crawl depth a bit
|
||||||
crawlFrontier.increaseDepth(1.5, 2500);
|
crawlFrontier.increaseDepth(1.5, 2500);
|
||||||
}
|
}
|
||||||
@ -146,15 +129,6 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
// Add links from the sitemap to the crawl frontier
|
// Add links from the sitemap to the crawl frontier
|
||||||
sitemapFetcher.downloadSitemaps(robotsRules, rootUrl);
|
sitemapFetcher.downloadSitemaps(robotsRules, rootUrl);
|
||||||
|
|
||||||
CrawledDomain ret = new CrawledDomain(domain,
|
|
||||||
null,
|
|
||||||
CrawlerDomainStatus.OK.name(),
|
|
||||||
null,
|
|
||||||
ip,
|
|
||||||
new ArrayList<>(),
|
|
||||||
null);
|
|
||||||
|
|
||||||
int fetchedCount = recrawled;
|
|
||||||
|
|
||||||
while (!crawlFrontier.isEmpty()
|
while (!crawlFrontier.isEmpty()
|
||||||
&& !crawlFrontier.isCrawlDepthReached()
|
&& !crawlFrontier.isCrawlDepthReached()
|
||||||
@ -186,7 +160,6 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
if (!crawlFrontier.addVisited(top))
|
if (!crawlFrontier.addVisited(top))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (fetchContentWithReference(top, delayTimer, DocumentWithReference.empty()).isOk()) {
|
if (fetchContentWithReference(top, delayTimer, DocumentWithReference.empty()).isOk()) {
|
||||||
fetchedCount++;
|
fetchedCount++;
|
||||||
@ -198,15 +171,28 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ret.cookies = fetcher.getCookies();
|
|
||||||
|
|
||||||
return fetchedCount;
|
return fetchedCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void syncAbortedRun(Path warcFile) {
|
||||||
|
var resync = new CrawlerWarcResynchronizer(crawlFrontier, warcRecorder);
|
||||||
|
|
||||||
|
resync.run(warcFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
private HttpFetcherImpl.ProbeResult probeRootUrl(String ip) throws IOException {
|
||||||
|
// Construct an URL to the root of the domain, we don't know the schema yet so we'll
|
||||||
|
// start with http and then try https if that fails
|
||||||
|
var httpUrl = new EdgeUrl("http", new EdgeDomain(domain), null, "/", null);
|
||||||
|
final HttpFetcherImpl.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, httpUrl);
|
||||||
|
|
||||||
|
warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult);
|
||||||
|
|
||||||
|
return probeResult;
|
||||||
|
}
|
||||||
|
|
||||||
private void sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
|
private void sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
|
||||||
try {
|
try {
|
||||||
logger.debug("Configuring link filter");
|
|
||||||
|
|
||||||
var url = rootUrl.withPathAndParam("/", null);
|
var url = rootUrl.withPathAndParam("/", null);
|
||||||
|
|
||||||
HttpFetchResult result = fetchWithRetry(url, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
HttpFetchResult result = fetchWithRetry(url, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
||||||
@ -291,7 +277,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
|
else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
|
||||||
var doc = reference.doc();
|
var doc = reference.doc();
|
||||||
|
|
||||||
warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody, contentTags);
|
warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody, doc.headers, contentTags);
|
||||||
|
|
||||||
fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
|
fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
|
||||||
new ContentType(doc.contentType, "UTF-8"),
|
new ContentType(doc.contentType, "UTF-8"),
|
||||||
@ -326,7 +312,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
try {
|
try {
|
||||||
return fetcher.fetchContent(url, warcRecorder, contentTags, probeType);
|
return fetcher.fetchContent(url, warcRecorder, contentTags, probeType);
|
||||||
}
|
}
|
||||||
catch (RateLimitException ex) {
|
catch (HttpFetcherImpl.RateLimitException ex) {
|
||||||
timer.waitRetryDelay(ex);
|
timer.waitRetryDelay(ex);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
package nu.marginalia.crawl.retreival;
|
package nu.marginalia.crawl.retreival;
|
||||||
|
|
||||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
@ -38,7 +38,7 @@ public class CrawlerWarcResynchronizer {
|
|||||||
accept(item);
|
accept(item);
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.info(STR."(Expected) Failed read full warc file \{tempFile}: \{e.getClass().getSimpleName()} \{e.getMessage()}");
|
logger.info("(Expected) Failed read full warc file " + tempFile + ": " + e.getClass().getSimpleName() + " " + e.getMessage());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Second pass, copy records to the new warc file
|
// Second pass, copy records to the new warc file
|
||||||
@ -47,7 +47,7 @@ public class CrawlerWarcResynchronizer {
|
|||||||
recorder.resync(item);
|
recorder.resync(item);
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.info(STR."(Expected) Failed read full warc file \{tempFile}: \{e.getClass().getSimpleName()} \{e.getMessage()}");
|
logger.info("(Expected) Failed read full warc file " + tempFile + ": " + e.getClass().getSimpleName() + " " + e.getMessage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -63,7 +63,7 @@ public class CrawlerWarcResynchronizer {
|
|||||||
|
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.info(STR."Failed to process warc record \{item}", ex);
|
logger.info("Failed to process warc record " + item, ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -78,7 +78,8 @@ public class CrawlerWarcResynchronizer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void request(WarcRequest request) {
|
private void request(WarcRequest request) {
|
||||||
EdgeUrl.parse(request.target()).ifPresent(crawlFrontier::addVisited);
|
var url = new EdgeUrl(request.targetURI());
|
||||||
|
crawlFrontier.addVisited(url);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void response(WarcResponse rsp) {
|
private void response(WarcResponse rsp) {
|
||||||
@ -97,7 +98,7 @@ public class CrawlerWarcResynchronizer {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
catch (Exception e) {
|
catch (Exception e) {
|
||||||
logger.info(STR."Failed to parse response body for \{url}", e);
|
logger.info("Failed to parse response body for " + url, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9,9 +9,14 @@ import nu.marginalia.model.EdgeUrl;
|
|||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.util.*;
|
import java.util.ArrayDeque;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Objects;
|
||||||
import java.util.function.Predicate;
|
import java.util.function.Predicate;
|
||||||
|
|
||||||
|
/** Encapsulates the crawl frontier for a single domain,
|
||||||
|
* that is information about known and visited URLs
|
||||||
|
*/
|
||||||
public class DomainCrawlFrontier {
|
public class DomainCrawlFrontier {
|
||||||
|
|
||||||
private static final LinkParser linkParser = new LinkParser();
|
private static final LinkParser linkParser = new LinkParser();
|
||||||
|
@ -2,8 +2,8 @@ package nu.marginalia.crawl.retreival;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.FetchResultState;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.ip_blocklist.IpBlockList;
|
import nu.marginalia.ip_blocklist.IpBlockList;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
@ -34,43 +34,18 @@ public class DomainProber {
|
|||||||
* doesn't immediately redirect to another domain (which should be crawled separately, not under the name
|
* doesn't immediately redirect to another domain (which should be crawled separately, not under the name
|
||||||
* of this domain).
|
* of this domain).
|
||||||
*/
|
*/
|
||||||
public ProbeResult probeDomain(HttpFetcher fetcher, String domain, @Nullable EdgeUrl firstUrlInQueue) {
|
public HttpFetcherImpl.ProbeResult probeDomain(HttpFetcher fetcher, String domain, @Nullable EdgeUrl firstUrlInQueue) {
|
||||||
|
|
||||||
if (firstUrlInQueue == null) {
|
if (firstUrlInQueue == null) {
|
||||||
logger.warn("No valid URLs for domain {}", domain);
|
logger.warn("No valid URLs for domain {}", domain);
|
||||||
|
|
||||||
return new ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs");
|
return new HttpFetcherImpl.ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!domainBlacklist.test(firstUrlInQueue.domain))
|
if (!domainBlacklist.test(firstUrlInQueue.domain))
|
||||||
return new ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed");
|
return new HttpFetcherImpl.ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed");
|
||||||
|
|
||||||
var fetchResult = fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null));
|
return fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null));
|
||||||
|
|
||||||
if (fetchResult.ok())
|
|
||||||
return new ProbeResultOk(fetchResult.url);
|
|
||||||
|
|
||||||
if (fetchResult.state == FetchResultState.REDIRECT)
|
|
||||||
return new ProbeResultRedirect(fetchResult.domain);
|
|
||||||
|
|
||||||
return new ProbeResultError(CrawlerDomainStatus.ERROR, "Bad status");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public sealed interface ProbeResult permits ProbeResultError, ProbeResultRedirect, ProbeResultOk {}
|
|
||||||
|
|
||||||
/** The probing failed for one reason or another
|
|
||||||
* @param status Machine readable status
|
|
||||||
* @param desc Human-readable description of the error
|
|
||||||
*/
|
|
||||||
public record ProbeResultError(CrawlerDomainStatus status, String desc) implements ProbeResult {}
|
|
||||||
|
|
||||||
/** This domain redirects to another domain */
|
|
||||||
public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {}
|
|
||||||
|
|
||||||
/** If the retrieval of the probed url was successful, return the url as it was fetched
|
|
||||||
* (which may be different from the url we probed, if we attempted another URL schema).
|
|
||||||
*
|
|
||||||
* @param probedUrl The url we successfully probed
|
|
||||||
*/
|
|
||||||
public record ProbeResultOk(EdgeUrl probedUrl) implements ProbeResult {}
|
|
||||||
}
|
}
|
||||||
|
@ -1,21 +0,0 @@
|
|||||||
package nu.marginalia.crawl.retreival;
|
|
||||||
|
|
||||||
public class RateLimitException extends Exception {
|
|
||||||
private final String retryAfter;
|
|
||||||
|
|
||||||
public RateLimitException(String retryAfter) {
|
|
||||||
this.retryAfter = retryAfter;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public StackTraceElement[] getStackTrace() { return new StackTraceElement[0]; }
|
|
||||||
|
|
||||||
public int retryAfter() {
|
|
||||||
try {
|
|
||||||
return Integer.parseInt(retryAfter);
|
|
||||||
}
|
|
||||||
catch (NumberFormatException ex) {
|
|
||||||
return 1000;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,24 +0,0 @@
|
|||||||
package nu.marginalia.crawl.retreival.fetcher;
|
|
||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
|
||||||
import lombok.ToString;
|
|
||||||
import nu.marginalia.model.EdgeDomain;
|
|
||||||
import nu.marginalia.model.EdgeUrl;
|
|
||||||
|
|
||||||
@AllArgsConstructor
|
|
||||||
@ToString
|
|
||||||
public class FetchResult {
|
|
||||||
public final FetchResultState state;
|
|
||||||
public final EdgeUrl url;
|
|
||||||
public final EdgeDomain domain;
|
|
||||||
|
|
||||||
public FetchResult(FetchResultState state, EdgeUrl url) {
|
|
||||||
this.state = state;
|
|
||||||
this.url = url;
|
|
||||||
this.domain = url.domain;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean ok() {
|
|
||||||
return state == FetchResultState.OK;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,7 +0,0 @@
|
|||||||
package nu.marginalia.crawl.retreival.fetcher;
|
|
||||||
|
|
||||||
public enum FetchResultState {
|
|
||||||
OK,
|
|
||||||
REDIRECT,
|
|
||||||
ERROR
|
|
||||||
}
|
|
@ -2,12 +2,12 @@ package nu.marginalia.crawl.retreival.revisit;
|
|||||||
|
|
||||||
import com.google.common.base.Strings;
|
import com.google.common.base.Strings;
|
||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||||
import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
|
import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
|
||||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
@ -125,6 +125,7 @@ public class CrawlerRevisitor {
|
|||||||
doc.contentType,
|
doc.contentType,
|
||||||
doc.httpStatus,
|
doc.httpStatus,
|
||||||
doc.documentBody,
|
doc.documentBody,
|
||||||
|
doc.headers,
|
||||||
new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe)
|
new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.crawl.retreival.revisit;
|
package nu.marginalia.crawl.retreival.revisit;
|
||||||
|
|
||||||
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
|
||||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||||
import nu.marginalia.model.body.DocumentBodyResult;
|
import nu.marginalia.model.body.DocumentBodyResult;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
package nu.marginalia.crawl.retreival.sitemap;
|
package nu.marginalia.crawl.retreival.sitemap;
|
||||||
|
|
||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
|
import nu.marginalia.crawl.fetcher.SitemapRetriever;
|
||||||
import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
|
import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever;
|
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
@ -144,7 +144,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
|||||||
nextRecord.httpStatus,
|
nextRecord.httpStatus,
|
||||||
status.toString(),
|
status.toString(),
|
||||||
"",
|
"",
|
||||||
"",
|
nextRecord.headers,
|
||||||
bodyString,
|
bodyString,
|
||||||
Long.toHexString(hash.hashNearlyASCII(bodyString)), // this field isn't actually used, maybe we can skip calculating it?
|
Long.toHexString(hash.hashNearlyASCII(bodyString)), // this field isn't actually used, maybe we can skip calculating it?
|
||||||
nextRecord.url,
|
nextRecord.url,
|
||||||
|
@ -23,7 +23,6 @@ public class CrawledDocument implements SerializableCrawlData {
|
|||||||
public String crawlerStatusDesc;
|
public String crawlerStatusDesc;
|
||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
@Deprecated // use getETag() or getLastModified() instead
|
|
||||||
public String headers;
|
public String headers;
|
||||||
|
|
||||||
public String documentBody;
|
public String documentBody;
|
||||||
|
@ -29,7 +29,11 @@ public class CrawledDocumentParquetRecord {
|
|||||||
public String contentType;
|
public String contentType;
|
||||||
public byte[] body;
|
public byte[] body;
|
||||||
|
|
||||||
|
public String headers;
|
||||||
|
|
||||||
|
@Deprecated // will be replaced with the full headers field in the future
|
||||||
public String etagHeader;
|
public String etagHeader;
|
||||||
|
@Deprecated // will be replaced with the full headers field in the future
|
||||||
public String lastModifiedHeader;
|
public String lastModifiedHeader;
|
||||||
|
|
||||||
public static Hydrator<CrawledDocumentParquetRecord, CrawledDocumentParquetRecord> newHydrator() {
|
public static Hydrator<CrawledDocumentParquetRecord, CrawledDocumentParquetRecord> newHydrator() {
|
||||||
@ -51,7 +55,8 @@ public class CrawledDocumentParquetRecord {
|
|||||||
Types.required(BINARY).as(stringType()).named("contentType"),
|
Types.required(BINARY).as(stringType()).named("contentType"),
|
||||||
Types.required(BINARY).named("body"),
|
Types.required(BINARY).named("body"),
|
||||||
Types.optional(BINARY).as(stringType()).named("etagHeader"),
|
Types.optional(BINARY).as(stringType()).named("etagHeader"),
|
||||||
Types.optional(BINARY).as(stringType()).named("lastModifiedHeader")
|
Types.optional(BINARY).as(stringType()).named("lastModifiedHeader"),
|
||||||
|
Types.optional(BINARY).as(stringType()).named("headers")
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
||||||
@ -67,6 +72,7 @@ public class CrawledDocumentParquetRecord {
|
|||||||
case "epochSeconds" -> timestamp = Instant.ofEpochSecond((Long) value);
|
case "epochSeconds" -> timestamp = Instant.ofEpochSecond((Long) value);
|
||||||
case "etagHeader" -> etagHeader = (String) value;
|
case "etagHeader" -> etagHeader = (String) value;
|
||||||
case "lastModifiedHeader" -> lastModifiedHeader = (String) value;
|
case "lastModifiedHeader" -> lastModifiedHeader = (String) value;
|
||||||
|
case "headers" -> headers = (String) value;
|
||||||
|
|
||||||
default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"');
|
default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"');
|
||||||
}
|
}
|
||||||
@ -82,6 +88,9 @@ public class CrawledDocumentParquetRecord {
|
|||||||
valueWriter.write("cookies", cookies);
|
valueWriter.write("cookies", cookies);
|
||||||
valueWriter.write("contentType", contentType);
|
valueWriter.write("contentType", contentType);
|
||||||
valueWriter.write("body", body);
|
valueWriter.write("body", body);
|
||||||
|
if (headers != null) {
|
||||||
|
valueWriter.write("headers", headers);
|
||||||
|
}
|
||||||
if (etagHeader != null) {
|
if (etagHeader != null) {
|
||||||
valueWriter.write("etagHeader", etagHeader);
|
valueWriter.write("etagHeader", etagHeader);
|
||||||
}
|
}
|
||||||
|
@ -16,6 +16,7 @@ import java.nio.file.Path;
|
|||||||
import java.time.Instant;
|
import java.time.Instant;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
import java.util.StringJoiner;
|
||||||
|
|
||||||
public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
||||||
private final ParquetWriter<CrawledDocumentParquetRecord> writer;
|
private final ParquetWriter<CrawledDocumentParquetRecord> writer;
|
||||||
@ -150,6 +151,14 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
|||||||
contentType = "";
|
contentType = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
String headersStr = null;
|
||||||
|
StringJoiner headersStrBuilder = new StringJoiner("\n");
|
||||||
|
for (var header : headers) {
|
||||||
|
headersStrBuilder.add(header.getFirst() + ": " + header.getSecond());
|
||||||
|
}
|
||||||
|
headersStr = headersStrBuilder.toString();
|
||||||
|
|
||||||
|
|
||||||
write(new CrawledDocumentParquetRecord(
|
write(new CrawledDocumentParquetRecord(
|
||||||
domain,
|
domain,
|
||||||
response.target(),
|
response.target(),
|
||||||
@ -159,6 +168,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
|||||||
response.date(),
|
response.date(),
|
||||||
contentType,
|
contentType,
|
||||||
bodyBytes,
|
bodyBytes,
|
||||||
|
headersStr,
|
||||||
headers.get("ETag"),
|
headers.get("ETag"),
|
||||||
headers.get("Last-Modified"))
|
headers.get("Last-Modified"))
|
||||||
);
|
);
|
||||||
@ -179,6 +189,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
|||||||
"x-marginalia/advisory;state=redirect",
|
"x-marginalia/advisory;state=redirect",
|
||||||
new byte[0],
|
new byte[0],
|
||||||
null,
|
null,
|
||||||
|
null,
|
||||||
null
|
null
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -192,6 +203,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
|||||||
"x-marginalia/advisory;state=error",
|
"x-marginalia/advisory;state=error",
|
||||||
errorStatus.getBytes(),
|
errorStatus.getBytes(),
|
||||||
null,
|
null,
|
||||||
|
null,
|
||||||
null
|
null
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -206,6 +218,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
|||||||
errorStatus,
|
errorStatus,
|
||||||
new byte[0],
|
new byte[0],
|
||||||
null,
|
null,
|
||||||
|
null,
|
||||||
null
|
null
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,124 @@
|
|||||||
|
package nu.marginalia.crawl.logic;
|
||||||
|
|
||||||
|
import com.sun.net.httpserver.HttpServer;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import okhttp3.OkHttpClient;
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.net.InetSocketAddress;
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertInstanceOf;
|
||||||
|
|
||||||
|
class ContentTypeProberTest {
|
||||||
|
|
||||||
|
private static int port;
|
||||||
|
private static HttpServer server;
|
||||||
|
private static OkHttpClient client;
|
||||||
|
|
||||||
|
static EdgeUrl htmlEndpoint;
|
||||||
|
static EdgeUrl htmlRedirEndpoint;
|
||||||
|
static EdgeUrl binaryEndpoint;
|
||||||
|
static EdgeUrl timeoutEndpoint;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() throws IOException {
|
||||||
|
Random r = new Random();
|
||||||
|
port = r.nextInt(10000) + 8000;
|
||||||
|
server = HttpServer.create(new InetSocketAddress("127.0.0.1", port), 10);
|
||||||
|
|
||||||
|
server.createContext("/html", exchange -> {
|
||||||
|
exchange.getResponseHeaders().add("Content-Type", "text/html");
|
||||||
|
exchange.sendResponseHeaders(200, -1);
|
||||||
|
exchange.close();
|
||||||
|
});
|
||||||
|
server.createContext("/redir", exchange -> {
|
||||||
|
exchange.getResponseHeaders().add("Location", "/html");
|
||||||
|
exchange.sendResponseHeaders(301, -1);
|
||||||
|
exchange.close();
|
||||||
|
});
|
||||||
|
|
||||||
|
server.createContext("/bin", exchange -> {
|
||||||
|
exchange.getResponseHeaders().add("Content-Type", "application/binary");
|
||||||
|
exchange.sendResponseHeaders(200, -1);
|
||||||
|
exchange.close();
|
||||||
|
});
|
||||||
|
|
||||||
|
server.createContext("/timeout", exchange -> {
|
||||||
|
try {
|
||||||
|
Thread.sleep(2000);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
exchange.getResponseHeaders().add("Content-Type", "application/binary");
|
||||||
|
exchange.sendResponseHeaders(200, -1);
|
||||||
|
exchange.close();
|
||||||
|
});
|
||||||
|
|
||||||
|
server.start();
|
||||||
|
|
||||||
|
htmlEndpoint = EdgeUrl.parse("http://localhost:" + port + "/html").get();
|
||||||
|
binaryEndpoint = EdgeUrl.parse("http://localhost:" + port + "/bin").get();
|
||||||
|
timeoutEndpoint = EdgeUrl.parse("http://localhost:" + port + "/timeout").get();
|
||||||
|
htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir").get();
|
||||||
|
|
||||||
|
client = new OkHttpClient.Builder()
|
||||||
|
.readTimeout(1, java.util.concurrent.TimeUnit.SECONDS)
|
||||||
|
.connectTimeout(1, java.util.concurrent.TimeUnit.SECONDS)
|
||||||
|
.callTimeout(1, java.util.concurrent.TimeUnit.SECONDS)
|
||||||
|
.writeTimeout(1, java.util.concurrent.TimeUnit.SECONDS)
|
||||||
|
.build();
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
void tearDown() {
|
||||||
|
server.stop(0);
|
||||||
|
client.dispatcher().executorService().shutdown();
|
||||||
|
client.connectionPool().evictAll();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void probeContentTypeOk() {
|
||||||
|
ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client)
|
||||||
|
.probeContentType(htmlEndpoint);
|
||||||
|
|
||||||
|
System.out.println(result);
|
||||||
|
|
||||||
|
assertEquals(result, new ContentTypeProber.ContentTypeProbeResult.Ok(htmlEndpoint));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void probeContentTypeRedir() {
|
||||||
|
ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client)
|
||||||
|
.probeContentType(htmlRedirEndpoint);
|
||||||
|
|
||||||
|
System.out.println(result);
|
||||||
|
|
||||||
|
assertEquals(result, new ContentTypeProber.ContentTypeProbeResult.Ok(htmlEndpoint));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void probeContentTypeBad() {
|
||||||
|
ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client)
|
||||||
|
.probeContentType(binaryEndpoint);
|
||||||
|
|
||||||
|
System.out.println(result);
|
||||||
|
|
||||||
|
assertInstanceOf(ContentTypeProber.ContentTypeProbeResult.BadContentType.class, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void probeContentTypeTimeout() {
|
||||||
|
ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client)
|
||||||
|
.probeContentType(timeoutEndpoint);
|
||||||
|
|
||||||
|
System.out.println(result);
|
||||||
|
|
||||||
|
assertInstanceOf(ContentTypeProber.ContentTypeProbeResult.Timeout.class, result);
|
||||||
|
}
|
||||||
|
}
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.crawl.retreival;
|
package nu.marginalia.crawl.retreival;
|
||||||
|
|
||||||
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
|
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import okhttp3.OkHttpClient;
|
import okhttp3.OkHttpClient;
|
||||||
@ -21,7 +21,8 @@ import java.security.NoSuchAlgorithmException;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.zip.GZIPInputStream;
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
import static org.junit.jupiter.api.Assertions.fail;
|
||||||
|
|
||||||
class CrawlerWarcResynchronizerTest {
|
class CrawlerWarcResynchronizerTest {
|
||||||
Path fileName;
|
Path fileName;
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
package nu.marginalia.crawl.retreival.fetcher;
|
package nu.marginalia.crawl.retreival.fetcher;
|
||||||
|
|
||||||
import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult.BadContentType;
|
import nu.marginalia.crawl.logic.ContentTypeProber;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult.Ok;
|
import nu.marginalia.crawl.logic.ContentTypeProber.ContentTypeProbeResult.BadContentType;
|
||||||
|
import nu.marginalia.crawl.logic.ContentTypeProber.ContentTypeProbeResult.Ok;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import okhttp3.ConnectionPool;
|
import okhttp3.ConnectionPool;
|
||||||
import okhttp3.Dispatcher;
|
import okhttp3.Dispatcher;
|
||||||
@ -13,7 +14,7 @@ import java.net.URISyntaxException;
|
|||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
class ContentTypeProberTest {
|
class ContentTypeProberTest {
|
||||||
|
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
package nu.marginalia.crawl.retreival.fetcher;
|
package nu.marginalia.crawl.retreival.fetcher;
|
||||||
|
|
||||||
import nu.marginalia.UserAgent;
|
import nu.marginalia.UserAgent;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||||
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
||||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
||||||
@ -80,6 +81,7 @@ class WarcRecorderTest {
|
|||||||
"text/html",
|
"text/html",
|
||||||
200,
|
200,
|
||||||
"<?doctype html><html><body>test</body></html>",
|
"<?doctype html><html><body>test</body></html>",
|
||||||
|
null,
|
||||||
ContentTags.empty());
|
ContentTags.empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -103,7 +105,7 @@ class WarcRecorderTest {
|
|||||||
"text/html",
|
"text/html",
|
||||||
200,
|
200,
|
||||||
null,
|
null,
|
||||||
ContentTags.empty());
|
null, ContentTags.empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -115,7 +117,7 @@ class WarcRecorderTest {
|
|||||||
"text/html",
|
"text/html",
|
||||||
200,
|
200,
|
||||||
"<?doctype html><html><body>test</body></html>",
|
"<?doctype html><html><body>test</body></html>",
|
||||||
ContentTags.empty());
|
null, ContentTags.empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var reader = new WarcReader(fileNameWarc)) {
|
try (var reader = new WarcReader(fileNameWarc)) {
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
package nu.marginalia.crawl.retreival.revisit;
|
package nu.marginalia.crawl.retreival.revisit;
|
||||||
|
|
||||||
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
@ -1,11 +1,10 @@
|
|||||||
package nu.marginalia.crawling;
|
package nu.marginalia.crawling;
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.crawl.retreival.RateLimitException;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.ContentTypeLogic;
|
import nu.marginalia.model.body.ContentTypeLogic;
|
||||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||||
@ -33,7 +32,7 @@ class HttpFetcherTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void fetchUTF8() throws URISyntaxException, RateLimitException, IOException {
|
void fetchUTF8() throws URISyntaxException, HttpFetcherImpl.RateLimitException, IOException {
|
||||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||||
try (var recorder = new WarcRecorder()) {
|
try (var recorder = new WarcRecorder()) {
|
||||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||||
@ -44,7 +43,7 @@ class HttpFetcherTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void fetchText() throws URISyntaxException, RateLimitException, IOException {
|
void fetchText() throws URISyntaxException, HttpFetcherImpl.RateLimitException, IOException {
|
||||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||||
|
|
||||||
try (var recorder = new WarcRecorder()) {
|
try (var recorder = new WarcRecorder()) {
|
||||||
|
@ -2,10 +2,13 @@ package nu.marginalia.crawling.retreival;
|
|||||||
|
|
||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
|
import nu.marginalia.crawl.fetcher.SitemapRetriever;
|
||||||
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||||
import nu.marginalia.crawl.retreival.DomainProber;
|
import nu.marginalia.crawl.retreival.DomainProber;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.*;
|
|
||||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
@ -68,7 +71,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
void crawl(CrawlSpecRecord spec) throws IOException {
|
void crawl(CrawlSpecRecord spec) throws IOException {
|
||||||
try (var recorder = new WarcRecorder()) {
|
try (var recorder = new WarcRecorder()) {
|
||||||
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder)
|
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder)
|
||||||
.fetch();
|
.crawlDomain();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -115,9 +118,9 @@ public class CrawlerMockFetcherTest {
|
|||||||
public void clearCookies() {}
|
public void clearCookies() {}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public FetchResult probeDomain(EdgeUrl url) {
|
public HttpFetcherImpl.ProbeResult probeDomain(EdgeUrl url) {
|
||||||
logger.info("Probing {}", url);
|
logger.info("Probing {}", url);
|
||||||
return new FetchResult(FetchResultState.OK, url);
|
return new HttpFetcherImpl.ProbeResultOk(url);
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
|
@ -4,10 +4,10 @@ import lombok.SneakyThrows;
|
|||||||
import nu.marginalia.UserAgent;
|
import nu.marginalia.UserAgent;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawl.retreival.*;
|
import nu.marginalia.crawl.retreival.*;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
|
||||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
|
||||||
import nu.marginalia.io.crawldata.CrawledDomainReader;
|
import nu.marginalia.io.crawldata.CrawledDomainReader;
|
||||||
import nu.marginalia.io.crawldata.SerializableCrawlDataStream;
|
import nu.marginalia.io.crawldata.SerializableCrawlDataStream;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
@ -468,7 +468,7 @@ class CrawlerRetreiverTest {
|
|||||||
|
|
||||||
private void doCrawlWithReferenceStream(CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
|
private void doCrawlWithReferenceStream(CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
|
||||||
try (var recorder = new WarcRecorder(tempFileWarc2)) {
|
try (var recorder = new WarcRecorder(tempFileWarc2)) {
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(new DomainLinks(),
|
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).crawlDomain(new DomainLinks(),
|
||||||
new CrawlDataReference(stream));
|
new CrawlDataReference(stream));
|
||||||
}
|
}
|
||||||
catch (IOException ex) {
|
catch (IOException ex) {
|
||||||
@ -480,7 +480,7 @@ class CrawlerRetreiverTest {
|
|||||||
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlSpecRecord specs) {
|
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlSpecRecord specs) {
|
||||||
try (var recorder = new WarcRecorder(tempFileWarc1)) {
|
try (var recorder = new WarcRecorder(tempFileWarc1)) {
|
||||||
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder);
|
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder);
|
||||||
crawler.fetch();
|
crawler.crawlDomain();
|
||||||
return crawler.getCrawlFrontier();
|
return crawler.getCrawlFrontier();
|
||||||
} catch (IOException ex) {
|
} catch (IOException ex) {
|
||||||
Assertions.fail(ex);
|
Assertions.fail(ex);
|
||||||
|
@ -8,9 +8,9 @@ import nu.marginalia.api.searchquery.RpcQueryLimits;
|
|||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||||
import nu.marginalia.crawl.retreival.DomainProber;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||||
import nu.marginalia.index.IndexGrpcService;
|
import nu.marginalia.index.IndexGrpcService;
|
||||||
import nu.marginalia.index.ReverseIndexFullFileNames;
|
import nu.marginalia.index.ReverseIndexFullFileNames;
|
||||||
@ -120,7 +120,7 @@ public class IntegrationTest {
|
|||||||
/** CREATE WARC */
|
/** CREATE WARC */
|
||||||
try (WarcRecorder warcRecorder = new WarcRecorder(warcData)) {
|
try (WarcRecorder warcRecorder = new WarcRecorder(warcData)) {
|
||||||
warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"),
|
warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"),
|
||||||
new DomainProber.ProbeResultOk(new EdgeUrl("https://www.example.com/")));
|
new HttpFetcherImpl.ProbeResultOk(new EdgeUrl("https://www.example.com/")));
|
||||||
|
|
||||||
warcRecorder.writeReferenceCopy(new EdgeUrl("https://www.example.com/"),
|
warcRecorder.writeReferenceCopy(new EdgeUrl("https://www.example.com/"),
|
||||||
"text/html", 200,
|
"text/html", 200,
|
||||||
@ -134,6 +134,7 @@ public class IntegrationTest {
|
|||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
""",
|
""",
|
||||||
|
"",
|
||||||
ContentTags.empty()
|
ContentTags.empty()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -204,7 +205,7 @@ public class IntegrationTest {
|
|||||||
.setFetchSize(1000)
|
.setFetchSize(1000)
|
||||||
.build())
|
.build())
|
||||||
.setQueryStrategy("AUTO")
|
.setQueryStrategy("AUTO")
|
||||||
.setHumanQuery("\"This is how thinking works\"")
|
.setHumanQuery("\"is that there is\"")
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
var params = QueryProtobufCodec.convertRequest(request);
|
var params = QueryProtobufCodec.convertRequest(request);
|
||||||
|
@ -96,7 +96,7 @@ public class ScreenshotCaptureToolMain {
|
|||||||
private static byte[] fetchDomain(HttpClient client, EdgeDomain domain) {
|
private static byte[] fetchDomain(HttpClient client, EdgeDomain domain) {
|
||||||
try {
|
try {
|
||||||
Map<String, Object> requestData = Map.of(
|
Map<String, Object> requestData = Map.of(
|
||||||
"url", domain.toRootUrl().toString(),
|
"url", domain.toRootUrlHttp().toString(),
|
||||||
"options",
|
"options",
|
||||||
Map.of("fullPage", false,
|
Map.of("fullPage", false,
|
||||||
"type", "png"),
|
"type", "png"),
|
||||||
|
Loading…
Reference in New Issue
Block a user