(crawler) Refactor

* Restructure the code to make a bit more sense
* Store full headers in crawl data
* Fix bug in retry-after header that assumed the timeout was in milliseconds, and then clamped it to a lower bound of 500ms, meaning this was almost always handled wrong
This commit is contained in:
Viktor Lofgren 2024-09-23 17:51:07 +02:00
parent 9c292a4f62
commit e9854f194c
44 changed files with 398 additions and 337 deletions

View File

@ -7,11 +7,11 @@ import nu.marginalia.UserAgent;
import nu.marginalia.WmsaHome;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.crawl.DomainIndexingState;
@ -266,7 +266,7 @@ public class CrawlingThenConvertingIntegrationTest {
List<SerializableCrawlData> data = new ArrayList<>();
try (var recorder = new WarcRecorder(fileName)) {
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).fetch();
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).crawlDomain();
}
CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain,

View File

@ -10,12 +10,12 @@ import nu.marginalia.UserAgent;
import nu.marginalia.WmsaHome;
import nu.marginalia.atags.source.AnchorTagsSource;
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.logic.DomainLocks;
import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainLocks;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.spec.CrawlSpecProvider;
import nu.marginalia.crawl.spec.DbCrawlSpecProvider;
import nu.marginalia.crawl.spec.ParquetCrawlSpecProvider;
@ -294,7 +294,7 @@ public class CrawlerMain extends ProcessMainClass {
Files.delete(tempFile);
}
int size = retriever.fetch(domainLinks, reference);
int size = retriever.crawlDomain(domainLinks, reference);
// Delete the reference crawl data if it's not the same as the new one
// (mostly a case when migrating from legacy->warc)

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival.fetcher;
package nu.marginalia.crawl.fetcher;
import okhttp3.Request;

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival;
package nu.marginalia.crawl.fetcher;
import okhttp3.Cookie;
import okhttp3.CookieJar;

View File

@ -1,9 +1,8 @@
package nu.marginalia.crawl.retreival.fetcher;
package nu.marginalia.crawl.fetcher;
import com.google.inject.ImplementedBy;
import crawlercommons.robots.SimpleRobotRules;
import nu.marginalia.crawl.retreival.RateLimitException;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.HttpFetchResult;
@ -17,12 +16,12 @@ public interface HttpFetcher {
List<String> getCookies();
void clearCookies();
FetchResult probeDomain(EdgeUrl url);
HttpFetcherImpl.ProbeResult probeDomain(EdgeUrl url);
HttpFetchResult fetchContent(EdgeUrl url,
WarcRecorder recorder,
ContentTags tags,
ProbeType probeType) throws RateLimitException;
ProbeType probeType) throws HttpFetcherImpl.RateLimitException;
SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);

View File

@ -1,22 +1,23 @@
package nu.marginalia.crawl.retreival.fetcher;
package nu.marginalia.crawl.fetcher;
import com.google.inject.Inject;
import crawlercommons.robots.SimpleRobotRules;
import crawlercommons.robots.SimpleRobotRulesParser;
import lombok.SneakyThrows;
import nu.marginalia.UserAgent;
import nu.marginalia.crawl.retreival.Cookies;
import nu.marginalia.crawl.retreival.RateLimitException;
import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult;
import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory;
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.fetcher.socket.FastTerminatingSocketFactory;
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.crawl.fetcher.socket.NoSecuritySSL;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.logic.ContentTypeProber;
import nu.marginalia.crawl.logic.ContentTypeProber.ContentTypeProbeResult;
import nu.marginalia.crawl.logic.SoftIfModifiedSinceProber;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.ContentTypeLogic;
import nu.marginalia.model.body.DocumentBodyExtractor;
import nu.marginalia.model.body.HttpFetchResult;
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
import okhttp3.ConnectionPool;
import okhttp3.Dispatcher;
import okhttp3.OkHttpClient;
@ -25,6 +26,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.net.ssl.X509TrustManager;
import java.time.Duration;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
@ -114,7 +116,7 @@ public class HttpFetcherImpl implements HttpFetcher {
*/
@Override
@SneakyThrows
public FetchResult probeDomain(EdgeUrl url) {
public ProbeResult probeDomain(EdgeUrl url) {
var head = new Request.Builder().head().addHeader("User-agent", userAgentString)
.url(url.toString())
.build();
@ -125,9 +127,9 @@ public class HttpFetcherImpl implements HttpFetcher {
EdgeUrl requestUrl = new EdgeUrl(rsp.request().url().toString());
if (!Objects.equals(requestUrl.domain, url.domain)) {
return new FetchResult(FetchResultState.REDIRECT, requestUrl);
return new ProbeResultRedirect(url.domain);
}
return new FetchResult(FetchResultState.OK, requestUrl);
return new ProbeResultOk(requestUrl);
}
catch (Exception ex) {
@ -136,7 +138,7 @@ public class HttpFetcherImpl implements HttpFetcher {
}
logger.info("Error during fetching {}", ex.getMessage());
return new FetchResult(FetchResultState.ERROR, url);
return new ProbeResultError(CrawlerDomainStatus.ERROR, ex.getMessage());
}
}
@ -196,8 +198,7 @@ public class HttpFetcherImpl implements HttpFetcher {
if (result instanceof HttpFetchResult.ResultOk ok) {
if (ok.statusCode() == 429) {
String retryAfter = Objects.requireNonNullElse(ok.header("Retry-After"), "1000");
throw new RateLimitException(retryAfter);
throw new RateLimitException(Objects.requireNonNullElse(ok.header("Retry-After"), "1"));
}
if (ok.statusCode() == 304) {
return new HttpFetchResult.Result304Raw();
@ -249,5 +250,44 @@ public class HttpFetcherImpl implements HttpFetcher {
}
public sealed interface ProbeResult permits ProbeResultError, ProbeResultRedirect, ProbeResultOk {}
/** The probing failed for one reason or another
* @param status Machine readable status
* @param desc Human-readable description of the error
*/
public record ProbeResultError(CrawlerDomainStatus status, String desc) implements ProbeResult {}
/** This domain redirects to another domain */
public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {}
/** If the retrieval of the probed url was successful, return the url as it was fetched
* (which may be different from the url we probed, if we attempted another URL schema).
*
* @param probedUrl The url we successfully probed
*/
public record ProbeResultOk(EdgeUrl probedUrl) implements ProbeResult {}
/** Exception thrown when the server signals the rate limit is exceeded */
public static class RateLimitException extends Exception {
private final String retryAfter;
public RateLimitException(String retryAfterHeader) {
this.retryAfter = retryAfterHeader;
}
@Override
public StackTraceElement[] getStackTrace() { return new StackTraceElement[0]; }
public Duration retryAfter() {
try {
return Duration.ofSeconds(Integer.parseInt(retryAfter));
}
catch (NumberFormatException ex) {
return Duration.ofSeconds(1);
}
}
}
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival.fetcher;
package nu.marginalia.crawl.fetcher;
import crawlercommons.sitemaps.*;
import nu.marginalia.model.EdgeUrl;

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival.fetcher.socket;
package nu.marginalia.crawl.fetcher.socket;
import javax.net.SocketFactory;
import java.io.IOException;

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival.fetcher.socket;
package nu.marginalia.crawl.fetcher.socket;
import okhttp3.Interceptor;
import okhttp3.Response;

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival.fetcher.socket;
package nu.marginalia.crawl.fetcher.socket;
import lombok.SneakyThrows;

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival.fetcher.warc;
package nu.marginalia.crawl.fetcher.warc;
import org.netpreserve.jwarc.WarcDigest;

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival.fetcher.warc;
package nu.marginalia.crawl.fetcher.warc;
import okhttp3.Headers;
import okhttp3.Response;

View File

@ -1,7 +1,6 @@
package nu.marginalia.crawl.retreival.fetcher.warc;
package nu.marginalia.crawl.fetcher.warc;
import okhttp3.Protocol;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.commons.lang3.StringUtils;
@ -73,7 +72,7 @@ public class WarcProtocolReconstructor {
String headerString = getHeadersAsString(headersAsString);
return STR."HTTP/\{version} \{statusCode} \{statusMessage}\r\n\{headerString}\r\n\r\n";
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
}
static String getResponseHeader(Response response, long size) {
@ -84,7 +83,7 @@ public class WarcProtocolReconstructor {
String headerString = getHeadersAsString(response, size);
return STR."HTTP/\{version} \{statusCode} \{statusMessage}\r\n\{headerString}\r\n\r\n";
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
}
private static final Map<Integer, String> STATUS_CODE_MAP = Map.ofEntries(

View File

@ -1,13 +1,14 @@
package nu.marginalia.crawl.retreival.fetcher.warc;
package nu.marginalia.crawl.fetcher.warc;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.HttpFetchResult;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import org.jetbrains.annotations.Nullable;
import org.netpreserve.jwarc.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -183,7 +184,7 @@ public class WarcRecorder implements AutoCloseable {
writer.write(item);
}
private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, String documentBody, ContentTags contentTags) {
private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, String documentBody, @Nullable String headers, ContentTags contentTags) {
try {
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
@ -192,24 +193,42 @@ public class WarcRecorder implements AutoCloseable {
if (documentBody == null) {
bytes = new byte[0];
}
else {
} else {
bytes = documentBody.getBytes();
}
StringJoiner fakeHeadersBuilder = new StringJoiner("\n");
// Create a synthesis of custom headers and the original headers
// to create a new set of headers that will be written to the WARC file.
fakeHeadersBuilder.add(STR."Content-Type: \{contentType}");
fakeHeadersBuilder.add(STR."Content-Length: \{bytes.length}");
StringJoiner syntheticHeadersBuilder = new StringJoiner("\n");
syntheticHeadersBuilder.add("Content-Type: " + contentType);
syntheticHeadersBuilder.add("Content-Length: " + bytes.length);
if (contentTags.etag() != null) {
fakeHeadersBuilder.add(STR."ETag: \{contentTags.etag()}");
syntheticHeadersBuilder.add("ETag: " + contentTags.etag());
}
if (contentTags.lastMod() != null) {
fakeHeadersBuilder.add(STR."Last-Modified: \{contentTags.lastMod()}");
syntheticHeadersBuilder.add("Last-Modified: " + contentTags.lastMod());
}
// Grab the headers from the original response and add them to the fake headers if they are not
// Content-Type, Content-Length, ETag, or Last-Modified
for (String headerLine : Objects.requireNonNullElse(headers, "").split("\n")) {
if (headerLine.isBlank()) continue;
var lowerCase = headerLine.toLowerCase();
if (lowerCase.startsWith("content-type:")) continue;
if (lowerCase.startsWith("content-length:")) continue;
if (contentTags.etag() != null && lowerCase.startsWith("etag:")) continue;
if (contentTags.lastMod() != null && lowerCase.startsWith("last-modified:")) continue;
syntheticHeadersBuilder.add(headerLine);
}
byte[] header = WarcProtocolReconstructor
.getResponseHeader(fakeHeadersBuilder.toString(), statusCode)
.getResponseHeader(syntheticHeadersBuilder.toString(), statusCode)
.getBytes(StandardCharsets.UTF_8);
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(bytes.length + header.length);
responseDataBuffer.put(header);
@ -244,25 +263,25 @@ public class WarcRecorder implements AutoCloseable {
* an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this
* scenario we want to record the data as it was in the previous crawl, but not re-fetch it.
*/
public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, String documentBody, ContentTags ctags) {
saveOldResponse(url, contentType, statusCode, documentBody, ctags);
public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, String documentBody, @Nullable String headers, ContentTags ctags) {
saveOldResponse(url, contentType, statusCode, documentBody, headers, ctags);
}
public void writeWarcinfoHeader(String ip, EdgeDomain domain, DomainProber.ProbeResult result) throws IOException {
public void writeWarcinfoHeader(String ip, EdgeDomain domain, HttpFetcherImpl.ProbeResult result) throws IOException {
Map<String, List<String>> fields = new HashMap<>();
fields.put("ip", List.of(ip));
fields.put("software", List.of(STR."search.marginalia.nu/\{warcRecorderVersion}"));
fields.put("software", List.of("search.marginalia.nu/" + warcRecorderVersion));
fields.put("domain", List.of(domain.toString()));
switch (result) {
case DomainProber.ProbeResultRedirect redirectDomain:
fields.put("X-WARC-Probe-Status", List.of(STR."REDIRECT;\{redirectDomain.domain()}"));
case HttpFetcherImpl.ProbeResultRedirect redirectDomain:
fields.put("X-WARC-Probe-Status", List.of("REDIRECT;" + redirectDomain.domain()));
break;
case DomainProber.ProbeResultError error:
fields.put("X-WARC-Probe-Status", List.of(STR."\{error.status().toString()};\{error.desc()}"));
case HttpFetcherImpl.ProbeResultError error:
fields.put("X-WARC-Probe-Status", List.of(error.status().toString() + ";" + error.desc()));
break;
case DomainProber.ProbeResultOk ok:
case HttpFetcherImpl.ProbeResultOk ok:
fields.put("X-WARC-Probe-Status", List.of("OK"));
break;
}

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival.fetcher;
package nu.marginalia.crawl.logic;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.ContentTypeLogic;
@ -7,7 +7,7 @@ import okhttp3.Request;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.SocketTimeoutException;
import java.io.InterruptedIOException;
import java.util.Objects;
public class ContentTypeProber {
@ -68,7 +68,7 @@ public class ContentTypeProber {
return new ContentTypeProbeResult.Ok(ret);
} catch (SocketTimeoutException ex) {
} catch (InterruptedIOException ex) {
return new ContentTypeProbeResult.Timeout(ex);
} catch (Exception ex) {
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival;
package nu.marginalia.crawl.logic;
import nu.marginalia.model.EdgeDomain;

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival;
package nu.marginalia.crawl.logic;
import nu.marginalia.model.EdgeUrl;
import org.jsoup.nodes.Document;
@ -26,6 +26,20 @@ public class LinkFilterSelector {
if (isDiscourse(head)) {
return url -> url.path.startsWith("/t/") || url.path.contains("/latest");
}
if (isMediawiki(head)) {
return url -> {
if (url.path.endsWith(".php")) {
return false;
}
if (url.path.contains("Special:")) {
return false;
}
if (url.path.contains("Talk:")) {
return false;
}
return true;
};
}
return LinkFilterSelector::defaultFilter;
}

View File

@ -1,6 +1,7 @@
package nu.marginalia.crawl.retreival.fetcher;
package nu.marginalia.crawl.logic;
import com.google.common.base.Strings;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.model.EdgeUrl;
import okhttp3.OkHttpClient;
import okhttp3.Request;

View File

@ -1,6 +1,9 @@
package nu.marginalia.crawl.retreival;
import lombok.SneakyThrows;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import java.time.Duration;
import static java.lang.Math.max;
import static java.lang.Math.min;
@ -22,12 +25,21 @@ public class CrawlDelayTimer {
/** Call when we've gotten an HTTP 429 response. This will wait a moment, and then
* set a flag that slows down the main crawl delay as well. */
public void waitRetryDelay(RateLimitException ex) throws InterruptedException {
public void waitRetryDelay(HttpFetcherImpl.RateLimitException ex) throws InterruptedException {
slowDown = true;
int delay = ex.retryAfter();
Duration delay = ex.retryAfter();
Thread.sleep(Math.clamp(delay, 100, 5000));
if (delay.compareTo(Duration.ofSeconds(1)) < 0) {
// If the server wants us to retry in less than a second, we'll just wait a bit
delay = Duration.ofSeconds(1);
}
else if (delay.compareTo(Duration.ofSeconds(5)) > 0) {
// If the server wants us to retry in more than a minute, we'll wait a bit
delay = Duration.ofSeconds(5);
}
Thread.sleep(delay.toMillis());
}
@SneakyThrows

View File

@ -1,91 +0,0 @@
package nu.marginalia.crawl.retreival;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.HttpFetchResult;
import nu.marginalia.model.crawldata.CrawledDocument;
import nu.marginalia.model.crawldata.CrawlerDocumentStatus;
import java.time.LocalDateTime;
import java.util.Objects;
public class CrawledDocumentFactory {
public static CrawledDocument createHardErrorRsp(EdgeUrl url, Exception why) {
return CrawledDocument.builder()
.crawlerStatus(CrawlerDocumentStatus.ERROR.toString())
.crawlerStatusDesc(why.getClass().getSimpleName() + ": " + why.getMessage())
.timestamp(LocalDateTime.now().toString())
.url(url.toString())
.build();
}
public static CrawledDocument createUnknownHostError(EdgeUrl url) {
return CrawledDocument.builder()
.crawlerStatus(CrawlerDocumentStatus.ERROR.toString())
.crawlerStatusDesc("Unknown Host")
.timestamp(LocalDateTime.now().toString())
.url(url.toString())
.build();
}
public static CrawledDocument createTimeoutErrorRsp(EdgeUrl url) {
return CrawledDocument.builder()
.crawlerStatus("Timeout")
.timestamp(LocalDateTime.now().toString())
.url(url.toString())
.build();
}
public static CrawledDocument createErrorResponse(EdgeUrl url, HttpFetchResult.ResultOk rsp, CrawlerDocumentStatus status, String why) {
return CrawledDocument.builder()
.crawlerStatus(status.toString())
.crawlerStatusDesc(why)
.headers(rsp.headers().toString())
.contentType(Objects.requireNonNullElse(rsp.headers().get("Content-Type"), ""))
.timestamp(LocalDateTime.now().toString())
.httpStatus(rsp.statusCode())
.url(url.toString())
.build();
}
public static CrawledDocument createErrorResponse(EdgeUrl url, String contentType, int statusCode, CrawlerDocumentStatus status, String why) {
return CrawledDocument.builder()
.crawlerStatus(status.toString())
.crawlerStatusDesc(why)
.headers("")
.contentType(contentType)
.timestamp(LocalDateTime.now().toString())
.httpStatus(statusCode)
.url(url.toString())
.build();
}
public static CrawledDocument createRedirectResponse(EdgeUrl url, HttpFetchResult.ResultOk rsp, EdgeUrl responseUrl) {
return CrawledDocument.builder()
.crawlerStatus(CrawlerDocumentStatus.REDIRECT.name())
.redirectUrl(responseUrl.toString())
.headers(rsp.headers().toString())
.contentType(Objects.requireNonNullElse(rsp.headers().get("Content-Type"), ""))
.timestamp(LocalDateTime.now().toString())
.httpStatus(rsp.statusCode())
.url(url.toString())
.build();
}
public static CrawledDocument createRobotsError(EdgeUrl url) {
return CrawledDocument.builder()
.url(url.toString())
.timestamp(LocalDateTime.now().toString())
.httpStatus(-1)
.crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name())
.build();
}
public static CrawledDocument createRetryError(EdgeUrl url) {
return CrawledDocument.builder()
.url(url.toString())
.timestamp(LocalDateTime.now().toString())
.httpStatus(429)
.crawlerStatus(CrawlerDocumentStatus.ERROR.name())
.build();
}
}

View File

@ -3,9 +3,11 @@ package nu.marginalia.crawl.retreival;
import crawlercommons.robots.SimpleRobotRules;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.contenttype.ContentType;
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.logic.LinkFilterSelector;
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
@ -14,8 +16,6 @@ import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.HttpFetchResult;
import nu.marginalia.model.crawldata.CrawledDomain;
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
import org.jsoup.Jsoup;
import org.slf4j.Logger;
@ -25,7 +25,6 @@ import java.io.IOException;
import java.net.InetAddress;
import java.net.UnknownHostException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
@ -84,11 +83,11 @@ public class CrawlerRetreiver implements AutoCloseable {
return crawlFrontier;
}
public int fetch() {
return fetch(new DomainLinks(), new CrawlDataReference());
public int crawlDomain() {
return crawlDomain(new DomainLinks(), new CrawlDataReference());
}
public int fetch(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
public int crawlDomain(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
try {
return crawlDomain(oldCrawlData, domainLinks);
}
@ -98,28 +97,11 @@ public class CrawlerRetreiver implements AutoCloseable {
}
}
public void syncAbortedRun(Path warcFile) {
var resync = new CrawlerWarcResynchronizer(crawlFrontier, warcRecorder);
resync.run(warcFile);
}
private DomainProber.ProbeResult probeRootUrl(String ip) throws IOException {
// Construct an URL to the root of the domain, we don't know the schema yet so we'll
// start with http and then try https if that fails
var httpUrl = new EdgeUrl("http", new EdgeDomain(domain), null, "/", null);
final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, httpUrl);
warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult);
return probeResult;
}
private int crawlDomain(CrawlDataReference oldCrawlData, DomainLinks domainLinks) throws IOException, InterruptedException {
String ip = findIp(domain);
EdgeUrl rootUrl;
if (probeRootUrl(ip) instanceof DomainProber.ProbeResultOk ok) rootUrl = ok.probedUrl();
if (probeRootUrl(ip) instanceof HttpFetcherImpl.ProbeResultOk ok) rootUrl = ok.probedUrl();
else return 1;
// Sleep after the initial probe, we don't have access to the robots.txt yet
@ -130,12 +112,13 @@ public class CrawlerRetreiver implements AutoCloseable {
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
sniffRootDocument(rootUrl, delayTimer);
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
int recrawled = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
int fetchedCount = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
if (recrawled > 0) {
if (fetchedCount > 0) {
// If we have reference data, we will always grow the crawl depth a bit
crawlFrontier.increaseDepth(1.5, 2500);
}
@ -146,15 +129,6 @@ public class CrawlerRetreiver implements AutoCloseable {
// Add links from the sitemap to the crawl frontier
sitemapFetcher.downloadSitemaps(robotsRules, rootUrl);
CrawledDomain ret = new CrawledDomain(domain,
null,
CrawlerDomainStatus.OK.name(),
null,
ip,
new ArrayList<>(),
null);
int fetchedCount = recrawled;
while (!crawlFrontier.isEmpty()
&& !crawlFrontier.isCrawlDepthReached()
@ -186,7 +160,6 @@ public class CrawlerRetreiver implements AutoCloseable {
if (!crawlFrontier.addVisited(top))
continue;
try {
if (fetchContentWithReference(top, delayTimer, DocumentWithReference.empty()).isOk()) {
fetchedCount++;
@ -198,15 +171,28 @@ public class CrawlerRetreiver implements AutoCloseable {
}
}
ret.cookies = fetcher.getCookies();
return fetchedCount;
}
public void syncAbortedRun(Path warcFile) {
var resync = new CrawlerWarcResynchronizer(crawlFrontier, warcRecorder);
resync.run(warcFile);
}
private HttpFetcherImpl.ProbeResult probeRootUrl(String ip) throws IOException {
// Construct an URL to the root of the domain, we don't know the schema yet so we'll
// start with http and then try https if that fails
var httpUrl = new EdgeUrl("http", new EdgeDomain(domain), null, "/", null);
final HttpFetcherImpl.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, httpUrl);
warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult);
return probeResult;
}
private void sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
try {
logger.debug("Configuring link filter");
var url = rootUrl.withPathAndParam("/", null);
HttpFetchResult result = fetchWithRetry(url, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
@ -291,7 +277,7 @@ public class CrawlerRetreiver implements AutoCloseable {
else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
var doc = reference.doc();
warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody, contentTags);
warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody, doc.headers, contentTags);
fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
new ContentType(doc.contentType, "UTF-8"),
@ -326,7 +312,7 @@ public class CrawlerRetreiver implements AutoCloseable {
try {
return fetcher.fetchContent(url, warcRecorder, contentTags, probeType);
}
catch (RateLimitException ex) {
catch (HttpFetcherImpl.RateLimitException ex) {
timer.waitRetryDelay(ex);
}
catch (Exception ex) {

View File

@ -1,6 +1,6 @@
package nu.marginalia.crawl.retreival;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.DocumentBodyExtractor;
import nu.marginalia.model.body.HttpFetchResult;
@ -38,7 +38,7 @@ public class CrawlerWarcResynchronizer {
accept(item);
}
} catch (Exception e) {
logger.info(STR."(Expected) Failed read full warc file \{tempFile}: \{e.getClass().getSimpleName()} \{e.getMessage()}");
logger.info("(Expected) Failed read full warc file " + tempFile + ": " + e.getClass().getSimpleName() + " " + e.getMessage());
}
// Second pass, copy records to the new warc file
@ -47,7 +47,7 @@ public class CrawlerWarcResynchronizer {
recorder.resync(item);
}
} catch (Exception e) {
logger.info(STR."(Expected) Failed read full warc file \{tempFile}: \{e.getClass().getSimpleName()} \{e.getMessage()}");
logger.info("(Expected) Failed read full warc file " + tempFile + ": " + e.getClass().getSimpleName() + " " + e.getMessage());
}
}
@ -63,7 +63,7 @@ public class CrawlerWarcResynchronizer {
}
catch (Exception ex) {
logger.info(STR."Failed to process warc record \{item}", ex);
logger.info("Failed to process warc record " + item, ex);
}
}
@ -78,7 +78,8 @@ public class CrawlerWarcResynchronizer {
}
private void request(WarcRequest request) {
EdgeUrl.parse(request.target()).ifPresent(crawlFrontier::addVisited);
var url = new EdgeUrl(request.targetURI());
crawlFrontier.addVisited(url);
}
private void response(WarcResponse rsp) {
@ -97,7 +98,7 @@ public class CrawlerWarcResynchronizer {
});
}
catch (Exception e) {
logger.info(STR."Failed to parse response body for \{url}", e);
logger.info("Failed to parse response body for " + url, e);
}
}

View File

@ -9,9 +9,14 @@ import nu.marginalia.model.EdgeUrl;
import org.jsoup.nodes.Document;
import java.net.URISyntaxException;
import java.util.*;
import java.util.ArrayDeque;
import java.util.Collection;
import java.util.Objects;
import java.util.function.Predicate;
/** Encapsulates the crawl frontier for a single domain,
* that is information about known and visited URLs
*/
public class DomainCrawlFrontier {
private static final LinkParser linkParser = new LinkParser();

View File

@ -2,8 +2,8 @@ package nu.marginalia.crawl.retreival;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.crawl.retreival.fetcher.FetchResultState;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.ip_blocklist.IpBlockList;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
@ -34,43 +34,18 @@ public class DomainProber {
* doesn't immediately redirect to another domain (which should be crawled separately, not under the name
* of this domain).
*/
public ProbeResult probeDomain(HttpFetcher fetcher, String domain, @Nullable EdgeUrl firstUrlInQueue) {
public HttpFetcherImpl.ProbeResult probeDomain(HttpFetcher fetcher, String domain, @Nullable EdgeUrl firstUrlInQueue) {
if (firstUrlInQueue == null) {
logger.warn("No valid URLs for domain {}", domain);
return new ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs");
return new HttpFetcherImpl.ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs");
}
if (!domainBlacklist.test(firstUrlInQueue.domain))
return new ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed");
return new HttpFetcherImpl.ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed");
var fetchResult = fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null));
if (fetchResult.ok())
return new ProbeResultOk(fetchResult.url);
if (fetchResult.state == FetchResultState.REDIRECT)
return new ProbeResultRedirect(fetchResult.domain);
return new ProbeResultError(CrawlerDomainStatus.ERROR, "Bad status");
return fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null));
}
public sealed interface ProbeResult permits ProbeResultError, ProbeResultRedirect, ProbeResultOk {}
/** The probing failed for one reason or another
* @param status Machine readable status
* @param desc Human-readable description of the error
*/
public record ProbeResultError(CrawlerDomainStatus status, String desc) implements ProbeResult {}
/** This domain redirects to another domain */
public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {}
/** If the retrieval of the probed url was successful, return the url as it was fetched
* (which may be different from the url we probed, if we attempted another URL schema).
*
* @param probedUrl The url we successfully probed
*/
public record ProbeResultOk(EdgeUrl probedUrl) implements ProbeResult {}
}

View File

@ -1,21 +0,0 @@
package nu.marginalia.crawl.retreival;
public class RateLimitException extends Exception {
private final String retryAfter;
public RateLimitException(String retryAfter) {
this.retryAfter = retryAfter;
}
@Override
public StackTraceElement[] getStackTrace() { return new StackTraceElement[0]; }
public int retryAfter() {
try {
return Integer.parseInt(retryAfter);
}
catch (NumberFormatException ex) {
return 1000;
}
}
}

View File

@ -1,24 +0,0 @@
package nu.marginalia.crawl.retreival.fetcher;
import lombok.AllArgsConstructor;
import lombok.ToString;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
@AllArgsConstructor
@ToString
public class FetchResult {
public final FetchResultState state;
public final EdgeUrl url;
public final EdgeDomain domain;
public FetchResult(FetchResultState state, EdgeUrl url) {
this.state = state;
this.url = url;
this.domain = url.domain;
}
public boolean ok() {
return state == FetchResultState.OK;
}
}

View File

@ -1,7 +0,0 @@
package nu.marginalia.crawl.retreival.fetcher;
public enum FetchResultState {
OK,
REDIRECT,
ERROR
}

View File

@ -2,12 +2,12 @@ package nu.marginalia.crawl.retreival.revisit;
import com.google.common.base.Strings;
import crawlercommons.robots.SimpleRobotRules;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.HttpFetchResult;
import nu.marginalia.model.crawldata.CrawledDocument;
@ -125,6 +125,7 @@ public class CrawlerRevisitor {
doc.contentType,
doc.httpStatus,
doc.documentBody,
doc.headers,
new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe)
);

View File

@ -1,7 +1,7 @@
package nu.marginalia.crawl.retreival.revisit;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.model.body.DocumentBodyExtractor;
import nu.marginalia.model.body.DocumentBodyResult;
import nu.marginalia.model.body.HttpFetchResult;

View File

@ -1,8 +1,8 @@
package nu.marginalia.crawl.retreival.sitemap;
import crawlercommons.robots.SimpleRobotRules;
import nu.marginalia.crawl.fetcher.SitemapRetriever;
import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever;
import nu.marginalia.model.EdgeUrl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View File

@ -144,7 +144,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
nextRecord.httpStatus,
status.toString(),
"",
"",
nextRecord.headers,
bodyString,
Long.toHexString(hash.hashNearlyASCII(bodyString)), // this field isn't actually used, maybe we can skip calculating it?
nextRecord.url,

View File

@ -23,7 +23,6 @@ public class CrawledDocument implements SerializableCrawlData {
public String crawlerStatusDesc;
@Nullable
@Deprecated // use getETag() or getLastModified() instead
public String headers;
public String documentBody;

View File

@ -29,7 +29,11 @@ public class CrawledDocumentParquetRecord {
public String contentType;
public byte[] body;
public String headers;
@Deprecated // will be replaced with the full headers field in the future
public String etagHeader;
@Deprecated // will be replaced with the full headers field in the future
public String lastModifiedHeader;
public static Hydrator<CrawledDocumentParquetRecord, CrawledDocumentParquetRecord> newHydrator() {
@ -51,7 +55,8 @@ public class CrawledDocumentParquetRecord {
Types.required(BINARY).as(stringType()).named("contentType"),
Types.required(BINARY).named("body"),
Types.optional(BINARY).as(stringType()).named("etagHeader"),
Types.optional(BINARY).as(stringType()).named("lastModifiedHeader")
Types.optional(BINARY).as(stringType()).named("lastModifiedHeader"),
Types.optional(BINARY).as(stringType()).named("headers")
);
@ -67,6 +72,7 @@ public class CrawledDocumentParquetRecord {
case "epochSeconds" -> timestamp = Instant.ofEpochSecond((Long) value);
case "etagHeader" -> etagHeader = (String) value;
case "lastModifiedHeader" -> lastModifiedHeader = (String) value;
case "headers" -> headers = (String) value;
default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"');
}
@ -82,6 +88,9 @@ public class CrawledDocumentParquetRecord {
valueWriter.write("cookies", cookies);
valueWriter.write("contentType", contentType);
valueWriter.write("body", body);
if (headers != null) {
valueWriter.write("headers", headers);
}
if (etagHeader != null) {
valueWriter.write("etagHeader", etagHeader);
}

View File

@ -16,6 +16,7 @@ import java.nio.file.Path;
import java.time.Instant;
import java.util.List;
import java.util.Objects;
import java.util.StringJoiner;
public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
private final ParquetWriter<CrawledDocumentParquetRecord> writer;
@ -150,6 +151,14 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
contentType = "";
}
String headersStr = null;
StringJoiner headersStrBuilder = new StringJoiner("\n");
for (var header : headers) {
headersStrBuilder.add(header.getFirst() + ": " + header.getSecond());
}
headersStr = headersStrBuilder.toString();
write(new CrawledDocumentParquetRecord(
domain,
response.target(),
@ -159,6 +168,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
response.date(),
contentType,
bodyBytes,
headersStr,
headers.get("ETag"),
headers.get("Last-Modified"))
);
@ -179,6 +189,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
"x-marginalia/advisory;state=redirect",
new byte[0],
null,
null,
null
);
}
@ -192,6 +203,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
"x-marginalia/advisory;state=error",
errorStatus.getBytes(),
null,
null,
null
);
}
@ -206,6 +218,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
errorStatus,
new byte[0],
null,
null,
null
);
}

View File

@ -0,0 +1,124 @@
package nu.marginalia.crawl.logic;
import com.sun.net.httpserver.HttpServer;
import nu.marginalia.model.EdgeUrl;
import okhttp3.OkHttpClient;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.Random;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertInstanceOf;
class ContentTypeProberTest {
private static int port;
private static HttpServer server;
private static OkHttpClient client;
static EdgeUrl htmlEndpoint;
static EdgeUrl htmlRedirEndpoint;
static EdgeUrl binaryEndpoint;
static EdgeUrl timeoutEndpoint;
@BeforeEach
void setUp() throws IOException {
Random r = new Random();
port = r.nextInt(10000) + 8000;
server = HttpServer.create(new InetSocketAddress("127.0.0.1", port), 10);
server.createContext("/html", exchange -> {
exchange.getResponseHeaders().add("Content-Type", "text/html");
exchange.sendResponseHeaders(200, -1);
exchange.close();
});
server.createContext("/redir", exchange -> {
exchange.getResponseHeaders().add("Location", "/html");
exchange.sendResponseHeaders(301, -1);
exchange.close();
});
server.createContext("/bin", exchange -> {
exchange.getResponseHeaders().add("Content-Type", "application/binary");
exchange.sendResponseHeaders(200, -1);
exchange.close();
});
server.createContext("/timeout", exchange -> {
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
exchange.getResponseHeaders().add("Content-Type", "application/binary");
exchange.sendResponseHeaders(200, -1);
exchange.close();
});
server.start();
htmlEndpoint = EdgeUrl.parse("http://localhost:" + port + "/html").get();
binaryEndpoint = EdgeUrl.parse("http://localhost:" + port + "/bin").get();
timeoutEndpoint = EdgeUrl.parse("http://localhost:" + port + "/timeout").get();
htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir").get();
client = new OkHttpClient.Builder()
.readTimeout(1, java.util.concurrent.TimeUnit.SECONDS)
.connectTimeout(1, java.util.concurrent.TimeUnit.SECONDS)
.callTimeout(1, java.util.concurrent.TimeUnit.SECONDS)
.writeTimeout(1, java.util.concurrent.TimeUnit.SECONDS)
.build();
}
@AfterEach
void tearDown() {
server.stop(0);
client.dispatcher().executorService().shutdown();
client.connectionPool().evictAll();
}
@Test
void probeContentTypeOk() {
ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client)
.probeContentType(htmlEndpoint);
System.out.println(result);
assertEquals(result, new ContentTypeProber.ContentTypeProbeResult.Ok(htmlEndpoint));
}
@Test
void probeContentTypeRedir() {
ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client)
.probeContentType(htmlRedirEndpoint);
System.out.println(result);
assertEquals(result, new ContentTypeProber.ContentTypeProbeResult.Ok(htmlEndpoint));
}
@Test
void probeContentTypeBad() {
ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client)
.probeContentType(binaryEndpoint);
System.out.println(result);
assertInstanceOf(ContentTypeProber.ContentTypeProbeResult.BadContentType.class, result);
}
@Test
void probeContentTypeTimeout() {
ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client)
.probeContentType(timeoutEndpoint);
System.out.println(result);
assertInstanceOf(ContentTypeProber.ContentTypeProbeResult.Timeout.class, result);
}
}

View File

@ -1,7 +1,7 @@
package nu.marginalia.crawl.retreival;
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import okhttp3.OkHttpClient;
@ -21,7 +21,8 @@ import java.security.NoSuchAlgorithmException;
import java.util.List;
import java.util.zip.GZIPInputStream;
import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
class CrawlerWarcResynchronizerTest {
Path fileName;

View File

@ -1,7 +1,8 @@
package nu.marginalia.crawl.retreival.fetcher;
import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult.BadContentType;
import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult.Ok;
import nu.marginalia.crawl.logic.ContentTypeProber;
import nu.marginalia.crawl.logic.ContentTypeProber.ContentTypeProbeResult.BadContentType;
import nu.marginalia.crawl.logic.ContentTypeProber.ContentTypeProbeResult.Ok;
import nu.marginalia.model.EdgeUrl;
import okhttp3.ConnectionPool;
import okhttp3.Dispatcher;
@ -13,7 +14,7 @@ import java.net.URISyntaxException;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
class ContentTypeProberTest {

View File

@ -1,8 +1,9 @@
package nu.marginalia.crawl.retreival.fetcher;
import nu.marginalia.UserAgent;
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
@ -80,6 +81,7 @@ class WarcRecorderTest {
"text/html",
200,
"<?doctype html><html><body>test</body></html>",
null,
ContentTags.empty());
}
@ -103,7 +105,7 @@ class WarcRecorderTest {
"text/html",
200,
null,
ContentTags.empty());
null, ContentTags.empty());
}
}
@ -115,7 +117,7 @@ class WarcRecorderTest {
"text/html",
200,
"<?doctype html><html><body>test</body></html>",
ContentTags.empty());
null, ContentTags.empty());
}
try (var reader = new WarcReader(fileNameWarc)) {

View File

@ -1,7 +1,7 @@
package nu.marginalia.crawl.retreival.revisit;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.model.crawldata.CrawledDocument;
import org.junit.jupiter.api.Test;

View File

@ -1,11 +1,10 @@
package nu.marginalia.crawling;
import lombok.SneakyThrows;
import nu.marginalia.crawl.retreival.RateLimitException;
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.ContentTypeLogic;
import nu.marginalia.model.body.DocumentBodyExtractor;
@ -33,7 +32,7 @@ class HttpFetcherTest {
}
@Test
void fetchUTF8() throws URISyntaxException, RateLimitException, IOException {
void fetchUTF8() throws URISyntaxException, HttpFetcherImpl.RateLimitException, IOException {
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
try (var recorder = new WarcRecorder()) {
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
@ -44,7 +43,7 @@ class HttpFetcherTest {
}
@Test
void fetchText() throws URISyntaxException, RateLimitException, IOException {
void fetchText() throws URISyntaxException, HttpFetcherImpl.RateLimitException, IOException {
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
try (var recorder = new WarcRecorder()) {

View File

@ -2,10 +2,13 @@ package nu.marginalia.crawling.retreival;
import crawlercommons.robots.SimpleRobotRules;
import lombok.SneakyThrows;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.SitemapRetriever;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.*;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.HttpFetchResult;
@ -68,7 +71,7 @@ public class CrawlerMockFetcherTest {
void crawl(CrawlSpecRecord spec) throws IOException {
try (var recorder = new WarcRecorder()) {
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder)
.fetch();
.crawlDomain();
}
}
@ -115,9 +118,9 @@ public class CrawlerMockFetcherTest {
public void clearCookies() {}
@Override
public FetchResult probeDomain(EdgeUrl url) {
public HttpFetcherImpl.ProbeResult probeDomain(EdgeUrl url) {
logger.info("Probing {}", url);
return new FetchResult(FetchResultState.OK, url);
return new HttpFetcherImpl.ProbeResultOk(url);
}
@SneakyThrows

View File

@ -4,10 +4,10 @@ import lombok.SneakyThrows;
import nu.marginalia.UserAgent;
import nu.marginalia.WmsaHome;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.retreival.*;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.io.crawldata.CrawledDomainReader;
import nu.marginalia.io.crawldata.SerializableCrawlDataStream;
import nu.marginalia.model.EdgeDomain;
@ -468,7 +468,7 @@ class CrawlerRetreiverTest {
private void doCrawlWithReferenceStream(CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
try (var recorder = new WarcRecorder(tempFileWarc2)) {
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(new DomainLinks(),
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).crawlDomain(new DomainLinks(),
new CrawlDataReference(stream));
}
catch (IOException ex) {
@ -480,7 +480,7 @@ class CrawlerRetreiverTest {
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlSpecRecord specs) {
try (var recorder = new WarcRecorder(tempFileWarc1)) {
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder);
crawler.fetch();
crawler.crawlDomain();
return crawler.getCrawlFrontier();
} catch (IOException ex) {
Assertions.fail(ex);

View File

@ -8,9 +8,9 @@ import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.converting.writer.ConverterBatchWriter;
import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.functions.searchquery.QueryFactory;
import nu.marginalia.index.IndexGrpcService;
import nu.marginalia.index.ReverseIndexFullFileNames;
@ -120,7 +120,7 @@ public class IntegrationTest {
/** CREATE WARC */
try (WarcRecorder warcRecorder = new WarcRecorder(warcData)) {
warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"),
new DomainProber.ProbeResultOk(new EdgeUrl("https://www.example.com/")));
new HttpFetcherImpl.ProbeResultOk(new EdgeUrl("https://www.example.com/")));
warcRecorder.writeReferenceCopy(new EdgeUrl("https://www.example.com/"),
"text/html", 200,
@ -134,6 +134,7 @@ public class IntegrationTest {
</body>
</html>
""",
"",
ContentTags.empty()
);
}
@ -204,7 +205,7 @@ public class IntegrationTest {
.setFetchSize(1000)
.build())
.setQueryStrategy("AUTO")
.setHumanQuery("\"This is how thinking works\"")
.setHumanQuery("\"is that there is\"")
.build();
var params = QueryProtobufCodec.convertRequest(request);

View File

@ -96,7 +96,7 @@ public class ScreenshotCaptureToolMain {
private static byte[] fetchDomain(HttpClient client, EdgeDomain domain) {
try {
Map<String, Object> requestData = Map.of(
"url", domain.toRootUrl().toString(),
"url", domain.toRootUrlHttp().toString(),
"options",
Map.of("fullPage", false,
"type", "png"),