mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(crawler) Refactor
* Restructure the code to make a bit more sense * Store full headers in crawl data * Fix bug in retry-after header that assumed the timeout was in milliseconds, and then clamped it to a lower bound of 500ms, meaning this was almost always handled wrong
This commit is contained in:
parent
9c292a4f62
commit
e9854f194c
@ -7,11 +7,11 @@ import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
@ -266,7 +266,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
||||
List<SerializableCrawlData> data = new ArrayList<>();
|
||||
|
||||
try (var recorder = new WarcRecorder(fileName)) {
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).fetch();
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).crawlDomain();
|
||||
}
|
||||
|
||||
CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain,
|
||||
|
@ -10,12 +10,12 @@ import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.atags.source.AnchorTagsSource;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.logic.DomainLocks;
|
||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.DomainLocks;
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.spec.CrawlSpecProvider;
|
||||
import nu.marginalia.crawl.spec.DbCrawlSpecProvider;
|
||||
import nu.marginalia.crawl.spec.ParquetCrawlSpecProvider;
|
||||
@ -294,7 +294,7 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
Files.delete(tempFile);
|
||||
}
|
||||
|
||||
int size = retriever.fetch(domainLinks, reference);
|
||||
int size = retriever.crawlDomain(domainLinks, reference);
|
||||
|
||||
// Delete the reference crawl data if it's not the same as the new one
|
||||
// (mostly a case when migrating from legacy->warc)
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher;
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import okhttp3.Request;
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import okhttp3.Cookie;
|
||||
import okhttp3.CookieJar;
|
@ -1,9 +1,8 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher;
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import com.google.inject.ImplementedBy;
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import nu.marginalia.crawl.retreival.RateLimitException;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
@ -17,12 +16,12 @@ public interface HttpFetcher {
|
||||
List<String> getCookies();
|
||||
void clearCookies();
|
||||
|
||||
FetchResult probeDomain(EdgeUrl url);
|
||||
HttpFetcherImpl.ProbeResult probeDomain(EdgeUrl url);
|
||||
|
||||
HttpFetchResult fetchContent(EdgeUrl url,
|
||||
WarcRecorder recorder,
|
||||
ContentTags tags,
|
||||
ProbeType probeType) throws RateLimitException;
|
||||
ProbeType probeType) throws HttpFetcherImpl.RateLimitException;
|
||||
|
||||
SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);
|
||||
|
@ -1,22 +1,23 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher;
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import crawlercommons.robots.SimpleRobotRulesParser;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.crawl.retreival.Cookies;
|
||||
import nu.marginalia.crawl.retreival.RateLimitException;
|
||||
import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult;
|
||||
import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory;
|
||||
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||
import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.fetcher.socket.FastTerminatingSocketFactory;
|
||||
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||
import nu.marginalia.crawl.fetcher.socket.NoSecuritySSL;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.logic.ContentTypeProber;
|
||||
import nu.marginalia.crawl.logic.ContentTypeProber.ContentTypeProbeResult;
|
||||
import nu.marginalia.crawl.logic.SoftIfModifiedSinceProber;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.ContentTypeLogic;
|
||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||
import okhttp3.ConnectionPool;
|
||||
import okhttp3.Dispatcher;
|
||||
import okhttp3.OkHttpClient;
|
||||
@ -25,6 +26,7 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.net.ssl.X509TrustManager;
|
||||
import java.time.Duration;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
@ -114,7 +116,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
*/
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public FetchResult probeDomain(EdgeUrl url) {
|
||||
public ProbeResult probeDomain(EdgeUrl url) {
|
||||
var head = new Request.Builder().head().addHeader("User-agent", userAgentString)
|
||||
.url(url.toString())
|
||||
.build();
|
||||
@ -125,9 +127,9 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
EdgeUrl requestUrl = new EdgeUrl(rsp.request().url().toString());
|
||||
|
||||
if (!Objects.equals(requestUrl.domain, url.domain)) {
|
||||
return new FetchResult(FetchResultState.REDIRECT, requestUrl);
|
||||
return new ProbeResultRedirect(url.domain);
|
||||
}
|
||||
return new FetchResult(FetchResultState.OK, requestUrl);
|
||||
return new ProbeResultOk(requestUrl);
|
||||
}
|
||||
|
||||
catch (Exception ex) {
|
||||
@ -136,7 +138,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
}
|
||||
|
||||
logger.info("Error during fetching {}", ex.getMessage());
|
||||
return new FetchResult(FetchResultState.ERROR, url);
|
||||
return new ProbeResultError(CrawlerDomainStatus.ERROR, ex.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@ -196,8 +198,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
|
||||
if (result instanceof HttpFetchResult.ResultOk ok) {
|
||||
if (ok.statusCode() == 429) {
|
||||
String retryAfter = Objects.requireNonNullElse(ok.header("Retry-After"), "1000");
|
||||
throw new RateLimitException(retryAfter);
|
||||
throw new RateLimitException(Objects.requireNonNullElse(ok.header("Retry-After"), "1"));
|
||||
}
|
||||
if (ok.statusCode() == 304) {
|
||||
return new HttpFetchResult.Result304Raw();
|
||||
@ -249,5 +250,44 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
}
|
||||
|
||||
|
||||
public sealed interface ProbeResult permits ProbeResultError, ProbeResultRedirect, ProbeResultOk {}
|
||||
|
||||
/** The probing failed for one reason or another
|
||||
* @param status Machine readable status
|
||||
* @param desc Human-readable description of the error
|
||||
*/
|
||||
public record ProbeResultError(CrawlerDomainStatus status, String desc) implements ProbeResult {}
|
||||
|
||||
/** This domain redirects to another domain */
|
||||
public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {}
|
||||
|
||||
/** If the retrieval of the probed url was successful, return the url as it was fetched
|
||||
* (which may be different from the url we probed, if we attempted another URL schema).
|
||||
*
|
||||
* @param probedUrl The url we successfully probed
|
||||
*/
|
||||
public record ProbeResultOk(EdgeUrl probedUrl) implements ProbeResult {}
|
||||
|
||||
|
||||
/** Exception thrown when the server signals the rate limit is exceeded */
|
||||
public static class RateLimitException extends Exception {
|
||||
private final String retryAfter;
|
||||
|
||||
public RateLimitException(String retryAfterHeader) {
|
||||
this.retryAfter = retryAfterHeader;
|
||||
}
|
||||
|
||||
@Override
|
||||
public StackTraceElement[] getStackTrace() { return new StackTraceElement[0]; }
|
||||
|
||||
public Duration retryAfter() {
|
||||
try {
|
||||
return Duration.ofSeconds(Integer.parseInt(retryAfter));
|
||||
}
|
||||
catch (NumberFormatException ex) {
|
||||
return Duration.ofSeconds(1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher;
|
||||
package nu.marginalia.crawl.fetcher;
|
||||
|
||||
import crawlercommons.sitemaps.*;
|
||||
import nu.marginalia.model.EdgeUrl;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher.socket;
|
||||
package nu.marginalia.crawl.fetcher.socket;
|
||||
|
||||
import javax.net.SocketFactory;
|
||||
import java.io.IOException;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher.socket;
|
||||
package nu.marginalia.crawl.fetcher.socket;
|
||||
|
||||
import okhttp3.Interceptor;
|
||||
import okhttp3.Response;
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher.socket;
|
||||
package nu.marginalia.crawl.fetcher.socket;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher.warc;
|
||||
package nu.marginalia.crawl.fetcher.warc;
|
||||
|
||||
import org.netpreserve.jwarc.WarcDigest;
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher.warc;
|
||||
package nu.marginalia.crawl.fetcher.warc;
|
||||
|
||||
import okhttp3.Headers;
|
||||
import okhttp3.Response;
|
@ -1,7 +1,6 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher.warc;
|
||||
package nu.marginalia.crawl.fetcher.warc;
|
||||
|
||||
import okhttp3.Protocol;
|
||||
import okhttp3.Request;
|
||||
import okhttp3.Response;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
@ -73,7 +72,7 @@ public class WarcProtocolReconstructor {
|
||||
|
||||
String headerString = getHeadersAsString(headersAsString);
|
||||
|
||||
return STR."HTTP/\{version} \{statusCode} \{statusMessage}\r\n\{headerString}\r\n\r\n";
|
||||
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
||||
}
|
||||
|
||||
static String getResponseHeader(Response response, long size) {
|
||||
@ -84,7 +83,7 @@ public class WarcProtocolReconstructor {
|
||||
|
||||
String headerString = getHeadersAsString(response, size);
|
||||
|
||||
return STR."HTTP/\{version} \{statusCode} \{statusMessage}\r\n\{headerString}\r\n\r\n";
|
||||
return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
|
||||
}
|
||||
|
||||
private static final Map<Integer, String> STATUS_CODE_MAP = Map.ofEntries(
|
@ -1,13 +1,14 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher.warc;
|
||||
package nu.marginalia.crawl.fetcher.warc;
|
||||
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
import okhttp3.OkHttpClient;
|
||||
import okhttp3.Request;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
import org.netpreserve.jwarc.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -183,7 +184,7 @@ public class WarcRecorder implements AutoCloseable {
|
||||
writer.write(item);
|
||||
}
|
||||
|
||||
private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, String documentBody, ContentTags contentTags) {
|
||||
private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, String documentBody, @Nullable String headers, ContentTags contentTags) {
|
||||
try {
|
||||
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
|
||||
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
|
||||
@ -192,24 +193,42 @@ public class WarcRecorder implements AutoCloseable {
|
||||
|
||||
if (documentBody == null) {
|
||||
bytes = new byte[0];
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
bytes = documentBody.getBytes();
|
||||
}
|
||||
|
||||
StringJoiner fakeHeadersBuilder = new StringJoiner("\n");
|
||||
// Create a synthesis of custom headers and the original headers
|
||||
// to create a new set of headers that will be written to the WARC file.
|
||||
|
||||
fakeHeadersBuilder.add(STR."Content-Type: \{contentType}");
|
||||
fakeHeadersBuilder.add(STR."Content-Length: \{bytes.length}");
|
||||
StringJoiner syntheticHeadersBuilder = new StringJoiner("\n");
|
||||
|
||||
syntheticHeadersBuilder.add("Content-Type: " + contentType);
|
||||
syntheticHeadersBuilder.add("Content-Length: " + bytes.length);
|
||||
if (contentTags.etag() != null) {
|
||||
fakeHeadersBuilder.add(STR."ETag: \{contentTags.etag()}");
|
||||
syntheticHeadersBuilder.add("ETag: " + contentTags.etag());
|
||||
}
|
||||
if (contentTags.lastMod() != null) {
|
||||
fakeHeadersBuilder.add(STR."Last-Modified: \{contentTags.lastMod()}");
|
||||
syntheticHeadersBuilder.add("Last-Modified: " + contentTags.lastMod());
|
||||
}
|
||||
|
||||
// Grab the headers from the original response and add them to the fake headers if they are not
|
||||
// Content-Type, Content-Length, ETag, or Last-Modified
|
||||
for (String headerLine : Objects.requireNonNullElse(headers, "").split("\n")) {
|
||||
if (headerLine.isBlank()) continue;
|
||||
|
||||
var lowerCase = headerLine.toLowerCase();
|
||||
|
||||
if (lowerCase.startsWith("content-type:")) continue;
|
||||
if (lowerCase.startsWith("content-length:")) continue;
|
||||
|
||||
if (contentTags.etag() != null && lowerCase.startsWith("etag:")) continue;
|
||||
if (contentTags.lastMod() != null && lowerCase.startsWith("last-modified:")) continue;
|
||||
|
||||
syntheticHeadersBuilder.add(headerLine);
|
||||
}
|
||||
|
||||
byte[] header = WarcProtocolReconstructor
|
||||
.getResponseHeader(fakeHeadersBuilder.toString(), statusCode)
|
||||
.getResponseHeader(syntheticHeadersBuilder.toString(), statusCode)
|
||||
.getBytes(StandardCharsets.UTF_8);
|
||||
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(bytes.length + header.length);
|
||||
responseDataBuffer.put(header);
|
||||
@ -244,25 +263,25 @@ public class WarcRecorder implements AutoCloseable {
|
||||
* an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this
|
||||
* scenario we want to record the data as it was in the previous crawl, but not re-fetch it.
|
||||
*/
|
||||
public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, String documentBody, ContentTags ctags) {
|
||||
saveOldResponse(url, contentType, statusCode, documentBody, ctags);
|
||||
public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, String documentBody, @Nullable String headers, ContentTags ctags) {
|
||||
saveOldResponse(url, contentType, statusCode, documentBody, headers, ctags);
|
||||
}
|
||||
|
||||
public void writeWarcinfoHeader(String ip, EdgeDomain domain, DomainProber.ProbeResult result) throws IOException {
|
||||
public void writeWarcinfoHeader(String ip, EdgeDomain domain, HttpFetcherImpl.ProbeResult result) throws IOException {
|
||||
|
||||
Map<String, List<String>> fields = new HashMap<>();
|
||||
fields.put("ip", List.of(ip));
|
||||
fields.put("software", List.of(STR."search.marginalia.nu/\{warcRecorderVersion}"));
|
||||
fields.put("software", List.of("search.marginalia.nu/" + warcRecorderVersion));
|
||||
fields.put("domain", List.of(domain.toString()));
|
||||
|
||||
switch (result) {
|
||||
case DomainProber.ProbeResultRedirect redirectDomain:
|
||||
fields.put("X-WARC-Probe-Status", List.of(STR."REDIRECT;\{redirectDomain.domain()}"));
|
||||
case HttpFetcherImpl.ProbeResultRedirect redirectDomain:
|
||||
fields.put("X-WARC-Probe-Status", List.of("REDIRECT;" + redirectDomain.domain()));
|
||||
break;
|
||||
case DomainProber.ProbeResultError error:
|
||||
fields.put("X-WARC-Probe-Status", List.of(STR."\{error.status().toString()};\{error.desc()}"));
|
||||
case HttpFetcherImpl.ProbeResultError error:
|
||||
fields.put("X-WARC-Probe-Status", List.of(error.status().toString() + ";" + error.desc()));
|
||||
break;
|
||||
case DomainProber.ProbeResultOk ok:
|
||||
case HttpFetcherImpl.ProbeResultOk ok:
|
||||
fields.put("X-WARC-Probe-Status", List.of("OK"));
|
||||
break;
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher;
|
||||
package nu.marginalia.crawl.logic;
|
||||
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.ContentTypeLogic;
|
||||
@ -7,7 +7,7 @@ import okhttp3.Request;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.SocketTimeoutException;
|
||||
import java.io.InterruptedIOException;
|
||||
import java.util.Objects;
|
||||
|
||||
public class ContentTypeProber {
|
||||
@ -68,7 +68,7 @@ public class ContentTypeProber {
|
||||
|
||||
return new ContentTypeProbeResult.Ok(ret);
|
||||
|
||||
} catch (SocketTimeoutException ex) {
|
||||
} catch (InterruptedIOException ex) {
|
||||
return new ContentTypeProbeResult.Timeout(ex);
|
||||
} catch (Exception ex) {
|
||||
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
package nu.marginalia.crawl.logic;
|
||||
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
package nu.marginalia.crawl.logic;
|
||||
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.jsoup.nodes.Document;
|
||||
@ -26,6 +26,20 @@ public class LinkFilterSelector {
|
||||
if (isDiscourse(head)) {
|
||||
return url -> url.path.startsWith("/t/") || url.path.contains("/latest");
|
||||
}
|
||||
if (isMediawiki(head)) {
|
||||
return url -> {
|
||||
if (url.path.endsWith(".php")) {
|
||||
return false;
|
||||
}
|
||||
if (url.path.contains("Special:")) {
|
||||
return false;
|
||||
}
|
||||
if (url.path.contains("Talk:")) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
}
|
||||
|
||||
return LinkFilterSelector::defaultFilter;
|
||||
}
|
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher;
|
||||
package nu.marginalia.crawl.logic;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import okhttp3.OkHttpClient;
|
||||
import okhttp3.Request;
|
@ -1,6 +1,9 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
|
||||
import java.time.Duration;
|
||||
|
||||
import static java.lang.Math.max;
|
||||
import static java.lang.Math.min;
|
||||
@ -22,12 +25,21 @@ public class CrawlDelayTimer {
|
||||
|
||||
/** Call when we've gotten an HTTP 429 response. This will wait a moment, and then
|
||||
* set a flag that slows down the main crawl delay as well. */
|
||||
public void waitRetryDelay(RateLimitException ex) throws InterruptedException {
|
||||
public void waitRetryDelay(HttpFetcherImpl.RateLimitException ex) throws InterruptedException {
|
||||
slowDown = true;
|
||||
|
||||
int delay = ex.retryAfter();
|
||||
Duration delay = ex.retryAfter();
|
||||
|
||||
Thread.sleep(Math.clamp(delay, 100, 5000));
|
||||
if (delay.compareTo(Duration.ofSeconds(1)) < 0) {
|
||||
// If the server wants us to retry in less than a second, we'll just wait a bit
|
||||
delay = Duration.ofSeconds(1);
|
||||
}
|
||||
else if (delay.compareTo(Duration.ofSeconds(5)) > 0) {
|
||||
// If the server wants us to retry in more than a minute, we'll wait a bit
|
||||
delay = Duration.ofSeconds(5);
|
||||
}
|
||||
|
||||
Thread.sleep(delay.toMillis());
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
|
@ -1,91 +0,0 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import nu.marginalia.model.crawldata.CrawlerDocumentStatus;
|
||||
|
||||
import java.time.LocalDateTime;
|
||||
import java.util.Objects;
|
||||
|
||||
public class CrawledDocumentFactory {
|
||||
|
||||
public static CrawledDocument createHardErrorRsp(EdgeUrl url, Exception why) {
|
||||
return CrawledDocument.builder()
|
||||
.crawlerStatus(CrawlerDocumentStatus.ERROR.toString())
|
||||
.crawlerStatusDesc(why.getClass().getSimpleName() + ": " + why.getMessage())
|
||||
.timestamp(LocalDateTime.now().toString())
|
||||
.url(url.toString())
|
||||
.build();
|
||||
}
|
||||
|
||||
public static CrawledDocument createUnknownHostError(EdgeUrl url) {
|
||||
return CrawledDocument.builder()
|
||||
.crawlerStatus(CrawlerDocumentStatus.ERROR.toString())
|
||||
.crawlerStatusDesc("Unknown Host")
|
||||
.timestamp(LocalDateTime.now().toString())
|
||||
.url(url.toString())
|
||||
.build();
|
||||
}
|
||||
|
||||
public static CrawledDocument createTimeoutErrorRsp(EdgeUrl url) {
|
||||
return CrawledDocument.builder()
|
||||
.crawlerStatus("Timeout")
|
||||
.timestamp(LocalDateTime.now().toString())
|
||||
.url(url.toString())
|
||||
.build();
|
||||
}
|
||||
|
||||
public static CrawledDocument createErrorResponse(EdgeUrl url, HttpFetchResult.ResultOk rsp, CrawlerDocumentStatus status, String why) {
|
||||
return CrawledDocument.builder()
|
||||
.crawlerStatus(status.toString())
|
||||
.crawlerStatusDesc(why)
|
||||
.headers(rsp.headers().toString())
|
||||
.contentType(Objects.requireNonNullElse(rsp.headers().get("Content-Type"), ""))
|
||||
.timestamp(LocalDateTime.now().toString())
|
||||
.httpStatus(rsp.statusCode())
|
||||
.url(url.toString())
|
||||
.build();
|
||||
}
|
||||
public static CrawledDocument createErrorResponse(EdgeUrl url, String contentType, int statusCode, CrawlerDocumentStatus status, String why) {
|
||||
return CrawledDocument.builder()
|
||||
.crawlerStatus(status.toString())
|
||||
.crawlerStatusDesc(why)
|
||||
.headers("")
|
||||
.contentType(contentType)
|
||||
.timestamp(LocalDateTime.now().toString())
|
||||
.httpStatus(statusCode)
|
||||
.url(url.toString())
|
||||
.build();
|
||||
}
|
||||
|
||||
public static CrawledDocument createRedirectResponse(EdgeUrl url, HttpFetchResult.ResultOk rsp, EdgeUrl responseUrl) {
|
||||
|
||||
return CrawledDocument.builder()
|
||||
.crawlerStatus(CrawlerDocumentStatus.REDIRECT.name())
|
||||
.redirectUrl(responseUrl.toString())
|
||||
.headers(rsp.headers().toString())
|
||||
.contentType(Objects.requireNonNullElse(rsp.headers().get("Content-Type"), ""))
|
||||
.timestamp(LocalDateTime.now().toString())
|
||||
.httpStatus(rsp.statusCode())
|
||||
.url(url.toString())
|
||||
.build();
|
||||
}
|
||||
|
||||
public static CrawledDocument createRobotsError(EdgeUrl url) {
|
||||
return CrawledDocument.builder()
|
||||
.url(url.toString())
|
||||
.timestamp(LocalDateTime.now().toString())
|
||||
.httpStatus(-1)
|
||||
.crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name())
|
||||
.build();
|
||||
}
|
||||
public static CrawledDocument createRetryError(EdgeUrl url) {
|
||||
return CrawledDocument.builder()
|
||||
.url(url.toString())
|
||||
.timestamp(LocalDateTime.now().toString())
|
||||
.httpStatus(429)
|
||||
.crawlerStatus(CrawlerDocumentStatus.ERROR.name())
|
||||
.build();
|
||||
}
|
||||
}
|
@ -3,9 +3,11 @@ package nu.marginalia.crawl.retreival;
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.contenttype.ContentType;
|
||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.logic.LinkFilterSelector;
|
||||
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
|
||||
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
|
||||
import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
|
||||
@ -14,8 +16,6 @@ import nu.marginalia.link_parser.LinkParser;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||
import nu.marginalia.model.crawlspec.CrawlSpecRecord;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.slf4j.Logger;
|
||||
@ -25,7 +25,6 @@ import java.io.IOException;
|
||||
import java.net.InetAddress;
|
||||
import java.net.UnknownHostException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
@ -84,11 +83,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
return crawlFrontier;
|
||||
}
|
||||
|
||||
public int fetch() {
|
||||
return fetch(new DomainLinks(), new CrawlDataReference());
|
||||
public int crawlDomain() {
|
||||
return crawlDomain(new DomainLinks(), new CrawlDataReference());
|
||||
}
|
||||
|
||||
public int fetch(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
|
||||
public int crawlDomain(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
|
||||
try {
|
||||
return crawlDomain(oldCrawlData, domainLinks);
|
||||
}
|
||||
@ -98,28 +97,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
public void syncAbortedRun(Path warcFile) {
|
||||
var resync = new CrawlerWarcResynchronizer(crawlFrontier, warcRecorder);
|
||||
|
||||
resync.run(warcFile);
|
||||
}
|
||||
|
||||
private DomainProber.ProbeResult probeRootUrl(String ip) throws IOException {
|
||||
// Construct an URL to the root of the domain, we don't know the schema yet so we'll
|
||||
// start with http and then try https if that fails
|
||||
var httpUrl = new EdgeUrl("http", new EdgeDomain(domain), null, "/", null);
|
||||
final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, httpUrl);
|
||||
|
||||
warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult);
|
||||
|
||||
return probeResult;
|
||||
}
|
||||
|
||||
private int crawlDomain(CrawlDataReference oldCrawlData, DomainLinks domainLinks) throws IOException, InterruptedException {
|
||||
String ip = findIp(domain);
|
||||
EdgeUrl rootUrl;
|
||||
|
||||
if (probeRootUrl(ip) instanceof DomainProber.ProbeResultOk ok) rootUrl = ok.probedUrl();
|
||||
if (probeRootUrl(ip) instanceof HttpFetcherImpl.ProbeResultOk ok) rootUrl = ok.probedUrl();
|
||||
else return 1;
|
||||
|
||||
// Sleep after the initial probe, we don't have access to the robots.txt yet
|
||||
@ -130,12 +112,13 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
|
||||
|
||||
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
|
||||
|
||||
sniffRootDocument(rootUrl, delayTimer);
|
||||
|
||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||
int recrawled = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
|
||||
int fetchedCount = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
|
||||
|
||||
if (recrawled > 0) {
|
||||
if (fetchedCount > 0) {
|
||||
// If we have reference data, we will always grow the crawl depth a bit
|
||||
crawlFrontier.increaseDepth(1.5, 2500);
|
||||
}
|
||||
@ -146,15 +129,6 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
// Add links from the sitemap to the crawl frontier
|
||||
sitemapFetcher.downloadSitemaps(robotsRules, rootUrl);
|
||||
|
||||
CrawledDomain ret = new CrawledDomain(domain,
|
||||
null,
|
||||
CrawlerDomainStatus.OK.name(),
|
||||
null,
|
||||
ip,
|
||||
new ArrayList<>(),
|
||||
null);
|
||||
|
||||
int fetchedCount = recrawled;
|
||||
|
||||
while (!crawlFrontier.isEmpty()
|
||||
&& !crawlFrontier.isCrawlDepthReached()
|
||||
@ -186,7 +160,6 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
if (!crawlFrontier.addVisited(top))
|
||||
continue;
|
||||
|
||||
|
||||
try {
|
||||
if (fetchContentWithReference(top, delayTimer, DocumentWithReference.empty()).isOk()) {
|
||||
fetchedCount++;
|
||||
@ -198,15 +171,28 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
ret.cookies = fetcher.getCookies();
|
||||
|
||||
return fetchedCount;
|
||||
}
|
||||
|
||||
public void syncAbortedRun(Path warcFile) {
|
||||
var resync = new CrawlerWarcResynchronizer(crawlFrontier, warcRecorder);
|
||||
|
||||
resync.run(warcFile);
|
||||
}
|
||||
|
||||
private HttpFetcherImpl.ProbeResult probeRootUrl(String ip) throws IOException {
|
||||
// Construct an URL to the root of the domain, we don't know the schema yet so we'll
|
||||
// start with http and then try https if that fails
|
||||
var httpUrl = new EdgeUrl("http", new EdgeDomain(domain), null, "/", null);
|
||||
final HttpFetcherImpl.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, httpUrl);
|
||||
|
||||
warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult);
|
||||
|
||||
return probeResult;
|
||||
}
|
||||
|
||||
private void sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
|
||||
try {
|
||||
logger.debug("Configuring link filter");
|
||||
|
||||
var url = rootUrl.withPathAndParam("/", null);
|
||||
|
||||
HttpFetchResult result = fetchWithRetry(url, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
||||
@ -291,7 +277,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
|
||||
var doc = reference.doc();
|
||||
|
||||
warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody, contentTags);
|
||||
warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody, doc.headers, contentTags);
|
||||
|
||||
fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
|
||||
new ContentType(doc.contentType, "UTF-8"),
|
||||
@ -326,7 +312,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
try {
|
||||
return fetcher.fetchContent(url, warcRecorder, contentTags, probeType);
|
||||
}
|
||||
catch (RateLimitException ex) {
|
||||
catch (HttpFetcherImpl.RateLimitException ex) {
|
||||
timer.waitRetryDelay(ex);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
@ -38,7 +38,7 @@ public class CrawlerWarcResynchronizer {
|
||||
accept(item);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.info(STR."(Expected) Failed read full warc file \{tempFile}: \{e.getClass().getSimpleName()} \{e.getMessage()}");
|
||||
logger.info("(Expected) Failed read full warc file " + tempFile + ": " + e.getClass().getSimpleName() + " " + e.getMessage());
|
||||
}
|
||||
|
||||
// Second pass, copy records to the new warc file
|
||||
@ -47,7 +47,7 @@ public class CrawlerWarcResynchronizer {
|
||||
recorder.resync(item);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.info(STR."(Expected) Failed read full warc file \{tempFile}: \{e.getClass().getSimpleName()} \{e.getMessage()}");
|
||||
logger.info("(Expected) Failed read full warc file " + tempFile + ": " + e.getClass().getSimpleName() + " " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@ -63,7 +63,7 @@ public class CrawlerWarcResynchronizer {
|
||||
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.info(STR."Failed to process warc record \{item}", ex);
|
||||
logger.info("Failed to process warc record " + item, ex);
|
||||
}
|
||||
}
|
||||
|
||||
@ -78,7 +78,8 @@ public class CrawlerWarcResynchronizer {
|
||||
}
|
||||
|
||||
private void request(WarcRequest request) {
|
||||
EdgeUrl.parse(request.target()).ifPresent(crawlFrontier::addVisited);
|
||||
var url = new EdgeUrl(request.targetURI());
|
||||
crawlFrontier.addVisited(url);
|
||||
}
|
||||
|
||||
private void response(WarcResponse rsp) {
|
||||
@ -97,7 +98,7 @@ public class CrawlerWarcResynchronizer {
|
||||
});
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.info(STR."Failed to parse response body for \{url}", e);
|
||||
logger.info("Failed to parse response body for " + url, e);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -9,9 +9,14 @@ import nu.marginalia.model.EdgeUrl;
|
||||
import org.jsoup.nodes.Document;
|
||||
|
||||
import java.net.URISyntaxException;
|
||||
import java.util.*;
|
||||
import java.util.ArrayDeque;
|
||||
import java.util.Collection;
|
||||
import java.util.Objects;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
/** Encapsulates the crawl frontier for a single domain,
|
||||
* that is information about known and visited URLs
|
||||
*/
|
||||
public class DomainCrawlFrontier {
|
||||
|
||||
private static final LinkParser linkParser = new LinkParser();
|
||||
|
@ -2,8 +2,8 @@ package nu.marginalia.crawl.retreival;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.crawl.retreival.fetcher.FetchResultState;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.ip_blocklist.IpBlockList;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
@ -34,43 +34,18 @@ public class DomainProber {
|
||||
* doesn't immediately redirect to another domain (which should be crawled separately, not under the name
|
||||
* of this domain).
|
||||
*/
|
||||
public ProbeResult probeDomain(HttpFetcher fetcher, String domain, @Nullable EdgeUrl firstUrlInQueue) {
|
||||
public HttpFetcherImpl.ProbeResult probeDomain(HttpFetcher fetcher, String domain, @Nullable EdgeUrl firstUrlInQueue) {
|
||||
|
||||
if (firstUrlInQueue == null) {
|
||||
logger.warn("No valid URLs for domain {}", domain);
|
||||
|
||||
return new ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs");
|
||||
return new HttpFetcherImpl.ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs");
|
||||
}
|
||||
|
||||
if (!domainBlacklist.test(firstUrlInQueue.domain))
|
||||
return new ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed");
|
||||
return new HttpFetcherImpl.ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed");
|
||||
|
||||
var fetchResult = fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null));
|
||||
|
||||
if (fetchResult.ok())
|
||||
return new ProbeResultOk(fetchResult.url);
|
||||
|
||||
if (fetchResult.state == FetchResultState.REDIRECT)
|
||||
return new ProbeResultRedirect(fetchResult.domain);
|
||||
|
||||
return new ProbeResultError(CrawlerDomainStatus.ERROR, "Bad status");
|
||||
return fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null));
|
||||
}
|
||||
|
||||
public sealed interface ProbeResult permits ProbeResultError, ProbeResultRedirect, ProbeResultOk {}
|
||||
|
||||
/** The probing failed for one reason or another
|
||||
* @param status Machine readable status
|
||||
* @param desc Human-readable description of the error
|
||||
*/
|
||||
public record ProbeResultError(CrawlerDomainStatus status, String desc) implements ProbeResult {}
|
||||
|
||||
/** This domain redirects to another domain */
|
||||
public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {}
|
||||
|
||||
/** If the retrieval of the probed url was successful, return the url as it was fetched
|
||||
* (which may be different from the url we probed, if we attempted another URL schema).
|
||||
*
|
||||
* @param probedUrl The url we successfully probed
|
||||
*/
|
||||
public record ProbeResultOk(EdgeUrl probedUrl) implements ProbeResult {}
|
||||
}
|
||||
|
@ -1,21 +0,0 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
|
||||
public class RateLimitException extends Exception {
|
||||
private final String retryAfter;
|
||||
|
||||
public RateLimitException(String retryAfter) {
|
||||
this.retryAfter = retryAfter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public StackTraceElement[] getStackTrace() { return new StackTraceElement[0]; }
|
||||
|
||||
public int retryAfter() {
|
||||
try {
|
||||
return Integer.parseInt(retryAfter);
|
||||
}
|
||||
catch (NumberFormatException ex) {
|
||||
return 1000;
|
||||
}
|
||||
}
|
||||
}
|
@ -1,24 +0,0 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
@AllArgsConstructor
|
||||
@ToString
|
||||
public class FetchResult {
|
||||
public final FetchResultState state;
|
||||
public final EdgeUrl url;
|
||||
public final EdgeDomain domain;
|
||||
|
||||
public FetchResult(FetchResultState state, EdgeUrl url) {
|
||||
this.state = state;
|
||||
this.url = url;
|
||||
this.domain = url.domain;
|
||||
}
|
||||
|
||||
public boolean ok() {
|
||||
return state == FetchResultState.OK;
|
||||
}
|
||||
}
|
@ -1,7 +0,0 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
public enum FetchResultState {
|
||||
OK,
|
||||
REDIRECT,
|
||||
ERROR
|
||||
}
|
@ -2,12 +2,12 @@ package nu.marginalia.crawl.retreival.revisit;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
|
||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
@ -125,6 +125,7 @@ public class CrawlerRevisitor {
|
||||
doc.contentType,
|
||||
doc.httpStatus,
|
||||
doc.documentBody,
|
||||
doc.headers,
|
||||
new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe)
|
||||
);
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.crawl.retreival.revisit;
|
||||
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||
import nu.marginalia.model.body.DocumentBodyResult;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
|
@ -1,8 +1,8 @@
|
||||
package nu.marginalia.crawl.retreival.sitemap;
|
||||
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import nu.marginalia.crawl.fetcher.SitemapRetriever;
|
||||
import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
|
||||
import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -144,7 +144,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
||||
nextRecord.httpStatus,
|
||||
status.toString(),
|
||||
"",
|
||||
"",
|
||||
nextRecord.headers,
|
||||
bodyString,
|
||||
Long.toHexString(hash.hashNearlyASCII(bodyString)), // this field isn't actually used, maybe we can skip calculating it?
|
||||
nextRecord.url,
|
||||
|
@ -23,7 +23,6 @@ public class CrawledDocument implements SerializableCrawlData {
|
||||
public String crawlerStatusDesc;
|
||||
|
||||
@Nullable
|
||||
@Deprecated // use getETag() or getLastModified() instead
|
||||
public String headers;
|
||||
|
||||
public String documentBody;
|
||||
|
@ -29,7 +29,11 @@ public class CrawledDocumentParquetRecord {
|
||||
public String contentType;
|
||||
public byte[] body;
|
||||
|
||||
public String headers;
|
||||
|
||||
@Deprecated // will be replaced with the full headers field in the future
|
||||
public String etagHeader;
|
||||
@Deprecated // will be replaced with the full headers field in the future
|
||||
public String lastModifiedHeader;
|
||||
|
||||
public static Hydrator<CrawledDocumentParquetRecord, CrawledDocumentParquetRecord> newHydrator() {
|
||||
@ -51,7 +55,8 @@ public class CrawledDocumentParquetRecord {
|
||||
Types.required(BINARY).as(stringType()).named("contentType"),
|
||||
Types.required(BINARY).named("body"),
|
||||
Types.optional(BINARY).as(stringType()).named("etagHeader"),
|
||||
Types.optional(BINARY).as(stringType()).named("lastModifiedHeader")
|
||||
Types.optional(BINARY).as(stringType()).named("lastModifiedHeader"),
|
||||
Types.optional(BINARY).as(stringType()).named("headers")
|
||||
);
|
||||
|
||||
|
||||
@ -67,6 +72,7 @@ public class CrawledDocumentParquetRecord {
|
||||
case "epochSeconds" -> timestamp = Instant.ofEpochSecond((Long) value);
|
||||
case "etagHeader" -> etagHeader = (String) value;
|
||||
case "lastModifiedHeader" -> lastModifiedHeader = (String) value;
|
||||
case "headers" -> headers = (String) value;
|
||||
|
||||
default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"');
|
||||
}
|
||||
@ -82,6 +88,9 @@ public class CrawledDocumentParquetRecord {
|
||||
valueWriter.write("cookies", cookies);
|
||||
valueWriter.write("contentType", contentType);
|
||||
valueWriter.write("body", body);
|
||||
if (headers != null) {
|
||||
valueWriter.write("headers", headers);
|
||||
}
|
||||
if (etagHeader != null) {
|
||||
valueWriter.write("etagHeader", etagHeader);
|
||||
}
|
||||
|
@ -16,6 +16,7 @@ import java.nio.file.Path;
|
||||
import java.time.Instant;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
||||
private final ParquetWriter<CrawledDocumentParquetRecord> writer;
|
||||
@ -150,6 +151,14 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
||||
contentType = "";
|
||||
}
|
||||
|
||||
String headersStr = null;
|
||||
StringJoiner headersStrBuilder = new StringJoiner("\n");
|
||||
for (var header : headers) {
|
||||
headersStrBuilder.add(header.getFirst() + ": " + header.getSecond());
|
||||
}
|
||||
headersStr = headersStrBuilder.toString();
|
||||
|
||||
|
||||
write(new CrawledDocumentParquetRecord(
|
||||
domain,
|
||||
response.target(),
|
||||
@ -159,6 +168,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
||||
response.date(),
|
||||
contentType,
|
||||
bodyBytes,
|
||||
headersStr,
|
||||
headers.get("ETag"),
|
||||
headers.get("Last-Modified"))
|
||||
);
|
||||
@ -179,6 +189,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
||||
"x-marginalia/advisory;state=redirect",
|
||||
new byte[0],
|
||||
null,
|
||||
null,
|
||||
null
|
||||
);
|
||||
}
|
||||
@ -192,6 +203,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
||||
"x-marginalia/advisory;state=error",
|
||||
errorStatus.getBytes(),
|
||||
null,
|
||||
null,
|
||||
null
|
||||
);
|
||||
}
|
||||
@ -206,6 +218,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
||||
errorStatus,
|
||||
new byte[0],
|
||||
null,
|
||||
null,
|
||||
null
|
||||
);
|
||||
}
|
||||
|
@ -0,0 +1,124 @@
|
||||
package nu.marginalia.crawl.logic;
|
||||
|
||||
import com.sun.net.httpserver.HttpServer;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import okhttp3.OkHttpClient;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.util.Random;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertInstanceOf;
|
||||
|
||||
class ContentTypeProberTest {
|
||||
|
||||
private static int port;
|
||||
private static HttpServer server;
|
||||
private static OkHttpClient client;
|
||||
|
||||
static EdgeUrl htmlEndpoint;
|
||||
static EdgeUrl htmlRedirEndpoint;
|
||||
static EdgeUrl binaryEndpoint;
|
||||
static EdgeUrl timeoutEndpoint;
|
||||
|
||||
@BeforeEach
|
||||
void setUp() throws IOException {
|
||||
Random r = new Random();
|
||||
port = r.nextInt(10000) + 8000;
|
||||
server = HttpServer.create(new InetSocketAddress("127.0.0.1", port), 10);
|
||||
|
||||
server.createContext("/html", exchange -> {
|
||||
exchange.getResponseHeaders().add("Content-Type", "text/html");
|
||||
exchange.sendResponseHeaders(200, -1);
|
||||
exchange.close();
|
||||
});
|
||||
server.createContext("/redir", exchange -> {
|
||||
exchange.getResponseHeaders().add("Location", "/html");
|
||||
exchange.sendResponseHeaders(301, -1);
|
||||
exchange.close();
|
||||
});
|
||||
|
||||
server.createContext("/bin", exchange -> {
|
||||
exchange.getResponseHeaders().add("Content-Type", "application/binary");
|
||||
exchange.sendResponseHeaders(200, -1);
|
||||
exchange.close();
|
||||
});
|
||||
|
||||
server.createContext("/timeout", exchange -> {
|
||||
try {
|
||||
Thread.sleep(2000);
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
exchange.getResponseHeaders().add("Content-Type", "application/binary");
|
||||
exchange.sendResponseHeaders(200, -1);
|
||||
exchange.close();
|
||||
});
|
||||
|
||||
server.start();
|
||||
|
||||
htmlEndpoint = EdgeUrl.parse("http://localhost:" + port + "/html").get();
|
||||
binaryEndpoint = EdgeUrl.parse("http://localhost:" + port + "/bin").get();
|
||||
timeoutEndpoint = EdgeUrl.parse("http://localhost:" + port + "/timeout").get();
|
||||
htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir").get();
|
||||
|
||||
client = new OkHttpClient.Builder()
|
||||
.readTimeout(1, java.util.concurrent.TimeUnit.SECONDS)
|
||||
.connectTimeout(1, java.util.concurrent.TimeUnit.SECONDS)
|
||||
.callTimeout(1, java.util.concurrent.TimeUnit.SECONDS)
|
||||
.writeTimeout(1, java.util.concurrent.TimeUnit.SECONDS)
|
||||
.build();
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
void tearDown() {
|
||||
server.stop(0);
|
||||
client.dispatcher().executorService().shutdown();
|
||||
client.connectionPool().evictAll();
|
||||
}
|
||||
|
||||
@Test
|
||||
void probeContentTypeOk() {
|
||||
ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client)
|
||||
.probeContentType(htmlEndpoint);
|
||||
|
||||
System.out.println(result);
|
||||
|
||||
assertEquals(result, new ContentTypeProber.ContentTypeProbeResult.Ok(htmlEndpoint));
|
||||
}
|
||||
|
||||
@Test
|
||||
void probeContentTypeRedir() {
|
||||
ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client)
|
||||
.probeContentType(htmlRedirEndpoint);
|
||||
|
||||
System.out.println(result);
|
||||
|
||||
assertEquals(result, new ContentTypeProber.ContentTypeProbeResult.Ok(htmlEndpoint));
|
||||
}
|
||||
|
||||
@Test
|
||||
void probeContentTypeBad() {
|
||||
ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client)
|
||||
.probeContentType(binaryEndpoint);
|
||||
|
||||
System.out.println(result);
|
||||
|
||||
assertInstanceOf(ContentTypeProber.ContentTypeProbeResult.BadContentType.class, result);
|
||||
}
|
||||
|
||||
@Test
|
||||
void probeContentTypeTimeout() {
|
||||
ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client)
|
||||
.probeContentType(timeoutEndpoint);
|
||||
|
||||
System.out.println(result);
|
||||
|
||||
assertInstanceOf(ContentTypeProber.ContentTypeProbeResult.Timeout.class, result);
|
||||
}
|
||||
}
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.crawl.retreival;
|
||||
|
||||
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import okhttp3.OkHttpClient;
|
||||
@ -21,7 +21,8 @@ import java.security.NoSuchAlgorithmException;
|
||||
import java.util.List;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.junit.jupiter.api.Assertions.fail;
|
||||
|
||||
class CrawlerWarcResynchronizerTest {
|
||||
Path fileName;
|
||||
|
@ -1,7 +1,8 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult.BadContentType;
|
||||
import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult.Ok;
|
||||
import nu.marginalia.crawl.logic.ContentTypeProber;
|
||||
import nu.marginalia.crawl.logic.ContentTypeProber.ContentTypeProbeResult.BadContentType;
|
||||
import nu.marginalia.crawl.logic.ContentTypeProber.ContentTypeProbeResult.Ok;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import okhttp3.ConnectionPool;
|
||||
import okhttp3.Dispatcher;
|
||||
@ -13,7 +14,7 @@ import java.net.URISyntaxException;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class ContentTypeProberTest {
|
||||
|
||||
|
@ -1,8 +1,9 @@
|
||||
package nu.marginalia.crawl.retreival.fetcher;
|
||||
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
|
||||
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
|
||||
@ -80,6 +81,7 @@ class WarcRecorderTest {
|
||||
"text/html",
|
||||
200,
|
||||
"<?doctype html><html><body>test</body></html>",
|
||||
null,
|
||||
ContentTags.empty());
|
||||
}
|
||||
|
||||
@ -103,7 +105,7 @@ class WarcRecorderTest {
|
||||
"text/html",
|
||||
200,
|
||||
null,
|
||||
ContentTags.empty());
|
||||
null, ContentTags.empty());
|
||||
}
|
||||
|
||||
}
|
||||
@ -115,7 +117,7 @@ class WarcRecorderTest {
|
||||
"text/html",
|
||||
200,
|
||||
"<?doctype html><html><body>test</body></html>",
|
||||
ContentTags.empty());
|
||||
null, ContentTags.empty());
|
||||
}
|
||||
|
||||
try (var reader = new WarcReader(fileNameWarc)) {
|
||||
|
@ -1,7 +1,7 @@
|
||||
package nu.marginalia.crawl.retreival.revisit;
|
||||
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
|
@ -1,11 +1,10 @@
|
||||
package nu.marginalia.crawling;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.crawl.retreival.RateLimitException;
|
||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.ContentTypeLogic;
|
||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||
@ -33,7 +32,7 @@ class HttpFetcherTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
void fetchUTF8() throws URISyntaxException, RateLimitException, IOException {
|
||||
void fetchUTF8() throws URISyntaxException, HttpFetcherImpl.RateLimitException, IOException {
|
||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||
try (var recorder = new WarcRecorder()) {
|
||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||
@ -44,7 +43,7 @@ class HttpFetcherTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
void fetchText() throws URISyntaxException, RateLimitException, IOException {
|
||||
void fetchText() throws URISyntaxException, HttpFetcherImpl.RateLimitException, IOException {
|
||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||
|
||||
try (var recorder = new WarcRecorder()) {
|
||||
|
@ -2,10 +2,13 @@ package nu.marginalia.crawling.retreival;
|
||||
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.SitemapRetriever;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
import nu.marginalia.crawl.retreival.fetcher.*;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
@ -68,7 +71,7 @@ public class CrawlerMockFetcherTest {
|
||||
void crawl(CrawlSpecRecord spec) throws IOException {
|
||||
try (var recorder = new WarcRecorder()) {
|
||||
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder)
|
||||
.fetch();
|
||||
.crawlDomain();
|
||||
}
|
||||
}
|
||||
|
||||
@ -115,9 +118,9 @@ public class CrawlerMockFetcherTest {
|
||||
public void clearCookies() {}
|
||||
|
||||
@Override
|
||||
public FetchResult probeDomain(EdgeUrl url) {
|
||||
public HttpFetcherImpl.ProbeResult probeDomain(EdgeUrl url) {
|
||||
logger.info("Probing {}", url);
|
||||
return new FetchResult(FetchResultState.OK, url);
|
||||
return new HttpFetcherImpl.ProbeResultOk(url);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
|
@ -4,10 +4,10 @@ import lombok.SneakyThrows;
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.retreival.*;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.io.crawldata.CrawledDomainReader;
|
||||
import nu.marginalia.io.crawldata.SerializableCrawlDataStream;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
@ -468,7 +468,7 @@ class CrawlerRetreiverTest {
|
||||
|
||||
private void doCrawlWithReferenceStream(CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
|
||||
try (var recorder = new WarcRecorder(tempFileWarc2)) {
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(new DomainLinks(),
|
||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).crawlDomain(new DomainLinks(),
|
||||
new CrawlDataReference(stream));
|
||||
}
|
||||
catch (IOException ex) {
|
||||
@ -480,7 +480,7 @@ class CrawlerRetreiverTest {
|
||||
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlSpecRecord specs) {
|
||||
try (var recorder = new WarcRecorder(tempFileWarc1)) {
|
||||
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder);
|
||||
crawler.fetch();
|
||||
crawler.crawlDomain();
|
||||
return crawler.getCrawlFrontier();
|
||||
} catch (IOException ex) {
|
||||
Assertions.fail(ex);
|
||||
|
@ -8,9 +8,9 @@ import nu.marginalia.api.searchquery.RpcQueryLimits;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.functions.searchquery.QueryFactory;
|
||||
import nu.marginalia.index.IndexGrpcService;
|
||||
import nu.marginalia.index.ReverseIndexFullFileNames;
|
||||
@ -120,7 +120,7 @@ public class IntegrationTest {
|
||||
/** CREATE WARC */
|
||||
try (WarcRecorder warcRecorder = new WarcRecorder(warcData)) {
|
||||
warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"),
|
||||
new DomainProber.ProbeResultOk(new EdgeUrl("https://www.example.com/")));
|
||||
new HttpFetcherImpl.ProbeResultOk(new EdgeUrl("https://www.example.com/")));
|
||||
|
||||
warcRecorder.writeReferenceCopy(new EdgeUrl("https://www.example.com/"),
|
||||
"text/html", 200,
|
||||
@ -134,6 +134,7 @@ public class IntegrationTest {
|
||||
</body>
|
||||
</html>
|
||||
""",
|
||||
"",
|
||||
ContentTags.empty()
|
||||
);
|
||||
}
|
||||
@ -204,7 +205,7 @@ public class IntegrationTest {
|
||||
.setFetchSize(1000)
|
||||
.build())
|
||||
.setQueryStrategy("AUTO")
|
||||
.setHumanQuery("\"This is how thinking works\"")
|
||||
.setHumanQuery("\"is that there is\"")
|
||||
.build();
|
||||
|
||||
var params = QueryProtobufCodec.convertRequest(request);
|
||||
|
@ -96,7 +96,7 @@ public class ScreenshotCaptureToolMain {
|
||||
private static byte[] fetchDomain(HttpClient client, EdgeDomain domain) {
|
||||
try {
|
||||
Map<String, Object> requestData = Map.of(
|
||||
"url", domain.toRootUrl().toString(),
|
||||
"url", domain.toRootUrlHttp().toString(),
|
||||
"options",
|
||||
Map.of("fullPage", false,
|
||||
"type", "png"),
|
||||
|
Loading…
Reference in New Issue
Block a user