(crawler) Refactor

* Restructure the code to make a bit more sense
* Store full headers in crawl data
* Fix bug in retry-after header that assumed the timeout was in milliseconds, and then clamped it to a lower bound of 500ms, meaning this was almost always handled wrong
This commit is contained in:
Viktor Lofgren 2024-09-23 17:51:07 +02:00
parent 9c292a4f62
commit e9854f194c
44 changed files with 398 additions and 337 deletions

View File

@ -7,11 +7,11 @@ import nu.marginalia.UserAgent;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream; import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawl.DomainIndexingState;
@ -266,7 +266,7 @@ public class CrawlingThenConvertingIntegrationTest {
List<SerializableCrawlData> data = new ArrayList<>(); List<SerializableCrawlData> data = new ArrayList<>();
try (var recorder = new WarcRecorder(fileName)) { try (var recorder = new WarcRecorder(fileName)) {
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).fetch(); new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).crawlDomain();
} }
CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain, CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain,

View File

@ -10,12 +10,12 @@ import nu.marginalia.UserAgent;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.atags.source.AnchorTagsSource; import nu.marginalia.atags.source.AnchorTagsSource;
import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.atags.source.AnchorTagsSourceFactory;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.logic.DomainLocks;
import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainLocks;
import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.spec.CrawlSpecProvider; import nu.marginalia.crawl.spec.CrawlSpecProvider;
import nu.marginalia.crawl.spec.DbCrawlSpecProvider; import nu.marginalia.crawl.spec.DbCrawlSpecProvider;
import nu.marginalia.crawl.spec.ParquetCrawlSpecProvider; import nu.marginalia.crawl.spec.ParquetCrawlSpecProvider;
@ -294,7 +294,7 @@ public class CrawlerMain extends ProcessMainClass {
Files.delete(tempFile); Files.delete(tempFile);
} }
int size = retriever.fetch(domainLinks, reference); int size = retriever.crawlDomain(domainLinks, reference);
// Delete the reference crawl data if it's not the same as the new one // Delete the reference crawl data if it's not the same as the new one
// (mostly a case when migrating from legacy->warc) // (mostly a case when migrating from legacy->warc)

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival.fetcher; package nu.marginalia.crawl.fetcher;
import okhttp3.Request; import okhttp3.Request;

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival; package nu.marginalia.crawl.fetcher;
import okhttp3.Cookie; import okhttp3.Cookie;
import okhttp3.CookieJar; import okhttp3.CookieJar;

View File

@ -1,9 +1,8 @@
package nu.marginalia.crawl.retreival.fetcher; package nu.marginalia.crawl.fetcher;
import com.google.inject.ImplementedBy; import com.google.inject.ImplementedBy;
import crawlercommons.robots.SimpleRobotRules; import crawlercommons.robots.SimpleRobotRules;
import nu.marginalia.crawl.retreival.RateLimitException; import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.HttpFetchResult; import nu.marginalia.model.body.HttpFetchResult;
@ -17,12 +16,12 @@ public interface HttpFetcher {
List<String> getCookies(); List<String> getCookies();
void clearCookies(); void clearCookies();
FetchResult probeDomain(EdgeUrl url); HttpFetcherImpl.ProbeResult probeDomain(EdgeUrl url);
HttpFetchResult fetchContent(EdgeUrl url, HttpFetchResult fetchContent(EdgeUrl url,
WarcRecorder recorder, WarcRecorder recorder,
ContentTags tags, ContentTags tags,
ProbeType probeType) throws RateLimitException; ProbeType probeType) throws HttpFetcherImpl.RateLimitException;
SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder); SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);

View File

@ -1,22 +1,23 @@
package nu.marginalia.crawl.retreival.fetcher; package nu.marginalia.crawl.fetcher;
import com.google.inject.Inject; import com.google.inject.Inject;
import crawlercommons.robots.SimpleRobotRules; import crawlercommons.robots.SimpleRobotRules;
import crawlercommons.robots.SimpleRobotRulesParser; import crawlercommons.robots.SimpleRobotRulesParser;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.UserAgent; import nu.marginalia.UserAgent;
import nu.marginalia.crawl.retreival.Cookies; import nu.marginalia.crawl.fetcher.socket.FastTerminatingSocketFactory;
import nu.marginalia.crawl.retreival.RateLimitException; import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult; import nu.marginalia.crawl.fetcher.socket.NoSecuritySSL;
import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory; import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.crawl.logic.ContentTypeProber;
import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL; import nu.marginalia.crawl.logic.ContentTypeProber.ContentTypeProbeResult;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.logic.SoftIfModifiedSinceProber;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.ContentTypeLogic; import nu.marginalia.model.body.ContentTypeLogic;
import nu.marginalia.model.body.DocumentBodyExtractor; import nu.marginalia.model.body.DocumentBodyExtractor;
import nu.marginalia.model.body.HttpFetchResult; import nu.marginalia.model.body.HttpFetchResult;
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
import okhttp3.ConnectionPool; import okhttp3.ConnectionPool;
import okhttp3.Dispatcher; import okhttp3.Dispatcher;
import okhttp3.OkHttpClient; import okhttp3.OkHttpClient;
@ -25,6 +26,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import javax.net.ssl.X509TrustManager; import javax.net.ssl.X509TrustManager;
import java.time.Duration;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
@ -114,7 +116,7 @@ public class HttpFetcherImpl implements HttpFetcher {
*/ */
@Override @Override
@SneakyThrows @SneakyThrows
public FetchResult probeDomain(EdgeUrl url) { public ProbeResult probeDomain(EdgeUrl url) {
var head = new Request.Builder().head().addHeader("User-agent", userAgentString) var head = new Request.Builder().head().addHeader("User-agent", userAgentString)
.url(url.toString()) .url(url.toString())
.build(); .build();
@ -125,9 +127,9 @@ public class HttpFetcherImpl implements HttpFetcher {
EdgeUrl requestUrl = new EdgeUrl(rsp.request().url().toString()); EdgeUrl requestUrl = new EdgeUrl(rsp.request().url().toString());
if (!Objects.equals(requestUrl.domain, url.domain)) { if (!Objects.equals(requestUrl.domain, url.domain)) {
return new FetchResult(FetchResultState.REDIRECT, requestUrl); return new ProbeResultRedirect(url.domain);
} }
return new FetchResult(FetchResultState.OK, requestUrl); return new ProbeResultOk(requestUrl);
} }
catch (Exception ex) { catch (Exception ex) {
@ -136,7 +138,7 @@ public class HttpFetcherImpl implements HttpFetcher {
} }
logger.info("Error during fetching {}", ex.getMessage()); logger.info("Error during fetching {}", ex.getMessage());
return new FetchResult(FetchResultState.ERROR, url); return new ProbeResultError(CrawlerDomainStatus.ERROR, ex.getMessage());
} }
} }
@ -196,8 +198,7 @@ public class HttpFetcherImpl implements HttpFetcher {
if (result instanceof HttpFetchResult.ResultOk ok) { if (result instanceof HttpFetchResult.ResultOk ok) {
if (ok.statusCode() == 429) { if (ok.statusCode() == 429) {
String retryAfter = Objects.requireNonNullElse(ok.header("Retry-After"), "1000"); throw new RateLimitException(Objects.requireNonNullElse(ok.header("Retry-After"), "1"));
throw new RateLimitException(retryAfter);
} }
if (ok.statusCode() == 304) { if (ok.statusCode() == 304) {
return new HttpFetchResult.Result304Raw(); return new HttpFetchResult.Result304Raw();
@ -249,5 +250,44 @@ public class HttpFetcherImpl implements HttpFetcher {
} }
public sealed interface ProbeResult permits ProbeResultError, ProbeResultRedirect, ProbeResultOk {}
/** The probing failed for one reason or another
* @param status Machine readable status
* @param desc Human-readable description of the error
*/
public record ProbeResultError(CrawlerDomainStatus status, String desc) implements ProbeResult {}
/** This domain redirects to another domain */
public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {}
/** If the retrieval of the probed url was successful, return the url as it was fetched
* (which may be different from the url we probed, if we attempted another URL schema).
*
* @param probedUrl The url we successfully probed
*/
public record ProbeResultOk(EdgeUrl probedUrl) implements ProbeResult {}
/** Exception thrown when the server signals the rate limit is exceeded */
public static class RateLimitException extends Exception {
private final String retryAfter;
public RateLimitException(String retryAfterHeader) {
this.retryAfter = retryAfterHeader;
}
@Override
public StackTraceElement[] getStackTrace() { return new StackTraceElement[0]; }
public Duration retryAfter() {
try {
return Duration.ofSeconds(Integer.parseInt(retryAfter));
}
catch (NumberFormatException ex) {
return Duration.ofSeconds(1);
}
}
}
} }

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival.fetcher; package nu.marginalia.crawl.fetcher;
import crawlercommons.sitemaps.*; import crawlercommons.sitemaps.*;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival.fetcher.socket; package nu.marginalia.crawl.fetcher.socket;
import javax.net.SocketFactory; import javax.net.SocketFactory;
import java.io.IOException; import java.io.IOException;

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival.fetcher.socket; package nu.marginalia.crawl.fetcher.socket;
import okhttp3.Interceptor; import okhttp3.Interceptor;
import okhttp3.Response; import okhttp3.Response;

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival.fetcher.socket; package nu.marginalia.crawl.fetcher.socket;
import lombok.SneakyThrows; import lombok.SneakyThrows;

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival.fetcher.warc; package nu.marginalia.crawl.fetcher.warc;
import org.netpreserve.jwarc.WarcDigest; import org.netpreserve.jwarc.WarcDigest;

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival.fetcher.warc; package nu.marginalia.crawl.fetcher.warc;
import okhttp3.Headers; import okhttp3.Headers;
import okhttp3.Response; import okhttp3.Response;

View File

@ -1,7 +1,6 @@
package nu.marginalia.crawl.retreival.fetcher.warc; package nu.marginalia.crawl.fetcher.warc;
import okhttp3.Protocol; import okhttp3.Protocol;
import okhttp3.Request;
import okhttp3.Response; import okhttp3.Response;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -73,7 +72,7 @@ public class WarcProtocolReconstructor {
String headerString = getHeadersAsString(headersAsString); String headerString = getHeadersAsString(headersAsString);
return STR."HTTP/\{version} \{statusCode} \{statusMessage}\r\n\{headerString}\r\n\r\n"; return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
} }
static String getResponseHeader(Response response, long size) { static String getResponseHeader(Response response, long size) {
@ -84,7 +83,7 @@ public class WarcProtocolReconstructor {
String headerString = getHeadersAsString(response, size); String headerString = getHeadersAsString(response, size);
return STR."HTTP/\{version} \{statusCode} \{statusMessage}\r\n\{headerString}\r\n\r\n"; return "HTTP/" + version + " " + statusCode + " " + statusMessage + "\r\n" + headerString + "\r\n\r\n";
} }
private static final Map<Integer, String> STATUS_CODE_MAP = Map.ofEntries( private static final Map<Integer, String> STATUS_CODE_MAP = Map.ofEntries(

View File

@ -1,13 +1,14 @@
package nu.marginalia.crawl.retreival.fetcher.warc; package nu.marginalia.crawl.fetcher.warc;
import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.HttpFetchResult; import nu.marginalia.model.body.HttpFetchResult;
import okhttp3.OkHttpClient; import okhttp3.OkHttpClient;
import okhttp3.Request; import okhttp3.Request;
import org.jetbrains.annotations.Nullable;
import org.netpreserve.jwarc.*; import org.netpreserve.jwarc.*;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -183,7 +184,7 @@ public class WarcRecorder implements AutoCloseable {
writer.write(item); writer.write(item);
} }
private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, String documentBody, ContentTags contentTags) { private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, String documentBody, @Nullable String headers, ContentTags contentTags) {
try { try {
WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder(); WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder();
WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder(); WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder();
@ -192,24 +193,42 @@ public class WarcRecorder implements AutoCloseable {
if (documentBody == null) { if (documentBody == null) {
bytes = new byte[0]; bytes = new byte[0];
} } else {
else {
bytes = documentBody.getBytes(); bytes = documentBody.getBytes();
} }
StringJoiner fakeHeadersBuilder = new StringJoiner("\n"); // Create a synthesis of custom headers and the original headers
// to create a new set of headers that will be written to the WARC file.
fakeHeadersBuilder.add(STR."Content-Type: \{contentType}"); StringJoiner syntheticHeadersBuilder = new StringJoiner("\n");
fakeHeadersBuilder.add(STR."Content-Length: \{bytes.length}");
syntheticHeadersBuilder.add("Content-Type: " + contentType);
syntheticHeadersBuilder.add("Content-Length: " + bytes.length);
if (contentTags.etag() != null) { if (contentTags.etag() != null) {
fakeHeadersBuilder.add(STR."ETag: \{contentTags.etag()}"); syntheticHeadersBuilder.add("ETag: " + contentTags.etag());
} }
if (contentTags.lastMod() != null) { if (contentTags.lastMod() != null) {
fakeHeadersBuilder.add(STR."Last-Modified: \{contentTags.lastMod()}"); syntheticHeadersBuilder.add("Last-Modified: " + contentTags.lastMod());
}
// Grab the headers from the original response and add them to the fake headers if they are not
// Content-Type, Content-Length, ETag, or Last-Modified
for (String headerLine : Objects.requireNonNullElse(headers, "").split("\n")) {
if (headerLine.isBlank()) continue;
var lowerCase = headerLine.toLowerCase();
if (lowerCase.startsWith("content-type:")) continue;
if (lowerCase.startsWith("content-length:")) continue;
if (contentTags.etag() != null && lowerCase.startsWith("etag:")) continue;
if (contentTags.lastMod() != null && lowerCase.startsWith("last-modified:")) continue;
syntheticHeadersBuilder.add(headerLine);
} }
byte[] header = WarcProtocolReconstructor byte[] header = WarcProtocolReconstructor
.getResponseHeader(fakeHeadersBuilder.toString(), statusCode) .getResponseHeader(syntheticHeadersBuilder.toString(), statusCode)
.getBytes(StandardCharsets.UTF_8); .getBytes(StandardCharsets.UTF_8);
ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(bytes.length + header.length); ResponseDataBuffer responseDataBuffer = new ResponseDataBuffer(bytes.length + header.length);
responseDataBuffer.put(header); responseDataBuffer.put(header);
@ -244,25 +263,25 @@ public class WarcRecorder implements AutoCloseable {
* an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this * an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this
* scenario we want to record the data as it was in the previous crawl, but not re-fetch it. * scenario we want to record the data as it was in the previous crawl, but not re-fetch it.
*/ */
public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, String documentBody, ContentTags ctags) { public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, String documentBody, @Nullable String headers, ContentTags ctags) {
saveOldResponse(url, contentType, statusCode, documentBody, ctags); saveOldResponse(url, contentType, statusCode, documentBody, headers, ctags);
} }
public void writeWarcinfoHeader(String ip, EdgeDomain domain, DomainProber.ProbeResult result) throws IOException { public void writeWarcinfoHeader(String ip, EdgeDomain domain, HttpFetcherImpl.ProbeResult result) throws IOException {
Map<String, List<String>> fields = new HashMap<>(); Map<String, List<String>> fields = new HashMap<>();
fields.put("ip", List.of(ip)); fields.put("ip", List.of(ip));
fields.put("software", List.of(STR."search.marginalia.nu/\{warcRecorderVersion}")); fields.put("software", List.of("search.marginalia.nu/" + warcRecorderVersion));
fields.put("domain", List.of(domain.toString())); fields.put("domain", List.of(domain.toString()));
switch (result) { switch (result) {
case DomainProber.ProbeResultRedirect redirectDomain: case HttpFetcherImpl.ProbeResultRedirect redirectDomain:
fields.put("X-WARC-Probe-Status", List.of(STR."REDIRECT;\{redirectDomain.domain()}")); fields.put("X-WARC-Probe-Status", List.of("REDIRECT;" + redirectDomain.domain()));
break; break;
case DomainProber.ProbeResultError error: case HttpFetcherImpl.ProbeResultError error:
fields.put("X-WARC-Probe-Status", List.of(STR."\{error.status().toString()};\{error.desc()}")); fields.put("X-WARC-Probe-Status", List.of(error.status().toString() + ";" + error.desc()));
break; break;
case DomainProber.ProbeResultOk ok: case HttpFetcherImpl.ProbeResultOk ok:
fields.put("X-WARC-Probe-Status", List.of("OK")); fields.put("X-WARC-Probe-Status", List.of("OK"));
break; break;
} }

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival.fetcher; package nu.marginalia.crawl.logic;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.ContentTypeLogic; import nu.marginalia.model.body.ContentTypeLogic;
@ -7,7 +7,7 @@ import okhttp3.Request;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.net.SocketTimeoutException; import java.io.InterruptedIOException;
import java.util.Objects; import java.util.Objects;
public class ContentTypeProber { public class ContentTypeProber {
@ -68,7 +68,7 @@ public class ContentTypeProber {
return new ContentTypeProbeResult.Ok(ret); return new ContentTypeProbeResult.Ok(ret);
} catch (SocketTimeoutException ex) { } catch (InterruptedIOException ex) {
return new ContentTypeProbeResult.Timeout(ex); return new ContentTypeProbeResult.Timeout(ex);
} catch (Exception ex) { } catch (Exception ex) {
logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage()); logger.error("Error during fetching {}[{}]", ex.getClass().getSimpleName(), ex.getMessage());

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival; package nu.marginalia.crawl.logic;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;

View File

@ -1,4 +1,4 @@
package nu.marginalia.crawl.retreival; package nu.marginalia.crawl.logic;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
@ -26,6 +26,20 @@ public class LinkFilterSelector {
if (isDiscourse(head)) { if (isDiscourse(head)) {
return url -> url.path.startsWith("/t/") || url.path.contains("/latest"); return url -> url.path.startsWith("/t/") || url.path.contains("/latest");
} }
if (isMediawiki(head)) {
return url -> {
if (url.path.endsWith(".php")) {
return false;
}
if (url.path.contains("Special:")) {
return false;
}
if (url.path.contains("Talk:")) {
return false;
}
return true;
};
}
return LinkFilterSelector::defaultFilter; return LinkFilterSelector::defaultFilter;
} }

View File

@ -1,6 +1,7 @@
package nu.marginalia.crawl.retreival.fetcher; package nu.marginalia.crawl.logic;
import com.google.common.base.Strings; import com.google.common.base.Strings;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import okhttp3.OkHttpClient; import okhttp3.OkHttpClient;
import okhttp3.Request; import okhttp3.Request;

View File

@ -1,6 +1,9 @@
package nu.marginalia.crawl.retreival; package nu.marginalia.crawl.retreival;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import java.time.Duration;
import static java.lang.Math.max; import static java.lang.Math.max;
import static java.lang.Math.min; import static java.lang.Math.min;
@ -22,12 +25,21 @@ public class CrawlDelayTimer {
/** Call when we've gotten an HTTP 429 response. This will wait a moment, and then /** Call when we've gotten an HTTP 429 response. This will wait a moment, and then
* set a flag that slows down the main crawl delay as well. */ * set a flag that slows down the main crawl delay as well. */
public void waitRetryDelay(RateLimitException ex) throws InterruptedException { public void waitRetryDelay(HttpFetcherImpl.RateLimitException ex) throws InterruptedException {
slowDown = true; slowDown = true;
int delay = ex.retryAfter(); Duration delay = ex.retryAfter();
Thread.sleep(Math.clamp(delay, 100, 5000)); if (delay.compareTo(Duration.ofSeconds(1)) < 0) {
// If the server wants us to retry in less than a second, we'll just wait a bit
delay = Duration.ofSeconds(1);
}
else if (delay.compareTo(Duration.ofSeconds(5)) > 0) {
// If the server wants us to retry in more than a minute, we'll wait a bit
delay = Duration.ofSeconds(5);
}
Thread.sleep(delay.toMillis());
} }
@SneakyThrows @SneakyThrows

View File

@ -1,91 +0,0 @@
package nu.marginalia.crawl.retreival;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.HttpFetchResult;
import nu.marginalia.model.crawldata.CrawledDocument;
import nu.marginalia.model.crawldata.CrawlerDocumentStatus;
import java.time.LocalDateTime;
import java.util.Objects;
public class CrawledDocumentFactory {
public static CrawledDocument createHardErrorRsp(EdgeUrl url, Exception why) {
return CrawledDocument.builder()
.crawlerStatus(CrawlerDocumentStatus.ERROR.toString())
.crawlerStatusDesc(why.getClass().getSimpleName() + ": " + why.getMessage())
.timestamp(LocalDateTime.now().toString())
.url(url.toString())
.build();
}
public static CrawledDocument createUnknownHostError(EdgeUrl url) {
return CrawledDocument.builder()
.crawlerStatus(CrawlerDocumentStatus.ERROR.toString())
.crawlerStatusDesc("Unknown Host")
.timestamp(LocalDateTime.now().toString())
.url(url.toString())
.build();
}
public static CrawledDocument createTimeoutErrorRsp(EdgeUrl url) {
return CrawledDocument.builder()
.crawlerStatus("Timeout")
.timestamp(LocalDateTime.now().toString())
.url(url.toString())
.build();
}
public static CrawledDocument createErrorResponse(EdgeUrl url, HttpFetchResult.ResultOk rsp, CrawlerDocumentStatus status, String why) {
return CrawledDocument.builder()
.crawlerStatus(status.toString())
.crawlerStatusDesc(why)
.headers(rsp.headers().toString())
.contentType(Objects.requireNonNullElse(rsp.headers().get("Content-Type"), ""))
.timestamp(LocalDateTime.now().toString())
.httpStatus(rsp.statusCode())
.url(url.toString())
.build();
}
public static CrawledDocument createErrorResponse(EdgeUrl url, String contentType, int statusCode, CrawlerDocumentStatus status, String why) {
return CrawledDocument.builder()
.crawlerStatus(status.toString())
.crawlerStatusDesc(why)
.headers("")
.contentType(contentType)
.timestamp(LocalDateTime.now().toString())
.httpStatus(statusCode)
.url(url.toString())
.build();
}
public static CrawledDocument createRedirectResponse(EdgeUrl url, HttpFetchResult.ResultOk rsp, EdgeUrl responseUrl) {
return CrawledDocument.builder()
.crawlerStatus(CrawlerDocumentStatus.REDIRECT.name())
.redirectUrl(responseUrl.toString())
.headers(rsp.headers().toString())
.contentType(Objects.requireNonNullElse(rsp.headers().get("Content-Type"), ""))
.timestamp(LocalDateTime.now().toString())
.httpStatus(rsp.statusCode())
.url(url.toString())
.build();
}
public static CrawledDocument createRobotsError(EdgeUrl url) {
return CrawledDocument.builder()
.url(url.toString())
.timestamp(LocalDateTime.now().toString())
.httpStatus(-1)
.crawlerStatus(CrawlerDocumentStatus.ROBOTS_TXT.name())
.build();
}
public static CrawledDocument createRetryError(EdgeUrl url) {
return CrawledDocument.builder()
.url(url.toString())
.timestamp(LocalDateTime.now().toString())
.httpStatus(429)
.crawlerStatus(CrawlerDocumentStatus.ERROR.name())
.build();
}
}

View File

@ -3,9 +3,11 @@ package nu.marginalia.crawl.retreival;
import crawlercommons.robots.SimpleRobotRules; import crawlercommons.robots.SimpleRobotRules;
import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.contenttype.ContentType; import nu.marginalia.contenttype.ContentType;
import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.logic.LinkFilterSelector;
import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor; import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference; import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher; import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
@ -14,8 +16,6 @@ import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.HttpFetchResult; import nu.marginalia.model.body.HttpFetchResult;
import nu.marginalia.model.crawldata.CrawledDomain;
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
import nu.marginalia.model.crawlspec.CrawlSpecRecord; import nu.marginalia.model.crawlspec.CrawlSpecRecord;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -25,7 +25,6 @@ import java.io.IOException;
import java.net.InetAddress; import java.net.InetAddress;
import java.net.UnknownHostException; import java.net.UnknownHostException;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
@ -84,11 +83,11 @@ public class CrawlerRetreiver implements AutoCloseable {
return crawlFrontier; return crawlFrontier;
} }
public int fetch() { public int crawlDomain() {
return fetch(new DomainLinks(), new CrawlDataReference()); return crawlDomain(new DomainLinks(), new CrawlDataReference());
} }
public int fetch(DomainLinks domainLinks, CrawlDataReference oldCrawlData) { public int crawlDomain(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
try { try {
return crawlDomain(oldCrawlData, domainLinks); return crawlDomain(oldCrawlData, domainLinks);
} }
@ -98,28 +97,11 @@ public class CrawlerRetreiver implements AutoCloseable {
} }
} }
public void syncAbortedRun(Path warcFile) {
var resync = new CrawlerWarcResynchronizer(crawlFrontier, warcRecorder);
resync.run(warcFile);
}
private DomainProber.ProbeResult probeRootUrl(String ip) throws IOException {
// Construct an URL to the root of the domain, we don't know the schema yet so we'll
// start with http and then try https if that fails
var httpUrl = new EdgeUrl("http", new EdgeDomain(domain), null, "/", null);
final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, httpUrl);
warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult);
return probeResult;
}
private int crawlDomain(CrawlDataReference oldCrawlData, DomainLinks domainLinks) throws IOException, InterruptedException { private int crawlDomain(CrawlDataReference oldCrawlData, DomainLinks domainLinks) throws IOException, InterruptedException {
String ip = findIp(domain); String ip = findIp(domain);
EdgeUrl rootUrl; EdgeUrl rootUrl;
if (probeRootUrl(ip) instanceof DomainProber.ProbeResultOk ok) rootUrl = ok.probedUrl(); if (probeRootUrl(ip) instanceof HttpFetcherImpl.ProbeResultOk ok) rootUrl = ok.probedUrl();
else return 1; else return 1;
// Sleep after the initial probe, we don't have access to the robots.txt yet // Sleep after the initial probe, we don't have access to the robots.txt yet
@ -130,12 +112,13 @@ public class CrawlerRetreiver implements AutoCloseable {
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay()); final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
delayTimer.waitFetchDelay(0); // initial delay after robots.txt delayTimer.waitFetchDelay(0); // initial delay after robots.txt
sniffRootDocument(rootUrl, delayTimer); sniffRootDocument(rootUrl, delayTimer);
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
int recrawled = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer); int fetchedCount = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
if (recrawled > 0) { if (fetchedCount > 0) {
// If we have reference data, we will always grow the crawl depth a bit // If we have reference data, we will always grow the crawl depth a bit
crawlFrontier.increaseDepth(1.5, 2500); crawlFrontier.increaseDepth(1.5, 2500);
} }
@ -146,15 +129,6 @@ public class CrawlerRetreiver implements AutoCloseable {
// Add links from the sitemap to the crawl frontier // Add links from the sitemap to the crawl frontier
sitemapFetcher.downloadSitemaps(robotsRules, rootUrl); sitemapFetcher.downloadSitemaps(robotsRules, rootUrl);
CrawledDomain ret = new CrawledDomain(domain,
null,
CrawlerDomainStatus.OK.name(),
null,
ip,
new ArrayList<>(),
null);
int fetchedCount = recrawled;
while (!crawlFrontier.isEmpty() while (!crawlFrontier.isEmpty()
&& !crawlFrontier.isCrawlDepthReached() && !crawlFrontier.isCrawlDepthReached()
@ -186,7 +160,6 @@ public class CrawlerRetreiver implements AutoCloseable {
if (!crawlFrontier.addVisited(top)) if (!crawlFrontier.addVisited(top))
continue; continue;
try { try {
if (fetchContentWithReference(top, delayTimer, DocumentWithReference.empty()).isOk()) { if (fetchContentWithReference(top, delayTimer, DocumentWithReference.empty()).isOk()) {
fetchedCount++; fetchedCount++;
@ -198,15 +171,28 @@ public class CrawlerRetreiver implements AutoCloseable {
} }
} }
ret.cookies = fetcher.getCookies();
return fetchedCount; return fetchedCount;
} }
public void syncAbortedRun(Path warcFile) {
var resync = new CrawlerWarcResynchronizer(crawlFrontier, warcRecorder);
resync.run(warcFile);
}
private HttpFetcherImpl.ProbeResult probeRootUrl(String ip) throws IOException {
// Construct an URL to the root of the domain, we don't know the schema yet so we'll
// start with http and then try https if that fails
var httpUrl = new EdgeUrl("http", new EdgeDomain(domain), null, "/", null);
final HttpFetcherImpl.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, httpUrl);
warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult);
return probeResult;
}
private void sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) { private void sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
try { try {
logger.debug("Configuring link filter");
var url = rootUrl.withPathAndParam("/", null); var url = rootUrl.withPathAndParam("/", null);
HttpFetchResult result = fetchWithRetry(url, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty()); HttpFetchResult result = fetchWithRetry(url, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
@ -291,7 +277,7 @@ public class CrawlerRetreiver implements AutoCloseable {
else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) { else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) {
var doc = reference.doc(); var doc = reference.doc();
warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody, contentTags); warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody, doc.headers, contentTags);
fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url, fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url,
new ContentType(doc.contentType, "UTF-8"), new ContentType(doc.contentType, "UTF-8"),
@ -326,7 +312,7 @@ public class CrawlerRetreiver implements AutoCloseable {
try { try {
return fetcher.fetchContent(url, warcRecorder, contentTags, probeType); return fetcher.fetchContent(url, warcRecorder, contentTags, probeType);
} }
catch (RateLimitException ex) { catch (HttpFetcherImpl.RateLimitException ex) {
timer.waitRetryDelay(ex); timer.waitRetryDelay(ex);
} }
catch (Exception ex) { catch (Exception ex) {

View File

@ -1,6 +1,6 @@
package nu.marginalia.crawl.retreival; package nu.marginalia.crawl.retreival;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.DocumentBodyExtractor; import nu.marginalia.model.body.DocumentBodyExtractor;
import nu.marginalia.model.body.HttpFetchResult; import nu.marginalia.model.body.HttpFetchResult;
@ -38,7 +38,7 @@ public class CrawlerWarcResynchronizer {
accept(item); accept(item);
} }
} catch (Exception e) { } catch (Exception e) {
logger.info(STR."(Expected) Failed read full warc file \{tempFile}: \{e.getClass().getSimpleName()} \{e.getMessage()}"); logger.info("(Expected) Failed read full warc file " + tempFile + ": " + e.getClass().getSimpleName() + " " + e.getMessage());
} }
// Second pass, copy records to the new warc file // Second pass, copy records to the new warc file
@ -47,7 +47,7 @@ public class CrawlerWarcResynchronizer {
recorder.resync(item); recorder.resync(item);
} }
} catch (Exception e) { } catch (Exception e) {
logger.info(STR."(Expected) Failed read full warc file \{tempFile}: \{e.getClass().getSimpleName()} \{e.getMessage()}"); logger.info("(Expected) Failed read full warc file " + tempFile + ": " + e.getClass().getSimpleName() + " " + e.getMessage());
} }
} }
@ -63,7 +63,7 @@ public class CrawlerWarcResynchronizer {
} }
catch (Exception ex) { catch (Exception ex) {
logger.info(STR."Failed to process warc record \{item}", ex); logger.info("Failed to process warc record " + item, ex);
} }
} }
@ -78,7 +78,8 @@ public class CrawlerWarcResynchronizer {
} }
private void request(WarcRequest request) { private void request(WarcRequest request) {
EdgeUrl.parse(request.target()).ifPresent(crawlFrontier::addVisited); var url = new EdgeUrl(request.targetURI());
crawlFrontier.addVisited(url);
} }
private void response(WarcResponse rsp) { private void response(WarcResponse rsp) {
@ -97,7 +98,7 @@ public class CrawlerWarcResynchronizer {
}); });
} }
catch (Exception e) { catch (Exception e) {
logger.info(STR."Failed to parse response body for \{url}", e); logger.info("Failed to parse response body for " + url, e);
} }
} }

View File

@ -9,9 +9,14 @@ import nu.marginalia.model.EdgeUrl;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.util.*; import java.util.ArrayDeque;
import java.util.Collection;
import java.util.Objects;
import java.util.function.Predicate; import java.util.function.Predicate;
/** Encapsulates the crawl frontier for a single domain,
* that is information about known and visited URLs
*/
public class DomainCrawlFrontier { public class DomainCrawlFrontier {
private static final LinkParser linkParser = new LinkParser(); private static final LinkParser linkParser = new LinkParser();

View File

@ -2,8 +2,8 @@ package nu.marginalia.crawl.retreival;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import nu.marginalia.crawl.retreival.fetcher.FetchResultState; import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.ip_blocklist.IpBlockList; import nu.marginalia.ip_blocklist.IpBlockList;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
@ -34,43 +34,18 @@ public class DomainProber {
* doesn't immediately redirect to another domain (which should be crawled separately, not under the name * doesn't immediately redirect to another domain (which should be crawled separately, not under the name
* of this domain). * of this domain).
*/ */
public ProbeResult probeDomain(HttpFetcher fetcher, String domain, @Nullable EdgeUrl firstUrlInQueue) { public HttpFetcherImpl.ProbeResult probeDomain(HttpFetcher fetcher, String domain, @Nullable EdgeUrl firstUrlInQueue) {
if (firstUrlInQueue == null) { if (firstUrlInQueue == null) {
logger.warn("No valid URLs for domain {}", domain); logger.warn("No valid URLs for domain {}", domain);
return new ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs"); return new HttpFetcherImpl.ProbeResultError(CrawlerDomainStatus.ERROR, "No known URLs");
} }
if (!domainBlacklist.test(firstUrlInQueue.domain)) if (!domainBlacklist.test(firstUrlInQueue.domain))
return new ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed"); return new HttpFetcherImpl.ProbeResultError(CrawlerDomainStatus.BLOCKED, "IP not allowed");
var fetchResult = fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null)); return fetcher.probeDomain(firstUrlInQueue.withPathAndParam("/", null));
if (fetchResult.ok())
return new ProbeResultOk(fetchResult.url);
if (fetchResult.state == FetchResultState.REDIRECT)
return new ProbeResultRedirect(fetchResult.domain);
return new ProbeResultError(CrawlerDomainStatus.ERROR, "Bad status");
} }
public sealed interface ProbeResult permits ProbeResultError, ProbeResultRedirect, ProbeResultOk {}
/** The probing failed for one reason or another
* @param status Machine readable status
* @param desc Human-readable description of the error
*/
public record ProbeResultError(CrawlerDomainStatus status, String desc) implements ProbeResult {}
/** This domain redirects to another domain */
public record ProbeResultRedirect(EdgeDomain domain) implements ProbeResult {}
/** If the retrieval of the probed url was successful, return the url as it was fetched
* (which may be different from the url we probed, if we attempted another URL schema).
*
* @param probedUrl The url we successfully probed
*/
public record ProbeResultOk(EdgeUrl probedUrl) implements ProbeResult {}
} }

View File

@ -1,21 +0,0 @@
package nu.marginalia.crawl.retreival;
public class RateLimitException extends Exception {
private final String retryAfter;
public RateLimitException(String retryAfter) {
this.retryAfter = retryAfter;
}
@Override
public StackTraceElement[] getStackTrace() { return new StackTraceElement[0]; }
public int retryAfter() {
try {
return Integer.parseInt(retryAfter);
}
catch (NumberFormatException ex) {
return 1000;
}
}
}

View File

@ -1,24 +0,0 @@
package nu.marginalia.crawl.retreival.fetcher;
import lombok.AllArgsConstructor;
import lombok.ToString;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
@AllArgsConstructor
@ToString
public class FetchResult {
public final FetchResultState state;
public final EdgeUrl url;
public final EdgeDomain domain;
public FetchResult(FetchResultState state, EdgeUrl url) {
this.state = state;
this.url = url;
this.domain = url.domain;
}
public boolean ok() {
return state == FetchResultState.OK;
}
}

View File

@ -1,7 +0,0 @@
package nu.marginalia.crawl.retreival.fetcher;
public enum FetchResultState {
OK,
REDIRECT,
ERROR
}

View File

@ -2,12 +2,12 @@ package nu.marginalia.crawl.retreival.revisit;
import com.google.common.base.Strings; import com.google.common.base.Strings;
import crawlercommons.robots.SimpleRobotRules; import crawlercommons.robots.SimpleRobotRules;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.CrawlDelayTimer; import nu.marginalia.crawl.retreival.CrawlDelayTimer;
import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainCrawlFrontier; import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.HttpFetchResult; import nu.marginalia.model.body.HttpFetchResult;
import nu.marginalia.model.crawldata.CrawledDocument; import nu.marginalia.model.crawldata.CrawledDocument;
@ -125,6 +125,7 @@ public class CrawlerRevisitor {
doc.contentType, doc.contentType,
doc.httpStatus, doc.httpStatus,
doc.documentBody, doc.documentBody,
doc.headers,
new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe) new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe)
); );

View File

@ -1,7 +1,7 @@
package nu.marginalia.crawl.retreival.revisit; package nu.marginalia.crawl.retreival.revisit;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.model.body.DocumentBodyExtractor; import nu.marginalia.model.body.DocumentBodyExtractor;
import nu.marginalia.model.body.DocumentBodyResult; import nu.marginalia.model.body.DocumentBodyResult;
import nu.marginalia.model.body.HttpFetchResult; import nu.marginalia.model.body.HttpFetchResult;

View File

@ -1,8 +1,8 @@
package nu.marginalia.crawl.retreival.sitemap; package nu.marginalia.crawl.retreival.sitemap;
import crawlercommons.robots.SimpleRobotRules; import crawlercommons.robots.SimpleRobotRules;
import nu.marginalia.crawl.fetcher.SitemapRetriever;
import nu.marginalia.crawl.retreival.DomainCrawlFrontier; import nu.marginalia.crawl.retreival.DomainCrawlFrontier;
import nu.marginalia.crawl.retreival.fetcher.SitemapRetriever;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;

View File

@ -144,7 +144,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
nextRecord.httpStatus, nextRecord.httpStatus,
status.toString(), status.toString(),
"", "",
"", nextRecord.headers,
bodyString, bodyString,
Long.toHexString(hash.hashNearlyASCII(bodyString)), // this field isn't actually used, maybe we can skip calculating it? Long.toHexString(hash.hashNearlyASCII(bodyString)), // this field isn't actually used, maybe we can skip calculating it?
nextRecord.url, nextRecord.url,

View File

@ -23,7 +23,6 @@ public class CrawledDocument implements SerializableCrawlData {
public String crawlerStatusDesc; public String crawlerStatusDesc;
@Nullable @Nullable
@Deprecated // use getETag() or getLastModified() instead
public String headers; public String headers;
public String documentBody; public String documentBody;

View File

@ -29,7 +29,11 @@ public class CrawledDocumentParquetRecord {
public String contentType; public String contentType;
public byte[] body; public byte[] body;
public String headers;
@Deprecated // will be replaced with the full headers field in the future
public String etagHeader; public String etagHeader;
@Deprecated // will be replaced with the full headers field in the future
public String lastModifiedHeader; public String lastModifiedHeader;
public static Hydrator<CrawledDocumentParquetRecord, CrawledDocumentParquetRecord> newHydrator() { public static Hydrator<CrawledDocumentParquetRecord, CrawledDocumentParquetRecord> newHydrator() {
@ -51,7 +55,8 @@ public class CrawledDocumentParquetRecord {
Types.required(BINARY).as(stringType()).named("contentType"), Types.required(BINARY).as(stringType()).named("contentType"),
Types.required(BINARY).named("body"), Types.required(BINARY).named("body"),
Types.optional(BINARY).as(stringType()).named("etagHeader"), Types.optional(BINARY).as(stringType()).named("etagHeader"),
Types.optional(BINARY).as(stringType()).named("lastModifiedHeader") Types.optional(BINARY).as(stringType()).named("lastModifiedHeader"),
Types.optional(BINARY).as(stringType()).named("headers")
); );
@ -67,6 +72,7 @@ public class CrawledDocumentParquetRecord {
case "epochSeconds" -> timestamp = Instant.ofEpochSecond((Long) value); case "epochSeconds" -> timestamp = Instant.ofEpochSecond((Long) value);
case "etagHeader" -> etagHeader = (String) value; case "etagHeader" -> etagHeader = (String) value;
case "lastModifiedHeader" -> lastModifiedHeader = (String) value; case "lastModifiedHeader" -> lastModifiedHeader = (String) value;
case "headers" -> headers = (String) value;
default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"');
} }
@ -82,6 +88,9 @@ public class CrawledDocumentParquetRecord {
valueWriter.write("cookies", cookies); valueWriter.write("cookies", cookies);
valueWriter.write("contentType", contentType); valueWriter.write("contentType", contentType);
valueWriter.write("body", body); valueWriter.write("body", body);
if (headers != null) {
valueWriter.write("headers", headers);
}
if (etagHeader != null) { if (etagHeader != null) {
valueWriter.write("etagHeader", etagHeader); valueWriter.write("etagHeader", etagHeader);
} }

View File

@ -16,6 +16,7 @@ import java.nio.file.Path;
import java.time.Instant; import java.time.Instant;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.StringJoiner;
public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
private final ParquetWriter<CrawledDocumentParquetRecord> writer; private final ParquetWriter<CrawledDocumentParquetRecord> writer;
@ -150,6 +151,14 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
contentType = ""; contentType = "";
} }
String headersStr = null;
StringJoiner headersStrBuilder = new StringJoiner("\n");
for (var header : headers) {
headersStrBuilder.add(header.getFirst() + ": " + header.getSecond());
}
headersStr = headersStrBuilder.toString();
write(new CrawledDocumentParquetRecord( write(new CrawledDocumentParquetRecord(
domain, domain,
response.target(), response.target(),
@ -159,6 +168,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
response.date(), response.date(),
contentType, contentType,
bodyBytes, bodyBytes,
headersStr,
headers.get("ETag"), headers.get("ETag"),
headers.get("Last-Modified")) headers.get("Last-Modified"))
); );
@ -179,6 +189,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
"x-marginalia/advisory;state=redirect", "x-marginalia/advisory;state=redirect",
new byte[0], new byte[0],
null, null,
null,
null null
); );
} }
@ -192,6 +203,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
"x-marginalia/advisory;state=error", "x-marginalia/advisory;state=error",
errorStatus.getBytes(), errorStatus.getBytes(),
null, null,
null,
null null
); );
} }
@ -206,6 +218,7 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
errorStatus, errorStatus,
new byte[0], new byte[0],
null, null,
null,
null null
); );
} }

View File

@ -0,0 +1,124 @@
package nu.marginalia.crawl.logic;
import com.sun.net.httpserver.HttpServer;
import nu.marginalia.model.EdgeUrl;
import okhttp3.OkHttpClient;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.Random;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertInstanceOf;
class ContentTypeProberTest {
private static int port;
private static HttpServer server;
private static OkHttpClient client;
static EdgeUrl htmlEndpoint;
static EdgeUrl htmlRedirEndpoint;
static EdgeUrl binaryEndpoint;
static EdgeUrl timeoutEndpoint;
@BeforeEach
void setUp() throws IOException {
Random r = new Random();
port = r.nextInt(10000) + 8000;
server = HttpServer.create(new InetSocketAddress("127.0.0.1", port), 10);
server.createContext("/html", exchange -> {
exchange.getResponseHeaders().add("Content-Type", "text/html");
exchange.sendResponseHeaders(200, -1);
exchange.close();
});
server.createContext("/redir", exchange -> {
exchange.getResponseHeaders().add("Location", "/html");
exchange.sendResponseHeaders(301, -1);
exchange.close();
});
server.createContext("/bin", exchange -> {
exchange.getResponseHeaders().add("Content-Type", "application/binary");
exchange.sendResponseHeaders(200, -1);
exchange.close();
});
server.createContext("/timeout", exchange -> {
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
exchange.getResponseHeaders().add("Content-Type", "application/binary");
exchange.sendResponseHeaders(200, -1);
exchange.close();
});
server.start();
htmlEndpoint = EdgeUrl.parse("http://localhost:" + port + "/html").get();
binaryEndpoint = EdgeUrl.parse("http://localhost:" + port + "/bin").get();
timeoutEndpoint = EdgeUrl.parse("http://localhost:" + port + "/timeout").get();
htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir").get();
client = new OkHttpClient.Builder()
.readTimeout(1, java.util.concurrent.TimeUnit.SECONDS)
.connectTimeout(1, java.util.concurrent.TimeUnit.SECONDS)
.callTimeout(1, java.util.concurrent.TimeUnit.SECONDS)
.writeTimeout(1, java.util.concurrent.TimeUnit.SECONDS)
.build();
}
@AfterEach
void tearDown() {
server.stop(0);
client.dispatcher().executorService().shutdown();
client.connectionPool().evictAll();
}
@Test
void probeContentTypeOk() {
ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client)
.probeContentType(htmlEndpoint);
System.out.println(result);
assertEquals(result, new ContentTypeProber.ContentTypeProbeResult.Ok(htmlEndpoint));
}
@Test
void probeContentTypeRedir() {
ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client)
.probeContentType(htmlRedirEndpoint);
System.out.println(result);
assertEquals(result, new ContentTypeProber.ContentTypeProbeResult.Ok(htmlEndpoint));
}
@Test
void probeContentTypeBad() {
ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client)
.probeContentType(binaryEndpoint);
System.out.println(result);
assertInstanceOf(ContentTypeProber.ContentTypeProbeResult.BadContentType.class, result);
}
@Test
void probeContentTypeTimeout() {
ContentTypeProber.ContentTypeProbeResult result = new ContentTypeProber("test", client)
.probeContentType(timeoutEndpoint);
System.out.println(result);
assertInstanceOf(ContentTypeProber.ContentTypeProbeResult.Timeout.class, result);
}
}

View File

@ -1,7 +1,7 @@
package nu.marginalia.crawl.retreival; package nu.marginalia.crawl.retreival;
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import okhttp3.OkHttpClient; import okhttp3.OkHttpClient;
@ -21,7 +21,8 @@ import java.security.NoSuchAlgorithmException;
import java.util.List; import java.util.List;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
class CrawlerWarcResynchronizerTest { class CrawlerWarcResynchronizerTest {
Path fileName; Path fileName;

View File

@ -1,7 +1,8 @@
package nu.marginalia.crawl.retreival.fetcher; package nu.marginalia.crawl.retreival.fetcher;
import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult.BadContentType; import nu.marginalia.crawl.logic.ContentTypeProber;
import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeResult.Ok; import nu.marginalia.crawl.logic.ContentTypeProber.ContentTypeProbeResult.BadContentType;
import nu.marginalia.crawl.logic.ContentTypeProber.ContentTypeProbeResult.Ok;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import okhttp3.ConnectionPool; import okhttp3.ConnectionPool;
import okhttp3.Dispatcher; import okhttp3.Dispatcher;
@ -13,7 +14,7 @@ import java.net.URISyntaxException;
import java.util.concurrent.Executors; import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.assertEquals;
class ContentTypeProberTest { class ContentTypeProberTest {

View File

@ -1,8 +1,9 @@
package nu.marginalia.crawl.retreival.fetcher; package nu.marginalia.crawl.retreival.fetcher;
import nu.marginalia.UserAgent; import nu.marginalia.UserAgent;
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader; import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter; import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
@ -80,6 +81,7 @@ class WarcRecorderTest {
"text/html", "text/html",
200, 200,
"<?doctype html><html><body>test</body></html>", "<?doctype html><html><body>test</body></html>",
null,
ContentTags.empty()); ContentTags.empty());
} }
@ -103,7 +105,7 @@ class WarcRecorderTest {
"text/html", "text/html",
200, 200,
null, null,
ContentTags.empty()); null, ContentTags.empty());
} }
} }
@ -115,7 +117,7 @@ class WarcRecorderTest {
"text/html", "text/html",
200, 200,
"<?doctype html><html><body>test</body></html>", "<?doctype html><html><body>test</body></html>",
ContentTags.empty()); null, ContentTags.empty());
} }
try (var reader = new WarcReader(fileNameWarc)) { try (var reader = new WarcReader(fileNameWarc)) {

View File

@ -1,7 +1,7 @@
package nu.marginalia.crawl.retreival.revisit; package nu.marginalia.crawl.retreival.revisit;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.CrawlDataReference;
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
import nu.marginalia.model.crawldata.CrawledDocument; import nu.marginalia.model.crawldata.CrawledDocument;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;

View File

@ -1,11 +1,10 @@
package nu.marginalia.crawling; package nu.marginalia.crawling;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.crawl.retreival.RateLimitException; import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.ContentTypeLogic; import nu.marginalia.model.body.ContentTypeLogic;
import nu.marginalia.model.body.DocumentBodyExtractor; import nu.marginalia.model.body.DocumentBodyExtractor;
@ -33,7 +32,7 @@ class HttpFetcherTest {
} }
@Test @Test
void fetchUTF8() throws URISyntaxException, RateLimitException, IOException { void fetchUTF8() throws URISyntaxException, HttpFetcherImpl.RateLimitException, IOException {
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
try (var recorder = new WarcRecorder()) { try (var recorder = new WarcRecorder()) {
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL); var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
@ -44,7 +43,7 @@ class HttpFetcherTest {
} }
@Test @Test
void fetchText() throws URISyntaxException, RateLimitException, IOException { void fetchText() throws URISyntaxException, HttpFetcherImpl.RateLimitException, IOException {
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler"); var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
try (var recorder = new WarcRecorder()) { try (var recorder = new WarcRecorder()) {

View File

@ -2,10 +2,13 @@ package nu.marginalia.crawling.retreival;
import crawlercommons.robots.SimpleRobotRules; import crawlercommons.robots.SimpleRobotRules;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.SitemapRetriever;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.CrawlerRetreiver;
import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.DomainProber;
import nu.marginalia.crawl.retreival.fetcher.*;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.HttpFetchResult; import nu.marginalia.model.body.HttpFetchResult;
@ -68,7 +71,7 @@ public class CrawlerMockFetcherTest {
void crawl(CrawlSpecRecord spec) throws IOException { void crawl(CrawlSpecRecord spec) throws IOException {
try (var recorder = new WarcRecorder()) { try (var recorder = new WarcRecorder()) {
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder) new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder)
.fetch(); .crawlDomain();
} }
} }
@ -115,9 +118,9 @@ public class CrawlerMockFetcherTest {
public void clearCookies() {} public void clearCookies() {}
@Override @Override
public FetchResult probeDomain(EdgeUrl url) { public HttpFetcherImpl.ProbeResult probeDomain(EdgeUrl url) {
logger.info("Probing {}", url); logger.info("Probing {}", url);
return new FetchResult(FetchResultState.OK, url); return new HttpFetcherImpl.ProbeResultOk(url);
} }
@SneakyThrows @SneakyThrows

View File

@ -4,10 +4,10 @@ import lombok.SneakyThrows;
import nu.marginalia.UserAgent; import nu.marginalia.UserAgent;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.crawl.retreival.*; import nu.marginalia.crawl.retreival.*;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.io.crawldata.CrawledDomainReader; import nu.marginalia.io.crawldata.CrawledDomainReader;
import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import nu.marginalia.io.crawldata.SerializableCrawlDataStream;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
@ -468,7 +468,7 @@ class CrawlerRetreiverTest {
private void doCrawlWithReferenceStream(CrawlSpecRecord specs, SerializableCrawlDataStream stream) { private void doCrawlWithReferenceStream(CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
try (var recorder = new WarcRecorder(tempFileWarc2)) { try (var recorder = new WarcRecorder(tempFileWarc2)) {
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).fetch(new DomainLinks(), new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).crawlDomain(new DomainLinks(),
new CrawlDataReference(stream)); new CrawlDataReference(stream));
} }
catch (IOException ex) { catch (IOException ex) {
@ -480,7 +480,7 @@ class CrawlerRetreiverTest {
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlSpecRecord specs) { private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlSpecRecord specs) {
try (var recorder = new WarcRecorder(tempFileWarc1)) { try (var recorder = new WarcRecorder(tempFileWarc1)) {
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder); var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder);
crawler.fetch(); crawler.crawlDomain();
return crawler.getCrawlFrontier(); return crawler.getCrawlFrontier();
} catch (IOException ex) { } catch (IOException ex) {
Assertions.fail(ex); Assertions.fail(ex);

View File

@ -8,9 +8,9 @@ import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.converting.writer.ConverterBatchWriter; import nu.marginalia.converting.writer.ConverterBatchWriter;
import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.functions.searchquery.QueryFactory; import nu.marginalia.functions.searchquery.QueryFactory;
import nu.marginalia.index.IndexGrpcService; import nu.marginalia.index.IndexGrpcService;
import nu.marginalia.index.ReverseIndexFullFileNames; import nu.marginalia.index.ReverseIndexFullFileNames;
@ -120,7 +120,7 @@ public class IntegrationTest {
/** CREATE WARC */ /** CREATE WARC */
try (WarcRecorder warcRecorder = new WarcRecorder(warcData)) { try (WarcRecorder warcRecorder = new WarcRecorder(warcData)) {
warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"), warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"),
new DomainProber.ProbeResultOk(new EdgeUrl("https://www.example.com/"))); new HttpFetcherImpl.ProbeResultOk(new EdgeUrl("https://www.example.com/")));
warcRecorder.writeReferenceCopy(new EdgeUrl("https://www.example.com/"), warcRecorder.writeReferenceCopy(new EdgeUrl("https://www.example.com/"),
"text/html", 200, "text/html", 200,
@ -134,6 +134,7 @@ public class IntegrationTest {
</body> </body>
</html> </html>
""", """,
"",
ContentTags.empty() ContentTags.empty()
); );
} }
@ -204,7 +205,7 @@ public class IntegrationTest {
.setFetchSize(1000) .setFetchSize(1000)
.build()) .build())
.setQueryStrategy("AUTO") .setQueryStrategy("AUTO")
.setHumanQuery("\"This is how thinking works\"") .setHumanQuery("\"is that there is\"")
.build(); .build();
var params = QueryProtobufCodec.convertRequest(request); var params = QueryProtobufCodec.convertRequest(request);

View File

@ -96,7 +96,7 @@ public class ScreenshotCaptureToolMain {
private static byte[] fetchDomain(HttpClient client, EdgeDomain domain) { private static byte[] fetchDomain(HttpClient client, EdgeDomain domain) {
try { try {
Map<String, Object> requestData = Map.of( Map<String, Object> requestData = Map.of(
"url", domain.toRootUrl().toString(), "url", domain.toRootUrlHttp().toString(),
"options", "options",
Map.of("fullPage", false, Map.of("fullPage", false,
"type", "png"), "type", "png"),