mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(crawler) Grab favicons as part of root sniff
This commit is contained in:
parent
b1bfe6f76e
commit
5407da5650
@ -10,7 +10,8 @@ import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor;
|
|||||||
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
|
import nu.marginalia.crawl.retreival.revisit.DocumentWithReference;
|
||||||
import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
|
import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher;
|
||||||
import nu.marginalia.crawling.body.HttpFetchResult;
|
import nu.marginalia.crawling.body.HttpFetchResult;
|
||||||
import nu.marginalia.crawling.model.*;
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
|
import nu.marginalia.crawling.model.CrawlerDomainStatus;
|
||||||
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
||||||
import nu.marginalia.link_parser.LinkParser;
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
@ -87,17 +88,8 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public int fetch(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
|
public int fetch(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
|
||||||
final DomainProber.ProbeResult probeResult = domainProber.probeDomain(
|
|
||||||
fetcher,
|
|
||||||
domain,
|
|
||||||
new EdgeUrl("http", new EdgeDomain(domain), null, "/", null));
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Sleep a bit to avoid hammering the server with requests, we just probed it
|
return crawlDomain(oldCrawlData, domainLinks);
|
||||||
TimeUnit.SECONDS.sleep(1);
|
|
||||||
|
|
||||||
// Fetch the domain
|
|
||||||
return crawlDomain(oldCrawlData, probeResult, domainLinks);
|
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.error("Error crawling domain {}", domain, ex);
|
logger.error("Error crawling domain {}", domain, ex);
|
||||||
@ -111,25 +103,33 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
resync.run(warcFile);
|
resync.run(warcFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
private int crawlDomain(CrawlDataReference oldCrawlData, DomainProber.ProbeResult probeResult, DomainLinks domainLinks) throws IOException, InterruptedException {
|
private DomainProber.ProbeResult probeRootUrl(String ip) throws IOException {
|
||||||
String ip = findIp(domain);
|
// Construct an URL to the root of the domain, we don't know the schema yet so we'll
|
||||||
EdgeUrl rootUrl;
|
// start with http and then try https if that fails
|
||||||
|
var httpUrl = new EdgeUrl("http", new EdgeDomain(domain), null, "/", null);
|
||||||
|
final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, httpUrl);
|
||||||
|
|
||||||
warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult);
|
warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult);
|
||||||
|
|
||||||
if (!(probeResult instanceof DomainProber.ProbeResultOk ok)) {
|
return probeResult;
|
||||||
return 1;
|
}
|
||||||
}
|
|
||||||
else {
|
private int crawlDomain(CrawlDataReference oldCrawlData, DomainLinks domainLinks) throws IOException, InterruptedException {
|
||||||
rootUrl = ok.probedUrl();
|
String ip = findIp(domain);
|
||||||
}
|
EdgeUrl rootUrl;
|
||||||
|
|
||||||
|
if (probeRootUrl(ip) instanceof DomainProber.ProbeResultOk ok) rootUrl = ok.probedUrl();
|
||||||
|
else return 1;
|
||||||
|
|
||||||
|
// Sleep after the initial probe, we don't have access to the robots.txt yet
|
||||||
|
// so we don't know the crawl delay
|
||||||
|
TimeUnit.SECONDS.sleep(1);
|
||||||
|
|
||||||
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder);
|
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder);
|
||||||
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
|
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
|
||||||
|
|
||||||
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
|
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
|
||||||
sniffRootDocument(rootUrl, delayTimer);
|
sniffRootDocument(rootUrl, delayTimer);
|
||||||
delayTimer.waitFetchDelay(0); // delay after sniffing
|
|
||||||
|
|
||||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||||
int recrawled = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
|
int recrawled = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer);
|
||||||
@ -187,7 +187,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (fetchWriteAndSleep(top, delayTimer, DocumentWithReference.empty()).isOk()) {
|
if (fetchContentWithReference(top, delayTimer, DocumentWithReference.empty()).isOk()) {
|
||||||
fetchedCount++;
|
fetchedCount++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -208,21 +208,8 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
|
|
||||||
var url = rootUrl.withPathAndParam("/", null);
|
var url = rootUrl.withPathAndParam("/", null);
|
||||||
|
|
||||||
HttpFetchResult result = null;
|
HttpFetchResult result = fetchWithRetry(url, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
||||||
|
timer.waitFetchDelay(0);
|
||||||
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
|
||||||
try {
|
|
||||||
result = fetcher.fetchContent(url, warcRecorder, ContentTags.empty());
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
catch (RateLimitException ex) {
|
|
||||||
timer.waitRetryDelay(ex);
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
logger.warn("Failed to fetch {}", url, ex);
|
|
||||||
result = new HttpFetchResult.ResultException(ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!(result instanceof HttpFetchResult.ResultOk ok))
|
if (!(result instanceof HttpFetchResult.ResultOk ok))
|
||||||
return;
|
return;
|
||||||
@ -235,24 +222,39 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
var doc = optDoc.get();
|
var doc = optDoc.get();
|
||||||
crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc));
|
crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc));
|
||||||
|
|
||||||
|
EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null);
|
||||||
|
EdgeUrl sitemapUrl = url.withPathAndParam("/sitemap.xml", null);
|
||||||
|
|
||||||
for (var link : doc.getElementsByTag("link")) {
|
for (var link : doc.getElementsByTag("link")) {
|
||||||
String rel = link.attr("rel");
|
String rel = link.attr("rel");
|
||||||
String type = link.attr("type");
|
String type = link.attr("type");
|
||||||
|
|
||||||
if (!rel.equalsIgnoreCase("alternate"))
|
if (rel.equals("icon") || rel.equals("shortcut icon")) {
|
||||||
continue;
|
String href = link.attr("href");
|
||||||
|
|
||||||
if (!(type.equalsIgnoreCase("application/atom+xml")
|
faviconUrl = linkParser.parseLink(url, href)
|
||||||
|| type.equalsIgnoreCase("application/rss+xml")))
|
.filter(crawlFrontier::isSameDomain)
|
||||||
continue;
|
.orElse(faviconUrl);
|
||||||
|
}
|
||||||
|
|
||||||
String href = link.attr("href");
|
// Grab the RSS/Atom as a sitemap if it exists
|
||||||
|
if (rel.equalsIgnoreCase("alternate")
|
||||||
|
&& (type.equalsIgnoreCase("application/atom+xml") || type.equalsIgnoreCase("application/atomsvc+xml"))) {
|
||||||
|
String href = link.attr("href");
|
||||||
|
|
||||||
linkParser.parseLink(url, href)
|
sitemapUrl = linkParser.parseLink(url, href)
|
||||||
.filter(crawlFrontier::isSameDomain)
|
.filter(crawlFrontier::isSameDomain)
|
||||||
.map(List::of)
|
.orElse(sitemapUrl);
|
||||||
.ifPresent(sitemapFetcher::downloadSitemaps);
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Download the sitemap if it exists
|
||||||
|
sitemapFetcher.downloadSitemaps(List.of(sitemapUrl));
|
||||||
|
timer.waitFetchDelay(0);
|
||||||
|
|
||||||
|
// Grab the favicon if it exists
|
||||||
|
fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
||||||
|
timer.waitFetchDelay(0);
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.error("Error configuring link filter", ex);
|
logger.error("Error configuring link filter", ex);
|
||||||
@ -262,31 +264,16 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public HttpFetchResult fetchWriteAndSleep(EdgeUrl top,
|
public HttpFetchResult fetchContentWithReference(EdgeUrl top,
|
||||||
CrawlDelayTimer timer,
|
CrawlDelayTimer timer,
|
||||||
DocumentWithReference reference) throws InterruptedException
|
DocumentWithReference reference) throws InterruptedException
|
||||||
{
|
{
|
||||||
logger.debug("Fetching {}", top);
|
logger.debug("Fetching {}", top);
|
||||||
|
|
||||||
HttpFetchResult fetchedDoc = new HttpFetchResult.ResultNone();
|
|
||||||
|
|
||||||
long startTime = System.currentTimeMillis();
|
long startTime = System.currentTimeMillis();
|
||||||
var contentTags = reference.getContentTags();
|
var contentTags = reference.getContentTags();
|
||||||
|
|
||||||
// Fetch the document, retrying if we get a rate limit exception
|
HttpFetchResult fetchedDoc = fetchWithRetry(top, timer, HttpFetcher.ProbeType.FULL, contentTags);
|
||||||
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
|
||||||
try {
|
|
||||||
fetchedDoc = fetcher.fetchContent(top, warcRecorder, contentTags);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
catch (RateLimitException ex) {
|
|
||||||
timer.waitRetryDelay(ex);
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
logger.warn("Failed to fetch {}", top, ex);
|
|
||||||
fetchedDoc = new HttpFetchResult.ResultException(ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parse the document and enqueue links
|
// Parse the document and enqueue links
|
||||||
try {
|
try {
|
||||||
@ -328,6 +315,27 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
return fetchedDoc;
|
return fetchedDoc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Fetch a document and retry on 429s */
|
||||||
|
private HttpFetchResult fetchWithRetry(EdgeUrl url,
|
||||||
|
CrawlDelayTimer timer,
|
||||||
|
HttpFetcher.ProbeType probeType,
|
||||||
|
ContentTags contentTags) throws InterruptedException {
|
||||||
|
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
||||||
|
try {
|
||||||
|
return fetcher.fetchContent(url, warcRecorder, contentTags, probeType);
|
||||||
|
}
|
||||||
|
catch (RateLimitException ex) {
|
||||||
|
timer.waitRetryDelay(ex);
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.warn("Failed to fetch {}", url, ex);
|
||||||
|
return new HttpFetchResult.ResultException(ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return new HttpFetchResult.ResultNone();
|
||||||
|
}
|
||||||
|
|
||||||
private boolean isAllowedProtocol(String proto) {
|
private boolean isAllowedProtocol(String proto) {
|
||||||
return proto.equalsIgnoreCase("http")
|
return proto.equalsIgnoreCase("http")
|
||||||
|| proto.equalsIgnoreCase("https");
|
|| proto.equalsIgnoreCase("https");
|
||||||
|
@ -3,8 +3,8 @@ package nu.marginalia.crawl.retreival.fetcher;
|
|||||||
import com.google.inject.ImplementedBy;
|
import com.google.inject.ImplementedBy;
|
||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import nu.marginalia.crawl.retreival.RateLimitException;
|
import nu.marginalia.crawl.retreival.RateLimitException;
|
||||||
import nu.marginalia.crawling.body.HttpFetchResult;
|
|
||||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||||
|
import nu.marginalia.crawling.body.HttpFetchResult;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
|
||||||
@ -19,9 +19,18 @@ public interface HttpFetcher {
|
|||||||
|
|
||||||
FetchResult probeDomain(EdgeUrl url);
|
FetchResult probeDomain(EdgeUrl url);
|
||||||
|
|
||||||
HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) throws RateLimitException;
|
HttpFetchResult fetchContent(EdgeUrl url,
|
||||||
|
WarcRecorder recorder,
|
||||||
|
ContentTags tags,
|
||||||
|
ProbeType probeType) throws RateLimitException;
|
||||||
|
|
||||||
SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);
|
SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);
|
||||||
|
|
||||||
SitemapRetriever createSitemapRetriever();
|
SitemapRetriever createSitemapRetriever();
|
||||||
|
|
||||||
|
enum ProbeType {
|
||||||
|
DISABLED,
|
||||||
|
FULL,
|
||||||
|
IF_MODIFIED_SINCE
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -11,10 +11,10 @@ import nu.marginalia.crawl.retreival.fetcher.ContentTypeProber.ContentTypeProbeR
|
|||||||
import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory;
|
import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
|
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL;
|
import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL;
|
||||||
import nu.marginalia.crawling.body.DocumentBodyExtractor;
|
|
||||||
import nu.marginalia.crawling.body.HttpFetchResult;
|
|
||||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawling.body.ContentTypeLogic;
|
import nu.marginalia.crawling.body.ContentTypeLogic;
|
||||||
|
import nu.marginalia.crawling.body.DocumentBodyExtractor;
|
||||||
|
import nu.marginalia.crawling.body.HttpFetchResult;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import okhttp3.ConnectionPool;
|
import okhttp3.ConnectionPool;
|
||||||
@ -145,12 +145,13 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public HttpFetchResult fetchContent(EdgeUrl url,
|
public HttpFetchResult fetchContent(EdgeUrl url,
|
||||||
WarcRecorder warcRecorder,
|
WarcRecorder warcRecorder,
|
||||||
ContentTags contentTags)
|
ContentTags contentTags,
|
||||||
|
ProbeType probeType)
|
||||||
{
|
{
|
||||||
|
|
||||||
// We don't want to waste time and resources on URLs that are not HTML, so if the file ending
|
// We don't want to waste time and resources on URLs that are not HTML, so if the file ending
|
||||||
// looks like it might be something else, we perform a HEAD first to check the content type
|
// looks like it might be something else, we perform a HEAD first to check the content type
|
||||||
if (contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url))
|
if (probeType == ProbeType.FULL && contentTags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url))
|
||||||
{
|
{
|
||||||
ContentTypeProbeResult probeResult = contentTypeProber.probeContentType(url);
|
ContentTypeProbeResult probeResult = contentTypeProber.probeContentType(url);
|
||||||
if (probeResult instanceof ContentTypeProbeResult.Ok ok) {
|
if (probeResult instanceof ContentTypeProbeResult.Ok ok) {
|
||||||
@ -174,7 +175,9 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
else {
|
else {
|
||||||
// Possibly do a soft probe to see if the URL has been modified since the last time we crawled it
|
// Possibly do a soft probe to see if the URL has been modified since the last time we crawled it
|
||||||
// if we have reason to suspect ETags are not supported by the server.
|
// if we have reason to suspect ETags are not supported by the server.
|
||||||
if (softIfModifiedSinceProber.probeModificationTime(url, contentTags)) {
|
if (probeType == ProbeType.IF_MODIFIED_SINCE
|
||||||
|
&& softIfModifiedSinceProber.probeModificationTime(url, contentTags))
|
||||||
|
{
|
||||||
return new HttpFetchResult.Result304Raw();
|
return new HttpFetchResult.Result304Raw();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -137,7 +137,7 @@ public class CrawlerRevisitor {
|
|||||||
|
|
||||||
DocumentWithReference reference = new DocumentWithReference(doc, oldCrawlData);
|
DocumentWithReference reference = new DocumentWithReference(doc, oldCrawlData);
|
||||||
|
|
||||||
var result = crawlerRetreiver.fetchWriteAndSleep(url, delayTimer, reference);
|
var result = crawlerRetreiver.fetchContentWithReference(url, delayTimer, reference);
|
||||||
|
|
||||||
if (reference.isSame(result)) {
|
if (reference.isSame(result)) {
|
||||||
retained++;
|
retained++;
|
||||||
|
@ -3,11 +3,12 @@ package nu.marginalia.crawling;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.crawl.retreival.RateLimitException;
|
import nu.marginalia.crawl.retreival.RateLimitException;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
||||||
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawling.body.DocumentBodyExtractor;
|
|
||||||
import nu.marginalia.crawling.body.DocumentBodyResult;
|
|
||||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawling.body.ContentTypeLogic;
|
import nu.marginalia.crawling.body.ContentTypeLogic;
|
||||||
|
import nu.marginalia.crawling.body.DocumentBodyExtractor;
|
||||||
|
import nu.marginalia.crawling.body.DocumentBodyResult;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
@ -35,7 +36,7 @@ class HttpFetcherTest {
|
|||||||
void fetchUTF8() throws URISyntaxException, RateLimitException, IOException {
|
void fetchUTF8() throws URISyntaxException, RateLimitException, IOException {
|
||||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||||
try (var recorder = new WarcRecorder()) {
|
try (var recorder = new WarcRecorder()) {
|
||||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty());
|
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||||
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
||||||
System.out.println(bodyOk.contentType());
|
System.out.println(bodyOk.contentType());
|
||||||
}
|
}
|
||||||
@ -47,7 +48,7 @@ class HttpFetcherTest {
|
|||||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||||
|
|
||||||
try (var recorder = new WarcRecorder()) {
|
try (var recorder = new WarcRecorder()) {
|
||||||
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty());
|
var result = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), recorder, ContentTags.empty(), HttpFetcher.ProbeType.FULL);
|
||||||
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
if (DocumentBodyExtractor.asString(result) instanceof DocumentBodyResult.Ok bodyOk) {
|
||||||
System.out.println(bodyOk.contentType());
|
System.out.println(bodyOk.contentType());
|
||||||
}
|
}
|
||||||
|
@ -5,8 +5,8 @@ import lombok.SneakyThrows;
|
|||||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||||
import nu.marginalia.crawl.retreival.DomainProber;
|
import nu.marginalia.crawl.retreival.DomainProber;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.*;
|
import nu.marginalia.crawl.retreival.fetcher.*;
|
||||||
import nu.marginalia.crawling.body.HttpFetchResult;
|
|
||||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||||
|
import nu.marginalia.crawling.body.HttpFetchResult;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
import nu.marginalia.crawling.model.CrawlerDocumentStatus;
|
||||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||||
@ -23,7 +23,10 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.util.*;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
public class CrawlerMockFetcherTest {
|
public class CrawlerMockFetcherTest {
|
||||||
|
|
||||||
@ -119,7 +122,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
@Override
|
@Override
|
||||||
public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags) {
|
public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags, ProbeType probeType) {
|
||||||
logger.info("Fetching {}", url);
|
logger.info("Fetching {}", url);
|
||||||
if (mockData.containsKey(url)) {
|
if (mockData.containsKey(url)) {
|
||||||
byte[] bodyBytes = mockData.get(url).documentBody.getBytes();
|
byte[] bodyBytes = mockData.get(url).documentBody.getBytes();
|
||||||
|
@ -261,6 +261,7 @@ class CrawlerRetreiverTest {
|
|||||||
.collect(Collectors.toSet());
|
.collect(Collectors.toSet());
|
||||||
|
|
||||||
assertEquals(Set.of("https://www.marginalia.nu/",
|
assertEquals(Set.of("https://www.marginalia.nu/",
|
||||||
|
"https://www.marginalia.nu/favicon.ico",
|
||||||
"https://www.marginalia.nu/log/06-optimization.gmi/"),
|
"https://www.marginalia.nu/log/06-optimization.gmi/"),
|
||||||
fetchedUrls);
|
fetchedUrls);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user