mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(crawler) Clean up crawl data reference and recrawl logic
This commit is contained in:
parent
9e4aa7da7c
commit
c069c8c182
@ -6,6 +6,7 @@ import lombok.SneakyThrows;
|
|||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.crawling.model.CrawledDomain;
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||||
|
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
||||||
import nu.marginalia.model.gson.GsonFactory;
|
import nu.marginalia.model.gson.GsonFactory;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -30,7 +31,10 @@ public class CrawledDomainReader {
|
|||||||
public CrawledDomainReader() {
|
public CrawledDomainReader() {
|
||||||
}
|
}
|
||||||
|
|
||||||
public Iterator<SerializableCrawlData> createIterator(Path path) throws IOException {
|
public Iterator<SerializableCrawlData> createIterator(Path basePath, CrawlingSpecification spec) throws IOException {
|
||||||
|
|
||||||
|
final var path = CrawlerOutputFile.getOutputFile(basePath, spec.id, spec.domain);
|
||||||
|
|
||||||
BufferedReader br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))));
|
BufferedReader br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile()))));
|
||||||
|
|
||||||
return new Iterator<>() {
|
return new Iterator<>() {
|
||||||
|
@ -18,8 +18,6 @@ public class CrawlingSpecification {
|
|||||||
public String domain;
|
public String domain;
|
||||||
public List<String> urls;
|
public List<String> urls;
|
||||||
|
|
||||||
public CrawledDomain oldData;
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return String.format(getClass().getSimpleName() + "[" + id + "/" + domain + ": " + crawlDepth + "[ " + urls.size() + "]");
|
return String.format(getClass().getSimpleName() + "[" + id + "/" + domain + ": " + crawlDepth + "[ " + urls.size() + "]");
|
||||||
|
@ -8,22 +8,15 @@ import java.util.concurrent.Semaphore;
|
|||||||
public class CrawlLimiter {
|
public class CrawlLimiter {
|
||||||
public static final int maxPoolSize = Integer.getInteger("crawler.pool-size", 512);
|
public static final int maxPoolSize = Integer.getInteger("crawler.pool-size", 512);
|
||||||
|
|
||||||
public record CrawlTaskLimits(Path refreshPath, boolean isRefreshable, int taskSize) {}
|
|
||||||
|
|
||||||
private final Semaphore taskSemCount = new Semaphore(maxPoolSize);
|
private final Semaphore taskSemCount = new Semaphore(maxPoolSize);
|
||||||
|
|
||||||
|
|
||||||
public CrawlTaskLimits getTaskLimits(Path fileName) {
|
public void acquire() throws InterruptedException {
|
||||||
return new CrawlTaskLimits(fileName, true, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void acquire(CrawlTaskLimits properties) throws InterruptedException {
|
|
||||||
// It's very important that we acquire the RAM semaphore first to avoid a deadlock
|
// It's very important that we acquire the RAM semaphore first to avoid a deadlock
|
||||||
taskSemCount.acquire(1);
|
taskSemCount.acquire(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void release(CrawlTaskLimits properties) {
|
public void release() {
|
||||||
taskSemCount.release(1);
|
taskSemCount.release(1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6,6 +6,7 @@ import com.google.inject.Inject;
|
|||||||
import com.google.inject.Injector;
|
import com.google.inject.Injector;
|
||||||
import nu.marginalia.UserAgent;
|
import nu.marginalia.UserAgent;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawling.io.CrawledDomainReader;
|
import nu.marginalia.crawling.io.CrawledDomainReader;
|
||||||
import nu.marginalia.crawling.io.CrawlerOutputFile;
|
import nu.marginalia.crawling.io.CrawlerOutputFile;
|
||||||
@ -173,49 +174,37 @@ public class CrawlerMain implements AutoCloseable {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
var limits = crawlLimiter.getTaskLimits(CrawlerOutputFile.getOutputFile(crawlDataDir, crawlingSpecification));
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
crawlLimiter.acquire(limits);
|
crawlLimiter.acquire();
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
|
|
||||||
pool.execute(() -> {
|
pool.execute(() -> {
|
||||||
try {
|
try {
|
||||||
fetchDomain(crawlingSpecification, limits);
|
fetchDomain(crawlingSpecification);
|
||||||
heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
|
heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
|
||||||
}
|
}
|
||||||
finally {
|
finally {
|
||||||
crawlLimiter.release(limits);
|
crawlLimiter.release();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void fetchDomain(CrawlingSpecification specification, CrawlLimiter.CrawlTaskLimits limits) {
|
private void fetchDomain(CrawlingSpecification specification) {
|
||||||
if (workLog.isJobFinished(specification.id))
|
if (workLog.isJobFinished(specification.id))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool);
|
HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool);
|
||||||
|
|
||||||
Iterator<SerializableCrawlData> iterator;
|
|
||||||
try {
|
|
||||||
if (limits.isRefreshable()) {
|
|
||||||
iterator = reader.createIterator(limits.refreshPath());
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
iterator = Collections.emptyIterator();
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
logger.warn("Failed to read previous crawl data for {}", specification.domain);
|
|
||||||
iterator = Collections.emptyIterator();
|
|
||||||
}
|
|
||||||
|
|
||||||
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
|
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
|
||||||
var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept);
|
var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept);
|
||||||
|
|
||||||
int size = retreiver.fetch(iterator);
|
CrawlDataReference reference = getReference(specification);
|
||||||
|
|
||||||
|
int size = retreiver.fetch(reference);
|
||||||
|
|
||||||
workLog.setJobToFinished(specification.id, writer.getOutputFile().toString(), size);
|
workLog.setJobToFinished(specification.id, writer.getOutputFile().toString(), size);
|
||||||
|
|
||||||
@ -225,6 +214,16 @@ public class CrawlerMain implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private CrawlDataReference getReference(CrawlingSpecification specification) {
|
||||||
|
try {
|
||||||
|
var iterator = reader.createIterator(crawlDataDir, specification);
|
||||||
|
return new CrawlDataReference(iterator);
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.warn("Failed to read previous crawl data for {}", specification.domain);
|
||||||
|
return new CrawlDataReference();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static class CrawlRequest {
|
private static class CrawlRequest {
|
||||||
private final CrawlPlan plan;
|
private final CrawlPlan plan;
|
||||||
private final MqMessage message;
|
private final MqMessage message;
|
||||||
|
@ -1,123 +1,73 @@
|
|||||||
package nu.marginalia.crawl.retreival;
|
package nu.marginalia.crawl.retreival;
|
||||||
|
|
||||||
|
import com.google.common.hash.HashCode;
|
||||||
|
import com.google.common.hash.HashFunction;
|
||||||
|
import com.google.common.hash.Hashing;
|
||||||
|
import nu.marginalia.bigstring.BigString;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
import nu.marginalia.crawling.model.CrawledDomain;
|
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.lsh.EasyLSH;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.net.URISyntaxException;
|
import javax.annotation.Nullable;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
/** A reference to a domain that has been crawled before. */
|
/** A reference to a domain that has been crawled before. */
|
||||||
public class CrawlDataReference {
|
public class CrawlDataReference {
|
||||||
private final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class);
|
private final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class);
|
||||||
final Map<EdgeUrl, CrawledDocument> documents;
|
|
||||||
final Map<EdgeUrl, String> etags;
|
|
||||||
final Map<EdgeUrl, String> lastModified;
|
|
||||||
final Set<EdgeUrl> previouslyDeadUrls = new HashSet<>();
|
|
||||||
|
|
||||||
CrawlDataReference(CrawledDomain referenceDomain) {
|
private final Iterator<SerializableCrawlData> data;
|
||||||
|
private final HashFunction hashFunction = Hashing.murmur3_128();
|
||||||
|
|
||||||
if (referenceDomain == null || referenceDomain.doc == null) {
|
public CrawlDataReference(Iterator<SerializableCrawlData> data) {
|
||||||
documents = Collections.emptyMap();
|
this.data = data;
|
||||||
etags = Collections.emptyMap();
|
}
|
||||||
lastModified = Collections.emptyMap();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
documents = new HashMap<>(referenceDomain.doc.size());
|
public CrawlDataReference() {
|
||||||
etags = new HashMap<>(referenceDomain.doc.size());
|
this(Collections.emptyIterator());
|
||||||
lastModified = new HashMap<>(referenceDomain.doc.size());
|
}
|
||||||
|
|
||||||
for (var doc : referenceDomain.doc) {
|
@Nullable
|
||||||
try {
|
public CrawledDocument nextDocument() {
|
||||||
addReference(doc);
|
while (data.hasNext()) {
|
||||||
} catch (URISyntaxException ex) {
|
if (data.next() instanceof CrawledDocument doc) {
|
||||||
logger.warn("Failed to add reference document {}", doc.url);
|
return doc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addReference(CrawledDocument doc) throws URISyntaxException {
|
public boolean isContentSame(CrawledDocument one, CrawledDocument other) {
|
||||||
var url = new EdgeUrl(doc.url);
|
assert one.documentBody != null;
|
||||||
|
assert other.documentBody != null;
|
||||||
|
|
||||||
if (doc.httpStatus == 404) {
|
final long contentHashOne = contentHash(one.documentBody);
|
||||||
previouslyDeadUrls.add(url);
|
final long contentHashOther = contentHash(other.documentBody);
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (doc.httpStatus != 200) {
|
return EasyLSH.hammingDistance(contentHashOne, contentHashOther) < 4;
|
||||||
return;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
documents.put(url, doc);
|
private long contentHash(BigString documentBody) {
|
||||||
|
String content = documentBody.decode();
|
||||||
|
EasyLSH hash = new EasyLSH();
|
||||||
|
int next = 0;
|
||||||
|
|
||||||
String headers = doc.headers;
|
boolean isInTag = false;
|
||||||
if (headers != null) {
|
for (int i = 0; i < content.length(); i++) {
|
||||||
String[] headersLines = headers.split("\n");
|
char c = content.charAt(i);
|
||||||
|
if (c == '<') {
|
||||||
String lastmod = null;
|
isInTag = true;
|
||||||
String etag = null;
|
} else if (c == '>') {
|
||||||
|
isInTag = false;
|
||||||
for (String line : headersLines) {
|
} else if (!isInTag) {
|
||||||
if (line.toLowerCase().startsWith("etag:")) {
|
next = (next << 8) | (byte) c;
|
||||||
etag = line.substring(5).trim();
|
hash.addHashUnordered(hashFunction.hashInt(next).asInt());
|
||||||
}
|
|
||||||
if (line.toLowerCase().startsWith("last-modified:")) {
|
|
||||||
lastmod = line.substring(14).trim();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (lastmod != null) {
|
|
||||||
lastModified.put(url, lastmod);
|
|
||||||
}
|
|
||||||
if (etag != null) {
|
|
||||||
etags.put(url, etag);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return hash.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isPreviouslyDead(EdgeUrl url) {
|
|
||||||
return previouslyDeadUrls.contains(url);
|
|
||||||
}
|
|
||||||
public int size() {
|
|
||||||
return documents.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getEtag(EdgeUrl url) {
|
|
||||||
return etags.get(url);
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getLastModified(EdgeUrl url) {
|
|
||||||
return lastModified.get(url);
|
|
||||||
}
|
|
||||||
|
|
||||||
public Map<EdgeUrl, CrawledDocument> allDocuments() {
|
|
||||||
return documents;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public Map<EdgeUrl, CrawledDocument> sample(int sampleSize) {
|
|
||||||
return documents.entrySet().stream().limit(sampleSize).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
|
|
||||||
}
|
|
||||||
|
|
||||||
public void evict() {
|
|
||||||
documents.clear();
|
|
||||||
etags.clear();
|
|
||||||
lastModified.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
public CrawledDocument getDoc(EdgeUrl top) {
|
|
||||||
return documents.get(top);
|
|
||||||
}
|
|
||||||
|
|
||||||
// This bit of manual housekeeping is needed to keep the memory footprint low
|
|
||||||
public void dispose(EdgeUrl url) {
|
|
||||||
documents.remove(url);
|
|
||||||
etags.remove(url);
|
|
||||||
lastModified.remove(url);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -97,10 +97,10 @@ public class CrawlerRetreiver {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public int fetch() {
|
public int fetch() {
|
||||||
return fetch(Collections.emptyIterator());
|
return fetch(new CrawlDataReference());
|
||||||
}
|
}
|
||||||
|
|
||||||
public int fetch(Iterator<SerializableCrawlData> oldCrawlData) {
|
public int fetch(CrawlDataReference oldCrawlData) {
|
||||||
final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, crawlFrontier.peek());
|
final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, crawlFrontier.peek());
|
||||||
|
|
||||||
if (probeResult instanceof DomainProber.ProbeResultOk) {
|
if (probeResult instanceof DomainProber.ProbeResultOk) {
|
||||||
@ -141,7 +141,7 @@ public class CrawlerRetreiver {
|
|||||||
throw new IllegalStateException("Unknown probe result: " + probeResult);
|
throw new IllegalStateException("Unknown probe result: " + probeResult);
|
||||||
};
|
};
|
||||||
|
|
||||||
private int crawlDomain(Iterator<SerializableCrawlData> oldCrawlData) {
|
private int crawlDomain(CrawlDataReference oldCrawlData) {
|
||||||
String ip = findIp(domain);
|
String ip = findIp(domain);
|
||||||
|
|
||||||
assert !crawlFrontier.isEmpty();
|
assert !crawlFrontier.isEmpty();
|
||||||
@ -207,14 +207,18 @@ public class CrawlerRetreiver {
|
|||||||
return fetchedCount;
|
return fetchedCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
private int recrawl(Iterator<SerializableCrawlData> oldCrawlData,
|
private int recrawl(CrawlDataReference oldCrawlData,
|
||||||
SimpleRobotRules robotsRules,
|
SimpleRobotRules robotsRules,
|
||||||
long crawlDelay) {
|
long crawlDelay) {
|
||||||
int recrawled = 0;
|
int recrawled = 0;
|
||||||
int retained = 0;
|
int retained = 0;
|
||||||
|
|
||||||
while (oldCrawlData.hasNext()) {
|
for (;;) {
|
||||||
if (!(oldCrawlData.next() instanceof CrawledDocument doc)) continue;
|
CrawledDocument doc = oldCrawlData.nextDocument();
|
||||||
|
|
||||||
|
if (doc == null) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
// This Shouldn't Happen (TM)
|
// This Shouldn't Happen (TM)
|
||||||
var urlMaybe = EdgeUrl.parse(doc.url);
|
var urlMaybe = EdgeUrl.parse(doc.url);
|
||||||
@ -265,6 +269,9 @@ public class CrawlerRetreiver {
|
|||||||
if (Objects.equals(fetchedDocOpt.get().recrawlState, retainedTag)) {
|
if (Objects.equals(fetchedDocOpt.get().recrawlState, retainedTag)) {
|
||||||
retained ++;
|
retained ++;
|
||||||
}
|
}
|
||||||
|
else if (oldCrawlData.isContentSame(doc, fetchedDocOpt.get())) {
|
||||||
|
retained ++;
|
||||||
|
}
|
||||||
|
|
||||||
recrawled ++;
|
recrawled ++;
|
||||||
}
|
}
|
||||||
|
@ -70,7 +70,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html");
|
registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html");
|
||||||
registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html");
|
registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html");
|
||||||
|
|
||||||
new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "startrek.website", new ArrayList<>(), null), out::add)
|
new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "startrek.website", new ArrayList<>()), out::add)
|
||||||
.withNoDelay()
|
.withNoDelay()
|
||||||
.fetch();
|
.fetch();
|
||||||
|
|
||||||
@ -83,7 +83,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
|
|
||||||
registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
|
registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
|
||||||
|
|
||||||
new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "en.wikipedia.org", new ArrayList<>(), null), out::add)
|
new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "en.wikipedia.org", new ArrayList<>()), out::add)
|
||||||
.withNoDelay()
|
.withNoDelay()
|
||||||
.fetch();
|
.fetch();
|
||||||
|
|
||||||
@ -98,7 +98,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html");
|
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html");
|
||||||
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html");
|
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html");
|
||||||
|
|
||||||
new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 100, "community.tt-rss.org", new ArrayList<>(), null), out::add)
|
new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 100, "community.tt-rss.org", new ArrayList<>()), out::add)
|
||||||
.withNoDelay()
|
.withNoDelay()
|
||||||
.fetch();
|
.fetch();
|
||||||
|
|
||||||
|
@ -2,6 +2,7 @@ package nu.marginalia.crawling.retreival;
|
|||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.crawl.retreival.CrawlDataReference;
|
||||||
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||||
@ -109,7 +110,7 @@ class CrawlerRetreiverTest {
|
|||||||
|
|
||||||
var specs = CrawlingSpecification
|
var specs = CrawlingSpecification
|
||||||
.builder()
|
.builder()
|
||||||
.id("whatever")
|
.id("123456")
|
||||||
.crawlDepth(12)
|
.crawlDepth(12)
|
||||||
.domain("www.marginalia.nu")
|
.domain("www.marginalia.nu")
|
||||||
.urls(List.of("https://www.marginalia.nu/some-dead-link"))
|
.urls(List.of("https://www.marginalia.nu/some-dead-link"))
|
||||||
@ -117,7 +118,7 @@ class CrawlerRetreiverTest {
|
|||||||
|
|
||||||
|
|
||||||
Path out = Files.createTempDirectory("crawling-process");
|
Path out = Files.createTempDirectory("crawling-process");
|
||||||
var writer = new CrawledDomainWriter(out, "test", "123456");
|
var writer = new CrawledDomainWriter(out, "www.marginalia.nu", "123456");
|
||||||
Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
|
Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
|
||||||
|
|
||||||
new CrawlerRetreiver(httpFetcher, specs, d -> {
|
new CrawlerRetreiver(httpFetcher, specs, d -> {
|
||||||
@ -130,18 +131,16 @@ class CrawlerRetreiverTest {
|
|||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
var reader = new CrawledDomainReader();
|
var reader = new CrawledDomainReader();
|
||||||
var iter = reader.createIterator(CrawlerOutputFile.getOutputFile(out, "123456", "test"));
|
var iter = reader.createIterator(out, specs);
|
||||||
|
|
||||||
CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
|
CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
|
||||||
domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
|
domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
|
||||||
|
|
||||||
var newSpec = specs.withOldData(domain);
|
new CrawlerRetreiver(httpFetcher, specs, d -> {
|
||||||
|
|
||||||
new CrawlerRetreiver(httpFetcher, newSpec, d -> {
|
|
||||||
if (d instanceof CrawledDocument doc) {
|
if (d instanceof CrawledDocument doc) {
|
||||||
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
|
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
|
||||||
}
|
}
|
||||||
}).fetch(iter);
|
}).fetch(new CrawlDataReference(iter));
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -31,9 +31,9 @@ public class CrawlJobSpecWriterTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testReadWrite() throws IOException {
|
public void testReadWrite() throws IOException {
|
||||||
try (CrawlJobSpecWriter writer = new CrawlJobSpecWriter(tempFile)) {
|
try (CrawlJobSpecWriter writer = new CrawlJobSpecWriter(tempFile)) {
|
||||||
writer.accept(new CrawlingSpecification("first",1, "test1", List.of("a", "b", "c"), null));
|
writer.accept(new CrawlingSpecification("first",1, "test1", List.of("a", "b", "c")));
|
||||||
writer.accept(new CrawlingSpecification("second",1, "test2", List.of("a", "b", "c", "d"), null));
|
writer.accept(new CrawlingSpecification("second",1, "test2", List.of("a", "b", "c", "d")));
|
||||||
writer.accept(new CrawlingSpecification("third",1, "test3", List.of("a", "b"), null));
|
writer.accept(new CrawlingSpecification("third",1, "test3", List.of("a", "b")));
|
||||||
}
|
}
|
||||||
|
|
||||||
List<CrawlingSpecification> outputs = new ArrayList<>();
|
List<CrawlingSpecification> outputs = new ArrayList<>();
|
||||||
|
Loading…
Reference in New Issue
Block a user