mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(crawler) Reduce long retention of CrawlDataReference objects and their associated SerializableCrawlDataStreams
This commit is contained in:
parent
f0d74aa3bb
commit
c8b0a32c0f
@ -20,7 +20,6 @@ import nu.marginalia.crawl.warc.WarcArchiverFactory;
|
|||||||
import nu.marginalia.crawl.warc.WarcArchiverIf;
|
import nu.marginalia.crawl.warc.WarcArchiverIf;
|
||||||
import nu.marginalia.db.DomainBlacklist;
|
import nu.marginalia.db.DomainBlacklist;
|
||||||
import nu.marginalia.io.CrawlerOutputFile;
|
import nu.marginalia.io.CrawlerOutputFile;
|
||||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.mq.MessageQueueFactory;
|
import nu.marginalia.mq.MessageQueueFactory;
|
||||||
import nu.marginalia.process.ProcessConfiguration;
|
import nu.marginalia.process.ProcessConfiguration;
|
||||||
@ -417,13 +416,13 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
try {
|
try {
|
||||||
Path slopPath = CrawlerOutputFile.getSlopPath(outputDir, id, domain);
|
Path slopPath = CrawlerOutputFile.getSlopPath(outputDir, id, domain);
|
||||||
if (Files.exists(slopPath)) {
|
if (Files.exists(slopPath)) {
|
||||||
return new CrawlDataReference(SerializableCrawlDataStream.openDataStream(slopPath));
|
return new CrawlDataReference(slopPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
Path parquetPath = CrawlerOutputFile.getParquetPath(outputDir, id, domain);
|
Path parquetPath = CrawlerOutputFile.getParquetPath(outputDir, id, domain);
|
||||||
if (Files.exists(parquetPath)) {
|
if (Files.exists(parquetPath)) {
|
||||||
slopPath = migrateParquetData(parquetPath, domain, outputDir);
|
slopPath = migrateParquetData(parquetPath, domain, outputDir);
|
||||||
return new CrawlDataReference(SerializableCrawlDataStream.openDataStream(slopPath));
|
return new CrawlDataReference(slopPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
@ -4,6 +4,7 @@ import nu.marginalia.ContentTypes;
|
|||||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
import nu.marginalia.lsh.EasyLSH;
|
import nu.marginalia.lsh.EasyLSH;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -11,51 +12,73 @@ import javax.annotation.Nullable;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
/** A reference to a domain that has been crawled before. */
|
/** A reference to a domain that has been crawled before. */
|
||||||
public class CrawlDataReference implements AutoCloseable {
|
public class CrawlDataReference implements AutoCloseable, Iterable<CrawledDocument> {
|
||||||
|
|
||||||
|
private boolean closed = false;
|
||||||
|
|
||||||
|
@Nullable
|
||||||
|
private final Path path;
|
||||||
|
|
||||||
|
@Nullable
|
||||||
|
private SerializableCrawlDataStream data = null;
|
||||||
|
|
||||||
private final SerializableCrawlDataStream data;
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class);
|
private static final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class);
|
||||||
|
|
||||||
public CrawlDataReference(SerializableCrawlDataStream data) {
|
public CrawlDataReference(@Nullable Path path) {
|
||||||
this.data = data;
|
this.path = path;
|
||||||
}
|
}
|
||||||
|
|
||||||
public CrawlDataReference() {
|
public CrawlDataReference() {
|
||||||
this(SerializableCrawlDataStream.empty());
|
this(null);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Delete the associated data from disk, if it exists */
|
/** Delete the associated data from disk, if it exists */
|
||||||
public void delete() throws IOException {
|
public void delete() throws IOException {
|
||||||
Path filePath = data.path();
|
if (path != null) {
|
||||||
|
Files.deleteIfExists(path);
|
||||||
if (filePath != null) {
|
|
||||||
Files.deleteIfExists(filePath);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Get the next document from the crawl data,
|
public @NotNull Iterator<CrawledDocument> iterator() {
|
||||||
* returning null when there are no more documents
|
|
||||||
* available
|
requireStream();
|
||||||
*/
|
// Guaranteed by requireStream, but helps java
|
||||||
@Nullable
|
Objects.requireNonNull(data);
|
||||||
public CrawledDocument nextDocument() {
|
|
||||||
|
return data.map(next -> {
|
||||||
|
if (next instanceof CrawledDocument doc && ContentTypes.isAccepted(doc.contentType)) {
|
||||||
|
return Optional.of(doc);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/** After calling this method, data is guaranteed to be non-null */
|
||||||
|
private void requireStream() {
|
||||||
|
if (closed) {
|
||||||
|
throw new IllegalStateException("Use after close()");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (data == null) {
|
||||||
try {
|
try {
|
||||||
while (data.hasNext()) {
|
if (path != null) {
|
||||||
if (data.next() instanceof CrawledDocument doc) {
|
data = SerializableCrawlDataStream.openDataStream(path);
|
||||||
if (!ContentTypes.isAccepted(doc.contentType))
|
return;
|
||||||
continue;
|
|
||||||
|
|
||||||
return doc;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
catch (Exception ex) {
|
||||||
catch (IOException ex) {
|
logger.error("Failed to open stream", ex);
|
||||||
logger.error("Failed to read next document", ex);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
data = SerializableCrawlDataStream.empty();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static boolean isContentBodySame(byte[] one, byte[] other) {
|
public static boolean isContentBodySame(byte[] one, byte[] other) {
|
||||||
@ -98,7 +121,12 @@ public class CrawlDataReference implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws Exception {
|
public void close() throws IOException {
|
||||||
|
if (!closed) {
|
||||||
|
if (data != null) {
|
||||||
data.close();
|
data.close();
|
||||||
}
|
}
|
||||||
|
closed = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -89,47 +89,23 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public int crawlDomain(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
|
public int crawlDomain(DomainLinks domainLinks, CrawlDataReference oldCrawlData) {
|
||||||
try {
|
try (oldCrawlData) {
|
||||||
// Do an initial domain probe to determine the root URL
|
// Do an initial domain probe to determine the root URL
|
||||||
EdgeUrl rootUrl;
|
|
||||||
|
|
||||||
var probeResult = probeRootUrl();
|
var probeResult = probeRootUrl();
|
||||||
switch (probeResult) {
|
|
||||||
|
return switch (probeResult) {
|
||||||
case HttpFetcher.DomainProbeResult.Ok(EdgeUrl probedUrl) -> {
|
case HttpFetcher.DomainProbeResult.Ok(EdgeUrl probedUrl) -> {
|
||||||
rootUrl = probedUrl; // Good track
|
|
||||||
}
|
|
||||||
case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> {
|
|
||||||
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString()));
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
case HttpFetcher.DomainProbeResult.Error(CrawlerDomainStatus status, String desc) -> {
|
|
||||||
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, status.toString(), desc));
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sleep after the initial probe, we don't have access to the robots.txt yet
|
// Sleep after the initial probe, we don't have access to the robots.txt yet
|
||||||
// so we don't know the crawl delay
|
// so we don't know the crawl delay
|
||||||
TimeUnit.SECONDS.sleep(1);
|
TimeUnit.SECONDS.sleep(1);
|
||||||
|
|
||||||
return crawlDomain(oldCrawlData, rootUrl, domainLinks);
|
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(probedUrl.domain, warcRecorder);
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
logger.error("Error crawling domain {}", domain, ex);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private int crawlDomain(CrawlDataReference oldCrawlData,
|
|
||||||
EdgeUrl rootUrl,
|
|
||||||
DomainLinks domainLinks) throws InterruptedException {
|
|
||||||
|
|
||||||
final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder);
|
|
||||||
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
|
final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay());
|
||||||
|
|
||||||
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
|
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
|
||||||
|
|
||||||
DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(rootUrl, delayTimer);
|
DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(probedUrl, delayTimer);
|
||||||
domainStateDb.save(summaryRecord);
|
domainStateDb.save(summaryRecord);
|
||||||
|
|
||||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||||
@ -138,10 +114,36 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
crawlFrontier.increaseDepth(1.5, 2500);
|
crawlFrontier.increaseDepth(1.5, 2500);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
oldCrawlData.close(); // proactively close the crawl data reference here to not hold onto expensive resources
|
||||||
|
|
||||||
|
yield crawlDomain(probedUrl, robotsRules, delayTimer, domainLinks);
|
||||||
|
}
|
||||||
|
case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> {
|
||||||
|
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString()));
|
||||||
|
yield 1;
|
||||||
|
}
|
||||||
|
case HttpFetcher.DomainProbeResult.Error(CrawlerDomainStatus status, String desc) -> {
|
||||||
|
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, status.toString(), desc));
|
||||||
|
yield 1;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.error("Error crawling domain {}", domain, ex);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private int crawlDomain(EdgeUrl rootUrl,
|
||||||
|
SimpleRobotRules robotsRules,
|
||||||
|
CrawlDelayTimer delayTimer,
|
||||||
|
DomainLinks domainLinks) {
|
||||||
|
|
||||||
|
|
||||||
// Add external links to the crawl frontier
|
// Add external links to the crawl frontier
|
||||||
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
|
crawlFrontier.addAllToQueue(domainLinks.getUrls(rootUrl.proto));
|
||||||
|
|
||||||
|
|
||||||
// Fetch sitemaps
|
// Fetch sitemaps
|
||||||
for (var sitemap : robotsRules.getSitemaps()) {
|
for (var sitemap : robotsRules.getSitemaps()) {
|
||||||
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
|
crawlFrontier.addAllToQueue(fetcher.fetchSitemapUrls(sitemap, delayTimer));
|
||||||
|
@ -40,18 +40,12 @@ public class CrawlerRevisitor {
|
|||||||
int errors = 0;
|
int errors = 0;
|
||||||
int skipped = 0;
|
int skipped = 0;
|
||||||
|
|
||||||
for (;;) {
|
for (CrawledDocument doc : oldCrawlData) {
|
||||||
if (errors > 20) {
|
if (errors > 20) {
|
||||||
// If we've had too many errors, we'll stop trying to recrawl
|
// If we've had too many errors, we'll stop trying to recrawl
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
CrawledDocument doc = oldCrawlData.nextDocument();
|
|
||||||
|
|
||||||
if (doc == null)
|
|
||||||
break;
|
|
||||||
|
|
||||||
// This Shouldn't Happen (TM)
|
|
||||||
var urlMaybe = EdgeUrl.parse(doc.url);
|
var urlMaybe = EdgeUrl.parse(doc.url);
|
||||||
if (urlMaybe.isEmpty())
|
if (urlMaybe.isEmpty())
|
||||||
continue;
|
continue;
|
||||||
|
@ -108,8 +108,10 @@ public record SlopCrawlDataRecord(String domain,
|
|||||||
public static void convertFromParquet(Path parquetInput, Path slopOutput) throws IOException {
|
public static void convertFromParquet(Path parquetInput, Path slopOutput) throws IOException {
|
||||||
Path tempDir = Files.createTempDirectory(slopOutput.getParent(), "conversion");
|
Path tempDir = Files.createTempDirectory(slopOutput.getParent(), "conversion");
|
||||||
|
|
||||||
try (var writer = new Writer(tempDir)) {
|
try (var writer = new Writer(tempDir);
|
||||||
CrawledDocumentParquetRecordFileReader.stream(parquetInput).forEach(
|
var stream = CrawledDocumentParquetRecordFileReader.stream(parquetInput))
|
||||||
|
{
|
||||||
|
stream.forEach(
|
||||||
parquetRecord -> {
|
parquetRecord -> {
|
||||||
try {
|
try {
|
||||||
writer.write(new SlopCrawlDataRecord(parquetRecord));
|
writer.write(new SlopCrawlDataRecord(parquetRecord));
|
||||||
|
@ -375,7 +375,7 @@ class CrawlerRetreiverTest {
|
|||||||
doCrawl(tempFileWarc1, specs);
|
doCrawl(tempFileWarc1, specs);
|
||||||
convertToParquet(tempFileWarc1, tempFileParquet1);
|
convertToParquet(tempFileWarc1, tempFileParquet1);
|
||||||
doCrawlWithReferenceStream(specs,
|
doCrawlWithReferenceStream(specs,
|
||||||
SerializableCrawlDataStream.openDataStream(tempFileParquet1)
|
new CrawlDataReference(tempFileParquet1)
|
||||||
);
|
);
|
||||||
convertToParquet(tempFileWarc2, tempFileParquet2);
|
convertToParquet(tempFileWarc2, tempFileParquet2);
|
||||||
|
|
||||||
@ -447,11 +447,9 @@ class CrawlerRetreiverTest {
|
|||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
|
|
||||||
var stream = SerializableCrawlDataStream.openDataStream(tempFileParquet1);
|
|
||||||
|
|
||||||
System.out.println("---");
|
System.out.println("---");
|
||||||
|
|
||||||
doCrawlWithReferenceStream(specs, stream);
|
doCrawlWithReferenceStream(specs, new CrawlDataReference(tempFileParquet1));
|
||||||
|
|
||||||
var revisitCrawlFrontier = new DomainCrawlFrontier(
|
var revisitCrawlFrontier = new DomainCrawlFrontier(
|
||||||
new EdgeDomain("www.marginalia.nu"),
|
new EdgeDomain("www.marginalia.nu"),
|
||||||
@ -508,12 +506,11 @@ class CrawlerRetreiverTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
|
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, CrawlDataReference reference) {
|
||||||
try (var recorder = new WarcRecorder(tempFileWarc2, new Cookies());
|
try (var recorder = new WarcRecorder(tempFileWarc2, new Cookies());
|
||||||
var db = new DomainStateDb(tempFileDb)
|
var db = new DomainStateDb(tempFileDb)
|
||||||
) {
|
) {
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(),
|
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(), reference);
|
||||||
new CrawlDataReference(stream));
|
|
||||||
}
|
}
|
||||||
catch (IOException | SQLException ex) {
|
catch (IOException | SQLException ex) {
|
||||||
Assertions.fail(ex);
|
Assertions.fail(ex);
|
||||||
|
@ -234,7 +234,7 @@ dependencyResolutionManagement {
|
|||||||
library('jetty-util','org.eclipse.jetty','jetty-util').version('9.4.54.v20240208')
|
library('jetty-util','org.eclipse.jetty','jetty-util').version('9.4.54.v20240208')
|
||||||
library('jetty-servlet','org.eclipse.jetty','jetty-servlet').version('9.4.54.v20240208')
|
library('jetty-servlet','org.eclipse.jetty','jetty-servlet').version('9.4.54.v20240208')
|
||||||
|
|
||||||
library('slop', 'nu.marginalia', 'slop').version('0.0.9-org-5-SNAPSHOT')
|
library('slop', 'nu.marginalia', 'slop').version('0.0.10-SNAPSHOT')
|
||||||
library('jooby-netty','io.jooby','jooby-netty').version(joobyVersion)
|
library('jooby-netty','io.jooby','jooby-netty').version(joobyVersion)
|
||||||
library('jooby-jte','io.jooby','jooby-jte').version(joobyVersion)
|
library('jooby-jte','io.jooby','jooby-jte').version(joobyVersion)
|
||||||
library('jooby-apt','io.jooby','jooby-apt').version(joobyVersion)
|
library('jooby-apt','io.jooby','jooby-apt').version(joobyVersion)
|
||||||
|
Loading…
Reference in New Issue
Block a user