mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(crawler) WIP
This commit is contained in:
parent
08ca6399ec
commit
f91d92cccb
@ -6,6 +6,6 @@ import nu.marginalia.db.storage.model.FileStorageId;
|
|||||||
/** A request to start a crawl */
|
/** A request to start a crawl */
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
public class CrawlRequest {
|
public class CrawlRequest {
|
||||||
FileStorageId specStorage;
|
public FileStorageId specStorage;
|
||||||
FileStorageId crawlStorage;
|
public FileStorageId crawlStorage;
|
||||||
}
|
}
|
||||||
|
@ -11,6 +11,8 @@ import java.nio.file.Files;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.attribute.PosixFilePermissions;
|
import java.nio.file.attribute.PosixFilePermissions;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
/** Manages file storage for processes and services
|
/** Manages file storage for processes and services
|
||||||
@ -63,6 +65,49 @@ public class FileStorageService {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void relateFileStorages(FileStorageId source, FileStorageId target) {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("""
|
||||||
|
INSERT INTO FILE_STORAGE_RELATION(SOURCE_ID, TARGET_ID) VALUES (?, ?)
|
||||||
|
""")) {
|
||||||
|
stmt.setLong(1, source.id());
|
||||||
|
stmt.setLong(2, target.id());
|
||||||
|
stmt.executeUpdate();
|
||||||
|
} catch (SQLException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<FileStorage> getSourceFromStorage(FileStorage storage) throws SQLException {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("""
|
||||||
|
SELECT SOURCE_ID FROM FILE_STORAGE_RELATION WHERE TARGET_ID = ?
|
||||||
|
""")) {
|
||||||
|
stmt.setLong(1, storage.id().id());
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
List<FileStorage> ret = new ArrayList<>();
|
||||||
|
while (rs.next()) {
|
||||||
|
ret.add(getStorage(new FileStorageId(rs.getLong(1))));
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<FileStorage> getTargetFromStorage(FileStorage storage) throws SQLException {
|
||||||
|
try (var conn = dataSource.getConnection();
|
||||||
|
var stmt = conn.prepareStatement("""
|
||||||
|
SELECT TARGET_ID FROM FILE_STORAGE_RELATION WHERE SOURCE_ID = ?
|
||||||
|
""")) {
|
||||||
|
stmt.setLong(1, storage.id().id());
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
List<FileStorage> ret = new ArrayList<>();
|
||||||
|
while (rs.next()) {
|
||||||
|
ret.add(getStorage(new FileStorageId(rs.getLong(1))));
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/** @return the storage base with the given type, or null if it does not exist */
|
/** @return the storage base with the given type, or null if it does not exist */
|
||||||
public FileStorageBase getStorageBase(FileStorageBaseType type) throws SQLException {
|
public FileStorageBase getStorageBase(FileStorageBaseType type) throws SQLException {
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
@ -153,13 +198,7 @@ public class FileStorageService {
|
|||||||
var rs = query.executeQuery();
|
var rs = query.executeQuery();
|
||||||
|
|
||||||
if (rs.next()) {
|
if (rs.next()) {
|
||||||
return new FileStorage(
|
return getStorage(new FileStorageId(rs.getLong("ID")));
|
||||||
new FileStorageId(rs.getLong("ID")),
|
|
||||||
base,
|
|
||||||
type,
|
|
||||||
tempDir.toString(),
|
|
||||||
description
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
package nu.marginalia.db.storage.model;
|
package nu.marginalia.db.storage.model;
|
||||||
|
|
||||||
public record FileStorageId(long id) {
|
public record FileStorageId(long id) {
|
||||||
|
public static FileStorageId parse(String str) {
|
||||||
|
return new FileStorageId(Long.parseLong(str));
|
||||||
|
}
|
||||||
public static FileStorageId of(int storageId) {
|
public static FileStorageId of(int storageId) {
|
||||||
return new FileStorageId(storageId);
|
return new FileStorageId(storageId);
|
||||||
}
|
}
|
||||||
|
@ -23,6 +23,14 @@ CREATE TABLE IF NOT EXISTS FILE_STORAGE (
|
|||||||
CHARACTER SET utf8mb4
|
CHARACTER SET utf8mb4
|
||||||
COLLATE utf8mb4_bin;
|
COLLATE utf8mb4_bin;
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS FILE_STORAGE_RELATION (
|
||||||
|
SOURCE_ID BIGINT NOT NULL,
|
||||||
|
TARGET_ID BIGINT NOT NULL,
|
||||||
|
CONSTRAINT CONS UNIQUE (SOURCE_ID, TARGET_ID),
|
||||||
|
FOREIGN KEY (SOURCE_ID) REFERENCES FILE_STORAGE(ID) ON DELETE CASCADE,
|
||||||
|
FOREIGN KEY (TARGET_ID) REFERENCES FILE_STORAGE(ID) ON DELETE CASCADE
|
||||||
|
);
|
||||||
|
|
||||||
CREATE VIEW FILE_STORAGE_VIEW
|
CREATE VIEW FILE_STORAGE_VIEW
|
||||||
AS SELECT
|
AS SELECT
|
||||||
CONCAT(BASE.PATH, '/', STORAGE.PATH) AS PATH,
|
CONCAT(BASE.PATH, '/', STORAGE.PATH) AS PATH,
|
||||||
|
@ -64,7 +64,6 @@ public class CrawledDomainReader {
|
|||||||
return Optional.of(read(path));
|
return Optional.of(read(path));
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.warn("Failed to read domain " + path, ex);
|
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -14,12 +14,15 @@ import java.io.OutputStreamWriter;
|
|||||||
import java.io.Writer;
|
import java.io.Writer;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardCopyOption;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
|
||||||
public class CrawledDomainWriter implements AutoCloseable {
|
public class CrawledDomainWriter implements AutoCloseable {
|
||||||
private final Path outputDir;
|
private final Path outputDir;
|
||||||
private final Gson gson = GsonFactory.get();
|
private final Gson gson = GsonFactory.get();
|
||||||
private static final Logger logger = LoggerFactory.getLogger(CrawledDomainWriter.class);
|
private static final Logger logger = LoggerFactory.getLogger(CrawledDomainWriter.class);
|
||||||
private final Writer writer;
|
private final Writer writer;
|
||||||
|
private final Path tmpFile;
|
||||||
private final Path outputFile;
|
private final Path outputFile;
|
||||||
|
|
||||||
public CrawledDomainWriter(Path outputDir, String name, String id) throws IOException {
|
public CrawledDomainWriter(Path outputDir, String name, String id) throws IOException {
|
||||||
@ -29,8 +32,10 @@ public class CrawledDomainWriter implements AutoCloseable {
|
|||||||
throw new IllegalArgumentException("Output dir " + outputDir + " does not exist");
|
throw new IllegalArgumentException("Output dir " + outputDir + " does not exist");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tmpFile = getOutputFile(id, name + "_tmp");
|
||||||
outputFile = getOutputFile(id, name);
|
outputFile = getOutputFile(id, name);
|
||||||
writer = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(Files.newOutputStream(outputFile))));
|
writer = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(Files.newOutputStream(tmpFile,
|
||||||
|
StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING))));
|
||||||
}
|
}
|
||||||
|
|
||||||
public Path getOutputFile() {
|
public Path getOutputFile() {
|
||||||
@ -46,32 +51,12 @@ public class CrawledDomainWriter implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private Path getOutputFile(String id, String name) throws IOException {
|
private Path getOutputFile(String id, String name) throws IOException {
|
||||||
String first = id.substring(0, 2);
|
return CrawlerOutputFile.createOutputPath(outputDir, id, name);
|
||||||
String second = id.substring(2, 4);
|
|
||||||
|
|
||||||
Path destDir = outputDir.resolve(first).resolve(second);
|
|
||||||
if (!Files.exists(destDir)) {
|
|
||||||
Files.createDirectories(destDir);
|
|
||||||
}
|
|
||||||
return destDir.resolve(id + "-" + filesystemSafeName(name) + ".zstd");
|
|
||||||
}
|
|
||||||
|
|
||||||
private String filesystemSafeName(String name) {
|
|
||||||
StringBuilder nameSaneBuilder = new StringBuilder();
|
|
||||||
|
|
||||||
name.chars()
|
|
||||||
.map(Character::toLowerCase)
|
|
||||||
.map(c -> (c & ~0x7F) == 0 ? c : 'X')
|
|
||||||
.map(c -> (Character.isDigit(c) || Character.isAlphabetic(c) || c == '.') ? c : 'X')
|
|
||||||
.limit(128)
|
|
||||||
.forEach(c -> nameSaneBuilder.append((char) c));
|
|
||||||
|
|
||||||
return nameSaneBuilder.toString();
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
|
Files.move(tmpFile, outputFile, StandardCopyOption.REPLACE_EXISTING);
|
||||||
writer.close();
|
writer.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,53 @@
|
|||||||
|
package nu.marginalia.crawling.io;
|
||||||
|
|
||||||
|
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
public class CrawlerOutputFile {
|
||||||
|
|
||||||
|
public static Path getOutputFile(Path base, CrawlingSpecification spec) {
|
||||||
|
return getOutputFile(base, spec.id, spec.domain);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** Return the Path to a file for the given id and name */
|
||||||
|
public static Path getOutputFile(Path base, String id, String name) {
|
||||||
|
String first = id.substring(0, 2);
|
||||||
|
String second = id.substring(2, 4);
|
||||||
|
|
||||||
|
Path destDir = base.resolve(first).resolve(second);
|
||||||
|
return destDir.resolve(id + "-" + filesystemSafeName(name) + ".zstd");
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Return the Path to a file for the given id and name, creating the prerequisite
|
||||||
|
* directory structure as necessary. */
|
||||||
|
public static Path createOutputPath(Path base, String id, String name) throws IOException {
|
||||||
|
String first = id.substring(0, 2);
|
||||||
|
String second = id.substring(2, 4);
|
||||||
|
|
||||||
|
Path destDir = base.resolve(first).resolve(second);
|
||||||
|
if (!Files.exists(destDir)) {
|
||||||
|
Files.createDirectories(destDir);
|
||||||
|
}
|
||||||
|
return destDir.resolve(id + "-" + filesystemSafeName(name) + ".zstd");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static String filesystemSafeName(String name) {
|
||||||
|
StringBuilder nameSaneBuilder = new StringBuilder();
|
||||||
|
|
||||||
|
name.chars()
|
||||||
|
.map(Character::toLowerCase)
|
||||||
|
.map(c -> (c & ~0x7F) == 0 ? c : 'X')
|
||||||
|
.map(c -> (Character.isDigit(c) || Character.isAlphabetic(c) || c == '.') ? c : 'X')
|
||||||
|
.limit(128)
|
||||||
|
.forEach(c -> nameSaneBuilder.append((char) c));
|
||||||
|
|
||||||
|
return nameSaneBuilder.toString();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -27,6 +27,8 @@ public class CrawledDocument implements SerializableCrawlData {
|
|||||||
public String canonicalUrl;
|
public String canonicalUrl;
|
||||||
public String redirectUrl;
|
public String redirectUrl;
|
||||||
|
|
||||||
|
public String recrawlState;
|
||||||
|
|
||||||
public static final String SERIAL_IDENTIFIER = "// DOCUMENT";
|
public static final String SERIAL_IDENTIFIER = "// DOCUMENT";
|
||||||
@Override
|
@Override
|
||||||
public String getSerialIdentifier() {
|
public String getSerialIdentifier() {
|
||||||
|
@ -3,10 +3,12 @@ package nu.marginalia.crawling.model.spec;
|
|||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Builder;
|
import lombok.Builder;
|
||||||
import lombok.NoArgsConstructor;
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.With;
|
||||||
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@AllArgsConstructor @NoArgsConstructor @Builder
|
@AllArgsConstructor @NoArgsConstructor @Builder @With
|
||||||
public class CrawlingSpecification {
|
public class CrawlingSpecification {
|
||||||
public String id;
|
public String id;
|
||||||
|
|
||||||
@ -16,6 +18,8 @@ public class CrawlingSpecification {
|
|||||||
public String domain;
|
public String domain;
|
||||||
public List<String> urls;
|
public List<String> urls;
|
||||||
|
|
||||||
|
public CrawledDomain oldData;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return String.format(getClass().getSimpleName() + "[" + id + "/" + domain + ": " + crawlDepth + "[ " + urls.size() + "]");
|
return String.format(getClass().getSimpleName() + "[" + id + "/" + domain + ": " + crawlDepth + "[ " + urls.size() + "]");
|
||||||
|
@ -138,7 +138,7 @@ public class ConverterMain {
|
|||||||
|
|
||||||
// Advance the progress bar to the current position if this is a resumption
|
// Advance the progress bar to the current position if this is a resumption
|
||||||
processedDomains.set(processLog.countFinishedJobs());
|
processedDomains.set(processLog.countFinishedJobs());
|
||||||
heartbeat.setProgress(processedDomains.incrementAndGet() / (double) totalDomains);
|
heartbeat.setProgress(processedDomains.get() / (double) totalDomains);
|
||||||
|
|
||||||
for (var domain : plan.domainsIterable(id -> !processLog.isJobFinished(id)))
|
for (var domain : plan.domainsIterable(id -> !processLog.isJobFinished(id)))
|
||||||
{
|
{
|
||||||
|
@ -113,7 +113,8 @@ public class ConvertingIntegrationTest {
|
|||||||
BigString.encode(readClassPathFile(p.toString())),
|
BigString.encode(readClassPathFile(p.toString())),
|
||||||
Double.toString(Math.random()),
|
Double.toString(Math.random()),
|
||||||
"https://memex.marginalia.nu/" + file,
|
"https://memex.marginalia.nu/" + file,
|
||||||
null
|
null,
|
||||||
|
""
|
||||||
);
|
);
|
||||||
docs.add(doc);
|
docs.add(doc);
|
||||||
}
|
}
|
||||||
|
@ -27,9 +27,12 @@ dependencies {
|
|||||||
implementation project(':code:common:service')
|
implementation project(':code:common:service')
|
||||||
implementation project(':code:libraries:big-string')
|
implementation project(':code:libraries:big-string')
|
||||||
implementation project(':code:api:index-api')
|
implementation project(':code:api:index-api')
|
||||||
|
implementation project(':code:api:process-mqapi')
|
||||||
implementation project(':code:common:service-discovery')
|
implementation project(':code:common:service-discovery')
|
||||||
implementation project(':code:common:service-client')
|
implementation project(':code:common:service-client')
|
||||||
|
implementation project(':code:common:message-queue')
|
||||||
implementation project(':code:libraries:language-processing')
|
implementation project(':code:libraries:language-processing')
|
||||||
|
implementation project(':code:libraries:easy-lsh')
|
||||||
implementation project(':code:process-models:crawling-model')
|
implementation project(':code:process-models:crawling-model')
|
||||||
implementation project(':code:process-models:converting-model')
|
implementation project(':code:process-models:converting-model')
|
||||||
|
|
||||||
|
@ -0,0 +1,72 @@
|
|||||||
|
package nu.marginalia.crawl;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.concurrent.Semaphore;
|
||||||
|
|
||||||
|
public class CrawlLimiter {
|
||||||
|
public static final int maxPoolSize = Integer.getInteger("crawler.pool-size", 512);
|
||||||
|
|
||||||
|
// We'll round up to this size when we're crawling a new domain to prevent
|
||||||
|
// too many concurrent connections
|
||||||
|
public static final int minCrawlDataSizeKb = 128; // 100 Kb
|
||||||
|
|
||||||
|
// The largest size on disk where we'll permit a refresh crawl
|
||||||
|
// (these files easily grow into the gigabytes, we don't want that in RAM)
|
||||||
|
public static final int maxRefreshableCrawlDataSizeKBytes = 1024*128; // 128 Mb
|
||||||
|
|
||||||
|
// This limits how many concurrent crawl tasks we can have running at once
|
||||||
|
// based on their size on disk. The on-disk size is compressed, and the
|
||||||
|
// in-ram size is partially compressed (i.e. only the document body); so
|
||||||
|
// maybe a fair estimate is something like 2-4x this figure for RAM usage
|
||||||
|
//
|
||||||
|
public static final int maxConcurrentCrawlTaskSizeKb = 512*1024; // 512 Mb
|
||||||
|
|
||||||
|
static {
|
||||||
|
// Sanity check; if this is false we'll get a deadlock on taskSemRAM
|
||||||
|
assert maxConcurrentCrawlTaskSizeKb >= maxRefreshableCrawlDataSizeKBytes
|
||||||
|
: "maxConcurrentCrawlTaskSizeKb must be larger than maxRefreshableCrawlDataSizeKBytes";
|
||||||
|
}
|
||||||
|
|
||||||
|
public record CrawlTaskLimits(Path refreshPath, boolean isRefreshable, int taskSize) {}
|
||||||
|
|
||||||
|
// We use two semaphores to keep track of the number of concurrent crawls;
|
||||||
|
// first a RAM sempahore to limit the amount of RAM used by refresh crawls.
|
||||||
|
// then a count semaphore to limit the number of concurrent threads (this keeps the connection count manageable)
|
||||||
|
private final Semaphore taskSemRAM = new Semaphore(maxConcurrentCrawlTaskSizeKb);
|
||||||
|
private final Semaphore taskSemCount = new Semaphore(maxPoolSize);
|
||||||
|
|
||||||
|
|
||||||
|
public CrawlTaskLimits getTaskLimits(Path fileName) {
|
||||||
|
long size;
|
||||||
|
|
||||||
|
try {
|
||||||
|
size = Math.max(minCrawlDataSizeKb, Files.size(fileName) / 1024);
|
||||||
|
} catch (IOException ex) {
|
||||||
|
// If we can't read the file, we'll assume it's small since we won't be able to read it later for the refresh either
|
||||||
|
return new CrawlTaskLimits(null,false, minCrawlDataSizeKb);
|
||||||
|
}
|
||||||
|
|
||||||
|
// We'll only permit refresh crawls if the file is small enough
|
||||||
|
boolean isRefreshable = size < maxRefreshableCrawlDataSizeKBytes;
|
||||||
|
|
||||||
|
// We'll truncate this down to maxRefreshableCrawlDataSizeKBytes to ensure
|
||||||
|
// it's possible to acquire the RAM semaphore
|
||||||
|
int effectiveSize = (int) Math.min(maxRefreshableCrawlDataSizeKBytes, size);
|
||||||
|
|
||||||
|
return new CrawlTaskLimits(fileName, isRefreshable, effectiveSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void acquire(CrawlTaskLimits properties) throws InterruptedException {
|
||||||
|
// It's very important that we acquire the RAM semaphore first to avoid a deadlock
|
||||||
|
taskSemRAM.acquire(properties.taskSize);
|
||||||
|
taskSemCount.acquire(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void release(CrawlTaskLimits properties) {
|
||||||
|
taskSemCount.release(1);
|
||||||
|
taskSemRAM.release(properties.taskSize);
|
||||||
|
}
|
||||||
|
}
|
@ -1,13 +1,23 @@
|
|||||||
package nu.marginalia.crawl;
|
package nu.marginalia.crawl;
|
||||||
|
|
||||||
import nu.marginalia.ProcessConfiguration;
|
import com.google.gson.Gson;
|
||||||
|
import com.google.inject.Guice;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Injector;
|
||||||
import nu.marginalia.UserAgent;
|
import nu.marginalia.UserAgent;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||||
|
import nu.marginalia.crawling.io.CrawledDomainReader;
|
||||||
|
import nu.marginalia.crawling.io.CrawlerOutputFile;
|
||||||
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
|
import nu.marginalia.db.storage.FileStorageService;
|
||||||
|
import nu.marginalia.mq.MessageQueueFactory;
|
||||||
|
import nu.marginalia.mq.MqMessage;
|
||||||
|
import nu.marginalia.mq.inbox.MqInboxResponse;
|
||||||
|
import nu.marginalia.mq.inbox.MqSingleShotInbox;
|
||||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||||
import nu.marginalia.process.log.WorkLog;
|
import nu.marginalia.process.log.WorkLog;
|
||||||
import nu.marginalia.service.module.DatabaseModule;
|
import nu.marginalia.service.module.DatabaseModule;
|
||||||
import plan.CrawlPlanLoader;
|
|
||||||
import plan.CrawlPlan;
|
import plan.CrawlPlan;
|
||||||
import nu.marginalia.crawling.io.CrawledDomainWriter;
|
import nu.marginalia.crawling.io.CrawledDomainWriter;
|
||||||
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
||||||
@ -19,49 +29,63 @@ import okhttp3.internal.Util;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.sql.SQLException;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.Optional;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
import java.util.concurrent.*;
|
import java.util.concurrent.*;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
|
import static nu.marginalia.mqapi.ProcessInboxNames.CRAWLER_INBOX;
|
||||||
|
|
||||||
public class CrawlerMain implements AutoCloseable {
|
public class CrawlerMain implements AutoCloseable {
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
private final CrawlPlan plan;
|
private Path crawlDataDir;
|
||||||
private final Path crawlDataDir;
|
private WorkLog workLog;
|
||||||
|
|
||||||
private final WorkLog workLog;
|
|
||||||
|
|
||||||
|
private final ProcessHeartbeat heartbeat;
|
||||||
private final ConnectionPool connectionPool = new ConnectionPool(5, 10, TimeUnit.SECONDS);
|
private final ConnectionPool connectionPool = new ConnectionPool(5, 10, TimeUnit.SECONDS);
|
||||||
|
|
||||||
private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS,
|
private final Dispatcher dispatcher = new Dispatcher(new ThreadPoolExecutor(0, Integer.MAX_VALUE, 5, TimeUnit.SECONDS,
|
||||||
new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true)));
|
new SynchronousQueue<>(), Util.threadFactory("OkHttp Dispatcher", true)));
|
||||||
|
|
||||||
private final UserAgent userAgent;
|
private final UserAgent userAgent;
|
||||||
|
private final MessageQueueFactory messageQueueFactory;
|
||||||
|
private final FileStorageService fileStorageService;
|
||||||
|
private final Gson gson;
|
||||||
private final ThreadPoolExecutor pool;
|
private final ThreadPoolExecutor pool;
|
||||||
final int poolSize = Integer.getInteger("crawler.pool-size", 512);
|
|
||||||
final int poolQueueSize = 32;
|
|
||||||
|
|
||||||
|
public final CrawlLimiter crawlLimiter = new CrawlLimiter();
|
||||||
private final Set<String> processedIds = new HashSet<>();
|
private final Set<String> processedIds = new HashSet<>();
|
||||||
|
|
||||||
AbortMonitor abortMonitor = AbortMonitor.getInstance();
|
final AbortMonitor abortMonitor = AbortMonitor.getInstance();
|
||||||
Semaphore taskSem = new Semaphore(poolSize);
|
|
||||||
|
|
||||||
private static ProcessHeartbeat heartbeat;
|
volatile int totalTasks;
|
||||||
|
final AtomicInteger tasksDone = new AtomicInteger(0);
|
||||||
|
|
||||||
public CrawlerMain(CrawlPlan plan) throws Exception {
|
@Inject
|
||||||
this.plan = plan;
|
public CrawlerMain(UserAgent userAgent,
|
||||||
this.userAgent = WmsaHome.getUserAgent();
|
ProcessHeartbeat heartbeat,
|
||||||
|
MessageQueueFactory messageQueueFactory,
|
||||||
|
FileStorageService fileStorageService,
|
||||||
|
Gson gson) {
|
||||||
|
this.heartbeat = heartbeat;
|
||||||
|
this.userAgent = userAgent;
|
||||||
|
this.messageQueueFactory = messageQueueFactory;
|
||||||
|
this.fileStorageService = fileStorageService;
|
||||||
|
this.gson = gson;
|
||||||
|
|
||||||
// Ensure that the user agent is set for Java's HTTP requests
|
// maybe need to set -Xss for JVM to deal with this?
|
||||||
|
pool = new ThreadPoolExecutor(
|
||||||
BlockingQueue<Runnable> queue = new LinkedBlockingQueue<>(poolQueueSize);
|
CrawlLimiter.maxPoolSize /128,
|
||||||
pool = new ThreadPoolExecutor(poolSize/128, poolSize, 5, TimeUnit.MINUTES, queue); // maybe need to set -Xss for JVM to deal with this?
|
CrawlLimiter.maxPoolSize,
|
||||||
|
5, TimeUnit.MINUTES,
|
||||||
workLog = plan.createCrawlWorkLog();
|
new LinkedBlockingQueue<>(32)
|
||||||
crawlDataDir = plan.crawl.getDir();
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(String... args) throws Exception {
|
public static void main(String... args) throws Exception {
|
||||||
@ -77,46 +101,65 @@ public class CrawlerMain implements AutoCloseable {
|
|||||||
System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
|
System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
|
||||||
System.setProperty("sun.net.client.defaultReadTimeout", "30000");
|
System.setProperty("sun.net.client.defaultReadTimeout", "30000");
|
||||||
|
|
||||||
if (args.length != 1) {
|
Injector injector = Guice.createInjector(
|
||||||
System.err.println("Arguments: crawl-plan.yaml");
|
new CrawlerModule(),
|
||||||
System.exit(0);
|
new DatabaseModule()
|
||||||
}
|
);
|
||||||
var plan = new CrawlPlanLoader().load(Path.of(args[0]));
|
var crawler = injector.getInstance(CrawlerMain.class);
|
||||||
|
|
||||||
heartbeat = new ProcessHeartbeat(new ProcessConfiguration("crawler", 0, UUID.randomUUID()),
|
var instructions = crawler.fetchInstructions();
|
||||||
new DatabaseModule().provideConnection());
|
try {
|
||||||
|
crawler.run(instructions.getPlan());
|
||||||
|
instructions.ok();
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
System.err.println("Crawler failed");
|
||||||
|
ex.printStackTrace();
|
||||||
|
instructions.err();
|
||||||
|
}
|
||||||
|
|
||||||
try (var crawler = new CrawlerMain(plan)) {
|
TimeUnit.SECONDS.sleep(5);
|
||||||
heartbeat.start();
|
|
||||||
crawler.run();
|
|
||||||
}
|
|
||||||
finally {
|
|
||||||
heartbeat.shutDown();
|
|
||||||
}
|
|
||||||
|
|
||||||
System.exit(0);
|
System.exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void run() throws InterruptedException {
|
public void run(CrawlPlan plan) throws InterruptedException, IOException {
|
||||||
// First a validation run to ensure the file is all good to parse
|
|
||||||
logger.info("Validating JSON");
|
|
||||||
int countTotal = 0;
|
|
||||||
int countProcessed = 0;
|
|
||||||
|
|
||||||
for (var unused : plan.crawlingSpecificationIterable()) {
|
heartbeat.start();
|
||||||
countTotal++;
|
try {
|
||||||
|
// First a validation run to ensure the file is all good to parse
|
||||||
|
logger.info("Validating JSON");
|
||||||
|
|
||||||
|
|
||||||
|
workLog = plan.createCrawlWorkLog();
|
||||||
|
crawlDataDir = plan.crawl.getDir();
|
||||||
|
|
||||||
|
int countTotal = 0;
|
||||||
|
for (var unused : plan.crawlingSpecificationIterable()) {
|
||||||
|
countTotal++;
|
||||||
|
}
|
||||||
|
totalTasks = countTotal;
|
||||||
|
|
||||||
|
logger.info("Let's go");
|
||||||
|
|
||||||
|
for (var spec : plan.crawlingSpecificationIterable()) {
|
||||||
|
startCrawlTask(plan, spec);
|
||||||
|
}
|
||||||
|
|
||||||
|
pool.shutdown();
|
||||||
|
do {
|
||||||
|
System.out.println("Waiting for pool to terminate... " + pool.getActiveCount() + " remaining");
|
||||||
|
} while (!pool.awaitTermination(60, TimeUnit.SECONDS));
|
||||||
}
|
}
|
||||||
|
finally {
|
||||||
logger.info("Let's go");
|
heartbeat.shutDown();
|
||||||
|
|
||||||
for (var spec : plan.crawlingSpecificationIterable()) {
|
|
||||||
heartbeat.setProgress(countProcessed / (double) countTotal);
|
|
||||||
startCrawlTask(spec);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
CrawledDomainReader reader = new CrawledDomainReader();
|
||||||
|
|
||||||
private void startCrawlTask(CrawlingSpecification crawlingSpecification) {
|
|
||||||
|
private void startCrawlTask(CrawlPlan plan, CrawlingSpecification crawlingSpecification) {
|
||||||
|
|
||||||
if (!processedIds.add(crawlingSpecification.id)) {
|
if (!processedIds.add(crawlingSpecification.id)) {
|
||||||
|
|
||||||
@ -132,28 +175,41 @@ public class CrawlerMain implements AutoCloseable {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var limits = crawlLimiter.getTaskLimits(CrawlerOutputFile.getOutputFile(crawlDataDir, crawlingSpecification));
|
||||||
|
|
||||||
try {
|
try {
|
||||||
taskSem.acquire();
|
crawlLimiter.acquire(limits);
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
|
|
||||||
pool.execute(() -> {
|
pool.execute(() -> {
|
||||||
try {
|
try {
|
||||||
fetchDomain(crawlingSpecification);
|
fetchDomain(crawlingSpecification, limits);
|
||||||
|
heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
|
||||||
}
|
}
|
||||||
finally {
|
finally {
|
||||||
taskSem.release();
|
crawlLimiter.release(limits);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private void fetchDomain(CrawlingSpecification specification) {
|
|
||||||
|
private void fetchDomain(CrawlingSpecification specification, CrawlLimiter.CrawlTaskLimits limits) {
|
||||||
if (workLog.isJobFinished(specification.id))
|
if (workLog.isJobFinished(specification.id))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool);
|
HttpFetcher fetcher = new HttpFetcherImpl(userAgent.uaString(), dispatcher, connectionPool);
|
||||||
|
|
||||||
|
// Read the previous crawl's data for this domain, if it exists and has a reasonable size
|
||||||
|
Optional<CrawledDomain> domain;
|
||||||
|
if (limits.isRefreshable()) {
|
||||||
|
domain = reader.readOptionally(limits.refreshPath());
|
||||||
|
if (domain.isPresent()) {
|
||||||
|
specification = specification.withOldData(domain.get());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
|
try (CrawledDomainWriter writer = new CrawledDomainWriter(crawlDataDir, specification.domain, specification.id)) {
|
||||||
var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept);
|
var retreiver = new CrawlerRetreiver(fetcher, specification, writer::accept);
|
||||||
|
|
||||||
@ -167,6 +223,65 @@ public class CrawlerMain implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static class CrawlRequest {
|
||||||
|
private final CrawlPlan plan;
|
||||||
|
private final MqMessage message;
|
||||||
|
private final MqSingleShotInbox inbox;
|
||||||
|
|
||||||
|
CrawlRequest(CrawlPlan plan, MqMessage message, MqSingleShotInbox inbox) {
|
||||||
|
this.plan = plan;
|
||||||
|
this.message = message;
|
||||||
|
this.inbox = inbox;
|
||||||
|
}
|
||||||
|
|
||||||
|
public CrawlPlan getPlan() {
|
||||||
|
return plan;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void ok() {
|
||||||
|
inbox.sendResponse(message, MqInboxResponse.ok());
|
||||||
|
}
|
||||||
|
public void err() {
|
||||||
|
inbox.sendResponse(message, MqInboxResponse.err());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private CrawlRequest fetchInstructions() throws Exception {
|
||||||
|
|
||||||
|
var inbox = messageQueueFactory.createSingleShotInbox(CRAWLER_INBOX, UUID.randomUUID());
|
||||||
|
|
||||||
|
var msgOpt = getMessage(inbox, nu.marginalia.mqapi.crawling.CrawlRequest.class.getSimpleName());
|
||||||
|
var msg = msgOpt.orElseThrow(() -> new RuntimeException("No message received"));
|
||||||
|
|
||||||
|
var request = gson.fromJson(msg.payload(), nu.marginalia.mqapi.crawling.CrawlRequest.class);
|
||||||
|
|
||||||
|
var specData = fileStorageService.getStorage(request.specStorage);
|
||||||
|
var crawlData = fileStorageService.getStorage(request.crawlStorage);
|
||||||
|
|
||||||
|
var plan = new CrawlPlan(specData.asPath().resolve("crawler.spec").toString(),
|
||||||
|
new CrawlPlan.WorkDir(crawlData.path(), "crawler.log"),
|
||||||
|
null);
|
||||||
|
|
||||||
|
return new CrawlRequest(plan, msg, inbox);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Optional<MqMessage> getMessage(MqSingleShotInbox inbox, String expectedFunction) throws SQLException, InterruptedException {
|
||||||
|
var opt = inbox.waitForMessage(30, TimeUnit.SECONDS);
|
||||||
|
if (opt.isPresent()) {
|
||||||
|
if (!opt.get().function().equals(expectedFunction)) {
|
||||||
|
throw new RuntimeException("Unexpected function: " + opt.get().function());
|
||||||
|
}
|
||||||
|
return opt;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
var stolenMessage = inbox.stealMessage(msg -> msg.function().equals(expectedFunction));
|
||||||
|
stolenMessage.ifPresent(mqMessage -> logger.info("Stole message {}", mqMessage));
|
||||||
|
return stolenMessage;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public void close() throws Exception {
|
public void close() throws Exception {
|
||||||
logger.info("Awaiting termination");
|
logger.info("Awaiting termination");
|
||||||
pool.shutdown();
|
pool.shutdown();
|
||||||
@ -176,8 +291,6 @@ public class CrawlerMain implements AutoCloseable {
|
|||||||
|
|
||||||
workLog.close();
|
workLog.close();
|
||||||
dispatcher.executorService().shutdownNow();
|
dispatcher.executorService().shutdownNow();
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,24 @@
|
|||||||
|
package nu.marginalia.crawl;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.inject.AbstractModule;
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.ProcessConfiguration;
|
||||||
|
import nu.marginalia.UserAgent;
|
||||||
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.model.gson.GsonFactory;
|
||||||
|
|
||||||
|
import java.util.UUID;
|
||||||
|
|
||||||
|
public class CrawlerModule extends AbstractModule {
|
||||||
|
@SneakyThrows
|
||||||
|
public void configure() {
|
||||||
|
bind(Gson.class).toInstance(createGson());
|
||||||
|
bind(UserAgent.class).toInstance(WmsaHome.getUserAgent());
|
||||||
|
bind(ProcessConfiguration.class).toInstance(new ProcessConfiguration("crawler", 0, UUID.randomUUID()));
|
||||||
|
}
|
||||||
|
|
||||||
|
private Gson createGson() {
|
||||||
|
return GsonFactory.get();
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,123 @@
|
|||||||
|
package nu.marginalia.crawl.retreival;
|
||||||
|
|
||||||
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
/** A reference to a domain that has been crawled before. */
|
||||||
|
public class CrawlDataReference {
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(CrawlDataReference.class);
|
||||||
|
final Map<EdgeUrl, CrawledDocument> documents;
|
||||||
|
final Map<EdgeUrl, String> etags;
|
||||||
|
final Map<EdgeUrl, String> lastModified;
|
||||||
|
final Set<EdgeUrl> previouslyDeadUrls = new HashSet<>();
|
||||||
|
|
||||||
|
CrawlDataReference(CrawledDomain referenceDomain) {
|
||||||
|
|
||||||
|
if (referenceDomain == null || referenceDomain.doc == null) {
|
||||||
|
documents = Collections.emptyMap();
|
||||||
|
etags = Collections.emptyMap();
|
||||||
|
lastModified = Collections.emptyMap();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
documents = new HashMap<>(referenceDomain.doc.size());
|
||||||
|
etags = new HashMap<>(referenceDomain.doc.size());
|
||||||
|
lastModified = new HashMap<>(referenceDomain.doc.size());
|
||||||
|
|
||||||
|
for (var doc : referenceDomain.doc) {
|
||||||
|
try {
|
||||||
|
addReference(doc);
|
||||||
|
} catch (URISyntaxException ex) {
|
||||||
|
logger.warn("Failed to add reference document {}", doc.url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addReference(CrawledDocument doc) throws URISyntaxException {
|
||||||
|
var url = new EdgeUrl(doc.url);
|
||||||
|
|
||||||
|
if (doc.httpStatus == 404) {
|
||||||
|
previouslyDeadUrls.add(url);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (doc.httpStatus != 200) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
documents.put(url, doc);
|
||||||
|
|
||||||
|
String headers = doc.headers;
|
||||||
|
if (headers != null) {
|
||||||
|
String[] headersLines = headers.split("\n");
|
||||||
|
|
||||||
|
String lastmod = null;
|
||||||
|
String etag = null;
|
||||||
|
|
||||||
|
for (String line : headersLines) {
|
||||||
|
if (line.toLowerCase().startsWith("etag:")) {
|
||||||
|
etag = line.substring(5).trim();
|
||||||
|
}
|
||||||
|
if (line.toLowerCase().startsWith("last-modified:")) {
|
||||||
|
lastmod = line.substring(14).trim();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (lastmod != null) {
|
||||||
|
lastModified.put(url, lastmod);
|
||||||
|
}
|
||||||
|
if (etag != null) {
|
||||||
|
etags.put(url, etag);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isPreviouslyDead(EdgeUrl url) {
|
||||||
|
return previouslyDeadUrls.contains(url);
|
||||||
|
}
|
||||||
|
public int size() {
|
||||||
|
return documents.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getEtag(EdgeUrl url) {
|
||||||
|
return etags.get(url);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getLastModified(EdgeUrl url) {
|
||||||
|
return lastModified.get(url);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<EdgeUrl, CrawledDocument> allDocuments() {
|
||||||
|
return documents;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public Map<EdgeUrl, CrawledDocument> sample(int sampleSize) {
|
||||||
|
return documents.entrySet().stream().limit(sampleSize).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void evict() {
|
||||||
|
documents.clear();
|
||||||
|
etags.clear();
|
||||||
|
lastModified.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
public CrawledDocument getDoc(EdgeUrl top) {
|
||||||
|
return documents.get(top);
|
||||||
|
}
|
||||||
|
|
||||||
|
// This bit of manual housekeeping is needed to keep the memory footprint low
|
||||||
|
public void dispose(EdgeUrl url) {
|
||||||
|
documents.remove(url);
|
||||||
|
etags.remove(url);
|
||||||
|
lastModified.remove(url);
|
||||||
|
}
|
||||||
|
}
|
@ -10,6 +10,7 @@ import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
|||||||
import nu.marginalia.link_parser.LinkParser;
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
import nu.marginalia.crawling.model.*;
|
import nu.marginalia.crawling.model.*;
|
||||||
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
import nu.marginalia.ip_blocklist.UrlBlocklist;
|
||||||
|
import nu.marginalia.lsh.EasyLSH;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
@ -57,6 +58,7 @@ public class CrawlerRetreiver {
|
|||||||
private final SitemapRetriever sitemapRetriever;
|
private final SitemapRetriever sitemapRetriever;
|
||||||
private final DomainCrawlFrontier crawlFrontier;
|
private final DomainCrawlFrontier crawlFrontier;
|
||||||
|
|
||||||
|
private final CrawlDataReference oldCrawlData;
|
||||||
|
|
||||||
int errorCount = 0;
|
int errorCount = 0;
|
||||||
|
|
||||||
@ -64,6 +66,7 @@ public class CrawlerRetreiver {
|
|||||||
CrawlingSpecification specs,
|
CrawlingSpecification specs,
|
||||||
Consumer<SerializableCrawlData> writer) {
|
Consumer<SerializableCrawlData> writer) {
|
||||||
this.fetcher = fetcher;
|
this.fetcher = fetcher;
|
||||||
|
this.oldCrawlData = new CrawlDataReference(specs.oldData);
|
||||||
|
|
||||||
id = specs.id;
|
id = specs.id;
|
||||||
domain = specs.domain;
|
domain = specs.domain;
|
||||||
@ -73,9 +76,9 @@ public class CrawlerRetreiver {
|
|||||||
this.crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), specs.urls, specs.crawlDepth);
|
this.crawlFrontier = new DomainCrawlFrontier(new EdgeDomain(domain), specs.urls, specs.crawlDepth);
|
||||||
sitemapRetriever = fetcher.createSitemapRetriever();
|
sitemapRetriever = fetcher.createSitemapRetriever();
|
||||||
|
|
||||||
|
// We must always crawl the index page first, this is assumed when fingerprinting the server
|
||||||
var fst = crawlFrontier.peek();
|
var fst = crawlFrontier.peek();
|
||||||
if (fst != null) {
|
if (fst != null) {
|
||||||
|
|
||||||
// Ensure the index page is always crawled
|
// Ensure the index page is always crawled
|
||||||
var root = fst.withPathAndParam("/", null);
|
var root = fst.withPathAndParam("/", null);
|
||||||
|
|
||||||
@ -141,6 +144,29 @@ public class CrawlerRetreiver {
|
|||||||
|
|
||||||
var robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain);
|
var robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain);
|
||||||
|
|
||||||
|
CrawlDataComparison comparison = compareWithOldData(robotsRules);
|
||||||
|
logger.info("Comparison result for {} : {}", domain, comparison);
|
||||||
|
|
||||||
|
// If we have reference data, we will always grow the crawl depth a bit
|
||||||
|
if (oldCrawlData.size() > 0) {
|
||||||
|
crawlFrontier.increaseDepth(1.5);
|
||||||
|
}
|
||||||
|
|
||||||
|
// When the reference data doesn't appear to have changed, we'll forego
|
||||||
|
// re-fetching it and just use the old data
|
||||||
|
if (comparison == CrawlDataComparison.NO_CHANGES) {
|
||||||
|
oldCrawlData.allDocuments().forEach((url, doc) -> {
|
||||||
|
if (crawlFrontier.addVisited(url)) {
|
||||||
|
doc.recrawlState = "RETAINED";
|
||||||
|
crawledDomainWriter.accept(doc);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// We don't need to hold onto this in RAM anymore
|
||||||
|
oldCrawlData.evict();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
downloadSitemaps(robotsRules);
|
downloadSitemaps(robotsRules);
|
||||||
sniffRootDocument();
|
sniffRootDocument();
|
||||||
|
|
||||||
@ -161,18 +187,31 @@ public class CrawlerRetreiver {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Don't re-fetch links that were previously found dead as it's very unlikely that a
|
||||||
|
// 404:ing link will suddenly start working at a later point
|
||||||
|
if (oldCrawlData.isPreviouslyDead(top))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// Check the link filter if the endpoint should be fetched based on site-type
|
||||||
if (!crawlFrontier.filterLink(top))
|
if (!crawlFrontier.filterLink(top))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
// Check vs blocklist
|
||||||
if (urlBlocklist.isUrlBlocked(top))
|
if (urlBlocklist.isUrlBlocked(top))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (!isAllowedProtocol(top.proto))
|
if (!isAllowedProtocol(top.proto))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
|
// Check if the URL is too long to insert into the DB
|
||||||
if (top.toString().length() > 255)
|
if (top.toString().length() > 255)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (!crawlFrontier.addVisited(top))
|
if (!crawlFrontier.addVisited(top))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
if (fetchDocument(top, crawlDelay)) {
|
|
||||||
|
if (fetchDocument(top, crawlDelay).isPresent()) {
|
||||||
fetchedCount++;
|
fetchedCount++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -184,6 +223,76 @@ public class CrawlerRetreiver {
|
|||||||
return fetchedCount;
|
return fetchedCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private CrawlDataComparison compareWithOldData(SimpleRobotRules robotsRules) {
|
||||||
|
|
||||||
|
int numGoodDocuments = oldCrawlData.size();
|
||||||
|
|
||||||
|
if (numGoodDocuments == 0)
|
||||||
|
return CrawlDataComparison.NO_OLD_DATA;
|
||||||
|
|
||||||
|
if (numGoodDocuments < 10)
|
||||||
|
return CrawlDataComparison.SMALL_SAMPLE;
|
||||||
|
|
||||||
|
// We fetch a sample of the data to assess how much it has changed
|
||||||
|
int sampleSize = (int) Math.min(20, 0.25 * numGoodDocuments);
|
||||||
|
Map<EdgeUrl, CrawledDocument> referenceUrls = oldCrawlData.sample(sampleSize);
|
||||||
|
|
||||||
|
int differences = 0;
|
||||||
|
|
||||||
|
long crawlDelay = robotsRules.getCrawlDelay();
|
||||||
|
for (var url : referenceUrls.keySet()) {
|
||||||
|
|
||||||
|
var docMaybe = fetchDocument(url, crawlDelay);
|
||||||
|
if (docMaybe.isEmpty()) {
|
||||||
|
differences++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
var newDoc = docMaybe.get();
|
||||||
|
var referenceDoc = referenceUrls.get(url);
|
||||||
|
|
||||||
|
// This looks like a bug but it is not, we want to compare references
|
||||||
|
// to detect if the page has bounced off etag or last-modified headers
|
||||||
|
// to avoid having to do a full content comparison
|
||||||
|
if (newDoc == referenceDoc)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (newDoc.httpStatus != referenceDoc.httpStatus) {
|
||||||
|
differences++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (newDoc.documentBody == null) {
|
||||||
|
differences++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
long referenceLsh = hashDoc(referenceDoc);
|
||||||
|
long newLsh = hashDoc(newDoc);
|
||||||
|
|
||||||
|
if (EasyLSH.hammingDistance(referenceLsh, newLsh) > 5) {
|
||||||
|
differences++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (differences > sampleSize/4) {
|
||||||
|
return CrawlDataComparison.CHANGES_FOUND;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return CrawlDataComparison.NO_CHANGES;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final HashFunction hasher = Hashing.murmur3_128(0);
|
||||||
|
private long hashDoc(CrawledDocument doc) {
|
||||||
|
var hash = new EasyLSH();
|
||||||
|
long val = 0;
|
||||||
|
for (var b : doc.documentBody.decode().getBytes()) {
|
||||||
|
val = val << 8 | (b & 0xFF);
|
||||||
|
hash.addUnordered(hasher.hashLong(val).asLong());
|
||||||
|
}
|
||||||
|
return hash.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private void downloadSitemaps(SimpleRobotRules robotsRules) {
|
private void downloadSitemaps(SimpleRobotRules robotsRules) {
|
||||||
List<String> sitemaps = robotsRules.getSitemaps();
|
List<String> sitemaps = robotsRules.getSitemaps();
|
||||||
@ -235,7 +344,7 @@ public class CrawlerRetreiver {
|
|||||||
try {
|
try {
|
||||||
logger.debug("Configuring link filter");
|
logger.debug("Configuring link filter");
|
||||||
|
|
||||||
var url = crawlFrontier.peek();
|
var url = crawlFrontier.peek().withPathAndParam("/", null);
|
||||||
|
|
||||||
var maybeSample = fetchUrl(url).filter(sample -> sample.httpStatus == 200);
|
var maybeSample = fetchUrl(url).filter(sample -> sample.httpStatus == 200);
|
||||||
if (maybeSample.isEmpty())
|
if (maybeSample.isEmpty())
|
||||||
@ -273,7 +382,7 @@ public class CrawlerRetreiver {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean fetchDocument(EdgeUrl top, long crawlDelay) {
|
private Optional<CrawledDocument> fetchDocument(EdgeUrl top, long crawlDelay) {
|
||||||
logger.debug("Fetching {}", top);
|
logger.debug("Fetching {}", top);
|
||||||
|
|
||||||
long startTime = System.currentTimeMillis();
|
long startTime = System.currentTimeMillis();
|
||||||
@ -282,9 +391,14 @@ public class CrawlerRetreiver {
|
|||||||
if (doc.isPresent()) {
|
if (doc.isPresent()) {
|
||||||
var d = doc.get();
|
var d = doc.get();
|
||||||
crawledDomainWriter.accept(d);
|
crawledDomainWriter.accept(d);
|
||||||
|
oldCrawlData.dispose(top);
|
||||||
|
|
||||||
if (d.url != null) {
|
if (d.url != null) {
|
||||||
EdgeUrl.parse(d.url).ifPresent(crawlFrontier::addVisited);
|
// We may have redirected to a different path
|
||||||
|
EdgeUrl.parse(d.url).ifPresent(url -> {
|
||||||
|
crawlFrontier.addVisited(url);
|
||||||
|
oldCrawlData.dispose(url);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if ("ERROR".equals(d.crawlerStatus) && d.httpStatus != 404) {
|
if ("ERROR".equals(d.crawlerStatus) && d.httpStatus != 404) {
|
||||||
@ -296,7 +410,7 @@ public class CrawlerRetreiver {
|
|||||||
long crawledTime = System.currentTimeMillis() - startTime;
|
long crawledTime = System.currentTimeMillis() - startTime;
|
||||||
delay(crawlDelay, crawledTime);
|
delay(crawlDelay, crawledTime);
|
||||||
|
|
||||||
return doc.isPresent();
|
return doc;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isAllowedProtocol(String proto) {
|
private boolean isAllowedProtocol(String proto) {
|
||||||
@ -333,7 +447,20 @@ public class CrawlerRetreiver {
|
|||||||
private CrawledDocument fetchContent(EdgeUrl top) {
|
private CrawledDocument fetchContent(EdgeUrl top) {
|
||||||
for (int i = 0; i < 2; i++) {
|
for (int i = 0; i < 2; i++) {
|
||||||
try {
|
try {
|
||||||
return fetcher.fetchContent(top);
|
var doc = fetcher.fetchContent(top, oldCrawlData.getEtag(top), oldCrawlData.getLastModified(top));
|
||||||
|
|
||||||
|
doc.recrawlState = "NEW";
|
||||||
|
|
||||||
|
if (doc.httpStatus == 304) {
|
||||||
|
var referenceData = oldCrawlData.getDoc(top);
|
||||||
|
if (referenceData != null) {
|
||||||
|
referenceData.recrawlState = "304/UNCHANGED";
|
||||||
|
return referenceData;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
return doc;
|
||||||
}
|
}
|
||||||
catch (RateLimitException ex) {
|
catch (RateLimitException ex) {
|
||||||
slowDown = true;
|
slowDown = true;
|
||||||
@ -443,4 +570,12 @@ public class CrawlerRetreiver {
|
|||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
enum CrawlDataComparison {
|
||||||
|
NO_OLD_DATA,
|
||||||
|
SMALL_SAMPLE,
|
||||||
|
CHANGES_FOUND,
|
||||||
|
NO_CHANGES
|
||||||
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -17,7 +17,7 @@ public class DomainCrawlFrontier {
|
|||||||
|
|
||||||
private Predicate<EdgeUrl> linkFilter = url -> true;
|
private Predicate<EdgeUrl> linkFilter = url -> true;
|
||||||
|
|
||||||
final int depth;
|
private int depth;
|
||||||
|
|
||||||
public DomainCrawlFrontier(EdgeDomain thisDomain, Collection<String> urls, int depth) {
|
public DomainCrawlFrontier(EdgeDomain thisDomain, Collection<String> urls, int depth) {
|
||||||
this.thisDomain = thisDomain;
|
this.thisDomain = thisDomain;
|
||||||
@ -32,6 +32,9 @@ public class DomainCrawlFrontier {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void increaseDepth(double depthIncreaseFactor) {
|
||||||
|
depth = (int)(depth * depthIncreaseFactor);
|
||||||
|
}
|
||||||
public void setLinkFilter(Predicate<EdgeUrl> linkFilter) {
|
public void setLinkFilter(Predicate<EdgeUrl> linkFilter) {
|
||||||
this.linkFilter = linkFilter;
|
this.linkFilter = linkFilter;
|
||||||
}
|
}
|
||||||
@ -80,6 +83,9 @@ public class DomainCrawlFrontier {
|
|||||||
if (queue.size() + visited.size() >= depth + 100)
|
if (queue.size() + visited.size() >= depth + 100)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
if (visited.contains(url.toString()))
|
||||||
|
return;
|
||||||
|
|
||||||
if (known.add(url.toString())) {
|
if (known.add(url.toString())) {
|
||||||
queue.addLast(url);
|
queue.addLast(url);
|
||||||
}
|
}
|
||||||
|
@ -18,7 +18,7 @@ public interface HttpFetcher {
|
|||||||
|
|
||||||
FetchResult probeDomain(EdgeUrl url);
|
FetchResult probeDomain(EdgeUrl url);
|
||||||
|
|
||||||
CrawledDocument fetchContent(EdgeUrl url) throws RateLimitException;
|
CrawledDocument fetchContent(EdgeUrl url, String etag, String lastMod) throws RateLimitException;
|
||||||
|
|
||||||
SimpleRobotRules fetchRobotRules(EdgeDomain domain);
|
SimpleRobotRules fetchRobotRules(EdgeDomain domain);
|
||||||
|
|
||||||
|
@ -125,29 +125,20 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private Request createHeadRequest(EdgeUrl url) {
|
|
||||||
return new Request.Builder().head().addHeader("User-agent", userAgent)
|
|
||||||
.url(url.toString())
|
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
private Request createGetRequest(EdgeUrl url) {
|
|
||||||
return new Request.Builder().get().addHeader("User-agent", userAgent)
|
|
||||||
.url(url.toString())
|
|
||||||
.addHeader("Accept-Encoding", "gzip")
|
|
||||||
.build();
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public CrawledDocument fetchContent(EdgeUrl url) throws RateLimitException {
|
public CrawledDocument fetchContent(EdgeUrl url, String etag, String lastMod) throws RateLimitException {
|
||||||
|
|
||||||
if (contentTypeLogic.isUrlLikeBinary(url)) {
|
if (contentTypeLogic.isUrlLikeBinary(url)) {
|
||||||
logger.debug("Probing suspected binary {}", url);
|
logger.debug("Probing suspected binary {}", url);
|
||||||
|
|
||||||
var head = createHeadRequest(url);
|
var headBuilder = new Request.Builder().head()
|
||||||
|
.addHeader("User-agent", userAgent)
|
||||||
|
.url(url.toString())
|
||||||
|
.addHeader("Accept-Encoding", "gzip");
|
||||||
|
|
||||||
|
var head = headBuilder.build();
|
||||||
var call = client.newCall(head);
|
var call = client.newCall(head);
|
||||||
|
|
||||||
try (var rsp = call.execute()) {
|
try (var rsp = call.execute()) {
|
||||||
@ -165,7 +156,15 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var get = createGetRequest(url);
|
var getBuilder = new Request.Builder().get();
|
||||||
|
getBuilder.addHeader("User-agent", userAgent)
|
||||||
|
.url(url.toString())
|
||||||
|
.addHeader("Accept-Encoding", "gzip");
|
||||||
|
|
||||||
|
if (etag != null) getBuilder.addHeader("If-None-Match", etag);
|
||||||
|
if (lastMod != null) getBuilder.addHeader("If-Modified-Since", lastMod);
|
||||||
|
|
||||||
|
var get = getBuilder.build();
|
||||||
var call = client.newCall(get);
|
var call = client.newCall(get);
|
||||||
|
|
||||||
try (var rsp = call.execute()) {
|
try (var rsp = call.execute()) {
|
||||||
@ -315,7 +314,7 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, EdgeDomain domain) {
|
private Optional<SimpleRobotRules> fetchRobotsForProto(String proto, EdgeDomain domain) {
|
||||||
try {
|
try {
|
||||||
var url = new EdgeUrl(proto, domain, null, "/robots.txt", null);
|
var url = new EdgeUrl(proto, domain, null, "/robots.txt", null);
|
||||||
return Optional.of(parseRobotsTxt(fetchContent(url)));
|
return Optional.of(parseRobotsTxt(fetchContent(url, null, null)));
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
|
@ -29,14 +29,14 @@ class HttpFetcherTest {
|
|||||||
@Test
|
@Test
|
||||||
void fetchUTF8() throws URISyntaxException, RateLimitException {
|
void fetchUTF8() throws URISyntaxException, RateLimitException {
|
||||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||||
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"));
|
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu"), null, null);
|
||||||
System.out.println(str.contentType);
|
System.out.println(str.contentType);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
void fetchText() throws URISyntaxException, RateLimitException {
|
void fetchText() throws URISyntaxException, RateLimitException {
|
||||||
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
var fetcher = new HttpFetcherImpl("nu.marginalia.edge-crawler");
|
||||||
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"));
|
var str = fetcher.fetchContent(new EdgeUrl("https://www.marginalia.nu/robots.txt"), null, null);
|
||||||
System.out.println(str);
|
System.out.println(str);
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -33,7 +33,6 @@ public class CrawlerMockFetcherTest {
|
|||||||
|
|
||||||
Map<EdgeUrl, CrawledDocument> mockData = new HashMap<>();
|
Map<EdgeUrl, CrawledDocument> mockData = new HashMap<>();
|
||||||
HttpFetcher fetcherMock = new MockFetcher();
|
HttpFetcher fetcherMock = new MockFetcher();
|
||||||
SitemapRetriever sitemapRetriever = new SitemapRetriever();
|
|
||||||
|
|
||||||
@AfterEach
|
@AfterEach
|
||||||
public void tearDown() {
|
public void tearDown() {
|
||||||
@ -74,7 +73,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html");
|
registerUrlClasspathData(new EdgeUrl("https://startrek.website/c/startrek"), "mock-crawl-data/lemmy/c_startrek.html");
|
||||||
registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html");
|
registerUrlClasspathData(new EdgeUrl("https://startrek.website/post/108995"), "mock-crawl-data/lemmy/108995.html");
|
||||||
|
|
||||||
new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "startrek.website", new ArrayList<>()), out::add)
|
new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "startrek.website", new ArrayList<>(), null), out::add)
|
||||||
.withNoDelay()
|
.withNoDelay()
|
||||||
.fetch();
|
.fetch();
|
||||||
|
|
||||||
@ -87,7 +86,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
|
|
||||||
registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
|
registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
|
||||||
|
|
||||||
new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "en.wikipedia.org", new ArrayList<>()), out::add)
|
new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 10, "en.wikipedia.org", new ArrayList<>(), null), out::add)
|
||||||
.withNoDelay()
|
.withNoDelay()
|
||||||
.fetch();
|
.fetch();
|
||||||
|
|
||||||
@ -102,7 +101,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html");
|
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/telegram-channel-to-idle-on/3501"), "mock-crawl-data/discourse/telegram.html");
|
||||||
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html");
|
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/t/combined-mode-but-grid/4489"), "mock-crawl-data/discourse/grid.html");
|
||||||
|
|
||||||
new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 100, "community.tt-rss.org", new ArrayList<>()), out::add)
|
new CrawlerRetreiver(fetcherMock, new CrawlingSpecification("1", 100, "community.tt-rss.org", new ArrayList<>(), null), out::add)
|
||||||
.withNoDelay()
|
.withNoDelay()
|
||||||
.fetch();
|
.fetch();
|
||||||
|
|
||||||
@ -127,7 +126,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public CrawledDocument fetchContent(EdgeUrl url) {
|
public CrawledDocument fetchContent(EdgeUrl url, String etag, String lastModified) {
|
||||||
logger.info("Fetching {}", url);
|
logger.info("Fetching {}", url);
|
||||||
if (mockData.containsKey(url)) {
|
if (mockData.containsKey(url)) {
|
||||||
return mockData.get(url);
|
return mockData.get(url);
|
||||||
|
@ -6,12 +6,15 @@ import nu.marginalia.crawl.retreival.CrawlerRetreiver;
|
|||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawling.model.CrawledDocument;
|
import nu.marginalia.crawling.model.CrawledDocument;
|
||||||
|
import nu.marginalia.crawling.model.CrawledDomain;
|
||||||
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
import nu.marginalia.crawling.model.spec.CrawlingSpecification;
|
||||||
import nu.marginalia.crawling.model.SerializableCrawlData;
|
import nu.marginalia.crawling.model.SerializableCrawlData;
|
||||||
import org.junit.jupiter.api.*;
|
import org.junit.jupiter.api.*;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
@ -95,4 +98,36 @@ class CrawlerRetreiverTest {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRecrawl() {
|
||||||
|
|
||||||
|
var specs = CrawlingSpecification
|
||||||
|
.builder()
|
||||||
|
.id("whatever")
|
||||||
|
.crawlDepth(12)
|
||||||
|
.domain("www.marginalia.nu")
|
||||||
|
.urls(List.of("https://www.marginalia.nu/some-dead-link"))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
|
||||||
|
Map<Class<? extends SerializableCrawlData>, List<SerializableCrawlData>> data = new HashMap<>();
|
||||||
|
|
||||||
|
new CrawlerRetreiver(httpFetcher, specs, d -> {
|
||||||
|
data.computeIfAbsent(d.getClass(), k->new ArrayList<>()).add(d);
|
||||||
|
if (d instanceof CrawledDocument doc) {
|
||||||
|
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
|
||||||
|
}
|
||||||
|
}).fetch();
|
||||||
|
|
||||||
|
CrawledDomain domain = (CrawledDomain) data.get(CrawledDomain.class).get(0);
|
||||||
|
domain.doc = data.get(CrawledDocument.class).stream().map(CrawledDocument.class::cast).collect(Collectors.toList());
|
||||||
|
|
||||||
|
var newSpec = specs.withOldData(domain);
|
||||||
|
|
||||||
|
new CrawlerRetreiver(httpFetcher, newSpec, d -> {
|
||||||
|
if (d instanceof CrawledDocument doc) {
|
||||||
|
System.out.println(doc.url + ": " + doc.recrawlState + "\t" + doc.httpStatus);
|
||||||
|
}
|
||||||
|
}).fetch();
|
||||||
|
}
|
||||||
}
|
}
|
@ -82,6 +82,8 @@ public class ControlService extends Service {
|
|||||||
Spark.post("/public/fsms/:fsm/start", controlActorService::startFsm, redirectToProcesses);
|
Spark.post("/public/fsms/:fsm/start", controlActorService::startFsm, redirectToProcesses);
|
||||||
Spark.post("/public/fsms/:fsm/stop", controlActorService::stopFsm, redirectToProcesses);
|
Spark.post("/public/fsms/:fsm/stop", controlActorService::stopFsm, redirectToProcesses);
|
||||||
|
|
||||||
|
Spark.post("/public/storage/:fid/crawl", controlActorService::triggerCrawling, redirectToProcesses);
|
||||||
|
Spark.post("/public/storage/:fid/recrawl", controlActorService::triggerRecrawling, redirectToProcesses);
|
||||||
Spark.post("/public/storage/:fid/process", controlActorService::triggerProcessing, redirectToProcesses);
|
Spark.post("/public/storage/:fid/process", controlActorService::triggerProcessing, redirectToProcesses);
|
||||||
Spark.post("/public/storage/:fid/load", controlActorService::loadProcessedData, redirectToProcesses);
|
Spark.post("/public/storage/:fid/load", controlActorService::loadProcessedData, redirectToProcesses);
|
||||||
|
|
||||||
|
@ -4,6 +4,8 @@ import com.google.gson.Gson;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.control.actor.task.CrawlActor;
|
||||||
|
import nu.marginalia.control.actor.task.RecrawlActor;
|
||||||
import nu.marginalia.control.model.Actor;
|
import nu.marginalia.control.model.Actor;
|
||||||
import nu.marginalia.control.actor.monitor.*;
|
import nu.marginalia.control.actor.monitor.*;
|
||||||
import nu.marginalia.control.actor.monitor.ConverterMonitorActor;
|
import nu.marginalia.control.actor.monitor.ConverterMonitorActor;
|
||||||
@ -22,6 +24,7 @@ import java.util.Map;
|
|||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
/** This class is responsible for starting and stopping the various actors in the controller service */
|
||||||
@Singleton
|
@Singleton
|
||||||
public class ControlActors {
|
public class ControlActors {
|
||||||
private final ServiceEventLog eventLog;
|
private final ServiceEventLog eventLog;
|
||||||
@ -35,7 +38,10 @@ public class ControlActors {
|
|||||||
GsonFactory gsonFactory,
|
GsonFactory gsonFactory,
|
||||||
BaseServiceParams baseServiceParams,
|
BaseServiceParams baseServiceParams,
|
||||||
ReconvertAndLoadActor reconvertAndLoadActor,
|
ReconvertAndLoadActor reconvertAndLoadActor,
|
||||||
|
CrawlActor crawlActor,
|
||||||
|
RecrawlActor recrawlActor,
|
||||||
ConverterMonitorActor converterMonitorFSM,
|
ConverterMonitorActor converterMonitorFSM,
|
||||||
|
CrawlerMonitorActor crawlerMonitorActor,
|
||||||
LoaderMonitorActor loaderMonitor,
|
LoaderMonitorActor loaderMonitor,
|
||||||
MessageQueueMonitorActor messageQueueMonitor,
|
MessageQueueMonitorActor messageQueueMonitor,
|
||||||
ProcessLivenessMonitorActor processMonitorFSM,
|
ProcessLivenessMonitorActor processMonitorFSM,
|
||||||
@ -45,9 +51,12 @@ public class ControlActors {
|
|||||||
this.eventLog = baseServiceParams.eventLog;
|
this.eventLog = baseServiceParams.eventLog;
|
||||||
this.gson = gsonFactory.get();
|
this.gson = gsonFactory.get();
|
||||||
|
|
||||||
|
register(Actor.CRAWL, crawlActor);
|
||||||
|
register(Actor.RECRAWL, recrawlActor);
|
||||||
register(Actor.RECONVERT_LOAD, reconvertAndLoadActor);
|
register(Actor.RECONVERT_LOAD, reconvertAndLoadActor);
|
||||||
register(Actor.CONVERTER_MONITOR, converterMonitorFSM);
|
register(Actor.CONVERTER_MONITOR, converterMonitorFSM);
|
||||||
register(Actor.LOADER_MONITOR, loaderMonitor);
|
register(Actor.LOADER_MONITOR, loaderMonitor);
|
||||||
|
register(Actor.CRAWLER_MONITOR, crawlerMonitorActor);
|
||||||
register(Actor.MESSAGE_QUEUE_MONITOR, messageQueueMonitor);
|
register(Actor.MESSAGE_QUEUE_MONITOR, messageQueueMonitor);
|
||||||
register(Actor.PROCESS_LIVENESS_MONITOR, processMonitorFSM);
|
register(Actor.PROCESS_LIVENESS_MONITOR, processMonitorFSM);
|
||||||
register(Actor.FILE_STORAGE_MONITOR, fileStorageMonitorActor);
|
register(Actor.FILE_STORAGE_MONITOR, fileStorageMonitorActor);
|
||||||
@ -100,9 +109,6 @@ public class ControlActors {
|
|||||||
Map.Entry::getKey, e -> e.getValue().getState())
|
Map.Entry::getKey, e -> e.getValue().getState())
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
public MachineState getActorStates(Actor actor) {
|
|
||||||
return stateMachines.get(actor).getState();
|
|
||||||
}
|
|
||||||
|
|
||||||
public AbstractStateGraph getActorDefinition(Actor actor) {
|
public AbstractStateGraph getActorDefinition(Actor actor) {
|
||||||
return actorDefinitions.get(actor);
|
return actorDefinitions.get(actor);
|
||||||
|
@ -64,17 +64,28 @@ public class AbstractProcessSpawnerActor extends AbstractStateGraph {
|
|||||||
description = """
|
description = """
|
||||||
Monitors the inbox of the process for messages.
|
Monitors the inbox of the process for messages.
|
||||||
If a message is found, transition to RUN.
|
If a message is found, transition to RUN.
|
||||||
|
The state takes an optional Integer parameter errorAttempts
|
||||||
|
that is passed to run. errorAttempts is set to zero after
|
||||||
|
a few seconds of silence.
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
public void monitor() throws SQLException, InterruptedException {
|
public void monitor(Integer errorAttempts) throws SQLException, InterruptedException {
|
||||||
|
|
||||||
|
if (errorAttempts == null) {
|
||||||
|
errorAttempts = 0;
|
||||||
|
}
|
||||||
for (;;) {
|
for (;;) {
|
||||||
var messages = persistence.eavesdrop(inboxName, 1);
|
var messages = persistence.eavesdrop(inboxName, 1);
|
||||||
|
|
||||||
if (messages.isEmpty() && !processService.isRunning(processId)) {
|
if (messages.isEmpty() && !processService.isRunning(processId)) {
|
||||||
TimeUnit.SECONDS.sleep(5);
|
TimeUnit.SECONDS.sleep(5);
|
||||||
|
|
||||||
|
if (errorAttempts > 0) { // Reset the error counter if there is silence in the inbox
|
||||||
|
transition(MONITOR, 0);
|
||||||
|
}
|
||||||
|
// else continue
|
||||||
} else {
|
} else {
|
||||||
transition(RUN);
|
transition(RUN, errorAttempts);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -87,7 +98,7 @@ public class AbstractProcessSpawnerActor extends AbstractStateGraph {
|
|||||||
If the process fails, retransition to RUN up to MAX_ATTEMPTS times.
|
If the process fails, retransition to RUN up to MAX_ATTEMPTS times.
|
||||||
After MAX_ATTEMPTS at restarting the process, transition to ERROR.
|
After MAX_ATTEMPTS at restarting the process, transition to ERROR.
|
||||||
If the process is cancelled, transition to ABORTED.
|
If the process is cancelled, transition to ABORTED.
|
||||||
If the process is successful, transition to MONITOR.
|
If the process is successful, transition to MONITOR(errorAttempts).
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
public void run(Integer attempts) throws Exception {
|
public void run(Integer attempts) throws Exception {
|
||||||
@ -108,7 +119,7 @@ public class AbstractProcessSpawnerActor extends AbstractStateGraph {
|
|||||||
transition(ABORTED);
|
transition(ABORTED);
|
||||||
}
|
}
|
||||||
|
|
||||||
transition(MONITOR);
|
transition(MONITOR, attempts);
|
||||||
}
|
}
|
||||||
|
|
||||||
@TerminalState(name = ABORTED, description = "The process was manually aborted")
|
@TerminalState(name = ABORTED, description = "The process was manually aborted")
|
||||||
|
@ -0,0 +1,25 @@
|
|||||||
|
package nu.marginalia.control.actor.monitor;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.control.svc.ProcessService;
|
||||||
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
|
import nu.marginalia.mqapi.ProcessInboxNames;
|
||||||
|
import nu.marginalia.mqsm.StateFactory;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class CrawlerMonitorActor extends AbstractProcessSpawnerActor {
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public CrawlerMonitorActor(StateFactory stateFactory,
|
||||||
|
MqPersistence persistence,
|
||||||
|
ProcessService processService) {
|
||||||
|
super(stateFactory,
|
||||||
|
persistence,
|
||||||
|
processService,
|
||||||
|
ProcessInboxNames.CRAWLER_INBOX,
|
||||||
|
ProcessService.ProcessId.CRAWLER);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,171 @@
|
|||||||
|
package nu.marginalia.control.actor.task;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.With;
|
||||||
|
import nu.marginalia.control.svc.ProcessOutboxFactory;
|
||||||
|
import nu.marginalia.control.svc.ProcessService;
|
||||||
|
import nu.marginalia.db.storage.FileStorageService;
|
||||||
|
import nu.marginalia.db.storage.model.FileStorageBaseType;
|
||||||
|
import nu.marginalia.db.storage.model.FileStorageId;
|
||||||
|
import nu.marginalia.db.storage.model.FileStorageType;
|
||||||
|
import nu.marginalia.index.client.IndexClient;
|
||||||
|
import nu.marginalia.index.client.IndexMqEndpoints;
|
||||||
|
import nu.marginalia.mq.MqMessage;
|
||||||
|
import nu.marginalia.mq.MqMessageState;
|
||||||
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
|
import nu.marginalia.mqapi.converting.ConvertRequest;
|
||||||
|
import nu.marginalia.mqapi.crawling.CrawlRequest;
|
||||||
|
import nu.marginalia.mqapi.loading.LoadRequest;
|
||||||
|
import nu.marginalia.mqsm.StateFactory;
|
||||||
|
import nu.marginalia.mqsm.graph.AbstractStateGraph;
|
||||||
|
import nu.marginalia.mqsm.graph.GraphState;
|
||||||
|
import nu.marginalia.mqsm.graph.ResumeBehavior;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.StandardCopyOption;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.TimeoutException;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class CrawlActor extends AbstractStateGraph {
|
||||||
|
|
||||||
|
// STATES
|
||||||
|
|
||||||
|
public static final String INITIAL = "INITIAL";
|
||||||
|
public static final String CRAWL = "CRAWL";
|
||||||
|
public static final String CRAWL_WAIT = "CRAWL-WAIT";
|
||||||
|
public static final String END = "END";
|
||||||
|
private final ProcessService processService;
|
||||||
|
private final MqOutbox mqCrawlerOutbox;
|
||||||
|
private final FileStorageService storageService;
|
||||||
|
private final Gson gson;
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
|
||||||
|
@AllArgsConstructor @With @NoArgsConstructor
|
||||||
|
public static class Message {
|
||||||
|
public FileStorageId crawlSpecId = null;
|
||||||
|
public FileStorageId crawlStorageId = null;
|
||||||
|
public long crawlerMsgId = 0L;
|
||||||
|
};
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public CrawlActor(StateFactory stateFactory,
|
||||||
|
ProcessService processService,
|
||||||
|
ProcessOutboxFactory processOutboxFactory,
|
||||||
|
FileStorageService storageService,
|
||||||
|
Gson gson
|
||||||
|
)
|
||||||
|
{
|
||||||
|
super(stateFactory);
|
||||||
|
this.processService = processService;
|
||||||
|
this.mqCrawlerOutbox = processOutboxFactory.createCrawlerOutbox();
|
||||||
|
this.storageService = storageService;
|
||||||
|
this.gson = gson;
|
||||||
|
}
|
||||||
|
|
||||||
|
@GraphState(name = INITIAL,
|
||||||
|
next = CRAWL,
|
||||||
|
description = """
|
||||||
|
Validate the input and transition to CRAWL
|
||||||
|
""")
|
||||||
|
public Message init(FileStorageId crawlStorageId) throws Exception {
|
||||||
|
if (null == crawlStorageId) {
|
||||||
|
error("This Actor requires a FileStorageId to be passed in as a parameter to INITIAL");
|
||||||
|
}
|
||||||
|
|
||||||
|
var storage = storageService.getStorage(crawlStorageId);
|
||||||
|
|
||||||
|
if (storage == null) error("Bad storage id");
|
||||||
|
if (storage.type() != FileStorageType.CRAWL_SPEC) error("Bad storage type " + storage.type());
|
||||||
|
|
||||||
|
return new Message().withCrawlSpecId(crawlStorageId);
|
||||||
|
}
|
||||||
|
|
||||||
|
@GraphState(name = CRAWL,
|
||||||
|
next = CRAWL_WAIT,
|
||||||
|
resume = ResumeBehavior.ERROR,
|
||||||
|
description = """
|
||||||
|
Allocate a storage area for the crawled data,
|
||||||
|
then send a crawl request to the crawler and transition to CRAWL_WAIT.
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
public Message crawl(Message message) throws Exception {
|
||||||
|
// Create processed data area
|
||||||
|
|
||||||
|
var toCrawl = storageService.getStorage(message.crawlSpecId);
|
||||||
|
|
||||||
|
var base = storageService.getStorageBase(FileStorageBaseType.SLOW);
|
||||||
|
var dataArea = storageService.allocateTemporaryStorage(
|
||||||
|
base,
|
||||||
|
FileStorageType.CRAWL_DATA,
|
||||||
|
"crawl-data",
|
||||||
|
toCrawl.description());
|
||||||
|
|
||||||
|
storageService.relateFileStorages(toCrawl.id(), dataArea.id());
|
||||||
|
|
||||||
|
// Pre-send convert request
|
||||||
|
var request = new CrawlRequest(message.crawlSpecId, dataArea.id());
|
||||||
|
long id = mqCrawlerOutbox.sendAsync(CrawlRequest.class.getSimpleName(), gson.toJson(request));
|
||||||
|
|
||||||
|
return message
|
||||||
|
.withCrawlStorageId(dataArea.id())
|
||||||
|
.withCrawlerMsgId(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
@GraphState(
|
||||||
|
name = CRAWL_WAIT,
|
||||||
|
next = END,
|
||||||
|
resume = ResumeBehavior.RETRY,
|
||||||
|
description = """
|
||||||
|
Wait for the crawler to finish retreiving the data.
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
public Message crawlerWait(Message message) throws Exception {
|
||||||
|
var rsp = waitResponse(mqCrawlerOutbox, ProcessService.ProcessId.CRAWLER, message.crawlerMsgId);
|
||||||
|
|
||||||
|
if (rsp.state() != MqMessageState.OK)
|
||||||
|
error("Crawler failed");
|
||||||
|
|
||||||
|
return message;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public MqMessage waitResponse(MqOutbox outbox, ProcessService.ProcessId processId, long id) throws Exception {
|
||||||
|
if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) {
|
||||||
|
error("Process " + processId + " did not launch");
|
||||||
|
}
|
||||||
|
for (;;) {
|
||||||
|
try {
|
||||||
|
return outbox.waitResponse(id, 1, TimeUnit.SECONDS);
|
||||||
|
}
|
||||||
|
catch (TimeoutException ex) {
|
||||||
|
// Maybe the process died, wait a moment for it to restart
|
||||||
|
if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) {
|
||||||
|
error("Process " + processId + " died and did not re-launch");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean waitForProcess(ProcessService.ProcessId processId, TimeUnit unit, int duration) throws InterruptedException {
|
||||||
|
|
||||||
|
// Wait for process to start
|
||||||
|
long deadline = System.currentTimeMillis() + unit.toMillis(duration);
|
||||||
|
while (System.currentTimeMillis() < deadline) {
|
||||||
|
if (processService.isRunning(processId))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
TimeUnit.SECONDS.sleep(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -118,6 +118,8 @@ public class ReconvertAndLoadActor extends AbstractStateGraph {
|
|||||||
var processedArea = storageService.allocateTemporaryStorage(base, FileStorageType.PROCESSED_DATA, "processed-data",
|
var processedArea = storageService.allocateTemporaryStorage(base, FileStorageType.PROCESSED_DATA, "processed-data",
|
||||||
"Processed Data; " + toProcess.description());
|
"Processed Data; " + toProcess.description());
|
||||||
|
|
||||||
|
storageService.relateFileStorages(toProcess.id(), processedArea.id());
|
||||||
|
|
||||||
// Pre-send convert request
|
// Pre-send convert request
|
||||||
var request = new ConvertRequest(message.crawlStorageId, processedArea.id());
|
var request = new ConvertRequest(message.crawlStorageId, processedArea.id());
|
||||||
long id = mqConverterOutbox.sendAsync(ConvertRequest.class.getSimpleName(), gson.toJson(request));
|
long id = mqConverterOutbox.sendAsync(ConvertRequest.class.getSimpleName(), gson.toJson(request));
|
||||||
|
@ -0,0 +1,185 @@
|
|||||||
|
package nu.marginalia.control.actor.task;
|
||||||
|
|
||||||
|
import com.google.gson.Gson;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
|
import lombok.AllArgsConstructor;
|
||||||
|
import lombok.NoArgsConstructor;
|
||||||
|
import lombok.With;
|
||||||
|
import nu.marginalia.control.svc.ProcessOutboxFactory;
|
||||||
|
import nu.marginalia.control.svc.ProcessService;
|
||||||
|
import nu.marginalia.db.storage.FileStorageService;
|
||||||
|
import nu.marginalia.db.storage.model.FileStorage;
|
||||||
|
import nu.marginalia.db.storage.model.FileStorageId;
|
||||||
|
import nu.marginalia.db.storage.model.FileStorageType;
|
||||||
|
import nu.marginalia.index.client.IndexClient;
|
||||||
|
import nu.marginalia.mq.MqMessage;
|
||||||
|
import nu.marginalia.mq.MqMessageState;
|
||||||
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
|
import nu.marginalia.mqapi.crawling.CrawlRequest;
|
||||||
|
import nu.marginalia.mqsm.StateFactory;
|
||||||
|
import nu.marginalia.mqsm.graph.AbstractStateGraph;
|
||||||
|
import nu.marginalia.mqsm.graph.GraphState;
|
||||||
|
import nu.marginalia.mqsm.graph.ResumeBehavior;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.TimeoutException;
|
||||||
|
|
||||||
|
@Singleton
|
||||||
|
public class RecrawlActor extends AbstractStateGraph {
|
||||||
|
|
||||||
|
// STATES
|
||||||
|
|
||||||
|
public static final String INITIAL = "INITIAL";
|
||||||
|
public static final String CRAWL = "CRAWL";
|
||||||
|
public static final String CRAWL_WAIT = "CRAWL-WAIT";
|
||||||
|
public static final String END = "END";
|
||||||
|
private final ProcessService processService;
|
||||||
|
private final MqOutbox mqCrawlerOutbox;
|
||||||
|
private final FileStorageService storageService;
|
||||||
|
private final Gson gson;
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
|
||||||
|
@AllArgsConstructor @With @NoArgsConstructor
|
||||||
|
public static class RecrawlMessage {
|
||||||
|
public FileStorageId crawlSpecId = null;
|
||||||
|
public FileStorageId crawlStorageId = null;
|
||||||
|
public long crawlerMsgId = 0L;
|
||||||
|
};
|
||||||
|
|
||||||
|
public static RecrawlMessage recrawlFromCrawlData(FileStorageId crawlData) {
|
||||||
|
return new RecrawlMessage(null, crawlData, 0L);
|
||||||
|
}
|
||||||
|
public static RecrawlMessage recrawlFromCrawlDataAndCralSpec(FileStorageId crawlData, FileStorageId crawlSpec) {
|
||||||
|
return new RecrawlMessage(crawlSpec, crawlData, 0L);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public RecrawlActor(StateFactory stateFactory,
|
||||||
|
ProcessService processService,
|
||||||
|
ProcessOutboxFactory processOutboxFactory,
|
||||||
|
FileStorageService storageService,
|
||||||
|
Gson gson
|
||||||
|
)
|
||||||
|
{
|
||||||
|
super(stateFactory);
|
||||||
|
this.processService = processService;
|
||||||
|
this.mqCrawlerOutbox = processOutboxFactory.createCrawlerOutbox();
|
||||||
|
this.storageService = storageService;
|
||||||
|
this.gson = gson;
|
||||||
|
}
|
||||||
|
|
||||||
|
@GraphState(name = INITIAL,
|
||||||
|
next = CRAWL,
|
||||||
|
description = """
|
||||||
|
Validate the input and transition to CRAWL
|
||||||
|
""")
|
||||||
|
public RecrawlMessage init(RecrawlMessage recrawlMessage) throws Exception {
|
||||||
|
if (null == recrawlMessage) {
|
||||||
|
error("This Actor requires a message as an argument");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
var crawlStorage = storageService.getStorage(recrawlMessage.crawlStorageId);
|
||||||
|
FileStorage specStorage;
|
||||||
|
|
||||||
|
if (recrawlMessage.crawlSpecId != null) {
|
||||||
|
specStorage = storageService.getStorage(recrawlMessage.crawlSpecId);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
specStorage = getSpec(crawlStorage).orElse(null);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (specStorage == null) error("Bad storage id");
|
||||||
|
if (specStorage.type() != FileStorageType.CRAWL_SPEC) error("Bad storage type " + specStorage.type());
|
||||||
|
if (crawlStorage == null) error("Bad storage id");
|
||||||
|
if (crawlStorage.type() != FileStorageType.CRAWL_DATA) error("Bad storage type " + specStorage.type());
|
||||||
|
|
||||||
|
Files.deleteIfExists(crawlStorage.asPath().resolve("crawler.log"));
|
||||||
|
|
||||||
|
return recrawlMessage
|
||||||
|
.withCrawlSpecId(specStorage.id());
|
||||||
|
}
|
||||||
|
|
||||||
|
private Optional<FileStorage> getSpec(FileStorage crawlStorage) throws SQLException {
|
||||||
|
return storageService.getSourceFromStorage(crawlStorage)
|
||||||
|
.stream()
|
||||||
|
.filter(storage -> storage.type().equals(FileStorageType.CRAWL_SPEC))
|
||||||
|
.findFirst();
|
||||||
|
}
|
||||||
|
|
||||||
|
@GraphState(name = CRAWL,
|
||||||
|
next = CRAWL_WAIT,
|
||||||
|
resume = ResumeBehavior.ERROR,
|
||||||
|
description = """
|
||||||
|
Send a crawl request to the crawler and transition to CRAWL_WAIT.
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
public RecrawlMessage crawl(RecrawlMessage recrawlMessage) throws Exception {
|
||||||
|
// Create processed data area
|
||||||
|
|
||||||
|
var toCrawl = storageService.getStorage(recrawlMessage.crawlSpecId);
|
||||||
|
|
||||||
|
// Pre-send crawl request
|
||||||
|
var request = new CrawlRequest(recrawlMessage.crawlSpecId, recrawlMessage.crawlStorageId);
|
||||||
|
long id = mqCrawlerOutbox.sendAsync(CrawlRequest.class.getSimpleName(), gson.toJson(request));
|
||||||
|
|
||||||
|
return recrawlMessage.withCrawlerMsgId(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
@GraphState(
|
||||||
|
name = CRAWL_WAIT,
|
||||||
|
next = END,
|
||||||
|
resume = ResumeBehavior.RETRY,
|
||||||
|
description = """
|
||||||
|
Wait for the crawler to finish retreiving the data.
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
public RecrawlMessage crawlerWait(RecrawlMessage recrawlMessage) throws Exception {
|
||||||
|
var rsp = waitResponse(mqCrawlerOutbox, ProcessService.ProcessId.CRAWLER, recrawlMessage.crawlerMsgId);
|
||||||
|
|
||||||
|
if (rsp.state() != MqMessageState.OK)
|
||||||
|
error("Crawler failed");
|
||||||
|
|
||||||
|
return recrawlMessage;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public MqMessage waitResponse(MqOutbox outbox, ProcessService.ProcessId processId, long id) throws Exception {
|
||||||
|
if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) {
|
||||||
|
error("Process " + processId + " did not launch");
|
||||||
|
}
|
||||||
|
for (;;) {
|
||||||
|
try {
|
||||||
|
return outbox.waitResponse(id, 1, TimeUnit.SECONDS);
|
||||||
|
}
|
||||||
|
catch (TimeoutException ex) {
|
||||||
|
// Maybe the process died, wait a moment for it to restart
|
||||||
|
if (!waitForProcess(processId, TimeUnit.SECONDS, 30)) {
|
||||||
|
error("Process " + processId + " died and did not re-launch");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean waitForProcess(ProcessService.ProcessId processId, TimeUnit unit, int duration) throws InterruptedException {
|
||||||
|
|
||||||
|
// Wait for process to start
|
||||||
|
long deadline = System.currentTimeMillis() + unit.toMillis(duration);
|
||||||
|
while (System.currentTimeMillis() < deadline) {
|
||||||
|
if (processService.isRunning(processId))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
TimeUnit.SECONDS.sleep(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,9 +1,12 @@
|
|||||||
package nu.marginalia.control.model;
|
package nu.marginalia.control.model;
|
||||||
|
|
||||||
public enum Actor {
|
public enum Actor {
|
||||||
|
CRAWL,
|
||||||
|
RECRAWL,
|
||||||
RECONVERT_LOAD,
|
RECONVERT_LOAD,
|
||||||
CONVERTER_MONITOR,
|
CONVERTER_MONITOR,
|
||||||
LOADER_MONITOR,
|
LOADER_MONITOR,
|
||||||
|
CRAWLER_MONITOR,
|
||||||
MESSAGE_QUEUE_MONITOR,
|
MESSAGE_QUEUE_MONITOR,
|
||||||
PROCESS_LIVENESS_MONITOR,
|
PROCESS_LIVENESS_MONITOR,
|
||||||
FILE_STORAGE_MONITOR
|
FILE_STORAGE_MONITOR
|
||||||
|
@ -4,6 +4,13 @@ import nu.marginalia.db.storage.model.FileStorage;
|
|||||||
import nu.marginalia.db.storage.model.FileStorageType;
|
import nu.marginalia.db.storage.model.FileStorageType;
|
||||||
|
|
||||||
public record FileStorageWithActions(FileStorage storage) {
|
public record FileStorageWithActions(FileStorage storage) {
|
||||||
|
public boolean isCrawlable() {
|
||||||
|
return storage.type() == FileStorageType.CRAWL_SPEC;
|
||||||
|
}
|
||||||
|
public boolean isRecrawlable() {
|
||||||
|
return storage.type() == FileStorageType.CRAWL_DATA;
|
||||||
|
}
|
||||||
|
|
||||||
public boolean isLoadable() {
|
public boolean isLoadable() {
|
||||||
return storage.type() == FileStorageType.PROCESSED_DATA;
|
return storage.type() == FileStorageType.PROCESSED_DATA;
|
||||||
}
|
}
|
||||||
|
@ -4,6 +4,7 @@ import com.google.inject.Inject;
|
|||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.control.actor.ControlActors;
|
import nu.marginalia.control.actor.ControlActors;
|
||||||
import nu.marginalia.control.actor.task.ReconvertAndLoadActor;
|
import nu.marginalia.control.actor.task.ReconvertAndLoadActor;
|
||||||
|
import nu.marginalia.control.actor.task.RecrawlActor;
|
||||||
import nu.marginalia.control.model.Actor;
|
import nu.marginalia.control.model.Actor;
|
||||||
import nu.marginalia.control.model.ActorRunState;
|
import nu.marginalia.control.model.ActorRunState;
|
||||||
import nu.marginalia.control.model.ActorStateGraph;
|
import nu.marginalia.control.model.ActorStateGraph;
|
||||||
@ -43,16 +44,33 @@ public class ControlActorService {
|
|||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Object triggerCrawling(Request request, Response response) throws Exception {
|
||||||
|
controlActors.start(
|
||||||
|
Actor.CRAWL,
|
||||||
|
FileStorageId.parse(request.params("fid"))
|
||||||
|
);
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object triggerRecrawling(Request request, Response response) throws Exception {
|
||||||
|
controlActors.start(
|
||||||
|
Actor.RECRAWL,
|
||||||
|
RecrawlActor.recrawlFromCrawlData(
|
||||||
|
FileStorageId.parse(request.params("fid"))
|
||||||
|
)
|
||||||
|
);
|
||||||
|
return "";
|
||||||
|
}
|
||||||
public Object triggerProcessing(Request request, Response response) throws Exception {
|
public Object triggerProcessing(Request request, Response response) throws Exception {
|
||||||
controlActors.start(
|
controlActors.start(
|
||||||
Actor.RECONVERT_LOAD,
|
Actor.RECONVERT_LOAD,
|
||||||
FileStorageId.of(Integer.parseInt(request.params("fid")))
|
FileStorageId.parse(request.params("fid"))
|
||||||
);
|
);
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
||||||
public Object loadProcessedData(Request request, Response response) throws Exception {
|
public Object loadProcessedData(Request request, Response response) throws Exception {
|
||||||
var fid = FileStorageId.of(Integer.parseInt(request.params("fid")));
|
var fid = FileStorageId.parse(request.params("fid"));
|
||||||
|
|
||||||
// Start the FSM from the intermediate state that triggers the load
|
// Start the FSM from the intermediate state that triggers the load
|
||||||
controlActors.startFrom(
|
controlActors.startFrom(
|
||||||
|
@ -24,4 +24,8 @@ public class ProcessOutboxFactory {
|
|||||||
public MqOutbox createLoaderOutbox() {
|
public MqOutbox createLoaderOutbox() {
|
||||||
return new MqOutbox(persistence, ProcessInboxNames.LOADER_INBOX, params.configuration.serviceName(), params.configuration.instanceUuid());
|
return new MqOutbox(persistence, ProcessInboxNames.LOADER_INBOX, params.configuration.serviceName(), params.configuration.instanceUuid());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public MqOutbox createCrawlerOutbox() {
|
||||||
|
return new MqOutbox(persistence, ProcessInboxNames.CRAWLER_INBOX, params.configuration.serviceName(), params.configuration.instanceUuid());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -34,6 +34,11 @@
|
|||||||
{{#each storage}}
|
{{#each storage}}
|
||||||
<tr>
|
<tr>
|
||||||
<td>
|
<td>
|
||||||
|
{{#if isCrawlable}}
|
||||||
|
<form method="post" action="/storage/{{storage.id}}/crawl">
|
||||||
|
<button type="submit">Crawl</button>
|
||||||
|
</form>
|
||||||
|
{{/if}}
|
||||||
{{#if isLoadable}}
|
{{#if isLoadable}}
|
||||||
<form method="post" action="/storage/{{storage.id}}/load">
|
<form method="post" action="/storage/{{storage.id}}/load">
|
||||||
<button type="submit">Load</button>
|
<button type="submit">Load</button>
|
||||||
@ -44,6 +49,11 @@
|
|||||||
<button type="submit">Process</button>
|
<button type="submit">Process</button>
|
||||||
</form>
|
</form>
|
||||||
{{/if}}
|
{{/if}}
|
||||||
|
{{#if isRecrawlable}}
|
||||||
|
<form method="post" action="/storage/{{storage.id}}/recrawl">
|
||||||
|
<button type="submit">Recrawl</button>
|
||||||
|
</form>
|
||||||
|
{{/if}}
|
||||||
{{#if isDeletable}}
|
{{#if isDeletable}}
|
||||||
<form method="post" action="/storage/{{storage.id}}/delete" onsubmit="return confirm('Confirm deletion of {{storage.path}}')">
|
<form method="post" action="/storage/{{storage.id}}/delete" onsubmit="return confirm('Confirm deletion of {{storage.path}}')">
|
||||||
<button type="submit">Delete</button>
|
<button type="submit">Delete</button>
|
||||||
|
@ -31,9 +31,9 @@ public class CrawlJobSpecWriterTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testReadWrite() throws IOException {
|
public void testReadWrite() throws IOException {
|
||||||
try (CrawlJobSpecWriter writer = new CrawlJobSpecWriter(tempFile)) {
|
try (CrawlJobSpecWriter writer = new CrawlJobSpecWriter(tempFile)) {
|
||||||
writer.accept(new CrawlingSpecification("first",1, "test1", List.of("a", "b", "c")));
|
writer.accept(new CrawlingSpecification("first",1, "test1", List.of("a", "b", "c"), null));
|
||||||
writer.accept(new CrawlingSpecification("second",1, "test2", List.of("a", "b", "c", "d")));
|
writer.accept(new CrawlingSpecification("second",1, "test2", List.of("a", "b", "c", "d"), null));
|
||||||
writer.accept(new CrawlingSpecification("third",1, "test3", List.of("a", "b")));
|
writer.accept(new CrawlingSpecification("third",1, "test3", List.of("a", "b"), null));
|
||||||
}
|
}
|
||||||
|
|
||||||
List<CrawlingSpecification> outputs = new ArrayList<>();
|
List<CrawlingSpecification> outputs = new ArrayList<>();
|
||||||
|
3
run/env/service.env
vendored
3
run/env/service.env
vendored
@ -1,3 +1,4 @@
|
|||||||
WMSA_HOME=run/
|
WMSA_HOME=run/
|
||||||
CONTROL_SERVICE_OPTS="-DdistPath=/dist"
|
CONTROL_SERVICE_OPTS="-DdistPath=/dist"
|
||||||
CONVERTER_OPTS="-ea -Xmx16G -XX:-CompactStrings -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15"
|
CONVERTER_OPTS="-ea -Xmx16G -XX:-CompactStrings -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15"
|
||||||
|
CRAWLER_OPTS="-Xmx16G -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15"
|
Loading…
Reference in New Issue
Block a user