mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(crawler) Improved feed discovery, new domain state db per crawlset
Feed discover is improved with by probing a few likely endpoints when no feed link tag is provided. To store the feed URLs, a sqlite database is added to each crawlset that stores a simple summary of the crawl job, including any feed URLs that have been discovered. Solves issue #135
This commit is contained in:
parent
4bb71b8439
commit
895cee7004
@ -85,7 +85,7 @@ class BTreeWriterTest {
|
|||||||
public void testWriteEntrySize2() throws IOException {
|
public void testWriteEntrySize2() throws IOException {
|
||||||
BTreeContext ctx = new BTreeContext(4, 2, BTreeBlockSize.BS_64);
|
BTreeContext ctx = new BTreeContext(4, 2, BTreeBlockSize.BS_64);
|
||||||
|
|
||||||
var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat");
|
var tempFile = Files.createTempFile("tst", "dat");
|
||||||
|
|
||||||
int[] data = generateItems32(64);
|
int[] data = generateItems32(64);
|
||||||
|
|
||||||
|
@ -7,6 +7,7 @@ import nu.marginalia.WmsaHome;
|
|||||||
import nu.marginalia.converting.model.ProcessedDomain;
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
import nu.marginalia.crawl.CrawlerMain;
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
@ -46,6 +47,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
|
|
||||||
private Path fileName;
|
private Path fileName;
|
||||||
private Path fileName2;
|
private Path fileName2;
|
||||||
|
private Path dbTempFile;
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
public static void setUpAll() {
|
public static void setUpAll() {
|
||||||
@ -63,16 +65,18 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
httpFetcher = new HttpFetcherImpl(WmsaHome.getUserAgent().uaString());
|
httpFetcher = new HttpFetcherImpl(WmsaHome.getUserAgent().uaString());
|
||||||
this.fileName = Files.createTempFile("crawling-then-converting", ".warc.gz");
|
this.fileName = Files.createTempFile("crawling-then-converting", ".warc.gz");
|
||||||
this.fileName2 = Files.createTempFile("crawling-then-converting", ".warc.gz");
|
this.fileName2 = Files.createTempFile("crawling-then-converting", ".warc.gz");
|
||||||
|
this.dbTempFile = Files.createTempFile("domains", "db");
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterEach
|
@AfterEach
|
||||||
public void tearDown() throws IOException {
|
public void tearDown() throws IOException {
|
||||||
Files.deleteIfExists(fileName);
|
Files.deleteIfExists(fileName);
|
||||||
Files.deleteIfExists(fileName2);
|
Files.deleteIfExists(fileName2);
|
||||||
|
Files.deleteIfExists(dbTempFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testInvalidDomain() throws IOException {
|
public void testInvalidDomain() throws Exception {
|
||||||
// Attempt to fetch an invalid domain
|
// Attempt to fetch an invalid domain
|
||||||
var specs = new CrawlerMain.CrawlSpecRecord("invalid.invalid.invalid", 10);
|
var specs = new CrawlerMain.CrawlSpecRecord("invalid.invalid.invalid", 10);
|
||||||
|
|
||||||
@ -88,7 +92,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testRedirectingDomain() throws IOException {
|
public void testRedirectingDomain() throws Exception {
|
||||||
// Attempt to fetch an invalid domain
|
// Attempt to fetch an invalid domain
|
||||||
var specs = new CrawlerMain.CrawlSpecRecord("memex.marginalia.nu", 10);
|
var specs = new CrawlerMain.CrawlSpecRecord("memex.marginalia.nu", 10);
|
||||||
|
|
||||||
@ -107,7 +111,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testBlockedDomain() throws IOException {
|
public void testBlockedDomain() throws Exception {
|
||||||
// Attempt to fetch an invalid domain
|
// Attempt to fetch an invalid domain
|
||||||
var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 10);
|
var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 10);
|
||||||
|
|
||||||
@ -124,7 +128,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void crawlSunnyDay() throws IOException {
|
public void crawlSunnyDay() throws Exception {
|
||||||
var specs = new CrawlerMain.CrawlSpecRecord("www.marginalia.nu", 10);
|
var specs = new CrawlerMain.CrawlSpecRecord("www.marginalia.nu", 10);
|
||||||
|
|
||||||
CrawledDomain domain = crawl(specs);
|
CrawledDomain domain = crawl(specs);
|
||||||
@ -157,7 +161,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void crawlContentTypes() throws IOException {
|
public void crawlContentTypes() throws Exception {
|
||||||
var specs = new CrawlerMain.CrawlSpecRecord("www.marginalia.nu", 10,
|
var specs = new CrawlerMain.CrawlSpecRecord("www.marginalia.nu", 10,
|
||||||
List.of(
|
List.of(
|
||||||
"https://www.marginalia.nu/sanic.png",
|
"https://www.marginalia.nu/sanic.png",
|
||||||
@ -195,7 +199,7 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void crawlRobotsTxt() throws IOException {
|
public void crawlRobotsTxt() throws Exception {
|
||||||
var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 5,
|
var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 5,
|
||||||
List.of("https://search.marginalia.nu/search?q=hello+world")
|
List.of("https://search.marginalia.nu/search?q=hello+world")
|
||||||
);
|
);
|
||||||
@ -235,15 +239,17 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
return null; // unreachable
|
return null; // unreachable
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs) throws IOException {
|
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs) throws Exception {
|
||||||
return crawl(specs, domain -> true);
|
return crawl(specs, domain -> true);
|
||||||
}
|
}
|
||||||
|
|
||||||
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws IOException {
|
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws Exception {
|
||||||
List<SerializableCrawlData> data = new ArrayList<>();
|
List<SerializableCrawlData> data = new ArrayList<>();
|
||||||
|
|
||||||
try (var recorder = new WarcRecorder(fileName)) {
|
try (var recorder = new WarcRecorder(fileName);
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).crawlDomain();
|
var db = new DomainStateDb(dbTempFile))
|
||||||
|
{
|
||||||
|
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain();
|
||||||
}
|
}
|
||||||
|
|
||||||
CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain(),
|
CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain(),
|
||||||
|
@ -46,6 +46,8 @@ dependencies {
|
|||||||
|
|
||||||
implementation libs.notnull
|
implementation libs.notnull
|
||||||
implementation libs.guava
|
implementation libs.guava
|
||||||
|
implementation libs.sqlite
|
||||||
|
|
||||||
implementation dependencies.create(libs.guice.get()) {
|
implementation dependencies.create(libs.guice.get()) {
|
||||||
exclude group: 'com.google.guava'
|
exclude group: 'com.google.guava'
|
||||||
}
|
}
|
||||||
|
@ -241,6 +241,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
|
|
||||||
// Set up the work log and the warc archiver so we can keep track of what we've done
|
// Set up the work log and the warc archiver so we can keep track of what we've done
|
||||||
try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler.log"));
|
try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler.log"));
|
||||||
|
DomainStateDb domainStateDb = new DomainStateDb(outputDir.resolve("domainstate.db"));
|
||||||
WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir);
|
WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir);
|
||||||
AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(domainsToCrawl)
|
AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(domainsToCrawl)
|
||||||
) {
|
) {
|
||||||
@ -258,6 +259,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
anchorTagsSource,
|
anchorTagsSource,
|
||||||
outputDir,
|
outputDir,
|
||||||
warcArchiver,
|
warcArchiver,
|
||||||
|
domainStateDb,
|
||||||
workLog);
|
workLog);
|
||||||
|
|
||||||
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) == null) {
|
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) == null) {
|
||||||
@ -299,11 +301,12 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
heartbeat.start();
|
heartbeat.start();
|
||||||
|
|
||||||
try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler-" + targetDomainName.replace('/', '-') + ".log"));
|
try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler-" + targetDomainName.replace('/', '-') + ".log"));
|
||||||
|
DomainStateDb domainStateDb = new DomainStateDb(outputDir.resolve("domainstate.db"));
|
||||||
WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir);
|
WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir);
|
||||||
AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(List.of(new EdgeDomain(targetDomainName)))
|
AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(List.of(new EdgeDomain(targetDomainName)))
|
||||||
) {
|
) {
|
||||||
var spec = new CrawlSpecRecord(targetDomainName, 1000, List.of());
|
var spec = new CrawlSpecRecord(targetDomainName, 1000, List.of());
|
||||||
var task = new CrawlTask(spec, anchorTagsSource, outputDir, warcArchiver, workLog);
|
var task = new CrawlTask(spec, anchorTagsSource, outputDir, warcArchiver, domainStateDb, workLog);
|
||||||
task.run();
|
task.run();
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
@ -324,18 +327,21 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
private final AnchorTagsSource anchorTagsSource;
|
private final AnchorTagsSource anchorTagsSource;
|
||||||
private final Path outputDir;
|
private final Path outputDir;
|
||||||
private final WarcArchiverIf warcArchiver;
|
private final WarcArchiverIf warcArchiver;
|
||||||
|
private final DomainStateDb domainStateDb;
|
||||||
private final WorkLog workLog;
|
private final WorkLog workLog;
|
||||||
|
|
||||||
CrawlTask(CrawlSpecRecord specification,
|
CrawlTask(CrawlSpecRecord specification,
|
||||||
AnchorTagsSource anchorTagsSource,
|
AnchorTagsSource anchorTagsSource,
|
||||||
Path outputDir,
|
Path outputDir,
|
||||||
WarcArchiverIf warcArchiver,
|
WarcArchiverIf warcArchiver,
|
||||||
|
DomainStateDb domainStateDb,
|
||||||
WorkLog workLog)
|
WorkLog workLog)
|
||||||
{
|
{
|
||||||
this.specification = specification;
|
this.specification = specification;
|
||||||
this.anchorTagsSource = anchorTagsSource;
|
this.anchorTagsSource = anchorTagsSource;
|
||||||
this.outputDir = outputDir;
|
this.outputDir = outputDir;
|
||||||
this.warcArchiver = warcArchiver;
|
this.warcArchiver = warcArchiver;
|
||||||
|
this.domainStateDb = domainStateDb;
|
||||||
this.workLog = workLog;
|
this.workLog = workLog;
|
||||||
|
|
||||||
this.domain = specification.domain();
|
this.domain = specification.domain();
|
||||||
@ -359,7 +365,7 @@ public class CrawlerMain extends ProcessMainClass {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
|
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
|
||||||
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder);
|
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
|
||||||
CrawlDataReference reference = getReference();
|
CrawlDataReference reference = getReference();
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
|
@ -0,0 +1,127 @@
|
|||||||
|
package nu.marginalia.crawl;
|
||||||
|
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import javax.annotation.Nullable;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.sql.Connection;
|
||||||
|
import java.sql.DriverManager;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.time.Instant;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
/** Supplemental sqlite database for storing the summary of a crawl.
|
||||||
|
* One database exists per crawl data set.
|
||||||
|
* */
|
||||||
|
public class DomainStateDb implements AutoCloseable {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(DomainStateDb.class);
|
||||||
|
|
||||||
|
private final Connection connection;
|
||||||
|
|
||||||
|
public record SummaryRecord(
|
||||||
|
String domainName,
|
||||||
|
Instant lastUpdated,
|
||||||
|
String state,
|
||||||
|
@Nullable String stateDesc,
|
||||||
|
@Nullable String feedUrl
|
||||||
|
)
|
||||||
|
{
|
||||||
|
public static SummaryRecord forSuccess(String domainName) {
|
||||||
|
return new SummaryRecord(domainName, Instant.now(), "OK", null, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static SummaryRecord forSuccess(String domainName, String feedUrl) {
|
||||||
|
return new SummaryRecord(domainName, Instant.now(), "OK", null, feedUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static SummaryRecord forError(String domainName, String state, String stateDesc) {
|
||||||
|
return new SummaryRecord(domainName, Instant.now(), state, stateDesc, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean equals(Object other) {
|
||||||
|
if (other == this) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (!(other instanceof SummaryRecord(String name, Instant updated, String state1, String desc, String url))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return domainName.equals(name) &&
|
||||||
|
lastUpdated.toEpochMilli() == updated.toEpochMilli() &&
|
||||||
|
state.equals(state1) &&
|
||||||
|
(stateDesc == null ? desc == null : stateDesc.equals(desc)) &&
|
||||||
|
(feedUrl == null ? url == null : feedUrl.equals(url));
|
||||||
|
}
|
||||||
|
|
||||||
|
public int hashCode() {
|
||||||
|
return domainName.hashCode() + Long.hashCode(lastUpdated.toEpochMilli());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public DomainStateDb(Path filename) throws SQLException {
|
||||||
|
String sqliteDbString = "jdbc:sqlite:" + filename.toString();
|
||||||
|
connection = DriverManager.getConnection(sqliteDbString);
|
||||||
|
|
||||||
|
try (var stmt = connection.createStatement()) {
|
||||||
|
stmt.executeUpdate("""
|
||||||
|
CREATE TABLE IF NOT EXISTS summary (
|
||||||
|
domain TEXT PRIMARY KEY,
|
||||||
|
lastUpdatedEpochMs LONG NOT NULL,
|
||||||
|
state TEXT NOT NULL,
|
||||||
|
stateDesc TEXT,
|
||||||
|
feedUrl TEXT
|
||||||
|
)
|
||||||
|
""");
|
||||||
|
|
||||||
|
stmt.execute("PRAGMA journal_mode=WAL");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws SQLException {
|
||||||
|
connection.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void save(SummaryRecord record) {
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
INSERT OR REPLACE INTO summary (domain, lastUpdatedEpochMs, state, stateDesc, feedUrl)
|
||||||
|
VALUES (?, ?, ?, ?, ?)
|
||||||
|
""")) {
|
||||||
|
stmt.setString(1, record.domainName());
|
||||||
|
stmt.setLong(2, record.lastUpdated().toEpochMilli());
|
||||||
|
stmt.setString(3, record.state());
|
||||||
|
stmt.setString(4, record.stateDesc());
|
||||||
|
stmt.setString(5, record.feedUrl());
|
||||||
|
stmt.executeUpdate();
|
||||||
|
} catch (SQLException e) {
|
||||||
|
logger.error("Failed to insert summary record", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Optional<SummaryRecord> get(String domainName) {
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
SELECT domain, lastUpdatedEpochMs, state, stateDesc, feedUrl
|
||||||
|
FROM summary
|
||||||
|
WHERE domain = ?
|
||||||
|
""")) {
|
||||||
|
stmt.setString(1, domainName);
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
if (rs.next()) {
|
||||||
|
return Optional.of(new SummaryRecord(
|
||||||
|
rs.getString("domain"),
|
||||||
|
Instant.ofEpochMilli(rs.getLong("lastUpdatedEpochMs")),
|
||||||
|
rs.getString("state"),
|
||||||
|
rs.getString("stateDesc"),
|
||||||
|
rs.getString("feedUrl")
|
||||||
|
));
|
||||||
|
}
|
||||||
|
} catch (SQLException e) {
|
||||||
|
logger.error("Failed to get summary record", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
}
|
@ -4,6 +4,7 @@ import crawlercommons.robots.SimpleRobotRules;
|
|||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
import nu.marginalia.contenttype.ContentType;
|
import nu.marginalia.contenttype.ContentType;
|
||||||
import nu.marginalia.crawl.CrawlerMain;
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
@ -16,7 +17,9 @@ import nu.marginalia.ip_blocklist.UrlBlocklist;
|
|||||||
import nu.marginalia.link_parser.LinkParser;
|
import nu.marginalia.link_parser.LinkParser;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
|
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -46,6 +49,7 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
|
|
||||||
private final DomainProber domainProber;
|
private final DomainProber domainProber;
|
||||||
private final DomainCrawlFrontier crawlFrontier;
|
private final DomainCrawlFrontier crawlFrontier;
|
||||||
|
private final DomainStateDb domainStateDb;
|
||||||
private final WarcRecorder warcRecorder;
|
private final WarcRecorder warcRecorder;
|
||||||
private final CrawlerRevisitor crawlerRevisitor;
|
private final CrawlerRevisitor crawlerRevisitor;
|
||||||
|
|
||||||
@ -55,8 +59,10 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
public CrawlerRetreiver(HttpFetcher fetcher,
|
public CrawlerRetreiver(HttpFetcher fetcher,
|
||||||
DomainProber domainProber,
|
DomainProber domainProber,
|
||||||
CrawlerMain.CrawlSpecRecord specs,
|
CrawlerMain.CrawlSpecRecord specs,
|
||||||
|
DomainStateDb domainStateDb,
|
||||||
WarcRecorder warcRecorder)
|
WarcRecorder warcRecorder)
|
||||||
{
|
{
|
||||||
|
this.domainStateDb = domainStateDb;
|
||||||
this.warcRecorder = warcRecorder;
|
this.warcRecorder = warcRecorder;
|
||||||
this.fetcher = fetcher;
|
this.fetcher = fetcher;
|
||||||
this.domainProber = domainProber;
|
this.domainProber = domainProber;
|
||||||
@ -90,8 +96,21 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
try {
|
try {
|
||||||
// Do an initial domain probe to determine the root URL
|
// Do an initial domain probe to determine the root URL
|
||||||
EdgeUrl rootUrl;
|
EdgeUrl rootUrl;
|
||||||
if (probeRootUrl() instanceof HttpFetcher.DomainProbeResult.Ok ok) rootUrl = ok.probedUrl();
|
|
||||||
else return 1;
|
var probeResult = probeRootUrl();
|
||||||
|
switch (probeResult) {
|
||||||
|
case HttpFetcher.DomainProbeResult.Ok(EdgeUrl probedUrl) -> {
|
||||||
|
rootUrl = probedUrl; // Good track
|
||||||
|
}
|
||||||
|
case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> {
|
||||||
|
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString()));
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
case HttpFetcher.DomainProbeResult.Error(CrawlerDomainStatus status, String desc) -> {
|
||||||
|
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, status.toString(), desc));
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Sleep after the initial probe, we don't have access to the robots.txt yet
|
// Sleep after the initial probe, we don't have access to the robots.txt yet
|
||||||
// so we don't know the crawl delay
|
// so we don't know the crawl delay
|
||||||
@ -114,7 +133,8 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
|
|
||||||
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
|
delayTimer.waitFetchDelay(0); // initial delay after robots.txt
|
||||||
|
|
||||||
sniffRootDocument(rootUrl, delayTimer);
|
DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(rootUrl, delayTimer);
|
||||||
|
domainStateDb.save(summaryRecord);
|
||||||
|
|
||||||
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
|
||||||
if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) {
|
if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) {
|
||||||
@ -196,7 +216,9 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
return domainProbeResult;
|
return domainProbeResult;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
|
private DomainStateDb.SummaryRecord sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
|
||||||
|
Optional<String> feedLink = Optional.empty();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
var url = rootUrl.withPathAndParam("/", null);
|
var url = rootUrl.withPathAndParam("/", null);
|
||||||
|
|
||||||
@ -204,11 +226,11 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
timer.waitFetchDelay(0);
|
timer.waitFetchDelay(0);
|
||||||
|
|
||||||
if (!(result instanceof HttpFetchResult.ResultOk ok))
|
if (!(result instanceof HttpFetchResult.ResultOk ok))
|
||||||
return;
|
return DomainStateDb.SummaryRecord.forSuccess(domain);
|
||||||
|
|
||||||
var optDoc = ok.parseDocument();
|
var optDoc = ok.parseDocument();
|
||||||
if (optDoc.isEmpty())
|
if (optDoc.isEmpty())
|
||||||
return;
|
return DomainStateDb.SummaryRecord.forSuccess(domain);
|
||||||
|
|
||||||
// Sniff the software based on the sample document
|
// Sniff the software based on the sample document
|
||||||
var doc = optDoc.get();
|
var doc = optDoc.get();
|
||||||
@ -216,7 +238,6 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
crawlFrontier.enqueueLinksFromDocument(url, doc);
|
crawlFrontier.enqueueLinksFromDocument(url, doc);
|
||||||
|
|
||||||
EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null);
|
EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null);
|
||||||
Optional<EdgeUrl> sitemapUrl = Optional.empty();
|
|
||||||
|
|
||||||
for (var link : doc.getElementsByTag("link")) {
|
for (var link : doc.getElementsByTag("link")) {
|
||||||
String rel = link.attr("rel");
|
String rel = link.attr("rel");
|
||||||
@ -232,23 +253,33 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
|
|
||||||
// Grab the RSS/Atom as a sitemap if it exists
|
// Grab the RSS/Atom as a sitemap if it exists
|
||||||
if (rel.equalsIgnoreCase("alternate")
|
if (rel.equalsIgnoreCase("alternate")
|
||||||
&& (type.equalsIgnoreCase("application/atom+xml") || type.equalsIgnoreCase("application/atomsvc+xml"))) {
|
&& (type.equalsIgnoreCase("application/atom+xml")
|
||||||
|
|| type.equalsIgnoreCase("application/atomsvc+xml")
|
||||||
|
|| type.equalsIgnoreCase("application/rss+xml")
|
||||||
|
)) {
|
||||||
String href = link.attr("href");
|
String href = link.attr("href");
|
||||||
|
|
||||||
sitemapUrl = linkParser.parseLink(url, href)
|
feedLink = linkParser.parseLink(url, href)
|
||||||
.filter(crawlFrontier::isSameDomain);
|
.filter(crawlFrontier::isSameDomain)
|
||||||
|
.map(EdgeUrl::toString);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Download the sitemap if available exists
|
|
||||||
if (sitemapUrl.isPresent()) {
|
if (feedLink.isEmpty()) {
|
||||||
sitemapFetcher.downloadSitemaps(List.of(sitemapUrl.get()));
|
feedLink = guessFeedUrl(timer);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Download the sitemap if available
|
||||||
|
if (feedLink.isPresent()) {
|
||||||
|
sitemapFetcher.downloadSitemaps(List.of(feedLink.get()));
|
||||||
timer.waitFetchDelay(0);
|
timer.waitFetchDelay(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Grab the favicon if it exists
|
// Grab the favicon if it exists
|
||||||
fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
||||||
timer.waitFetchDelay(0);
|
timer.waitFetchDelay(0);
|
||||||
|
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
logger.error("Error configuring link filter", ex);
|
logger.error("Error configuring link filter", ex);
|
||||||
@ -256,6 +287,74 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
finally {
|
finally {
|
||||||
crawlFrontier.addVisited(rootUrl);
|
crawlFrontier.addVisited(rootUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (feedLink.isPresent()) {
|
||||||
|
return DomainStateDb.SummaryRecord.forSuccess(domain, feedLink.get());
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return DomainStateDb.SummaryRecord.forSuccess(domain);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private final List<String> likelyFeedEndpoints = List.of(
|
||||||
|
"/rss.xml",
|
||||||
|
"/atom.xml",
|
||||||
|
"/feed.xml",
|
||||||
|
"/index.xml",
|
||||||
|
"/feed",
|
||||||
|
"/rss",
|
||||||
|
"/atom",
|
||||||
|
"/feeds",
|
||||||
|
"/blog/feed",
|
||||||
|
"/blog/rss"
|
||||||
|
);
|
||||||
|
|
||||||
|
private Optional<String> guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException {
|
||||||
|
var oldDomainStateRecord = domainStateDb.get(domain);
|
||||||
|
|
||||||
|
// If we are already aware of an old feed URL, then we can just revalidate it
|
||||||
|
if (oldDomainStateRecord.isPresent()) {
|
||||||
|
var oldRecord = oldDomainStateRecord.get();
|
||||||
|
if (oldRecord.feedUrl() != null && validateFeedUrl(oldRecord.feedUrl(), timer)) {
|
||||||
|
return Optional.of(oldRecord.feedUrl());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String endpoint : likelyFeedEndpoints) {
|
||||||
|
String url = "https://" + domain + "/" + endpoint;
|
||||||
|
if (validateFeedUrl(url, timer)) {
|
||||||
|
return Optional.of(url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean validateFeedUrl(String url, CrawlDelayTimer timer) throws InterruptedException {
|
||||||
|
var parsedOpt = EdgeUrl.parse(url);
|
||||||
|
if (parsedOpt.isEmpty())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
HttpFetchResult result = fetchWithRetry(parsedOpt.get(), timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
||||||
|
timer.waitFetchDelay(0);
|
||||||
|
|
||||||
|
if (!(result instanceof HttpFetchResult.ResultOk ok)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract the beginning of the
|
||||||
|
Optional<String> bodyOpt = DocumentBodyExtractor.asString(ok).getBody();
|
||||||
|
if (bodyOpt.isEmpty())
|
||||||
|
return false;
|
||||||
|
String body = bodyOpt.get();
|
||||||
|
body = body.substring(0, Math.min(128, body.length())).toLowerCase();
|
||||||
|
|
||||||
|
if (body.contains("<atom"))
|
||||||
|
return true;
|
||||||
|
if (body.contains("<rss"))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public HttpFetchResult fetchContentWithReference(EdgeUrl top,
|
public HttpFetchResult fetchContentWithReference(EdgeUrl top,
|
||||||
|
@ -7,9 +7,9 @@ import nu.marginalia.model.EdgeUrl;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
public class SitemapFetcher {
|
public class SitemapFetcher {
|
||||||
@ -24,26 +24,27 @@ public class SitemapFetcher {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void downloadSitemaps(SimpleRobotRules robotsRules, EdgeUrl rootUrl) {
|
public void downloadSitemaps(SimpleRobotRules robotsRules, EdgeUrl rootUrl) {
|
||||||
List<String> sitemaps = robotsRules.getSitemaps();
|
List<String> urls = robotsRules.getSitemaps();
|
||||||
|
|
||||||
List<EdgeUrl> urls = new ArrayList<>(sitemaps.size());
|
if (urls.isEmpty()) {
|
||||||
if (!sitemaps.isEmpty()) {
|
urls = List.of(rootUrl.withPathAndParam("/sitemap.xml", null).toString());
|
||||||
for (var url : sitemaps) {
|
|
||||||
EdgeUrl.parse(url).ifPresent(urls::add);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
urls.add(rootUrl.withPathAndParam("/sitemap.xml", null));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
downloadSitemaps(urls);
|
downloadSitemaps(urls);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void downloadSitemaps(List<EdgeUrl> urls) {
|
public void downloadSitemaps(List<String> urls) {
|
||||||
|
|
||||||
Set<String> checkedSitemaps = new HashSet<>();
|
Set<String> checkedSitemaps = new HashSet<>();
|
||||||
|
|
||||||
for (var url : urls) {
|
for (var rawUrl : urls) {
|
||||||
|
Optional<EdgeUrl> parsedUrl = EdgeUrl.parse(rawUrl);
|
||||||
|
if (parsedUrl.isEmpty()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
EdgeUrl url = parsedUrl.get();
|
||||||
|
|
||||||
// Let's not download sitemaps from other domains for now
|
// Let's not download sitemaps from other domains for now
|
||||||
if (!crawlFrontier.isSameDomain(url)) {
|
if (!crawlFrontier.isSameDomain(url)) {
|
||||||
continue;
|
continue;
|
||||||
|
@ -18,6 +18,7 @@ public class ContentTypeLogic {
|
|||||||
"application/xhtml",
|
"application/xhtml",
|
||||||
"application/xml",
|
"application/xml",
|
||||||
"application/atom+xml",
|
"application/atom+xml",
|
||||||
|
"application/atomsvc+xml",
|
||||||
"application/rss+xml",
|
"application/rss+xml",
|
||||||
"application/x-rss+xml",
|
"application/x-rss+xml",
|
||||||
"application/rdf+xml",
|
"application/rdf+xml",
|
||||||
|
@ -23,6 +23,10 @@ public sealed interface DocumentBodyResult<T> {
|
|||||||
return mapper.apply(contentType, body);
|
return mapper.apply(contentType, body);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Optional<T> getBody() {
|
||||||
|
return Optional.of(body);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void ifPresent(ExConsumer<T, Exception> consumer) throws Exception {
|
public void ifPresent(ExConsumer<T, Exception> consumer) throws Exception {
|
||||||
consumer.accept(contentType, body);
|
consumer.accept(contentType, body);
|
||||||
@ -41,6 +45,11 @@ public sealed interface DocumentBodyResult<T> {
|
|||||||
return (DocumentBodyResult<T2>) this;
|
return (DocumentBodyResult<T2>) this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Optional<T> getBody() {
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void ifPresent(ExConsumer<T, Exception> consumer) throws Exception {
|
public void ifPresent(ExConsumer<T, Exception> consumer) throws Exception {
|
||||||
}
|
}
|
||||||
@ -49,6 +58,7 @@ public sealed interface DocumentBodyResult<T> {
|
|||||||
<T2> Optional<T2> mapOpt(BiFunction<ContentType, T, T2> mapper);
|
<T2> Optional<T2> mapOpt(BiFunction<ContentType, T, T2> mapper);
|
||||||
<T2> Optional<T2> flatMapOpt(BiFunction<ContentType, T, Optional<T2>> mapper);
|
<T2> Optional<T2> flatMapOpt(BiFunction<ContentType, T, Optional<T2>> mapper);
|
||||||
<T2> DocumentBodyResult<T2> flatMap(BiFunction<ContentType, T, DocumentBodyResult<T2>> mapper);
|
<T2> DocumentBodyResult<T2> flatMap(BiFunction<ContentType, T, DocumentBodyResult<T2>> mapper);
|
||||||
|
Optional<T> getBody();
|
||||||
|
|
||||||
void ifPresent(ExConsumer<T,Exception> consumer) throws Exception;
|
void ifPresent(ExConsumer<T,Exception> consumer) throws Exception;
|
||||||
|
|
||||||
|
@ -0,0 +1,66 @@
|
|||||||
|
package nu.marginalia.crawl;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.time.Instant;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
class DomainStateDbTest {
|
||||||
|
|
||||||
|
Path tempFile;
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() throws IOException {
|
||||||
|
tempFile = Files.createTempFile(getClass().getSimpleName(), ".db");
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
void tearDown() throws IOException {
|
||||||
|
Files.deleteIfExists(tempFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSunnyDay() throws SQLException {
|
||||||
|
try (var db = new DomainStateDb(tempFile)) {
|
||||||
|
var allFields = new DomainStateDb.SummaryRecord(
|
||||||
|
"all.marginalia.nu",
|
||||||
|
Instant.now(),
|
||||||
|
"OK",
|
||||||
|
"Bad address",
|
||||||
|
"https://www.marginalia.nu/atom.xml"
|
||||||
|
);
|
||||||
|
|
||||||
|
var minFields = new DomainStateDb.SummaryRecord(
|
||||||
|
"min.marginalia.nu",
|
||||||
|
Instant.now(),
|
||||||
|
"OK",
|
||||||
|
null,
|
||||||
|
null
|
||||||
|
);
|
||||||
|
|
||||||
|
db.save(allFields);
|
||||||
|
db.save(minFields);
|
||||||
|
|
||||||
|
assertEquals(allFields, db.get("all.marginalia.nu").orElseThrow());
|
||||||
|
assertEquals(minFields, db.get("min.marginalia.nu").orElseThrow());
|
||||||
|
|
||||||
|
var updatedAllFields = new DomainStateDb.SummaryRecord(
|
||||||
|
"all.marginalia.nu",
|
||||||
|
Instant.now(),
|
||||||
|
"BAD",
|
||||||
|
null,
|
||||||
|
null
|
||||||
|
);
|
||||||
|
|
||||||
|
db.save(updatedAllFields);
|
||||||
|
assertEquals(updatedAllFields, db.get("all.marginalia.nu").orElseThrow());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -2,6 +2,7 @@ package nu.marginalia.crawling.retreival;
|
|||||||
|
|
||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import nu.marginalia.crawl.CrawlerMain;
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
import nu.marginalia.crawl.fetcher.ContentTags;
|
import nu.marginalia.crawl.fetcher.ContentTags;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
@ -18,6 +19,7 @@ import nu.marginalia.model.crawldata.SerializableCrawlData;
|
|||||||
import nu.marginalia.test.CommonTestData;
|
import nu.marginalia.test.CommonTestData;
|
||||||
import okhttp3.Headers;
|
import okhttp3.Headers;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.mockito.Mockito;
|
import org.mockito.Mockito;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -25,6 +27,9 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.sql.SQLException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -36,9 +41,14 @@ public class CrawlerMockFetcherTest {
|
|||||||
|
|
||||||
Map<EdgeUrl, CrawledDocument> mockData = new HashMap<>();
|
Map<EdgeUrl, CrawledDocument> mockData = new HashMap<>();
|
||||||
HttpFetcher fetcherMock = new MockFetcher();
|
HttpFetcher fetcherMock = new MockFetcher();
|
||||||
|
private Path dbTempFile;
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws IOException {
|
||||||
|
dbTempFile = Files.createTempFile("domains","db");
|
||||||
|
}
|
||||||
@AfterEach
|
@AfterEach
|
||||||
public void tearDown() {
|
public void tearDown() throws IOException {
|
||||||
|
Files.deleteIfExists(dbTempFile);
|
||||||
mockData.clear();
|
mockData.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -66,15 +76,17 @@ public class CrawlerMockFetcherTest {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void crawl(CrawlerMain.CrawlSpecRecord spec) throws IOException {
|
void crawl(CrawlerMain.CrawlSpecRecord spec) throws IOException, SQLException {
|
||||||
try (var recorder = new WarcRecorder()) {
|
try (var recorder = new WarcRecorder();
|
||||||
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder)
|
var db = new DomainStateDb(dbTempFile)
|
||||||
|
) {
|
||||||
|
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, db, recorder)
|
||||||
.crawlDomain();
|
.crawlDomain();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testLemmy() throws URISyntaxException, IOException {
|
public void testLemmy() throws Exception {
|
||||||
List<SerializableCrawlData> out = new ArrayList<>();
|
List<SerializableCrawlData> out = new ArrayList<>();
|
||||||
|
|
||||||
registerUrlClasspathData(new EdgeUrl("https://startrek.website/"), "mock-crawl-data/lemmy/index.html");
|
registerUrlClasspathData(new EdgeUrl("https://startrek.website/"), "mock-crawl-data/lemmy/index.html");
|
||||||
@ -85,7 +97,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testMediawiki() throws URISyntaxException, IOException {
|
public void testMediawiki() throws Exception {
|
||||||
List<SerializableCrawlData> out = new ArrayList<>();
|
List<SerializableCrawlData> out = new ArrayList<>();
|
||||||
|
|
||||||
registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
|
registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
|
||||||
@ -94,7 +106,7 @@ public class CrawlerMockFetcherTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testDiscourse() throws URISyntaxException, IOException {
|
public void testDiscourse() throws Exception {
|
||||||
List<SerializableCrawlData> out = new ArrayList<>();
|
List<SerializableCrawlData> out = new ArrayList<>();
|
||||||
|
|
||||||
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/"), "mock-crawl-data/discourse/index.html");
|
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/"), "mock-crawl-data/discourse/index.html");
|
||||||
|
@ -4,6 +4,7 @@ import nu.marginalia.UserAgent;
|
|||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
import nu.marginalia.crawl.CrawlerMain;
|
import nu.marginalia.crawl.CrawlerMain;
|
||||||
|
import nu.marginalia.crawl.DomainStateDb;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
import nu.marginalia.crawl.fetcher.HttpFetcher;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
|
||||||
@ -25,6 +26,7 @@ import java.io.RandomAccessFile;
|
|||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.sql.SQLException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
@ -39,11 +41,13 @@ class CrawlerRetreiverTest {
|
|||||||
Path tempFileWarc2;
|
Path tempFileWarc2;
|
||||||
Path tempFileParquet2;
|
Path tempFileParquet2;
|
||||||
Path tempFileWarc3;
|
Path tempFileWarc3;
|
||||||
|
Path tempFileDb;
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() throws IOException {
|
public void setUp() throws IOException {
|
||||||
httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
|
httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
|
||||||
tempFileParquet1 = Files.createTempFile("crawling-process", ".parquet");
|
tempFileParquet1 = Files.createTempFile("crawling-process", ".parquet");
|
||||||
tempFileParquet2 = Files.createTempFile("crawling-process", ".parquet");
|
tempFileParquet2 = Files.createTempFile("crawling-process", ".parquet");
|
||||||
|
tempFileDb = Files.createTempFile("crawling-process", ".db");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -505,22 +509,26 @@ class CrawlerRetreiverTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
|
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
|
||||||
try (var recorder = new WarcRecorder(tempFileWarc2)) {
|
try (var recorder = new WarcRecorder(tempFileWarc2);
|
||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).crawlDomain(new DomainLinks(),
|
var db = new DomainStateDb(tempFileDb)
|
||||||
|
) {
|
||||||
|
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(),
|
||||||
new CrawlDataReference(stream));
|
new CrawlDataReference(stream));
|
||||||
}
|
}
|
||||||
catch (IOException ex) {
|
catch (IOException | SQLException ex) {
|
||||||
Assertions.fail(ex);
|
Assertions.fail(ex);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@NotNull
|
@NotNull
|
||||||
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) {
|
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) {
|
||||||
try (var recorder = new WarcRecorder(tempFileWarc1)) {
|
try (var recorder = new WarcRecorder(tempFileWarc1);
|
||||||
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder);
|
var db = new DomainStateDb(tempFileDb)
|
||||||
|
) {
|
||||||
|
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder);
|
||||||
crawler.crawlDomain();
|
crawler.crawlDomain();
|
||||||
return crawler.getCrawlFrontier();
|
return crawler.getCrawlFrontier();
|
||||||
} catch (IOException ex) {
|
} catch (IOException| SQLException ex) {
|
||||||
Assertions.fail(ex);
|
Assertions.fail(ex);
|
||||||
return null; // unreachable
|
return null; // unreachable
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user