(crawler) Improved feed discovery, new domain state db per crawlset

Feed discover is improved with by probing a few likely endpoints when no feed link tag is provided.  To store the feed URLs, a sqlite database is added to each crawlset that stores a simple summary of the crawl job, including any feed URLs that have been discovered.

Solves issue #135
This commit is contained in:
Viktor Lofgren 2024-12-26 15:05:52 +01:00
parent 4bb71b8439
commit 895cee7004
12 changed files with 390 additions and 52 deletions

View File

@ -85,7 +85,7 @@ class BTreeWriterTest {
public void testWriteEntrySize2() throws IOException { public void testWriteEntrySize2() throws IOException {
BTreeContext ctx = new BTreeContext(4, 2, BTreeBlockSize.BS_64); BTreeContext ctx = new BTreeContext(4, 2, BTreeBlockSize.BS_64);
var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat"); var tempFile = Files.createTempFile("tst", "dat");
int[] data = generateItems32(64); int[] data = generateItems32(64);

View File

@ -7,6 +7,7 @@ import nu.marginalia.WmsaHome;
import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.crawl.CrawlerMain; import nu.marginalia.crawl.CrawlerMain;
import nu.marginalia.crawl.DomainStateDb;
import nu.marginalia.crawl.fetcher.HttpFetcher; import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
@ -46,6 +47,7 @@ public class CrawlingThenConvertingIntegrationTest {
private Path fileName; private Path fileName;
private Path fileName2; private Path fileName2;
private Path dbTempFile;
@BeforeAll @BeforeAll
public static void setUpAll() { public static void setUpAll() {
@ -63,16 +65,18 @@ public class CrawlingThenConvertingIntegrationTest {
httpFetcher = new HttpFetcherImpl(WmsaHome.getUserAgent().uaString()); httpFetcher = new HttpFetcherImpl(WmsaHome.getUserAgent().uaString());
this.fileName = Files.createTempFile("crawling-then-converting", ".warc.gz"); this.fileName = Files.createTempFile("crawling-then-converting", ".warc.gz");
this.fileName2 = Files.createTempFile("crawling-then-converting", ".warc.gz"); this.fileName2 = Files.createTempFile("crawling-then-converting", ".warc.gz");
this.dbTempFile = Files.createTempFile("domains", "db");
} }
@AfterEach @AfterEach
public void tearDown() throws IOException { public void tearDown() throws IOException {
Files.deleteIfExists(fileName); Files.deleteIfExists(fileName);
Files.deleteIfExists(fileName2); Files.deleteIfExists(fileName2);
Files.deleteIfExists(dbTempFile);
} }
@Test @Test
public void testInvalidDomain() throws IOException { public void testInvalidDomain() throws Exception {
// Attempt to fetch an invalid domain // Attempt to fetch an invalid domain
var specs = new CrawlerMain.CrawlSpecRecord("invalid.invalid.invalid", 10); var specs = new CrawlerMain.CrawlSpecRecord("invalid.invalid.invalid", 10);
@ -88,7 +92,7 @@ public class CrawlingThenConvertingIntegrationTest {
} }
@Test @Test
public void testRedirectingDomain() throws IOException { public void testRedirectingDomain() throws Exception {
// Attempt to fetch an invalid domain // Attempt to fetch an invalid domain
var specs = new CrawlerMain.CrawlSpecRecord("memex.marginalia.nu", 10); var specs = new CrawlerMain.CrawlSpecRecord("memex.marginalia.nu", 10);
@ -107,7 +111,7 @@ public class CrawlingThenConvertingIntegrationTest {
} }
@Test @Test
public void testBlockedDomain() throws IOException { public void testBlockedDomain() throws Exception {
// Attempt to fetch an invalid domain // Attempt to fetch an invalid domain
var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 10); var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 10);
@ -124,7 +128,7 @@ public class CrawlingThenConvertingIntegrationTest {
} }
@Test @Test
public void crawlSunnyDay() throws IOException { public void crawlSunnyDay() throws Exception {
var specs = new CrawlerMain.CrawlSpecRecord("www.marginalia.nu", 10); var specs = new CrawlerMain.CrawlSpecRecord("www.marginalia.nu", 10);
CrawledDomain domain = crawl(specs); CrawledDomain domain = crawl(specs);
@ -157,7 +161,7 @@ public class CrawlingThenConvertingIntegrationTest {
@Test @Test
public void crawlContentTypes() throws IOException { public void crawlContentTypes() throws Exception {
var specs = new CrawlerMain.CrawlSpecRecord("www.marginalia.nu", 10, var specs = new CrawlerMain.CrawlSpecRecord("www.marginalia.nu", 10,
List.of( List.of(
"https://www.marginalia.nu/sanic.png", "https://www.marginalia.nu/sanic.png",
@ -195,7 +199,7 @@ public class CrawlingThenConvertingIntegrationTest {
@Test @Test
public void crawlRobotsTxt() throws IOException { public void crawlRobotsTxt() throws Exception {
var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 5, var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 5,
List.of("https://search.marginalia.nu/search?q=hello+world") List.of("https://search.marginalia.nu/search?q=hello+world")
); );
@ -235,15 +239,17 @@ public class CrawlingThenConvertingIntegrationTest {
return null; // unreachable return null; // unreachable
} }
} }
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs) throws IOException { private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs) throws Exception {
return crawl(specs, domain -> true); return crawl(specs, domain -> true);
} }
private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws IOException { private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate<EdgeDomain> domainBlacklist) throws Exception {
List<SerializableCrawlData> data = new ArrayList<>(); List<SerializableCrawlData> data = new ArrayList<>();
try (var recorder = new WarcRecorder(fileName)) { try (var recorder = new WarcRecorder(fileName);
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).crawlDomain(); var db = new DomainStateDb(dbTempFile))
{
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain();
} }
CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain(), CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain(),

View File

@ -46,6 +46,8 @@ dependencies {
implementation libs.notnull implementation libs.notnull
implementation libs.guava implementation libs.guava
implementation libs.sqlite
implementation dependencies.create(libs.guice.get()) { implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava' exclude group: 'com.google.guava'
} }

View File

@ -241,6 +241,7 @@ public class CrawlerMain extends ProcessMainClass {
// Set up the work log and the warc archiver so we can keep track of what we've done // Set up the work log and the warc archiver so we can keep track of what we've done
try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler.log")); try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler.log"));
DomainStateDb domainStateDb = new DomainStateDb(outputDir.resolve("domainstate.db"));
WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir); WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir);
AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(domainsToCrawl) AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(domainsToCrawl)
) { ) {
@ -258,6 +259,7 @@ public class CrawlerMain extends ProcessMainClass {
anchorTagsSource, anchorTagsSource,
outputDir, outputDir,
warcArchiver, warcArchiver,
domainStateDb,
workLog); workLog);
if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) == null) { if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) == null) {
@ -299,11 +301,12 @@ public class CrawlerMain extends ProcessMainClass {
heartbeat.start(); heartbeat.start();
try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler-" + targetDomainName.replace('/', '-') + ".log")); try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler-" + targetDomainName.replace('/', '-') + ".log"));
DomainStateDb domainStateDb = new DomainStateDb(outputDir.resolve("domainstate.db"));
WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir); WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir);
AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(List.of(new EdgeDomain(targetDomainName))) AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(List.of(new EdgeDomain(targetDomainName)))
) { ) {
var spec = new CrawlSpecRecord(targetDomainName, 1000, List.of()); var spec = new CrawlSpecRecord(targetDomainName, 1000, List.of());
var task = new CrawlTask(spec, anchorTagsSource, outputDir, warcArchiver, workLog); var task = new CrawlTask(spec, anchorTagsSource, outputDir, warcArchiver, domainStateDb, workLog);
task.run(); task.run();
} }
catch (Exception ex) { catch (Exception ex) {
@ -324,18 +327,21 @@ public class CrawlerMain extends ProcessMainClass {
private final AnchorTagsSource anchorTagsSource; private final AnchorTagsSource anchorTagsSource;
private final Path outputDir; private final Path outputDir;
private final WarcArchiverIf warcArchiver; private final WarcArchiverIf warcArchiver;
private final DomainStateDb domainStateDb;
private final WorkLog workLog; private final WorkLog workLog;
CrawlTask(CrawlSpecRecord specification, CrawlTask(CrawlSpecRecord specification,
AnchorTagsSource anchorTagsSource, AnchorTagsSource anchorTagsSource,
Path outputDir, Path outputDir,
WarcArchiverIf warcArchiver, WarcArchiverIf warcArchiver,
DomainStateDb domainStateDb,
WorkLog workLog) WorkLog workLog)
{ {
this.specification = specification; this.specification = specification;
this.anchorTagsSource = anchorTagsSource; this.anchorTagsSource = anchorTagsSource;
this.outputDir = outputDir; this.outputDir = outputDir;
this.warcArchiver = warcArchiver; this.warcArchiver = warcArchiver;
this.domainStateDb = domainStateDb;
this.workLog = workLog; this.workLog = workLog;
this.domain = specification.domain(); this.domain = specification.domain();
@ -359,7 +365,7 @@ public class CrawlerMain extends ProcessMainClass {
} }
try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now
var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder); var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder);
CrawlDataReference reference = getReference(); CrawlDataReference reference = getReference();
) )
{ {

View File

@ -0,0 +1,127 @@
package nu.marginalia.crawl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.nio.file.Path;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.SQLException;
import java.time.Instant;
import java.util.Optional;
/** Supplemental sqlite database for storing the summary of a crawl.
* One database exists per crawl data set.
* */
public class DomainStateDb implements AutoCloseable {
private static final Logger logger = LoggerFactory.getLogger(DomainStateDb.class);
private final Connection connection;
public record SummaryRecord(
String domainName,
Instant lastUpdated,
String state,
@Nullable String stateDesc,
@Nullable String feedUrl
)
{
public static SummaryRecord forSuccess(String domainName) {
return new SummaryRecord(domainName, Instant.now(), "OK", null, null);
}
public static SummaryRecord forSuccess(String domainName, String feedUrl) {
return new SummaryRecord(domainName, Instant.now(), "OK", null, feedUrl);
}
public static SummaryRecord forError(String domainName, String state, String stateDesc) {
return new SummaryRecord(domainName, Instant.now(), state, stateDesc, null);
}
public boolean equals(Object other) {
if (other == this) {
return true;
}
if (!(other instanceof SummaryRecord(String name, Instant updated, String state1, String desc, String url))) {
return false;
}
return domainName.equals(name) &&
lastUpdated.toEpochMilli() == updated.toEpochMilli() &&
state.equals(state1) &&
(stateDesc == null ? desc == null : stateDesc.equals(desc)) &&
(feedUrl == null ? url == null : feedUrl.equals(url));
}
public int hashCode() {
return domainName.hashCode() + Long.hashCode(lastUpdated.toEpochMilli());
}
}
public DomainStateDb(Path filename) throws SQLException {
String sqliteDbString = "jdbc:sqlite:" + filename.toString();
connection = DriverManager.getConnection(sqliteDbString);
try (var stmt = connection.createStatement()) {
stmt.executeUpdate("""
CREATE TABLE IF NOT EXISTS summary (
domain TEXT PRIMARY KEY,
lastUpdatedEpochMs LONG NOT NULL,
state TEXT NOT NULL,
stateDesc TEXT,
feedUrl TEXT
)
""");
stmt.execute("PRAGMA journal_mode=WAL");
}
}
@Override
public void close() throws SQLException {
connection.close();
}
public void save(SummaryRecord record) {
try (var stmt = connection.prepareStatement("""
INSERT OR REPLACE INTO summary (domain, lastUpdatedEpochMs, state, stateDesc, feedUrl)
VALUES (?, ?, ?, ?, ?)
""")) {
stmt.setString(1, record.domainName());
stmt.setLong(2, record.lastUpdated().toEpochMilli());
stmt.setString(3, record.state());
stmt.setString(4, record.stateDesc());
stmt.setString(5, record.feedUrl());
stmt.executeUpdate();
} catch (SQLException e) {
logger.error("Failed to insert summary record", e);
}
}
public Optional<SummaryRecord> get(String domainName) {
try (var stmt = connection.prepareStatement("""
SELECT domain, lastUpdatedEpochMs, state, stateDesc, feedUrl
FROM summary
WHERE domain = ?
""")) {
stmt.setString(1, domainName);
var rs = stmt.executeQuery();
if (rs.next()) {
return Optional.of(new SummaryRecord(
rs.getString("domain"),
Instant.ofEpochMilli(rs.getLong("lastUpdatedEpochMs")),
rs.getString("state"),
rs.getString("stateDesc"),
rs.getString("feedUrl")
));
}
} catch (SQLException e) {
logger.error("Failed to get summary record", e);
}
return Optional.empty();
}
}

View File

@ -4,6 +4,7 @@ import crawlercommons.robots.SimpleRobotRules;
import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.contenttype.ContentType; import nu.marginalia.contenttype.ContentType;
import nu.marginalia.crawl.CrawlerMain; import nu.marginalia.crawl.CrawlerMain;
import nu.marginalia.crawl.DomainStateDb;
import nu.marginalia.crawl.fetcher.ContentTags; import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.HttpFetcher; import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
@ -16,7 +17,9 @@ import nu.marginalia.ip_blocklist.UrlBlocklist;
import nu.marginalia.link_parser.LinkParser; import nu.marginalia.link_parser.LinkParser;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.body.DocumentBodyExtractor;
import nu.marginalia.model.body.HttpFetchResult; import nu.marginalia.model.body.HttpFetchResult;
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -46,6 +49,7 @@ public class CrawlerRetreiver implements AutoCloseable {
private final DomainProber domainProber; private final DomainProber domainProber;
private final DomainCrawlFrontier crawlFrontier; private final DomainCrawlFrontier crawlFrontier;
private final DomainStateDb domainStateDb;
private final WarcRecorder warcRecorder; private final WarcRecorder warcRecorder;
private final CrawlerRevisitor crawlerRevisitor; private final CrawlerRevisitor crawlerRevisitor;
@ -55,8 +59,10 @@ public class CrawlerRetreiver implements AutoCloseable {
public CrawlerRetreiver(HttpFetcher fetcher, public CrawlerRetreiver(HttpFetcher fetcher,
DomainProber domainProber, DomainProber domainProber,
CrawlerMain.CrawlSpecRecord specs, CrawlerMain.CrawlSpecRecord specs,
DomainStateDb domainStateDb,
WarcRecorder warcRecorder) WarcRecorder warcRecorder)
{ {
this.domainStateDb = domainStateDb;
this.warcRecorder = warcRecorder; this.warcRecorder = warcRecorder;
this.fetcher = fetcher; this.fetcher = fetcher;
this.domainProber = domainProber; this.domainProber = domainProber;
@ -90,8 +96,21 @@ public class CrawlerRetreiver implements AutoCloseable {
try { try {
// Do an initial domain probe to determine the root URL // Do an initial domain probe to determine the root URL
EdgeUrl rootUrl; EdgeUrl rootUrl;
if (probeRootUrl() instanceof HttpFetcher.DomainProbeResult.Ok ok) rootUrl = ok.probedUrl();
else return 1; var probeResult = probeRootUrl();
switch (probeResult) {
case HttpFetcher.DomainProbeResult.Ok(EdgeUrl probedUrl) -> {
rootUrl = probedUrl; // Good track
}
case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> {
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString()));
return 1;
}
case HttpFetcher.DomainProbeResult.Error(CrawlerDomainStatus status, String desc) -> {
domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, status.toString(), desc));
return 1;
}
}
// Sleep after the initial probe, we don't have access to the robots.txt yet // Sleep after the initial probe, we don't have access to the robots.txt yet
// so we don't know the crawl delay // so we don't know the crawl delay
@ -114,7 +133,8 @@ public class CrawlerRetreiver implements AutoCloseable {
delayTimer.waitFetchDelay(0); // initial delay after robots.txt delayTimer.waitFetchDelay(0); // initial delay after robots.txt
sniffRootDocument(rootUrl, delayTimer); DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(rootUrl, delayTimer);
domainStateDb.save(summaryRecord);
// Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified
if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) { if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) {
@ -196,7 +216,9 @@ public class CrawlerRetreiver implements AutoCloseable {
return domainProbeResult; return domainProbeResult;
} }
private void sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) { private DomainStateDb.SummaryRecord sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) {
Optional<String> feedLink = Optional.empty();
try { try {
var url = rootUrl.withPathAndParam("/", null); var url = rootUrl.withPathAndParam("/", null);
@ -204,11 +226,11 @@ public class CrawlerRetreiver implements AutoCloseable {
timer.waitFetchDelay(0); timer.waitFetchDelay(0);
if (!(result instanceof HttpFetchResult.ResultOk ok)) if (!(result instanceof HttpFetchResult.ResultOk ok))
return; return DomainStateDb.SummaryRecord.forSuccess(domain);
var optDoc = ok.parseDocument(); var optDoc = ok.parseDocument();
if (optDoc.isEmpty()) if (optDoc.isEmpty())
return; return DomainStateDb.SummaryRecord.forSuccess(domain);
// Sniff the software based on the sample document // Sniff the software based on the sample document
var doc = optDoc.get(); var doc = optDoc.get();
@ -216,7 +238,6 @@ public class CrawlerRetreiver implements AutoCloseable {
crawlFrontier.enqueueLinksFromDocument(url, doc); crawlFrontier.enqueueLinksFromDocument(url, doc);
EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null); EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null);
Optional<EdgeUrl> sitemapUrl = Optional.empty();
for (var link : doc.getElementsByTag("link")) { for (var link : doc.getElementsByTag("link")) {
String rel = link.attr("rel"); String rel = link.attr("rel");
@ -232,23 +253,33 @@ public class CrawlerRetreiver implements AutoCloseable {
// Grab the RSS/Atom as a sitemap if it exists // Grab the RSS/Atom as a sitemap if it exists
if (rel.equalsIgnoreCase("alternate") if (rel.equalsIgnoreCase("alternate")
&& (type.equalsIgnoreCase("application/atom+xml") || type.equalsIgnoreCase("application/atomsvc+xml"))) { && (type.equalsIgnoreCase("application/atom+xml")
|| type.equalsIgnoreCase("application/atomsvc+xml")
|| type.equalsIgnoreCase("application/rss+xml")
)) {
String href = link.attr("href"); String href = link.attr("href");
sitemapUrl = linkParser.parseLink(url, href) feedLink = linkParser.parseLink(url, href)
.filter(crawlFrontier::isSameDomain); .filter(crawlFrontier::isSameDomain)
.map(EdgeUrl::toString);
} }
} }
// Download the sitemap if available exists
if (sitemapUrl.isPresent()) { if (feedLink.isEmpty()) {
sitemapFetcher.downloadSitemaps(List.of(sitemapUrl.get())); feedLink = guessFeedUrl(timer);
}
// Download the sitemap if available
if (feedLink.isPresent()) {
sitemapFetcher.downloadSitemaps(List.of(feedLink.get()));
timer.waitFetchDelay(0); timer.waitFetchDelay(0);
} }
// Grab the favicon if it exists // Grab the favicon if it exists
fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty()); fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
timer.waitFetchDelay(0); timer.waitFetchDelay(0);
} }
catch (Exception ex) { catch (Exception ex) {
logger.error("Error configuring link filter", ex); logger.error("Error configuring link filter", ex);
@ -256,6 +287,74 @@ public class CrawlerRetreiver implements AutoCloseable {
finally { finally {
crawlFrontier.addVisited(rootUrl); crawlFrontier.addVisited(rootUrl);
} }
if (feedLink.isPresent()) {
return DomainStateDb.SummaryRecord.forSuccess(domain, feedLink.get());
}
else {
return DomainStateDb.SummaryRecord.forSuccess(domain);
}
}
private final List<String> likelyFeedEndpoints = List.of(
"/rss.xml",
"/atom.xml",
"/feed.xml",
"/index.xml",
"/feed",
"/rss",
"/atom",
"/feeds",
"/blog/feed",
"/blog/rss"
);
private Optional<String> guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException {
var oldDomainStateRecord = domainStateDb.get(domain);
// If we are already aware of an old feed URL, then we can just revalidate it
if (oldDomainStateRecord.isPresent()) {
var oldRecord = oldDomainStateRecord.get();
if (oldRecord.feedUrl() != null && validateFeedUrl(oldRecord.feedUrl(), timer)) {
return Optional.of(oldRecord.feedUrl());
}
}
for (String endpoint : likelyFeedEndpoints) {
String url = "https://" + domain + "/" + endpoint;
if (validateFeedUrl(url, timer)) {
return Optional.of(url);
}
}
return Optional.empty();
}
private boolean validateFeedUrl(String url, CrawlDelayTimer timer) throws InterruptedException {
var parsedOpt = EdgeUrl.parse(url);
if (parsedOpt.isEmpty())
return false;
HttpFetchResult result = fetchWithRetry(parsedOpt.get(), timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
timer.waitFetchDelay(0);
if (!(result instanceof HttpFetchResult.ResultOk ok)) {
return false;
}
// Extract the beginning of the
Optional<String> bodyOpt = DocumentBodyExtractor.asString(ok).getBody();
if (bodyOpt.isEmpty())
return false;
String body = bodyOpt.get();
body = body.substring(0, Math.min(128, body.length())).toLowerCase();
if (body.contains("<atom"))
return true;
if (body.contains("<rss"))
return true;
return false;
} }
public HttpFetchResult fetchContentWithReference(EdgeUrl top, public HttpFetchResult fetchContentWithReference(EdgeUrl top,

View File

@ -7,9 +7,9 @@ import nu.marginalia.model.EdgeUrl;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Optional;
import java.util.Set; import java.util.Set;
public class SitemapFetcher { public class SitemapFetcher {
@ -24,26 +24,27 @@ public class SitemapFetcher {
} }
public void downloadSitemaps(SimpleRobotRules robotsRules, EdgeUrl rootUrl) { public void downloadSitemaps(SimpleRobotRules robotsRules, EdgeUrl rootUrl) {
List<String> sitemaps = robotsRules.getSitemaps(); List<String> urls = robotsRules.getSitemaps();
List<EdgeUrl> urls = new ArrayList<>(sitemaps.size()); if (urls.isEmpty()) {
if (!sitemaps.isEmpty()) { urls = List.of(rootUrl.withPathAndParam("/sitemap.xml", null).toString());
for (var url : sitemaps) {
EdgeUrl.parse(url).ifPresent(urls::add);
}
}
else {
urls.add(rootUrl.withPathAndParam("/sitemap.xml", null));
} }
downloadSitemaps(urls); downloadSitemaps(urls);
} }
public void downloadSitemaps(List<EdgeUrl> urls) { public void downloadSitemaps(List<String> urls) {
Set<String> checkedSitemaps = new HashSet<>(); Set<String> checkedSitemaps = new HashSet<>();
for (var url : urls) { for (var rawUrl : urls) {
Optional<EdgeUrl> parsedUrl = EdgeUrl.parse(rawUrl);
if (parsedUrl.isEmpty()) {
continue;
}
EdgeUrl url = parsedUrl.get();
// Let's not download sitemaps from other domains for now // Let's not download sitemaps from other domains for now
if (!crawlFrontier.isSameDomain(url)) { if (!crawlFrontier.isSameDomain(url)) {
continue; continue;

View File

@ -18,6 +18,7 @@ public class ContentTypeLogic {
"application/xhtml", "application/xhtml",
"application/xml", "application/xml",
"application/atom+xml", "application/atom+xml",
"application/atomsvc+xml",
"application/rss+xml", "application/rss+xml",
"application/x-rss+xml", "application/x-rss+xml",
"application/rdf+xml", "application/rdf+xml",

View File

@ -23,6 +23,10 @@ public sealed interface DocumentBodyResult<T> {
return mapper.apply(contentType, body); return mapper.apply(contentType, body);
} }
public Optional<T> getBody() {
return Optional.of(body);
}
@Override @Override
public void ifPresent(ExConsumer<T, Exception> consumer) throws Exception { public void ifPresent(ExConsumer<T, Exception> consumer) throws Exception {
consumer.accept(contentType, body); consumer.accept(contentType, body);
@ -41,6 +45,11 @@ public sealed interface DocumentBodyResult<T> {
return (DocumentBodyResult<T2>) this; return (DocumentBodyResult<T2>) this;
} }
@Override
public Optional<T> getBody() {
return Optional.empty();
}
@Override @Override
public void ifPresent(ExConsumer<T, Exception> consumer) throws Exception { public void ifPresent(ExConsumer<T, Exception> consumer) throws Exception {
} }
@ -49,6 +58,7 @@ public sealed interface DocumentBodyResult<T> {
<T2> Optional<T2> mapOpt(BiFunction<ContentType, T, T2> mapper); <T2> Optional<T2> mapOpt(BiFunction<ContentType, T, T2> mapper);
<T2> Optional<T2> flatMapOpt(BiFunction<ContentType, T, Optional<T2>> mapper); <T2> Optional<T2> flatMapOpt(BiFunction<ContentType, T, Optional<T2>> mapper);
<T2> DocumentBodyResult<T2> flatMap(BiFunction<ContentType, T, DocumentBodyResult<T2>> mapper); <T2> DocumentBodyResult<T2> flatMap(BiFunction<ContentType, T, DocumentBodyResult<T2>> mapper);
Optional<T> getBody();
void ifPresent(ExConsumer<T,Exception> consumer) throws Exception; void ifPresent(ExConsumer<T,Exception> consumer) throws Exception;

View File

@ -0,0 +1,66 @@
package nu.marginalia.crawl;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.time.Instant;
import static org.junit.jupiter.api.Assertions.assertEquals;
class DomainStateDbTest {
Path tempFile;
@BeforeEach
void setUp() throws IOException {
tempFile = Files.createTempFile(getClass().getSimpleName(), ".db");
}
@AfterEach
void tearDown() throws IOException {
Files.deleteIfExists(tempFile);
}
@Test
public void testSunnyDay() throws SQLException {
try (var db = new DomainStateDb(tempFile)) {
var allFields = new DomainStateDb.SummaryRecord(
"all.marginalia.nu",
Instant.now(),
"OK",
"Bad address",
"https://www.marginalia.nu/atom.xml"
);
var minFields = new DomainStateDb.SummaryRecord(
"min.marginalia.nu",
Instant.now(),
"OK",
null,
null
);
db.save(allFields);
db.save(minFields);
assertEquals(allFields, db.get("all.marginalia.nu").orElseThrow());
assertEquals(minFields, db.get("min.marginalia.nu").orElseThrow());
var updatedAllFields = new DomainStateDb.SummaryRecord(
"all.marginalia.nu",
Instant.now(),
"BAD",
null,
null
);
db.save(updatedAllFields);
assertEquals(updatedAllFields, db.get("all.marginalia.nu").orElseThrow());
}
}
}

View File

@ -2,6 +2,7 @@ package nu.marginalia.crawling.retreival;
import crawlercommons.robots.SimpleRobotRules; import crawlercommons.robots.SimpleRobotRules;
import nu.marginalia.crawl.CrawlerMain; import nu.marginalia.crawl.CrawlerMain;
import nu.marginalia.crawl.DomainStateDb;
import nu.marginalia.crawl.fetcher.ContentTags; import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.HttpFetcher; import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
@ -18,6 +19,7 @@ import nu.marginalia.model.crawldata.SerializableCrawlData;
import nu.marginalia.test.CommonTestData; import nu.marginalia.test.CommonTestData;
import okhttp3.Headers; import okhttp3.Headers;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.mockito.Mockito; import org.mockito.Mockito;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -25,6 +27,9 @@ import org.slf4j.LoggerFactory;
import java.io.IOException; import java.io.IOException;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
@ -36,9 +41,14 @@ public class CrawlerMockFetcherTest {
Map<EdgeUrl, CrawledDocument> mockData = new HashMap<>(); Map<EdgeUrl, CrawledDocument> mockData = new HashMap<>();
HttpFetcher fetcherMock = new MockFetcher(); HttpFetcher fetcherMock = new MockFetcher();
private Path dbTempFile;
@BeforeEach
public void setUp() throws IOException {
dbTempFile = Files.createTempFile("domains","db");
}
@AfterEach @AfterEach
public void tearDown() { public void tearDown() throws IOException {
Files.deleteIfExists(dbTempFile);
mockData.clear(); mockData.clear();
} }
@ -66,15 +76,17 @@ public class CrawlerMockFetcherTest {
} }
void crawl(CrawlerMain.CrawlSpecRecord spec) throws IOException { void crawl(CrawlerMain.CrawlSpecRecord spec) throws IOException, SQLException {
try (var recorder = new WarcRecorder()) { try (var recorder = new WarcRecorder();
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder) var db = new DomainStateDb(dbTempFile)
) {
new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, db, recorder)
.crawlDomain(); .crawlDomain();
} }
} }
@Test @Test
public void testLemmy() throws URISyntaxException, IOException { public void testLemmy() throws Exception {
List<SerializableCrawlData> out = new ArrayList<>(); List<SerializableCrawlData> out = new ArrayList<>();
registerUrlClasspathData(new EdgeUrl("https://startrek.website/"), "mock-crawl-data/lemmy/index.html"); registerUrlClasspathData(new EdgeUrl("https://startrek.website/"), "mock-crawl-data/lemmy/index.html");
@ -85,7 +97,7 @@ public class CrawlerMockFetcherTest {
} }
@Test @Test
public void testMediawiki() throws URISyntaxException, IOException { public void testMediawiki() throws Exception {
List<SerializableCrawlData> out = new ArrayList<>(); List<SerializableCrawlData> out = new ArrayList<>();
registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html"); registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html");
@ -94,7 +106,7 @@ public class CrawlerMockFetcherTest {
} }
@Test @Test
public void testDiscourse() throws URISyntaxException, IOException { public void testDiscourse() throws Exception {
List<SerializableCrawlData> out = new ArrayList<>(); List<SerializableCrawlData> out = new ArrayList<>();
registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/"), "mock-crawl-data/discourse/index.html"); registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/"), "mock-crawl-data/discourse/index.html");

View File

@ -4,6 +4,7 @@ import nu.marginalia.UserAgent;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.crawl.CrawlerMain; import nu.marginalia.crawl.CrawlerMain;
import nu.marginalia.crawl.DomainStateDb;
import nu.marginalia.crawl.fetcher.HttpFetcher; import nu.marginalia.crawl.fetcher.HttpFetcher;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
@ -25,6 +26,7 @@ import java.io.RandomAccessFile;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.sql.SQLException;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -39,11 +41,13 @@ class CrawlerRetreiverTest {
Path tempFileWarc2; Path tempFileWarc2;
Path tempFileParquet2; Path tempFileParquet2;
Path tempFileWarc3; Path tempFileWarc3;
Path tempFileDb;
@BeforeEach @BeforeEach
public void setUp() throws IOException { public void setUp() throws IOException {
httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D"); httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D");
tempFileParquet1 = Files.createTempFile("crawling-process", ".parquet"); tempFileParquet1 = Files.createTempFile("crawling-process", ".parquet");
tempFileParquet2 = Files.createTempFile("crawling-process", ".parquet"); tempFileParquet2 = Files.createTempFile("crawling-process", ".parquet");
tempFileDb = Files.createTempFile("crawling-process", ".db");
} }
@ -505,22 +509,26 @@ class CrawlerRetreiverTest {
} }
private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, SerializableCrawlDataStream stream) { private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, SerializableCrawlDataStream stream) {
try (var recorder = new WarcRecorder(tempFileWarc2)) { try (var recorder = new WarcRecorder(tempFileWarc2);
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).crawlDomain(new DomainLinks(), var db = new DomainStateDb(tempFileDb)
) {
new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(),
new CrawlDataReference(stream)); new CrawlDataReference(stream));
} }
catch (IOException ex) { catch (IOException | SQLException ex) {
Assertions.fail(ex); Assertions.fail(ex);
} }
} }
@NotNull @NotNull
private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) { private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) {
try (var recorder = new WarcRecorder(tempFileWarc1)) { try (var recorder = new WarcRecorder(tempFileWarc1);
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder); var db = new DomainStateDb(tempFileDb)
) {
var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder);
crawler.crawlDomain(); crawler.crawlDomain();
return crawler.getCrawlFrontier(); return crawler.getCrawlFrontier();
} catch (IOException ex) { } catch (IOException| SQLException ex) {
Assertions.fail(ex); Assertions.fail(ex);
return null; // unreachable return null; // unreachable
} }