diff --git a/code/libraries/btree/test/nu/marginalia/btree/BTreeWriterTest.java b/code/libraries/btree/test/nu/marginalia/btree/BTreeWriterTest.java index dcb802b7..2cad98fc 100644 --- a/code/libraries/btree/test/nu/marginalia/btree/BTreeWriterTest.java +++ b/code/libraries/btree/test/nu/marginalia/btree/BTreeWriterTest.java @@ -85,7 +85,7 @@ class BTreeWriterTest { public void testWriteEntrySize2() throws IOException { BTreeContext ctx = new BTreeContext(4, 2, BTreeBlockSize.BS_64); - var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat"); + var tempFile = Files.createTempFile("tst", "dat"); int[] data = generateItems32(64); diff --git a/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index e32bdbf6..9f4fe2f9 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -7,6 +7,7 @@ import nu.marginalia.WmsaHome; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.crawl.CrawlerMain; +import nu.marginalia.crawl.DomainStateDb; import nu.marginalia.crawl.fetcher.HttpFetcher; import nu.marginalia.crawl.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.fetcher.warc.WarcRecorder; @@ -46,6 +47,7 @@ public class CrawlingThenConvertingIntegrationTest { private Path fileName; private Path fileName2; + private Path dbTempFile; @BeforeAll public static void setUpAll() { @@ -63,16 +65,18 @@ public class CrawlingThenConvertingIntegrationTest { httpFetcher = new HttpFetcherImpl(WmsaHome.getUserAgent().uaString()); this.fileName = Files.createTempFile("crawling-then-converting", ".warc.gz"); this.fileName2 = Files.createTempFile("crawling-then-converting", ".warc.gz"); + this.dbTempFile = Files.createTempFile("domains", "db"); } @AfterEach public void tearDown() throws IOException { Files.deleteIfExists(fileName); Files.deleteIfExists(fileName2); + Files.deleteIfExists(dbTempFile); } @Test - public void testInvalidDomain() throws IOException { + public void testInvalidDomain() throws Exception { // Attempt to fetch an invalid domain var specs = new CrawlerMain.CrawlSpecRecord("invalid.invalid.invalid", 10); @@ -88,7 +92,7 @@ public class CrawlingThenConvertingIntegrationTest { } @Test - public void testRedirectingDomain() throws IOException { + public void testRedirectingDomain() throws Exception { // Attempt to fetch an invalid domain var specs = new CrawlerMain.CrawlSpecRecord("memex.marginalia.nu", 10); @@ -107,7 +111,7 @@ public class CrawlingThenConvertingIntegrationTest { } @Test - public void testBlockedDomain() throws IOException { + public void testBlockedDomain() throws Exception { // Attempt to fetch an invalid domain var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 10); @@ -124,7 +128,7 @@ public class CrawlingThenConvertingIntegrationTest { } @Test - public void crawlSunnyDay() throws IOException { + public void crawlSunnyDay() throws Exception { var specs = new CrawlerMain.CrawlSpecRecord("www.marginalia.nu", 10); CrawledDomain domain = crawl(specs); @@ -157,7 +161,7 @@ public class CrawlingThenConvertingIntegrationTest { @Test - public void crawlContentTypes() throws IOException { + public void crawlContentTypes() throws Exception { var specs = new CrawlerMain.CrawlSpecRecord("www.marginalia.nu", 10, List.of( "https://www.marginalia.nu/sanic.png", @@ -195,7 +199,7 @@ public class CrawlingThenConvertingIntegrationTest { @Test - public void crawlRobotsTxt() throws IOException { + public void crawlRobotsTxt() throws Exception { var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 5, List.of("https://search.marginalia.nu/search?q=hello+world") ); @@ -235,15 +239,17 @@ public class CrawlingThenConvertingIntegrationTest { return null; // unreachable } } - private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs) throws IOException { + private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs) throws Exception { return crawl(specs, domain -> true); } - private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate domainBlacklist) throws IOException { + private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate domainBlacklist) throws Exception { List data = new ArrayList<>(); - try (var recorder = new WarcRecorder(fileName)) { - new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).crawlDomain(); + try (var recorder = new WarcRecorder(fileName); + var db = new DomainStateDb(dbTempFile)) + { + new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain(); } CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain(), diff --git a/code/processes/crawling-process/build.gradle b/code/processes/crawling-process/build.gradle index e955f86c..fa7579f3 100644 --- a/code/processes/crawling-process/build.gradle +++ b/code/processes/crawling-process/build.gradle @@ -46,6 +46,8 @@ dependencies { implementation libs.notnull implementation libs.guava + implementation libs.sqlite + implementation dependencies.create(libs.guice.get()) { exclude group: 'com.google.guava' } diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java index e7fbe4f9..01204b24 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java @@ -241,6 +241,7 @@ public class CrawlerMain extends ProcessMainClass { // Set up the work log and the warc archiver so we can keep track of what we've done try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler.log")); + DomainStateDb domainStateDb = new DomainStateDb(outputDir.resolve("domainstate.db")); WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir); AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(domainsToCrawl) ) { @@ -258,6 +259,7 @@ public class CrawlerMain extends ProcessMainClass { anchorTagsSource, outputDir, warcArchiver, + domainStateDb, workLog); if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) == null) { @@ -299,11 +301,12 @@ public class CrawlerMain extends ProcessMainClass { heartbeat.start(); try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler-" + targetDomainName.replace('/', '-') + ".log")); + DomainStateDb domainStateDb = new DomainStateDb(outputDir.resolve("domainstate.db")); WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir); AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(List.of(new EdgeDomain(targetDomainName))) ) { var spec = new CrawlSpecRecord(targetDomainName, 1000, List.of()); - var task = new CrawlTask(spec, anchorTagsSource, outputDir, warcArchiver, workLog); + var task = new CrawlTask(spec, anchorTagsSource, outputDir, warcArchiver, domainStateDb, workLog); task.run(); } catch (Exception ex) { @@ -324,18 +327,21 @@ public class CrawlerMain extends ProcessMainClass { private final AnchorTagsSource anchorTagsSource; private final Path outputDir; private final WarcArchiverIf warcArchiver; + private final DomainStateDb domainStateDb; private final WorkLog workLog; CrawlTask(CrawlSpecRecord specification, AnchorTagsSource anchorTagsSource, Path outputDir, WarcArchiverIf warcArchiver, + DomainStateDb domainStateDb, WorkLog workLog) { this.specification = specification; this.anchorTagsSource = anchorTagsSource; this.outputDir = outputDir; this.warcArchiver = warcArchiver; + this.domainStateDb = domainStateDb; this.workLog = workLog; this.domain = specification.domain(); @@ -359,7 +365,7 @@ public class CrawlerMain extends ProcessMainClass { } try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now - var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder); + var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder); CrawlDataReference reference = getReference(); ) { diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/DomainStateDb.java b/code/processes/crawling-process/java/nu/marginalia/crawl/DomainStateDb.java new file mode 100644 index 00000000..0824c3fe --- /dev/null +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/DomainStateDb.java @@ -0,0 +1,127 @@ +package nu.marginalia.crawl; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; +import java.nio.file.Path; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.time.Instant; +import java.util.Optional; + +/** Supplemental sqlite database for storing the summary of a crawl. + * One database exists per crawl data set. + * */ +public class DomainStateDb implements AutoCloseable { + + private static final Logger logger = LoggerFactory.getLogger(DomainStateDb.class); + + private final Connection connection; + + public record SummaryRecord( + String domainName, + Instant lastUpdated, + String state, + @Nullable String stateDesc, + @Nullable String feedUrl + ) + { + public static SummaryRecord forSuccess(String domainName) { + return new SummaryRecord(domainName, Instant.now(), "OK", null, null); + } + + public static SummaryRecord forSuccess(String domainName, String feedUrl) { + return new SummaryRecord(domainName, Instant.now(), "OK", null, feedUrl); + } + + public static SummaryRecord forError(String domainName, String state, String stateDesc) { + return new SummaryRecord(domainName, Instant.now(), state, stateDesc, null); + } + + public boolean equals(Object other) { + if (other == this) { + return true; + } + if (!(other instanceof SummaryRecord(String name, Instant updated, String state1, String desc, String url))) { + return false; + } + return domainName.equals(name) && + lastUpdated.toEpochMilli() == updated.toEpochMilli() && + state.equals(state1) && + (stateDesc == null ? desc == null : stateDesc.equals(desc)) && + (feedUrl == null ? url == null : feedUrl.equals(url)); + } + + public int hashCode() { + return domainName.hashCode() + Long.hashCode(lastUpdated.toEpochMilli()); + } + + } + + public DomainStateDb(Path filename) throws SQLException { + String sqliteDbString = "jdbc:sqlite:" + filename.toString(); + connection = DriverManager.getConnection(sqliteDbString); + + try (var stmt = connection.createStatement()) { + stmt.executeUpdate(""" + CREATE TABLE IF NOT EXISTS summary ( + domain TEXT PRIMARY KEY, + lastUpdatedEpochMs LONG NOT NULL, + state TEXT NOT NULL, + stateDesc TEXT, + feedUrl TEXT + ) + """); + + stmt.execute("PRAGMA journal_mode=WAL"); + } + } + + @Override + public void close() throws SQLException { + connection.close(); + } + + + public void save(SummaryRecord record) { + try (var stmt = connection.prepareStatement(""" + INSERT OR REPLACE INTO summary (domain, lastUpdatedEpochMs, state, stateDesc, feedUrl) + VALUES (?, ?, ?, ?, ?) + """)) { + stmt.setString(1, record.domainName()); + stmt.setLong(2, record.lastUpdated().toEpochMilli()); + stmt.setString(3, record.state()); + stmt.setString(4, record.stateDesc()); + stmt.setString(5, record.feedUrl()); + stmt.executeUpdate(); + } catch (SQLException e) { + logger.error("Failed to insert summary record", e); + } + } + + public Optional get(String domainName) { + try (var stmt = connection.prepareStatement(""" + SELECT domain, lastUpdatedEpochMs, state, stateDesc, feedUrl + FROM summary + WHERE domain = ? + """)) { + stmt.setString(1, domainName); + var rs = stmt.executeQuery(); + if (rs.next()) { + return Optional.of(new SummaryRecord( + rs.getString("domain"), + Instant.ofEpochMilli(rs.getLong("lastUpdatedEpochMs")), + rs.getString("state"), + rs.getString("stateDesc"), + rs.getString("feedUrl") + )); + } + } catch (SQLException e) { + logger.error("Failed to get summary record", e); + } + + return Optional.empty(); + } +} diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index ace2059b..adef8ea1 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -4,6 +4,7 @@ import crawlercommons.robots.SimpleRobotRules; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.contenttype.ContentType; import nu.marginalia.crawl.CrawlerMain; +import nu.marginalia.crawl.DomainStateDb; import nu.marginalia.crawl.fetcher.ContentTags; import nu.marginalia.crawl.fetcher.HttpFetcher; import nu.marginalia.crawl.fetcher.HttpFetcherImpl; @@ -16,7 +17,9 @@ import nu.marginalia.ip_blocklist.UrlBlocklist; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.body.DocumentBodyExtractor; import nu.marginalia.model.body.HttpFetchResult; +import nu.marginalia.model.crawldata.CrawlerDomainStatus; import org.jsoup.Jsoup; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,6 +49,7 @@ public class CrawlerRetreiver implements AutoCloseable { private final DomainProber domainProber; private final DomainCrawlFrontier crawlFrontier; + private final DomainStateDb domainStateDb; private final WarcRecorder warcRecorder; private final CrawlerRevisitor crawlerRevisitor; @@ -55,8 +59,10 @@ public class CrawlerRetreiver implements AutoCloseable { public CrawlerRetreiver(HttpFetcher fetcher, DomainProber domainProber, CrawlerMain.CrawlSpecRecord specs, + DomainStateDb domainStateDb, WarcRecorder warcRecorder) { + this.domainStateDb = domainStateDb; this.warcRecorder = warcRecorder; this.fetcher = fetcher; this.domainProber = domainProber; @@ -90,8 +96,21 @@ public class CrawlerRetreiver implements AutoCloseable { try { // Do an initial domain probe to determine the root URL EdgeUrl rootUrl; - if (probeRootUrl() instanceof HttpFetcher.DomainProbeResult.Ok ok) rootUrl = ok.probedUrl(); - else return 1; + + var probeResult = probeRootUrl(); + switch (probeResult) { + case HttpFetcher.DomainProbeResult.Ok(EdgeUrl probedUrl) -> { + rootUrl = probedUrl; // Good track + } + case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> { + domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString())); + return 1; + } + case HttpFetcher.DomainProbeResult.Error(CrawlerDomainStatus status, String desc) -> { + domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, status.toString(), desc)); + return 1; + } + } // Sleep after the initial probe, we don't have access to the robots.txt yet // so we don't know the crawl delay @@ -114,7 +133,8 @@ public class CrawlerRetreiver implements AutoCloseable { delayTimer.waitFetchDelay(0); // initial delay after robots.txt - sniffRootDocument(rootUrl, delayTimer); + DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(rootUrl, delayTimer); + domainStateDb.save(summaryRecord); // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) { @@ -196,7 +216,9 @@ public class CrawlerRetreiver implements AutoCloseable { return domainProbeResult; } - private void sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) { + private DomainStateDb.SummaryRecord sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) { + Optional feedLink = Optional.empty(); + try { var url = rootUrl.withPathAndParam("/", null); @@ -204,11 +226,11 @@ public class CrawlerRetreiver implements AutoCloseable { timer.waitFetchDelay(0); if (!(result instanceof HttpFetchResult.ResultOk ok)) - return; + return DomainStateDb.SummaryRecord.forSuccess(domain); var optDoc = ok.parseDocument(); if (optDoc.isEmpty()) - return; + return DomainStateDb.SummaryRecord.forSuccess(domain); // Sniff the software based on the sample document var doc = optDoc.get(); @@ -216,7 +238,6 @@ public class CrawlerRetreiver implements AutoCloseable { crawlFrontier.enqueueLinksFromDocument(url, doc); EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null); - Optional sitemapUrl = Optional.empty(); for (var link : doc.getElementsByTag("link")) { String rel = link.attr("rel"); @@ -232,23 +253,33 @@ public class CrawlerRetreiver implements AutoCloseable { // Grab the RSS/Atom as a sitemap if it exists if (rel.equalsIgnoreCase("alternate") - && (type.equalsIgnoreCase("application/atom+xml") || type.equalsIgnoreCase("application/atomsvc+xml"))) { + && (type.equalsIgnoreCase("application/atom+xml") + || type.equalsIgnoreCase("application/atomsvc+xml") + || type.equalsIgnoreCase("application/rss+xml") + )) { String href = link.attr("href"); - sitemapUrl = linkParser.parseLink(url, href) - .filter(crawlFrontier::isSameDomain); + feedLink = linkParser.parseLink(url, href) + .filter(crawlFrontier::isSameDomain) + .map(EdgeUrl::toString); } } - // Download the sitemap if available exists - if (sitemapUrl.isPresent()) { - sitemapFetcher.downloadSitemaps(List.of(sitemapUrl.get())); + + if (feedLink.isEmpty()) { + feedLink = guessFeedUrl(timer); + } + + // Download the sitemap if available + if (feedLink.isPresent()) { + sitemapFetcher.downloadSitemaps(List.of(feedLink.get())); timer.waitFetchDelay(0); } // Grab the favicon if it exists fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty()); timer.waitFetchDelay(0); + } catch (Exception ex) { logger.error("Error configuring link filter", ex); @@ -256,6 +287,74 @@ public class CrawlerRetreiver implements AutoCloseable { finally { crawlFrontier.addVisited(rootUrl); } + + if (feedLink.isPresent()) { + return DomainStateDb.SummaryRecord.forSuccess(domain, feedLink.get()); + } + else { + return DomainStateDb.SummaryRecord.forSuccess(domain); + } + } + + private final List likelyFeedEndpoints = List.of( + "/rss.xml", + "/atom.xml", + "/feed.xml", + "/index.xml", + "/feed", + "/rss", + "/atom", + "/feeds", + "/blog/feed", + "/blog/rss" + ); + + private Optional guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException { + var oldDomainStateRecord = domainStateDb.get(domain); + + // If we are already aware of an old feed URL, then we can just revalidate it + if (oldDomainStateRecord.isPresent()) { + var oldRecord = oldDomainStateRecord.get(); + if (oldRecord.feedUrl() != null && validateFeedUrl(oldRecord.feedUrl(), timer)) { + return Optional.of(oldRecord.feedUrl()); + } + } + + for (String endpoint : likelyFeedEndpoints) { + String url = "https://" + domain + "/" + endpoint; + if (validateFeedUrl(url, timer)) { + return Optional.of(url); + } + } + + return Optional.empty(); + } + + private boolean validateFeedUrl(String url, CrawlDelayTimer timer) throws InterruptedException { + var parsedOpt = EdgeUrl.parse(url); + if (parsedOpt.isEmpty()) + return false; + + HttpFetchResult result = fetchWithRetry(parsedOpt.get(), timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty()); + timer.waitFetchDelay(0); + + if (!(result instanceof HttpFetchResult.ResultOk ok)) { + return false; + } + + // Extract the beginning of the + Optional bodyOpt = DocumentBodyExtractor.asString(ok).getBody(); + if (bodyOpt.isEmpty()) + return false; + String body = bodyOpt.get(); + body = body.substring(0, Math.min(128, body.length())).toLowerCase(); + + if (body.contains(" sitemaps = robotsRules.getSitemaps(); + List urls = robotsRules.getSitemaps(); - List urls = new ArrayList<>(sitemaps.size()); - if (!sitemaps.isEmpty()) { - for (var url : sitemaps) { - EdgeUrl.parse(url).ifPresent(urls::add); - } - } - else { - urls.add(rootUrl.withPathAndParam("/sitemap.xml", null)); + if (urls.isEmpty()) { + urls = List.of(rootUrl.withPathAndParam("/sitemap.xml", null).toString()); } downloadSitemaps(urls); } - public void downloadSitemaps(List urls) { + public void downloadSitemaps(List urls) { Set checkedSitemaps = new HashSet<>(); - for (var url : urls) { + for (var rawUrl : urls) { + Optional parsedUrl = EdgeUrl.parse(rawUrl); + if (parsedUrl.isEmpty()) { + continue; + } + + EdgeUrl url = parsedUrl.get(); + // Let's not download sitemaps from other domains for now if (!crawlFrontier.isSameDomain(url)) { continue; diff --git a/code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java b/code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java index c38bcb3b..8d33fe00 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java @@ -18,6 +18,7 @@ public class ContentTypeLogic { "application/xhtml", "application/xml", "application/atom+xml", + "application/atomsvc+xml", "application/rss+xml", "application/x-rss+xml", "application/rdf+xml", diff --git a/code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyResult.java b/code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyResult.java index a29e7093..1248ecba 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyResult.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyResult.java @@ -23,6 +23,10 @@ public sealed interface DocumentBodyResult { return mapper.apply(contentType, body); } + public Optional getBody() { + return Optional.of(body); + } + @Override public void ifPresent(ExConsumer consumer) throws Exception { consumer.accept(contentType, body); @@ -41,6 +45,11 @@ public sealed interface DocumentBodyResult { return (DocumentBodyResult) this; } + @Override + public Optional getBody() { + return Optional.empty(); + } + @Override public void ifPresent(ExConsumer consumer) throws Exception { } @@ -49,6 +58,7 @@ public sealed interface DocumentBodyResult { Optional mapOpt(BiFunction mapper); Optional flatMapOpt(BiFunction> mapper); DocumentBodyResult flatMap(BiFunction> mapper); + Optional getBody(); void ifPresent(ExConsumer consumer) throws Exception; diff --git a/code/processes/crawling-process/test/nu/marginalia/crawl/DomainStateDbTest.java b/code/processes/crawling-process/test/nu/marginalia/crawl/DomainStateDbTest.java new file mode 100644 index 00000000..156f6f6d --- /dev/null +++ b/code/processes/crawling-process/test/nu/marginalia/crawl/DomainStateDbTest.java @@ -0,0 +1,66 @@ +package nu.marginalia.crawl; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.time.Instant; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class DomainStateDbTest { + + Path tempFile; + @BeforeEach + void setUp() throws IOException { + tempFile = Files.createTempFile(getClass().getSimpleName(), ".db"); + } + + @AfterEach + void tearDown() throws IOException { + Files.deleteIfExists(tempFile); + } + + @Test + public void testSunnyDay() throws SQLException { + try (var db = new DomainStateDb(tempFile)) { + var allFields = new DomainStateDb.SummaryRecord( + "all.marginalia.nu", + Instant.now(), + "OK", + "Bad address", + "https://www.marginalia.nu/atom.xml" + ); + + var minFields = new DomainStateDb.SummaryRecord( + "min.marginalia.nu", + Instant.now(), + "OK", + null, + null + ); + + db.save(allFields); + db.save(minFields); + + assertEquals(allFields, db.get("all.marginalia.nu").orElseThrow()); + assertEquals(minFields, db.get("min.marginalia.nu").orElseThrow()); + + var updatedAllFields = new DomainStateDb.SummaryRecord( + "all.marginalia.nu", + Instant.now(), + "BAD", + null, + null + ); + + db.save(updatedAllFields); + assertEquals(updatedAllFields, db.get("all.marginalia.nu").orElseThrow()); + } + } + +} \ No newline at end of file diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index ea9bcf60..aacc0e52 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -2,6 +2,7 @@ package nu.marginalia.crawling.retreival; import crawlercommons.robots.SimpleRobotRules; import nu.marginalia.crawl.CrawlerMain; +import nu.marginalia.crawl.DomainStateDb; import nu.marginalia.crawl.fetcher.ContentTags; import nu.marginalia.crawl.fetcher.HttpFetcher; import nu.marginalia.crawl.fetcher.HttpFetcherImpl; @@ -18,6 +19,7 @@ import nu.marginalia.model.crawldata.SerializableCrawlData; import nu.marginalia.test.CommonTestData; import okhttp3.Headers; import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.mockito.Mockito; import org.slf4j.Logger; @@ -25,6 +27,9 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.net.URISyntaxException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -36,9 +41,14 @@ public class CrawlerMockFetcherTest { Map mockData = new HashMap<>(); HttpFetcher fetcherMock = new MockFetcher(); - + private Path dbTempFile; + @BeforeEach + public void setUp() throws IOException { + dbTempFile = Files.createTempFile("domains","db"); + } @AfterEach - public void tearDown() { + public void tearDown() throws IOException { + Files.deleteIfExists(dbTempFile); mockData.clear(); } @@ -66,15 +76,17 @@ public class CrawlerMockFetcherTest { } - void crawl(CrawlerMain.CrawlSpecRecord spec) throws IOException { - try (var recorder = new WarcRecorder()) { - new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder) + void crawl(CrawlerMain.CrawlSpecRecord spec) throws IOException, SQLException { + try (var recorder = new WarcRecorder(); + var db = new DomainStateDb(dbTempFile) + ) { + new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, db, recorder) .crawlDomain(); } } @Test - public void testLemmy() throws URISyntaxException, IOException { + public void testLemmy() throws Exception { List out = new ArrayList<>(); registerUrlClasspathData(new EdgeUrl("https://startrek.website/"), "mock-crawl-data/lemmy/index.html"); @@ -85,7 +97,7 @@ public class CrawlerMockFetcherTest { } @Test - public void testMediawiki() throws URISyntaxException, IOException { + public void testMediawiki() throws Exception { List out = new ArrayList<>(); registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html"); @@ -94,7 +106,7 @@ public class CrawlerMockFetcherTest { } @Test - public void testDiscourse() throws URISyntaxException, IOException { + public void testDiscourse() throws Exception { List out = new ArrayList<>(); registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/"), "mock-crawl-data/discourse/index.html"); diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index edd0de78..01cf8339 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -4,6 +4,7 @@ import nu.marginalia.UserAgent; import nu.marginalia.WmsaHome; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.crawl.CrawlerMain; +import nu.marginalia.crawl.DomainStateDb; import nu.marginalia.crawl.fetcher.HttpFetcher; import nu.marginalia.crawl.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.fetcher.warc.WarcRecorder; @@ -25,6 +26,7 @@ import java.io.RandomAccessFile; import java.net.URISyntaxException; import java.nio.file.Files; import java.nio.file.Path; +import java.sql.SQLException; import java.util.*; import java.util.stream.Collectors; @@ -39,11 +41,13 @@ class CrawlerRetreiverTest { Path tempFileWarc2; Path tempFileParquet2; Path tempFileWarc3; + Path tempFileDb; @BeforeEach public void setUp() throws IOException { httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D"); tempFileParquet1 = Files.createTempFile("crawling-process", ".parquet"); tempFileParquet2 = Files.createTempFile("crawling-process", ".parquet"); + tempFileDb = Files.createTempFile("crawling-process", ".db"); } @@ -505,22 +509,26 @@ class CrawlerRetreiverTest { } private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, SerializableCrawlDataStream stream) { - try (var recorder = new WarcRecorder(tempFileWarc2)) { - new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).crawlDomain(new DomainLinks(), + try (var recorder = new WarcRecorder(tempFileWarc2); + var db = new DomainStateDb(tempFileDb) + ) { + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(), new CrawlDataReference(stream)); } - catch (IOException ex) { + catch (IOException | SQLException ex) { Assertions.fail(ex); } } @NotNull private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) { - try (var recorder = new WarcRecorder(tempFileWarc1)) { - var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder); + try (var recorder = new WarcRecorder(tempFileWarc1); + var db = new DomainStateDb(tempFileDb) + ) { + var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder); crawler.crawlDomain(); return crawler.getCrawlFrontier(); - } catch (IOException ex) { + } catch (IOException| SQLException ex) { Assertions.fail(ex); return null; // unreachable }