From 3bc99639a02803a8e2f0994f79cc1b1fd3b63fb3 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 27 Dec 2024 15:10:15 +0100 Subject: [PATCH] (feed-fetcher) Make feed fetcher requests conditional Add `If-None-Match` and `If-Modified-Since` headers as appropriate to the feed fetcher's requests. On well-configured web servers, this should short-circuit the request and reduce the amount of bandwidth and processing that is necessary. A new table was added to the FeedDb to hold one etag per domain. If-Modified-Since semantics are based on the creation date for the feed database, which should serve as a cutoff date for the earliest update we can have received. This completes the changes for Issue #136. --- .../java/nu/marginalia/rss/db/FeedDb.java | 41 +++++- .../nu/marginalia/rss/db/FeedDbReader.java | 32 +++++ .../nu/marginalia/rss/db/FeedDbWriter.java | 17 +++ .../rss/svc/FeedFetcherService.java | 124 +++++++++++------- .../rss/svc/FeedFetcherServiceTest.java | 22 +++- 5 files changed, 185 insertions(+), 51 deletions(-) diff --git a/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDb.java b/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDb.java index 0fb87e3c..530b81d6 100644 --- a/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDb.java +++ b/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDb.java @@ -8,6 +8,7 @@ import nu.marginalia.rss.model.FeedDefinition; import nu.marginalia.rss.model.FeedItems; import nu.marginalia.service.module.ServiceConfiguration; import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -127,6 +128,26 @@ public class FeedDb { return FeedItems.none(); } + + @Nullable + public String getEtag(EdgeDomain domain) { + if (!feedDbEnabled) { + throw new IllegalStateException("Feed database is disabled on this node"); + } + + // Capture the current reader to avoid concurrency issues + FeedDbReader reader = this.reader; + try { + if (reader != null) { + return reader.getEtag(domain); + } + } + catch (Exception e) { + logger.error("Error getting etag for " + domain, e); + } + return null; + } + public Optional getFeedAsJson(String domain) { if (!feedDbEnabled) { throw new IllegalStateException("Feed database is disabled on this node"); @@ -214,7 +235,7 @@ public class FeedDb { public Instant getFetchTime() { if (!Files.exists(readerDbPath)) { - return Instant.ofEpochMilli(0); + return Instant.EPOCH; } try { @@ -224,7 +245,23 @@ public class FeedDb { } catch (IOException ex) { logger.error("Failed to read the creatiom time of {}", readerDbPath); - return Instant.ofEpochMilli(0); + return Instant.EPOCH; } } + + public boolean hasData() { + if (!feedDbEnabled) { + throw new IllegalStateException("Feed database is disabled on this node"); + } + + // Capture the current reader to avoid concurrency issues + FeedDbReader reader = this.reader; + + if (reader != null) { + return reader.hasData(); + } + + return false; + } + } diff --git a/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDbReader.java b/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDbReader.java index 9bb02acf..af4c5aa0 100644 --- a/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDbReader.java +++ b/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDbReader.java @@ -8,6 +8,7 @@ import nu.marginalia.rss.model.FeedItems; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import javax.annotation.Nullable; import java.nio.file.Path; import java.sql.Connection; import java.sql.DriverManager; @@ -32,6 +33,7 @@ public class FeedDbReader implements AutoCloseable { try (var stmt = connection.createStatement()) { stmt.executeUpdate("CREATE TABLE IF NOT EXISTS feed (domain TEXT PRIMARY KEY, feed JSON)"); stmt.executeUpdate("CREATE TABLE IF NOT EXISTS errors (domain TEXT PRIMARY KEY, cnt INT DEFAULT 0)"); + stmt.executeUpdate("CREATE TABLE IF NOT EXISTS etags (domain TEXT PRIMARY KEY, etag TEXT)"); } } @@ -106,6 +108,22 @@ public class FeedDbReader implements AutoCloseable { return FeedItems.none(); } + @Nullable + public String getEtag(EdgeDomain domain) { + try (var stmt = connection.prepareStatement("SELECT etag FROM etags WHERE DOMAIN = ?")) { + stmt.setString(1, domain.toString()); + var rs = stmt.executeQuery(); + + if (rs.next()) { + return rs.getString(1); + } + } catch (SQLException e) { + logger.error("Error getting etag for " + domain, e); + } + + return null; + } + private FeedItems deserialize(String string) { return gson.fromJson(string, FeedItems.class); } @@ -141,4 +159,18 @@ public class FeedDbReader implements AutoCloseable { } + public boolean hasData() { + try (var stmt = connection.prepareStatement("SELECT 1 FROM feed LIMIT 1")) { + var rs = stmt.executeQuery(); + if (rs.next()) { + return rs.getBoolean(1); + } + else { + return false; + } + } + catch (SQLException ex) { + return false; + } + } } diff --git a/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDbWriter.java b/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDbWriter.java index bbd6354e..0ca561e2 100644 --- a/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDbWriter.java +++ b/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDbWriter.java @@ -20,6 +20,7 @@ public class FeedDbWriter implements AutoCloseable { private final Connection connection; private final PreparedStatement insertFeedStmt; private final PreparedStatement insertErrorStmt; + private final PreparedStatement insertEtagStmt; private final Path dbPath; private volatile boolean closed = false; @@ -34,10 +35,12 @@ public class FeedDbWriter implements AutoCloseable { try (var stmt = connection.createStatement()) { stmt.executeUpdate("CREATE TABLE IF NOT EXISTS feed (domain TEXT PRIMARY KEY, feed JSON)"); stmt.executeUpdate("CREATE TABLE IF NOT EXISTS errors (domain TEXT PRIMARY KEY, cnt INT DEFAULT 0)"); + stmt.executeUpdate("CREATE TABLE IF NOT EXISTS etags (domain TEXT PRIMARY KEY, etag TEXT)"); } insertFeedStmt = connection.prepareStatement("INSERT INTO feed (domain, feed) VALUES (?, ?)"); insertErrorStmt = connection.prepareStatement("INSERT INTO errors (domain, cnt) VALUES (?, ?)"); + insertEtagStmt = connection.prepareStatement("INSERT INTO etags (domain, etag) VALUES (?, ?)"); } public Path getDbPath() { @@ -56,6 +59,20 @@ public class FeedDbWriter implements AutoCloseable { } } + public synchronized void saveEtag(String domain, String etag) { + if (etag == null || etag.isBlank()) + return; + + try { + insertEtagStmt.setString(1, domain.toLowerCase()); + insertEtagStmt.setString(2, etag); + insertEtagStmt.executeUpdate(); + } + catch (SQLException e) { + logger.error("Error saving etag for " + domain, e); + } + } + public synchronized void setErrorCount(String domain, int count) { try { insertErrorStmt.setString(1, domain); diff --git a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java index 00f9663c..696b36ff 100644 --- a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java +++ b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java @@ -34,9 +34,7 @@ import java.net.http.HttpRequest; import java.net.http.HttpResponse; import java.nio.charset.StandardCharsets; import java.sql.SQLException; -import java.time.Duration; -import java.time.LocalDateTime; -import java.time.ZonedDateTime; +import java.time.*; import java.time.format.DateTimeFormatter; import java.util.*; import java.util.concurrent.Executors; @@ -61,7 +59,6 @@ public class FeedFetcherService { private final DomainLocks domainLocks = new DomainLocks(); private volatile boolean updating; - private boolean deterministic = false; @Inject public FeedFetcherService(FeedDb feedDb, @@ -93,11 +90,6 @@ public class FeedFetcherService { REFRESH }; - /** Disable random-based heuristics. This is meant for testing */ - public void setDeterministic() { - this.deterministic = true; - } - public void updateFeeds(UpdateMode updateMode) throws IOException { if (updating) // Prevent concurrent updates { @@ -137,37 +129,37 @@ public class FeedFetcherService { for (var feed : definitions) { executor.submitQuietly(() -> { try { - var oldData = feedDb.getFeed(new EdgeDomain(feed.domain())); + EdgeDomain domain = new EdgeDomain(feed.domain()); + var oldData = feedDb.getFeed(domain); - // If we have existing data, we might skip updating it with a probability that increases with time, - // this is to avoid hammering the feeds that are updated very rarely and save some time and resources - // on our end + @Nullable + String ifModifiedSinceDate = switch(updateMode) { + case REFRESH -> getIfModifiedSinceDate(feedDb); + case CLEAN -> null; + }; - /* Disable for now: - - if (!oldData.isEmpty()) { - Duration duration = feed.durationSinceUpdated(); - long daysSinceUpdate = duration.toDays(); - - - if (deterministic || (daysSinceUpdate > 2 && ThreadLocalRandom.current() - .nextInt(1, 1 + (int) Math.min(10, daysSinceUpdate) / 2) > 1)) { - // Skip updating this feed, just write the old data back instead - writer.saveFeed(oldData); - return; - } - } - */ + @Nullable + String ifNoneMatchTag = switch (updateMode) { + case REFRESH -> feedDb.getEtag(domain); + case CLEAN -> null; + }; FetchResult feedData; try (DomainLocks.DomainLock domainLock = domainLocks.lockDomain(new EdgeDomain(feed.domain()))) { - feedData = fetchFeedData(feed, client); + feedData = fetchFeedData(feed, client, ifModifiedSinceDate, ifNoneMatchTag); } catch (Exception ex) { feedData = new FetchResult.TransientError(); } switch (feedData) { - case FetchResult.Success(String value) -> writer.saveFeed(parseFeed(value, feed)); + case FetchResult.Success(String value, String etag) -> { + writer.saveEtag(feed.domain(), etag); + writer.saveFeed(parseFeed(value, feed)); + } + case FetchResult.NotModified() -> { + writer.saveEtag(feed.domain(), ifNoneMatchTag); + writer.saveFeed(oldData); + } case FetchResult.TransientError() -> { int errorCount = errorCounts.getOrDefault(feed.domain().toLowerCase(), 0); writer.setErrorCount(feed.domain().toLowerCase(), ++errorCount); @@ -214,36 +206,73 @@ public class FeedFetcherService { } } - private FetchResult fetchFeedData(FeedDefinition feed, HttpClient client) { + @Nullable + static String getIfModifiedSinceDate(FeedDb feedDb) { + + // If the db is fresh, we don't send If-Modified-Since + if (!feedDb.hasData()) + return null; + + Instant cutoffInstant = feedDb.getFetchTime(); + + // If we're unable to establish fetch time, we don't send If-Modified-Since + if (cutoffInstant == Instant.EPOCH) + return null; + + return cutoffInstant.atZone(ZoneId.of("GMT")).format(DateTimeFormatter.RFC_1123_DATE_TIME); + } + + private FetchResult fetchFeedData(FeedDefinition feed, + HttpClient client, + @Nullable String ifModifiedSinceDate, + @Nullable String ifNoneMatchTag) + { try { URI uri = new URI(feed.feedUrl()); - HttpRequest getRequest = HttpRequest.newBuilder() + HttpRequest.Builder requestBuilder = HttpRequest.newBuilder() .GET() .uri(uri) .header("User-Agent", WmsaHome.getUserAgent().uaIdentifier()) .header("Accept-Encoding", "gzip") .header("Accept", "text/*, */*;q=0.9") .timeout(Duration.ofSeconds(15)) - .build(); + ; + + if (ifModifiedSinceDate != null) { + requestBuilder.header("If-Modified-Since", ifModifiedSinceDate); + } + + if (ifNoneMatchTag != null) { + requestBuilder.header("If-None-Match", ifNoneMatchTag); + } + + HttpRequest getRequest = requestBuilder.build(); for (int i = 0; i < 3; i++) { - var rs = client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray()); - if (429 == rs.statusCode()) { + HttpResponse rs = client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray()); + + if (rs.statusCode() == 429) { // Too Many Requests int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2")); Thread.sleep(Duration.ofSeconds(Math.clamp(retryAfter, 1, 5))); - } else if (200 == rs.statusCode()) { - byte[] responseData = getResponseData(rs); - - String contentType = rs.headers().firstValue("Content-Type").orElse(""); - String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), responseData); - - return new FetchResult.Success(bodyText); - } else if (404 == rs.statusCode()) { - return new FetchResult.PermanentError(); // never try again - } else { - return new FetchResult.TransientError(); // we try again in a few days + continue; } + + String newEtagValue = rs.headers().firstValue("ETag").orElse(""); + + return switch (rs.statusCode()) { + case 200 -> { + byte[] responseData = getResponseData(rs); + + String contentType = rs.headers().firstValue("Content-Type").orElse(""); + String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), responseData); + + yield new FetchResult.Success(bodyText, newEtagValue); + } + case 304 -> new FetchResult.NotModified(); // via If-Modified-Since semantics + case 404 -> new FetchResult.PermanentError(); // never try again + default -> new FetchResult.TransientError(); // we try again later + }; } } catch (Exception ex) { @@ -267,7 +296,8 @@ public class FeedFetcherService { } public sealed interface FetchResult { - record Success(String value) implements FetchResult {} + record Success(String value, String etag) implements FetchResult {} + record NotModified() implements FetchResult {} record TransientError() implements FetchResult {} record PermanentError() implements FetchResult {} } diff --git a/code/functions/live-capture/test/nu/marginalia/rss/svc/FeedFetcherServiceTest.java b/code/functions/live-capture/test/nu/marginalia/rss/svc/FeedFetcherServiceTest.java index 3ddd7f49..d5bf025e 100644 --- a/code/functions/live-capture/test/nu/marginalia/rss/svc/FeedFetcherServiceTest.java +++ b/code/functions/live-capture/test/nu/marginalia/rss/svc/FeedFetcherServiceTest.java @@ -96,7 +96,6 @@ class FeedFetcherServiceTest extends AbstractModule { feedDb.switchDb(writer); } - feedFetcherService.setDeterministic(); feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH); var result = feedDb.getFeed(new EdgeDomain("www.marginalia.nu")); @@ -104,6 +103,26 @@ class FeedFetcherServiceTest extends AbstractModule { Assertions.assertFalse(result.isEmpty()); } + @Tag("flaky") + @Test + public void testFetchRepeatedly() throws Exception { + try (var writer = feedDb.createWriter()) { + writer.saveFeed(new FeedItems("www.marginalia.nu", "https://www.marginalia.nu/log/index.xml", "", List.of())); + feedDb.switchDb(writer); + } + + feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH); + Assertions.assertNotNull(feedDb.getEtag(new EdgeDomain("www.marginalia.nu"))); + feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH); + Assertions.assertNotNull(feedDb.getEtag(new EdgeDomain("www.marginalia.nu"))); + feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH); + Assertions.assertNotNull(feedDb.getEtag(new EdgeDomain("www.marginalia.nu"))); + + var result = feedDb.getFeed(new EdgeDomain("www.marginalia.nu")); + System.out.println(result); + Assertions.assertFalse(result.isEmpty()); + } + @Tag("flaky") @Test public void test404() throws Exception { @@ -112,7 +131,6 @@ class FeedFetcherServiceTest extends AbstractModule { feedDb.switchDb(writer); } - feedFetcherService.setDeterministic(); feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH); // We forget the feed on a 404 error