(feed-fetcher) Make feed fetcher requests conditional

Add `If-None-Match` and `If-Modified-Since` headers as appropriate to the feed fetcher's requests.  On well-configured web servers, this should short-circuit the request and reduce the amount of bandwidth and processing that is necessary.

A new table was added to the FeedDb to hold one etag per domain.

If-Modified-Since semantics are based on the creation date for the feed database, which should serve as a cutoff date for the earliest update we can have received.

This completes the changes for Issue #136.
This commit is contained in:
Viktor Lofgren 2024-12-27 15:10:15 +01:00
parent 927bc0b63c
commit 3bc99639a0
5 changed files with 185 additions and 51 deletions

View File

@ -8,6 +8,7 @@ import nu.marginalia.rss.model.FeedDefinition;
import nu.marginalia.rss.model.FeedItems;
import nu.marginalia.service.module.ServiceConfiguration;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -127,6 +128,26 @@ public class FeedDb {
return FeedItems.none();
}
@Nullable
public String getEtag(EdgeDomain domain) {
if (!feedDbEnabled) {
throw new IllegalStateException("Feed database is disabled on this node");
}
// Capture the current reader to avoid concurrency issues
FeedDbReader reader = this.reader;
try {
if (reader != null) {
return reader.getEtag(domain);
}
}
catch (Exception e) {
logger.error("Error getting etag for " + domain, e);
}
return null;
}
public Optional<String> getFeedAsJson(String domain) {
if (!feedDbEnabled) {
throw new IllegalStateException("Feed database is disabled on this node");
@ -214,7 +235,7 @@ public class FeedDb {
public Instant getFetchTime() {
if (!Files.exists(readerDbPath)) {
return Instant.ofEpochMilli(0);
return Instant.EPOCH;
}
try {
@ -224,7 +245,23 @@ public class FeedDb {
}
catch (IOException ex) {
logger.error("Failed to read the creatiom time of {}", readerDbPath);
return Instant.ofEpochMilli(0);
return Instant.EPOCH;
}
}
public boolean hasData() {
if (!feedDbEnabled) {
throw new IllegalStateException("Feed database is disabled on this node");
}
// Capture the current reader to avoid concurrency issues
FeedDbReader reader = this.reader;
if (reader != null) {
return reader.hasData();
}
return false;
}
}

View File

@ -8,6 +8,7 @@ import nu.marginalia.rss.model.FeedItems;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.nio.file.Path;
import java.sql.Connection;
import java.sql.DriverManager;
@ -32,6 +33,7 @@ public class FeedDbReader implements AutoCloseable {
try (var stmt = connection.createStatement()) {
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS feed (domain TEXT PRIMARY KEY, feed JSON)");
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS errors (domain TEXT PRIMARY KEY, cnt INT DEFAULT 0)");
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS etags (domain TEXT PRIMARY KEY, etag TEXT)");
}
}
@ -106,6 +108,22 @@ public class FeedDbReader implements AutoCloseable {
return FeedItems.none();
}
@Nullable
public String getEtag(EdgeDomain domain) {
try (var stmt = connection.prepareStatement("SELECT etag FROM etags WHERE DOMAIN = ?")) {
stmt.setString(1, domain.toString());
var rs = stmt.executeQuery();
if (rs.next()) {
return rs.getString(1);
}
} catch (SQLException e) {
logger.error("Error getting etag for " + domain, e);
}
return null;
}
private FeedItems deserialize(String string) {
return gson.fromJson(string, FeedItems.class);
}
@ -141,4 +159,18 @@ public class FeedDbReader implements AutoCloseable {
}
public boolean hasData() {
try (var stmt = connection.prepareStatement("SELECT 1 FROM feed LIMIT 1")) {
var rs = stmt.executeQuery();
if (rs.next()) {
return rs.getBoolean(1);
}
else {
return false;
}
}
catch (SQLException ex) {
return false;
}
}
}

View File

@ -20,6 +20,7 @@ public class FeedDbWriter implements AutoCloseable {
private final Connection connection;
private final PreparedStatement insertFeedStmt;
private final PreparedStatement insertErrorStmt;
private final PreparedStatement insertEtagStmt;
private final Path dbPath;
private volatile boolean closed = false;
@ -34,10 +35,12 @@ public class FeedDbWriter implements AutoCloseable {
try (var stmt = connection.createStatement()) {
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS feed (domain TEXT PRIMARY KEY, feed JSON)");
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS errors (domain TEXT PRIMARY KEY, cnt INT DEFAULT 0)");
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS etags (domain TEXT PRIMARY KEY, etag TEXT)");
}
insertFeedStmt = connection.prepareStatement("INSERT INTO feed (domain, feed) VALUES (?, ?)");
insertErrorStmt = connection.prepareStatement("INSERT INTO errors (domain, cnt) VALUES (?, ?)");
insertEtagStmt = connection.prepareStatement("INSERT INTO etags (domain, etag) VALUES (?, ?)");
}
public Path getDbPath() {
@ -56,6 +59,20 @@ public class FeedDbWriter implements AutoCloseable {
}
}
public synchronized void saveEtag(String domain, String etag) {
if (etag == null || etag.isBlank())
return;
try {
insertEtagStmt.setString(1, domain.toLowerCase());
insertEtagStmt.setString(2, etag);
insertEtagStmt.executeUpdate();
}
catch (SQLException e) {
logger.error("Error saving etag for " + domain, e);
}
}
public synchronized void setErrorCount(String domain, int count) {
try {
insertErrorStmt.setString(1, domain);

View File

@ -34,9 +34,7 @@ import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.nio.charset.StandardCharsets;
import java.sql.SQLException;
import java.time.Duration;
import java.time.LocalDateTime;
import java.time.ZonedDateTime;
import java.time.*;
import java.time.format.DateTimeFormatter;
import java.util.*;
import java.util.concurrent.Executors;
@ -61,7 +59,6 @@ public class FeedFetcherService {
private final DomainLocks domainLocks = new DomainLocks();
private volatile boolean updating;
private boolean deterministic = false;
@Inject
public FeedFetcherService(FeedDb feedDb,
@ -93,11 +90,6 @@ public class FeedFetcherService {
REFRESH
};
/** Disable random-based heuristics. This is meant for testing */
public void setDeterministic() {
this.deterministic = true;
}
public void updateFeeds(UpdateMode updateMode) throws IOException {
if (updating) // Prevent concurrent updates
{
@ -137,37 +129,37 @@ public class FeedFetcherService {
for (var feed : definitions) {
executor.submitQuietly(() -> {
try {
var oldData = feedDb.getFeed(new EdgeDomain(feed.domain()));
EdgeDomain domain = new EdgeDomain(feed.domain());
var oldData = feedDb.getFeed(domain);
// If we have existing data, we might skip updating it with a probability that increases with time,
// this is to avoid hammering the feeds that are updated very rarely and save some time and resources
// on our end
@Nullable
String ifModifiedSinceDate = switch(updateMode) {
case REFRESH -> getIfModifiedSinceDate(feedDb);
case CLEAN -> null;
};
/* Disable for now:
if (!oldData.isEmpty()) {
Duration duration = feed.durationSinceUpdated();
long daysSinceUpdate = duration.toDays();
if (deterministic || (daysSinceUpdate > 2 && ThreadLocalRandom.current()
.nextInt(1, 1 + (int) Math.min(10, daysSinceUpdate) / 2) > 1)) {
// Skip updating this feed, just write the old data back instead
writer.saveFeed(oldData);
return;
}
}
*/
@Nullable
String ifNoneMatchTag = switch (updateMode) {
case REFRESH -> feedDb.getEtag(domain);
case CLEAN -> null;
};
FetchResult feedData;
try (DomainLocks.DomainLock domainLock = domainLocks.lockDomain(new EdgeDomain(feed.domain()))) {
feedData = fetchFeedData(feed, client);
feedData = fetchFeedData(feed, client, ifModifiedSinceDate, ifNoneMatchTag);
} catch (Exception ex) {
feedData = new FetchResult.TransientError();
}
switch (feedData) {
case FetchResult.Success(String value) -> writer.saveFeed(parseFeed(value, feed));
case FetchResult.Success(String value, String etag) -> {
writer.saveEtag(feed.domain(), etag);
writer.saveFeed(parseFeed(value, feed));
}
case FetchResult.NotModified() -> {
writer.saveEtag(feed.domain(), ifNoneMatchTag);
writer.saveFeed(oldData);
}
case FetchResult.TransientError() -> {
int errorCount = errorCounts.getOrDefault(feed.domain().toLowerCase(), 0);
writer.setErrorCount(feed.domain().toLowerCase(), ++errorCount);
@ -214,36 +206,73 @@ public class FeedFetcherService {
}
}
private FetchResult fetchFeedData(FeedDefinition feed, HttpClient client) {
@Nullable
static String getIfModifiedSinceDate(FeedDb feedDb) {
// If the db is fresh, we don't send If-Modified-Since
if (!feedDb.hasData())
return null;
Instant cutoffInstant = feedDb.getFetchTime();
// If we're unable to establish fetch time, we don't send If-Modified-Since
if (cutoffInstant == Instant.EPOCH)
return null;
return cutoffInstant.atZone(ZoneId.of("GMT")).format(DateTimeFormatter.RFC_1123_DATE_TIME);
}
private FetchResult fetchFeedData(FeedDefinition feed,
HttpClient client,
@Nullable String ifModifiedSinceDate,
@Nullable String ifNoneMatchTag)
{
try {
URI uri = new URI(feed.feedUrl());
HttpRequest getRequest = HttpRequest.newBuilder()
HttpRequest.Builder requestBuilder = HttpRequest.newBuilder()
.GET()
.uri(uri)
.header("User-Agent", WmsaHome.getUserAgent().uaIdentifier())
.header("Accept-Encoding", "gzip")
.header("Accept", "text/*, */*;q=0.9")
.timeout(Duration.ofSeconds(15))
.build();
;
if (ifModifiedSinceDate != null) {
requestBuilder.header("If-Modified-Since", ifModifiedSinceDate);
}
if (ifNoneMatchTag != null) {
requestBuilder.header("If-None-Match", ifNoneMatchTag);
}
HttpRequest getRequest = requestBuilder.build();
for (int i = 0; i < 3; i++) {
var rs = client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray());
if (429 == rs.statusCode()) {
HttpResponse<byte[]> rs = client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray());
if (rs.statusCode() == 429) { // Too Many Requests
int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2"));
Thread.sleep(Duration.ofSeconds(Math.clamp(retryAfter, 1, 5)));
} else if (200 == rs.statusCode()) {
continue;
}
String newEtagValue = rs.headers().firstValue("ETag").orElse("");
return switch (rs.statusCode()) {
case 200 -> {
byte[] responseData = getResponseData(rs);
String contentType = rs.headers().firstValue("Content-Type").orElse("");
String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), responseData);
return new FetchResult.Success(bodyText);
} else if (404 == rs.statusCode()) {
return new FetchResult.PermanentError(); // never try again
} else {
return new FetchResult.TransientError(); // we try again in a few days
yield new FetchResult.Success(bodyText, newEtagValue);
}
case 304 -> new FetchResult.NotModified(); // via If-Modified-Since semantics
case 404 -> new FetchResult.PermanentError(); // never try again
default -> new FetchResult.TransientError(); // we try again later
};
}
}
catch (Exception ex) {
@ -267,7 +296,8 @@ public class FeedFetcherService {
}
public sealed interface FetchResult {
record Success(String value) implements FetchResult {}
record Success(String value, String etag) implements FetchResult {}
record NotModified() implements FetchResult {}
record TransientError() implements FetchResult {}
record PermanentError() implements FetchResult {}
}

View File

@ -96,7 +96,6 @@ class FeedFetcherServiceTest extends AbstractModule {
feedDb.switchDb(writer);
}
feedFetcherService.setDeterministic();
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
var result = feedDb.getFeed(new EdgeDomain("www.marginalia.nu"));
@ -104,6 +103,26 @@ class FeedFetcherServiceTest extends AbstractModule {
Assertions.assertFalse(result.isEmpty());
}
@Tag("flaky")
@Test
public void testFetchRepeatedly() throws Exception {
try (var writer = feedDb.createWriter()) {
writer.saveFeed(new FeedItems("www.marginalia.nu", "https://www.marginalia.nu/log/index.xml", "", List.of()));
feedDb.switchDb(writer);
}
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
Assertions.assertNotNull(feedDb.getEtag(new EdgeDomain("www.marginalia.nu")));
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
Assertions.assertNotNull(feedDb.getEtag(new EdgeDomain("www.marginalia.nu")));
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
Assertions.assertNotNull(feedDb.getEtag(new EdgeDomain("www.marginalia.nu")));
var result = feedDb.getFeed(new EdgeDomain("www.marginalia.nu"));
System.out.println(result);
Assertions.assertFalse(result.isEmpty());
}
@Tag("flaky")
@Test
public void test404() throws Exception {
@ -112,7 +131,6 @@ class FeedFetcherServiceTest extends AbstractModule {
feedDb.switchDb(writer);
}
feedFetcherService.setDeterministic();
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
// We forget the feed on a 404 error