mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(feed-fetcher) Make feed fetcher requests conditional
Add `If-None-Match` and `If-Modified-Since` headers as appropriate to the feed fetcher's requests. On well-configured web servers, this should short-circuit the request and reduce the amount of bandwidth and processing that is necessary. A new table was added to the FeedDb to hold one etag per domain. If-Modified-Since semantics are based on the creation date for the feed database, which should serve as a cutoff date for the earliest update we can have received. This completes the changes for Issue #136.
This commit is contained in:
parent
927bc0b63c
commit
3bc99639a0
@ -8,6 +8,7 @@ import nu.marginalia.rss.model.FeedDefinition;
|
||||
import nu.marginalia.rss.model.FeedItems;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -127,6 +128,26 @@ public class FeedDb {
|
||||
return FeedItems.none();
|
||||
}
|
||||
|
||||
|
||||
@Nullable
|
||||
public String getEtag(EdgeDomain domain) {
|
||||
if (!feedDbEnabled) {
|
||||
throw new IllegalStateException("Feed database is disabled on this node");
|
||||
}
|
||||
|
||||
// Capture the current reader to avoid concurrency issues
|
||||
FeedDbReader reader = this.reader;
|
||||
try {
|
||||
if (reader != null) {
|
||||
return reader.getEtag(domain);
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Error getting etag for " + domain, e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
public Optional<String> getFeedAsJson(String domain) {
|
||||
if (!feedDbEnabled) {
|
||||
throw new IllegalStateException("Feed database is disabled on this node");
|
||||
@ -214,7 +235,7 @@ public class FeedDb {
|
||||
|
||||
public Instant getFetchTime() {
|
||||
if (!Files.exists(readerDbPath)) {
|
||||
return Instant.ofEpochMilli(0);
|
||||
return Instant.EPOCH;
|
||||
}
|
||||
|
||||
try {
|
||||
@ -224,7 +245,23 @@ public class FeedDb {
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to read the creatiom time of {}", readerDbPath);
|
||||
return Instant.ofEpochMilli(0);
|
||||
return Instant.EPOCH;
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasData() {
|
||||
if (!feedDbEnabled) {
|
||||
throw new IllegalStateException("Feed database is disabled on this node");
|
||||
}
|
||||
|
||||
// Capture the current reader to avoid concurrency issues
|
||||
FeedDbReader reader = this.reader;
|
||||
|
||||
if (reader != null) {
|
||||
return reader.hasData();
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -8,6 +8,7 @@ import nu.marginalia.rss.model.FeedItems;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
@ -32,6 +33,7 @@ public class FeedDbReader implements AutoCloseable {
|
||||
try (var stmt = connection.createStatement()) {
|
||||
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS feed (domain TEXT PRIMARY KEY, feed JSON)");
|
||||
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS errors (domain TEXT PRIMARY KEY, cnt INT DEFAULT 0)");
|
||||
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS etags (domain TEXT PRIMARY KEY, etag TEXT)");
|
||||
}
|
||||
}
|
||||
|
||||
@ -106,6 +108,22 @@ public class FeedDbReader implements AutoCloseable {
|
||||
return FeedItems.none();
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public String getEtag(EdgeDomain domain) {
|
||||
try (var stmt = connection.prepareStatement("SELECT etag FROM etags WHERE DOMAIN = ?")) {
|
||||
stmt.setString(1, domain.toString());
|
||||
var rs = stmt.executeQuery();
|
||||
|
||||
if (rs.next()) {
|
||||
return rs.getString(1);
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
logger.error("Error getting etag for " + domain, e);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private FeedItems deserialize(String string) {
|
||||
return gson.fromJson(string, FeedItems.class);
|
||||
}
|
||||
@ -141,4 +159,18 @@ public class FeedDbReader implements AutoCloseable {
|
||||
}
|
||||
|
||||
|
||||
public boolean hasData() {
|
||||
try (var stmt = connection.prepareStatement("SELECT 1 FROM feed LIMIT 1")) {
|
||||
var rs = stmt.executeQuery();
|
||||
if (rs.next()) {
|
||||
return rs.getBoolean(1);
|
||||
}
|
||||
else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -20,6 +20,7 @@ public class FeedDbWriter implements AutoCloseable {
|
||||
private final Connection connection;
|
||||
private final PreparedStatement insertFeedStmt;
|
||||
private final PreparedStatement insertErrorStmt;
|
||||
private final PreparedStatement insertEtagStmt;
|
||||
private final Path dbPath;
|
||||
|
||||
private volatile boolean closed = false;
|
||||
@ -34,10 +35,12 @@ public class FeedDbWriter implements AutoCloseable {
|
||||
try (var stmt = connection.createStatement()) {
|
||||
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS feed (domain TEXT PRIMARY KEY, feed JSON)");
|
||||
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS errors (domain TEXT PRIMARY KEY, cnt INT DEFAULT 0)");
|
||||
stmt.executeUpdate("CREATE TABLE IF NOT EXISTS etags (domain TEXT PRIMARY KEY, etag TEXT)");
|
||||
}
|
||||
|
||||
insertFeedStmt = connection.prepareStatement("INSERT INTO feed (domain, feed) VALUES (?, ?)");
|
||||
insertErrorStmt = connection.prepareStatement("INSERT INTO errors (domain, cnt) VALUES (?, ?)");
|
||||
insertEtagStmt = connection.prepareStatement("INSERT INTO etags (domain, etag) VALUES (?, ?)");
|
||||
}
|
||||
|
||||
public Path getDbPath() {
|
||||
@ -56,6 +59,20 @@ public class FeedDbWriter implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized void saveEtag(String domain, String etag) {
|
||||
if (etag == null || etag.isBlank())
|
||||
return;
|
||||
|
||||
try {
|
||||
insertEtagStmt.setString(1, domain.toLowerCase());
|
||||
insertEtagStmt.setString(2, etag);
|
||||
insertEtagStmt.executeUpdate();
|
||||
}
|
||||
catch (SQLException e) {
|
||||
logger.error("Error saving etag for " + domain, e);
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized void setErrorCount(String domain, int count) {
|
||||
try {
|
||||
insertErrorStmt.setString(1, domain);
|
||||
|
@ -34,9 +34,7 @@ import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Duration;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.ZonedDateTime;
|
||||
import java.time.*;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.Executors;
|
||||
@ -61,7 +59,6 @@ public class FeedFetcherService {
|
||||
private final DomainLocks domainLocks = new DomainLocks();
|
||||
|
||||
private volatile boolean updating;
|
||||
private boolean deterministic = false;
|
||||
|
||||
@Inject
|
||||
public FeedFetcherService(FeedDb feedDb,
|
||||
@ -93,11 +90,6 @@ public class FeedFetcherService {
|
||||
REFRESH
|
||||
};
|
||||
|
||||
/** Disable random-based heuristics. This is meant for testing */
|
||||
public void setDeterministic() {
|
||||
this.deterministic = true;
|
||||
}
|
||||
|
||||
public void updateFeeds(UpdateMode updateMode) throws IOException {
|
||||
if (updating) // Prevent concurrent updates
|
||||
{
|
||||
@ -137,37 +129,37 @@ public class FeedFetcherService {
|
||||
for (var feed : definitions) {
|
||||
executor.submitQuietly(() -> {
|
||||
try {
|
||||
var oldData = feedDb.getFeed(new EdgeDomain(feed.domain()));
|
||||
EdgeDomain domain = new EdgeDomain(feed.domain());
|
||||
var oldData = feedDb.getFeed(domain);
|
||||
|
||||
// If we have existing data, we might skip updating it with a probability that increases with time,
|
||||
// this is to avoid hammering the feeds that are updated very rarely and save some time and resources
|
||||
// on our end
|
||||
@Nullable
|
||||
String ifModifiedSinceDate = switch(updateMode) {
|
||||
case REFRESH -> getIfModifiedSinceDate(feedDb);
|
||||
case CLEAN -> null;
|
||||
};
|
||||
|
||||
/* Disable for now:
|
||||
|
||||
if (!oldData.isEmpty()) {
|
||||
Duration duration = feed.durationSinceUpdated();
|
||||
long daysSinceUpdate = duration.toDays();
|
||||
|
||||
|
||||
if (deterministic || (daysSinceUpdate > 2 && ThreadLocalRandom.current()
|
||||
.nextInt(1, 1 + (int) Math.min(10, daysSinceUpdate) / 2) > 1)) {
|
||||
// Skip updating this feed, just write the old data back instead
|
||||
writer.saveFeed(oldData);
|
||||
return;
|
||||
}
|
||||
}
|
||||
*/
|
||||
@Nullable
|
||||
String ifNoneMatchTag = switch (updateMode) {
|
||||
case REFRESH -> feedDb.getEtag(domain);
|
||||
case CLEAN -> null;
|
||||
};
|
||||
|
||||
FetchResult feedData;
|
||||
try (DomainLocks.DomainLock domainLock = domainLocks.lockDomain(new EdgeDomain(feed.domain()))) {
|
||||
feedData = fetchFeedData(feed, client);
|
||||
feedData = fetchFeedData(feed, client, ifModifiedSinceDate, ifNoneMatchTag);
|
||||
} catch (Exception ex) {
|
||||
feedData = new FetchResult.TransientError();
|
||||
}
|
||||
|
||||
switch (feedData) {
|
||||
case FetchResult.Success(String value) -> writer.saveFeed(parseFeed(value, feed));
|
||||
case FetchResult.Success(String value, String etag) -> {
|
||||
writer.saveEtag(feed.domain(), etag);
|
||||
writer.saveFeed(parseFeed(value, feed));
|
||||
}
|
||||
case FetchResult.NotModified() -> {
|
||||
writer.saveEtag(feed.domain(), ifNoneMatchTag);
|
||||
writer.saveFeed(oldData);
|
||||
}
|
||||
case FetchResult.TransientError() -> {
|
||||
int errorCount = errorCounts.getOrDefault(feed.domain().toLowerCase(), 0);
|
||||
writer.setErrorCount(feed.domain().toLowerCase(), ++errorCount);
|
||||
@ -214,36 +206,73 @@ public class FeedFetcherService {
|
||||
}
|
||||
}
|
||||
|
||||
private FetchResult fetchFeedData(FeedDefinition feed, HttpClient client) {
|
||||
@Nullable
|
||||
static String getIfModifiedSinceDate(FeedDb feedDb) {
|
||||
|
||||
// If the db is fresh, we don't send If-Modified-Since
|
||||
if (!feedDb.hasData())
|
||||
return null;
|
||||
|
||||
Instant cutoffInstant = feedDb.getFetchTime();
|
||||
|
||||
// If we're unable to establish fetch time, we don't send If-Modified-Since
|
||||
if (cutoffInstant == Instant.EPOCH)
|
||||
return null;
|
||||
|
||||
return cutoffInstant.atZone(ZoneId.of("GMT")).format(DateTimeFormatter.RFC_1123_DATE_TIME);
|
||||
}
|
||||
|
||||
private FetchResult fetchFeedData(FeedDefinition feed,
|
||||
HttpClient client,
|
||||
@Nullable String ifModifiedSinceDate,
|
||||
@Nullable String ifNoneMatchTag)
|
||||
{
|
||||
try {
|
||||
URI uri = new URI(feed.feedUrl());
|
||||
|
||||
HttpRequest getRequest = HttpRequest.newBuilder()
|
||||
HttpRequest.Builder requestBuilder = HttpRequest.newBuilder()
|
||||
.GET()
|
||||
.uri(uri)
|
||||
.header("User-Agent", WmsaHome.getUserAgent().uaIdentifier())
|
||||
.header("Accept-Encoding", "gzip")
|
||||
.header("Accept", "text/*, */*;q=0.9")
|
||||
.timeout(Duration.ofSeconds(15))
|
||||
.build();
|
||||
;
|
||||
|
||||
if (ifModifiedSinceDate != null) {
|
||||
requestBuilder.header("If-Modified-Since", ifModifiedSinceDate);
|
||||
}
|
||||
|
||||
if (ifNoneMatchTag != null) {
|
||||
requestBuilder.header("If-None-Match", ifNoneMatchTag);
|
||||
}
|
||||
|
||||
HttpRequest getRequest = requestBuilder.build();
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
var rs = client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray());
|
||||
if (429 == rs.statusCode()) {
|
||||
HttpResponse<byte[]> rs = client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray());
|
||||
|
||||
if (rs.statusCode() == 429) { // Too Many Requests
|
||||
int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2"));
|
||||
Thread.sleep(Duration.ofSeconds(Math.clamp(retryAfter, 1, 5)));
|
||||
} else if (200 == rs.statusCode()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
String newEtagValue = rs.headers().firstValue("ETag").orElse("");
|
||||
|
||||
return switch (rs.statusCode()) {
|
||||
case 200 -> {
|
||||
byte[] responseData = getResponseData(rs);
|
||||
|
||||
String contentType = rs.headers().firstValue("Content-Type").orElse("");
|
||||
String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), responseData);
|
||||
|
||||
return new FetchResult.Success(bodyText);
|
||||
} else if (404 == rs.statusCode()) {
|
||||
return new FetchResult.PermanentError(); // never try again
|
||||
} else {
|
||||
return new FetchResult.TransientError(); // we try again in a few days
|
||||
yield new FetchResult.Success(bodyText, newEtagValue);
|
||||
}
|
||||
case 304 -> new FetchResult.NotModified(); // via If-Modified-Since semantics
|
||||
case 404 -> new FetchResult.PermanentError(); // never try again
|
||||
default -> new FetchResult.TransientError(); // we try again later
|
||||
};
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
@ -267,7 +296,8 @@ public class FeedFetcherService {
|
||||
}
|
||||
|
||||
public sealed interface FetchResult {
|
||||
record Success(String value) implements FetchResult {}
|
||||
record Success(String value, String etag) implements FetchResult {}
|
||||
record NotModified() implements FetchResult {}
|
||||
record TransientError() implements FetchResult {}
|
||||
record PermanentError() implements FetchResult {}
|
||||
}
|
||||
|
@ -96,7 +96,6 @@ class FeedFetcherServiceTest extends AbstractModule {
|
||||
feedDb.switchDb(writer);
|
||||
}
|
||||
|
||||
feedFetcherService.setDeterministic();
|
||||
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
|
||||
|
||||
var result = feedDb.getFeed(new EdgeDomain("www.marginalia.nu"));
|
||||
@ -104,6 +103,26 @@ class FeedFetcherServiceTest extends AbstractModule {
|
||||
Assertions.assertFalse(result.isEmpty());
|
||||
}
|
||||
|
||||
@Tag("flaky")
|
||||
@Test
|
||||
public void testFetchRepeatedly() throws Exception {
|
||||
try (var writer = feedDb.createWriter()) {
|
||||
writer.saveFeed(new FeedItems("www.marginalia.nu", "https://www.marginalia.nu/log/index.xml", "", List.of()));
|
||||
feedDb.switchDb(writer);
|
||||
}
|
||||
|
||||
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
|
||||
Assertions.assertNotNull(feedDb.getEtag(new EdgeDomain("www.marginalia.nu")));
|
||||
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
|
||||
Assertions.assertNotNull(feedDb.getEtag(new EdgeDomain("www.marginalia.nu")));
|
||||
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
|
||||
Assertions.assertNotNull(feedDb.getEtag(new EdgeDomain("www.marginalia.nu")));
|
||||
|
||||
var result = feedDb.getFeed(new EdgeDomain("www.marginalia.nu"));
|
||||
System.out.println(result);
|
||||
Assertions.assertFalse(result.isEmpty());
|
||||
}
|
||||
|
||||
@Tag("flaky")
|
||||
@Test
|
||||
public void test404() throws Exception {
|
||||
@ -112,7 +131,6 @@ class FeedFetcherServiceTest extends AbstractModule {
|
||||
feedDb.switchDb(writer);
|
||||
}
|
||||
|
||||
feedFetcherService.setDeterministic();
|
||||
feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH);
|
||||
|
||||
// We forget the feed on a 404 error
|
||||
|
Loading…
Reference in New Issue
Block a user