(search) Reduce the number of db queries a bit by caching data that doesn't change too often

This commit is contained in:
Viktor Lofgren 2025-01-10 13:53:56 +01:00
parent b245cc9f38
commit 2d17233366
3 changed files with 27 additions and 158 deletions

View File

@ -1,118 +0,0 @@
package nu.marginalia.db;
import com.zaxxer.hikari.HikariDataSource;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.OptionalInt;
/** Class used in exporting data. This is intended to be used for a brief time
* and then discarded, not kept around as a service.
*/
public class DbDomainStatsExportMultitool implements AutoCloseable {
private final Connection connection;
private final int nodeId;
private final PreparedStatement knownUrlsQuery;
private final PreparedStatement visitedUrlsQuery;
private final PreparedStatement goodUrlsQuery;
private final PreparedStatement domainNameToId;
private final PreparedStatement allDomainsQuery;
private final PreparedStatement crawlQueueDomains;
private final PreparedStatement indexedDomainsQuery;
public DbDomainStatsExportMultitool(HikariDataSource dataSource, int nodeId) throws SQLException {
this.connection = dataSource.getConnection();
this.nodeId = nodeId;
knownUrlsQuery = connection.prepareStatement("""
SELECT KNOWN_URLS
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE DOMAIN_NAME=?
""");
visitedUrlsQuery = connection.prepareStatement("""
SELECT VISITED_URLS
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE DOMAIN_NAME=?
""");
goodUrlsQuery = connection.prepareStatement("""
SELECT GOOD_URLS
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE DOMAIN_NAME=?
""");
domainNameToId = connection.prepareStatement("""
SELECT ID
FROM EC_DOMAIN
WHERE DOMAIN_NAME=?
""");
allDomainsQuery = connection.prepareStatement("""
SELECT DOMAIN_NAME
FROM EC_DOMAIN
""");
crawlQueueDomains = connection.prepareStatement("""
SELECT DOMAIN_NAME
FROM CRAWL_QUEUE
""");
indexedDomainsQuery = connection.prepareStatement("""
SELECT DOMAIN_NAME
FROM EC_DOMAIN
WHERE INDEXED > 0
""");
}
public OptionalInt getVisitedUrls(String domainName) throws SQLException {
return executeNameToIntQuery(domainName, visitedUrlsQuery);
}
public OptionalInt getDomainId(String domainName) throws SQLException {
return executeNameToIntQuery(domainName, domainNameToId);
}
public List<String> getCrawlQueueDomains() throws SQLException {
return executeListQuery(crawlQueueDomains, 100);
}
public List<String> getAllIndexedDomains() throws SQLException {
return executeListQuery(indexedDomainsQuery, 100_000);
}
private OptionalInt executeNameToIntQuery(String domainName, PreparedStatement statement)
throws SQLException {
statement.setString(1, domainName);
var rs = statement.executeQuery();
if (rs.next()) {
return OptionalInt.of(rs.getInt(1));
}
return OptionalInt.empty();
}
private List<String> executeListQuery(PreparedStatement statement, int sizeHint) throws SQLException {
List<String> ret = new ArrayList<>(sizeHint);
var rs = statement.executeQuery();
while (rs.next()) {
ret.add(rs.getString(1));
}
return ret;
}
@Override
public void close() throws SQLException {
knownUrlsQuery.close();
goodUrlsQuery.close();
visitedUrlsQuery.close();
allDomainsQuery.close();
crawlQueueDomains.close();
domainNameToId.close();
connection.close();
}
}

View File

@ -26,8 +26,6 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.sql.SQLException;
import java.time.Duration;
import java.time.Instant;
import java.util.*;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Future;
@ -69,9 +67,11 @@ public class SearchSiteInfoService {
this.screenshotService = screenshotService;
this.dataSource = dataSource;
this.searchSiteSubscriptions = searchSiteSubscriptions;
Thread.ofPlatform().name("Recently Added Domains Model Updater").start(this::modelUpdater);
}
private volatile SiteOverviewModel model = new SiteOverviewModel(List.of(), Instant.EPOCH);
private volatile SiteOverviewModel model = new SiteOverviewModel(List.of());
@GET
@Path("/site")
@ -81,55 +81,43 @@ public class SearchSiteInfoService {
return new MapModelAndView("redirect.jte", Map.of("url", "/site/"+domain));
}
if (model.age().compareTo(Duration.ofMinutes(15)) > 0) {
updateModel();
}
return new MapModelAndView("siteinfo/start.jte",
Map.of("navbar", NavbarModel.SITEINFO,
"model", model));
}
/** Update the model if it is older than 15 minutes.
* This query is expensive and should not be run too often,
* and the data doesn't change that often either.
* <p></p>
* This method is synchronized to avoid multiple threads updating the model at the same time.
*/
private synchronized void updateModel() {
var currentModel = model;
if (currentModel.age().compareTo(Duration.ofMinutes(15)) < 0) {
return;
}
private void modelUpdater() {
while (!Thread.interrupted()) {
List<SiteOverviewModel.DiscoveredDomain> domains = new ArrayList<>();
List<SiteOverviewModel.DiscoveredDomain> domains = new ArrayList<>();
// This query can be quite expensive, so we can't run it on demand
// for every request. Instead, we run it every 15 minutes and cache
// the result.
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, DISCOVER_DATE FROM EC_DOMAIN WHERE NODE_AFFINITY = 0 ORDER BY ID DESC LIMIT 10")) {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, DISCOVER_DATE FROM EC_DOMAIN WHERE NODE_AFFINITY = 0 ORDER BY ID DESC LIMIT 10")) {
var rs = stmt.executeQuery();
while (rs.next()) {
domains.add(new SiteOverviewModel.DiscoveredDomain(rs.getString("DOMAIN_NAME"), rs.getString("DISCOVER_DATE")));
var rs = stmt.executeQuery();
while (rs.next()) {
domains.add(new SiteOverviewModel.DiscoveredDomain(rs.getString("DOMAIN_NAME"), rs.getString("DISCOVER_DATE")));
}
} catch (SQLException ex) {
throw new RuntimeException();
}
model = new SiteOverviewModel(domains);
try {
TimeUnit.MINUTES.sleep(15);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}
catch (SQLException ex) {
throw new RuntimeException();
}
model = new SiteOverviewModel(domains);
}
public record SiteOverviewModel(List<DiscoveredDomain> domains, Instant captureTime) {
public SiteOverviewModel(List<DiscoveredDomain> domains) {
this(domains, Instant.now());
}
public record SiteOverviewModel(List<DiscoveredDomain> domains) {
public record DiscoveredDomain(String name, String timestamp) {}
public Duration age() {
return Duration.between(captureTime, Instant.now());
}
}
@GET

View File

@ -1,5 +1,4 @@
@import nu.marginalia.db.DbDomainQueries
@import nu.marginalia.model.EdgeDomain
@import nu.marginalia.search.svc.SearchSiteInfoService
@import nu.marginalia.search.svc.SearchSiteInfoService.*
@import nu.marginalia.search.model.UrlDetails