mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(search) Reduce the number of db queries a bit by caching data that doesn't change too often
This commit is contained in:
parent
b245cc9f38
commit
2d17233366
@ -1,118 +0,0 @@
|
||||
package nu.marginalia.db;
|
||||
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
|
||||
import java.sql.Connection;
|
||||
import java.sql.PreparedStatement;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.OptionalInt;
|
||||
|
||||
/** Class used in exporting data. This is intended to be used for a brief time
|
||||
* and then discarded, not kept around as a service.
|
||||
*/
|
||||
public class DbDomainStatsExportMultitool implements AutoCloseable {
|
||||
private final Connection connection;
|
||||
private final int nodeId;
|
||||
private final PreparedStatement knownUrlsQuery;
|
||||
private final PreparedStatement visitedUrlsQuery;
|
||||
private final PreparedStatement goodUrlsQuery;
|
||||
private final PreparedStatement domainNameToId;
|
||||
|
||||
private final PreparedStatement allDomainsQuery;
|
||||
private final PreparedStatement crawlQueueDomains;
|
||||
private final PreparedStatement indexedDomainsQuery;
|
||||
|
||||
public DbDomainStatsExportMultitool(HikariDataSource dataSource, int nodeId) throws SQLException {
|
||||
this.connection = dataSource.getConnection();
|
||||
this.nodeId = nodeId;
|
||||
|
||||
knownUrlsQuery = connection.prepareStatement("""
|
||||
SELECT KNOWN_URLS
|
||||
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
||||
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
WHERE DOMAIN_NAME=?
|
||||
""");
|
||||
visitedUrlsQuery = connection.prepareStatement("""
|
||||
SELECT VISITED_URLS
|
||||
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
||||
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
WHERE DOMAIN_NAME=?
|
||||
""");
|
||||
goodUrlsQuery = connection.prepareStatement("""
|
||||
SELECT GOOD_URLS
|
||||
FROM EC_DOMAIN INNER JOIN DOMAIN_METADATA
|
||||
ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||
WHERE DOMAIN_NAME=?
|
||||
""");
|
||||
domainNameToId = connection.prepareStatement("""
|
||||
SELECT ID
|
||||
FROM EC_DOMAIN
|
||||
WHERE DOMAIN_NAME=?
|
||||
""");
|
||||
allDomainsQuery = connection.prepareStatement("""
|
||||
SELECT DOMAIN_NAME
|
||||
FROM EC_DOMAIN
|
||||
""");
|
||||
crawlQueueDomains = connection.prepareStatement("""
|
||||
SELECT DOMAIN_NAME
|
||||
FROM CRAWL_QUEUE
|
||||
""");
|
||||
indexedDomainsQuery = connection.prepareStatement("""
|
||||
SELECT DOMAIN_NAME
|
||||
FROM EC_DOMAIN
|
||||
WHERE INDEXED > 0
|
||||
""");
|
||||
}
|
||||
|
||||
public OptionalInt getVisitedUrls(String domainName) throws SQLException {
|
||||
return executeNameToIntQuery(domainName, visitedUrlsQuery);
|
||||
}
|
||||
|
||||
public OptionalInt getDomainId(String domainName) throws SQLException {
|
||||
return executeNameToIntQuery(domainName, domainNameToId);
|
||||
}
|
||||
|
||||
public List<String> getCrawlQueueDomains() throws SQLException {
|
||||
return executeListQuery(crawlQueueDomains, 100);
|
||||
}
|
||||
public List<String> getAllIndexedDomains() throws SQLException {
|
||||
return executeListQuery(indexedDomainsQuery, 100_000);
|
||||
}
|
||||
|
||||
private OptionalInt executeNameToIntQuery(String domainName, PreparedStatement statement)
|
||||
throws SQLException {
|
||||
statement.setString(1, domainName);
|
||||
var rs = statement.executeQuery();
|
||||
|
||||
if (rs.next()) {
|
||||
return OptionalInt.of(rs.getInt(1));
|
||||
}
|
||||
|
||||
return OptionalInt.empty();
|
||||
}
|
||||
|
||||
private List<String> executeListQuery(PreparedStatement statement, int sizeHint) throws SQLException {
|
||||
List<String> ret = new ArrayList<>(sizeHint);
|
||||
|
||||
var rs = statement.executeQuery();
|
||||
|
||||
while (rs.next()) {
|
||||
ret.add(rs.getString(1));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws SQLException {
|
||||
knownUrlsQuery.close();
|
||||
goodUrlsQuery.close();
|
||||
visitedUrlsQuery.close();
|
||||
allDomainsQuery.close();
|
||||
crawlQueueDomains.close();
|
||||
domainNameToId.close();
|
||||
connection.close();
|
||||
}
|
||||
}
|
@ -26,8 +26,6 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.Future;
|
||||
@ -69,9 +67,11 @@ public class SearchSiteInfoService {
|
||||
this.screenshotService = screenshotService;
|
||||
this.dataSource = dataSource;
|
||||
this.searchSiteSubscriptions = searchSiteSubscriptions;
|
||||
|
||||
Thread.ofPlatform().name("Recently Added Domains Model Updater").start(this::modelUpdater);
|
||||
}
|
||||
|
||||
private volatile SiteOverviewModel model = new SiteOverviewModel(List.of(), Instant.EPOCH);
|
||||
private volatile SiteOverviewModel model = new SiteOverviewModel(List.of());
|
||||
|
||||
@GET
|
||||
@Path("/site")
|
||||
@ -81,29 +81,19 @@ public class SearchSiteInfoService {
|
||||
return new MapModelAndView("redirect.jte", Map.of("url", "/site/"+domain));
|
||||
}
|
||||
|
||||
if (model.age().compareTo(Duration.ofMinutes(15)) > 0) {
|
||||
updateModel();
|
||||
}
|
||||
|
||||
return new MapModelAndView("siteinfo/start.jte",
|
||||
Map.of("navbar", NavbarModel.SITEINFO,
|
||||
"model", model));
|
||||
}
|
||||
|
||||
/** Update the model if it is older than 15 minutes.
|
||||
* This query is expensive and should not be run too often,
|
||||
* and the data doesn't change that often either.
|
||||
* <p></p>
|
||||
* This method is synchronized to avoid multiple threads updating the model at the same time.
|
||||
*/
|
||||
private synchronized void updateModel() {
|
||||
var currentModel = model;
|
||||
if (currentModel.age().compareTo(Duration.ofMinutes(15)) < 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
private void modelUpdater() {
|
||||
while (!Thread.interrupted()) {
|
||||
List<SiteOverviewModel.DiscoveredDomain> domains = new ArrayList<>();
|
||||
|
||||
// This query can be quite expensive, so we can't run it on demand
|
||||
// for every request. Instead, we run it every 15 minutes and cache
|
||||
// the result.
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("SELECT DOMAIN_NAME, DISCOVER_DATE FROM EC_DOMAIN WHERE NODE_AFFINITY = 0 ORDER BY ID DESC LIMIT 10")) {
|
||||
|
||||
@ -111,25 +101,23 @@ public class SearchSiteInfoService {
|
||||
while (rs.next()) {
|
||||
domains.add(new SiteOverviewModel.DiscoveredDomain(rs.getString("DOMAIN_NAME"), rs.getString("DISCOVER_DATE")));
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
} catch (SQLException ex) {
|
||||
throw new RuntimeException();
|
||||
}
|
||||
|
||||
model = new SiteOverviewModel(domains);
|
||||
|
||||
try {
|
||||
TimeUnit.MINUTES.sleep(15);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public record SiteOverviewModel(List<DiscoveredDomain> domains, Instant captureTime) {
|
||||
|
||||
public SiteOverviewModel(List<DiscoveredDomain> domains) {
|
||||
this(domains, Instant.now());
|
||||
}
|
||||
|
||||
public record SiteOverviewModel(List<DiscoveredDomain> domains) {
|
||||
public record DiscoveredDomain(String name, String timestamp) {}
|
||||
|
||||
public Duration age() {
|
||||
return Duration.between(captureTime, Instant.now());
|
||||
}
|
||||
}
|
||||
|
||||
@GET
|
||||
|
@ -1,5 +1,4 @@
|
||||
@import nu.marginalia.db.DbDomainQueries
|
||||
@import nu.marginalia.model.EdgeDomain
|
||||
@import nu.marginalia.search.svc.SearchSiteInfoService
|
||||
@import nu.marginalia.search.svc.SearchSiteInfoService.*
|
||||
@import nu.marginalia.search.model.UrlDetails
|
||||
|
Loading…
Reference in New Issue
Block a user