(crawler) Pull additional new domains from node-affinity 0

Previously a bit ambiguously defined, node affinity 0 is now indicative that a domain is up for grabs for the next crawler
This commit is contained in:
Viktor Lofgren 2024-09-01 13:00:36 +02:00
parent 3d77456110
commit 74148c790e

View File

@ -46,22 +46,35 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider {
blacklist.waitUntilLoaded(); blacklist.waitUntilLoaded();
List<Integer> domainIds = new ArrayList<>(10_000);
try (var conn = dataSource.getConnection(); try (var conn = dataSource.getConnection();
var assignFreeDomains = conn.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY=? WHERE NODE_AFFINITY=0");
var query = conn.prepareStatement(""" var query = conn.prepareStatement("""
SELECT DOMAIN_NAME, COALESCE(KNOWN_URLS, 0), EC_DOMAIN.ID SELECT DOMAIN_NAME, COALESCE(KNOWN_URLS, 0), EC_DOMAIN.ID
FROM EC_DOMAIN FROM EC_DOMAIN
LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE NODE_AFFINITY=? WHERE NODE_AFFINITY=? OR NODE_AFFINITY=0
""")) """)
)
{ {
// Assign any domains with node_affinity=0 to this node. We must do this now, before we start crawling
// to avoid race conditions with other crawl runs. We don't want multiple crawlers to crawl the same domain.
assignFreeDomains.setInt(1, processConfiguration.node());
assignFreeDomains.executeUpdate();
// Fetch the domains to be crawled
query.setInt(1, processConfiguration.node()); query.setInt(1, processConfiguration.node());
query.setFetchSize(10_000); query.setFetchSize(10_000);
var rs = query.executeQuery(); var rs = query.executeQuery();
while (rs.next()) { while (rs.next()) {
// Skip blacklisted domains // Skip blacklisted domains
if (blacklist.isBlacklisted(rs.getInt(3))) int id = rs.getInt(3);
if (blacklist.isBlacklisted(id))
continue; continue;
domainIds.add(id);
int urls = rs.getInt(2); int urls = rs.getInt(2);
double growthFactor; double growthFactor;
@ -83,6 +96,7 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider {
domains.add(record); domains.add(record);
} }
} }
logger.info("Loaded {} domains", domains.size()); logger.info("Loaded {} domains", domains.size());