mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(crawler) Pull additional new domains from node-affinity 0
Previously a bit ambiguously defined, node affinity 0 is now indicative that a domain is up for grabs for the next crawler
This commit is contained in:
parent
3d77456110
commit
74148c790e
@ -46,22 +46,35 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider {
|
|||||||
|
|
||||||
blacklist.waitUntilLoaded();
|
blacklist.waitUntilLoaded();
|
||||||
|
|
||||||
|
List<Integer> domainIds = new ArrayList<>(10_000);
|
||||||
|
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
|
var assignFreeDomains = conn.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY=? WHERE NODE_AFFINITY=0");
|
||||||
var query = conn.prepareStatement("""
|
var query = conn.prepareStatement("""
|
||||||
SELECT DOMAIN_NAME, COALESCE(KNOWN_URLS, 0), EC_DOMAIN.ID
|
SELECT DOMAIN_NAME, COALESCE(KNOWN_URLS, 0), EC_DOMAIN.ID
|
||||||
FROM EC_DOMAIN
|
FROM EC_DOMAIN
|
||||||
LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||||
WHERE NODE_AFFINITY=?
|
WHERE NODE_AFFINITY=? OR NODE_AFFINITY=0
|
||||||
"""))
|
""")
|
||||||
|
)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
// Assign any domains with node_affinity=0 to this node. We must do this now, before we start crawling
|
||||||
|
// to avoid race conditions with other crawl runs. We don't want multiple crawlers to crawl the same domain.
|
||||||
|
assignFreeDomains.setInt(1, processConfiguration.node());
|
||||||
|
assignFreeDomains.executeUpdate();
|
||||||
|
|
||||||
|
// Fetch the domains to be crawled
|
||||||
query.setInt(1, processConfiguration.node());
|
query.setInt(1, processConfiguration.node());
|
||||||
query.setFetchSize(10_000);
|
query.setFetchSize(10_000);
|
||||||
var rs = query.executeQuery();
|
var rs = query.executeQuery();
|
||||||
|
|
||||||
while (rs.next()) {
|
while (rs.next()) {
|
||||||
// Skip blacklisted domains
|
// Skip blacklisted domains
|
||||||
if (blacklist.isBlacklisted(rs.getInt(3)))
|
int id = rs.getInt(3);
|
||||||
|
if (blacklist.isBlacklisted(id))
|
||||||
continue;
|
continue;
|
||||||
|
domainIds.add(id);
|
||||||
|
|
||||||
int urls = rs.getInt(2);
|
int urls = rs.getInt(2);
|
||||||
double growthFactor;
|
double growthFactor;
|
||||||
@ -83,6 +96,7 @@ public class DbCrawlSpecProvider implements CrawlSpecProvider {
|
|||||||
|
|
||||||
domains.add(record);
|
domains.add(record);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info("Loaded {} domains", domains.size());
|
logger.info("Loaded {} domains", domains.size());
|
||||||
|
Loading…
Reference in New Issue
Block a user