(crawler) Correct Spec Provider so that it uses VISITED_URLS rather than KNOWN_URLS when growing domains

This commit is contained in:
Viktor Lofgren 2024-10-15 14:21:59 +02:00
parent 89dd201a7b
commit 4b16022556

View File

@ -16,6 +16,7 @@ import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.stream.Stream; import java.util.stream.Stream;
// FIXME: This design is a vestige from when there were multiple sources of crawl data. It should be simplified and probably merged with CrawlerMain.
public class CrawlSpecProvider { public class CrawlSpecProvider {
private final HikariDataSource dataSource; private final HikariDataSource dataSource;
private final ProcessConfiguration processConfiguration; private final ProcessConfiguration processConfiguration;
@ -48,15 +49,13 @@ public class CrawlSpecProvider {
blacklist.waitUntilLoaded(); blacklist.waitUntilLoaded();
List<Integer> domainIds = new ArrayList<>(10_000);
try (var conn = dataSource.getConnection(); try (var conn = dataSource.getConnection();
var assignFreeDomains = conn.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY=? WHERE NODE_AFFINITY=0"); var assignFreeDomains = conn.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY=? WHERE NODE_AFFINITY=0");
var query = conn.prepareStatement(""" var query = conn.prepareStatement("""
SELECT DOMAIN_NAME, COALESCE(KNOWN_URLS, 0), EC_DOMAIN.ID SELECT DOMAIN_NAME, COALESCE(VISITED_URLS, 0), EC_DOMAIN.ID
FROM EC_DOMAIN FROM EC_DOMAIN
LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
WHERE NODE_AFFINITY=? OR NODE_AFFINITY=0 WHERE NODE_AFFINITY=?
""") """)
) )
{ {
@ -76,17 +75,12 @@ public class CrawlSpecProvider {
int id = rs.getInt(3); int id = rs.getInt(3);
if (blacklist.isBlacklisted(id)) if (blacklist.isBlacklisted(id))
continue; continue;
domainIds.add(id);
int urls = rs.getInt(2); int urls = rs.getInt(2);
double growthFactor;
if (urls < MID_URLS_PER_DOMAIN) { double growthFactor = urls < MID_URLS_PER_DOMAIN
growthFactor = Math.max(2.5, URL_GROWTH_FACTOR); ? Math.max(2.5, URL_GROWTH_FACTOR)
} : URL_GROWTH_FACTOR;
else {
growthFactor = URL_GROWTH_FACTOR;
}
int urlsToFetch = Math.clamp((int) (growthFactor * rs.getInt(2)), MIN_URLS_PER_DOMAIN, MAX_URLS_PER_DOMAIN); int urlsToFetch = Math.clamp((int) (growthFactor * rs.getInt(2)), MIN_URLS_PER_DOMAIN, MAX_URLS_PER_DOMAIN);