mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(crawler) Correct Spec Provider so that it uses VISITED_URLS rather than KNOWN_URLS when growing domains
This commit is contained in:
parent
89dd201a7b
commit
4b16022556
@ -16,6 +16,7 @@ import java.util.Collections;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
// FIXME: This design is a vestige from when there were multiple sources of crawl data. It should be simplified and probably merged with CrawlerMain.
|
||||||
public class CrawlSpecProvider {
|
public class CrawlSpecProvider {
|
||||||
private final HikariDataSource dataSource;
|
private final HikariDataSource dataSource;
|
||||||
private final ProcessConfiguration processConfiguration;
|
private final ProcessConfiguration processConfiguration;
|
||||||
@ -48,15 +49,13 @@ public class CrawlSpecProvider {
|
|||||||
|
|
||||||
blacklist.waitUntilLoaded();
|
blacklist.waitUntilLoaded();
|
||||||
|
|
||||||
List<Integer> domainIds = new ArrayList<>(10_000);
|
|
||||||
|
|
||||||
try (var conn = dataSource.getConnection();
|
try (var conn = dataSource.getConnection();
|
||||||
var assignFreeDomains = conn.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY=? WHERE NODE_AFFINITY=0");
|
var assignFreeDomains = conn.prepareStatement("UPDATE EC_DOMAIN SET NODE_AFFINITY=? WHERE NODE_AFFINITY=0");
|
||||||
var query = conn.prepareStatement("""
|
var query = conn.prepareStatement("""
|
||||||
SELECT DOMAIN_NAME, COALESCE(KNOWN_URLS, 0), EC_DOMAIN.ID
|
SELECT DOMAIN_NAME, COALESCE(VISITED_URLS, 0), EC_DOMAIN.ID
|
||||||
FROM EC_DOMAIN
|
FROM EC_DOMAIN
|
||||||
LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
LEFT JOIN DOMAIN_METADATA ON EC_DOMAIN.ID=DOMAIN_METADATA.ID
|
||||||
WHERE NODE_AFFINITY=? OR NODE_AFFINITY=0
|
WHERE NODE_AFFINITY=?
|
||||||
""")
|
""")
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
@ -76,17 +75,12 @@ public class CrawlSpecProvider {
|
|||||||
int id = rs.getInt(3);
|
int id = rs.getInt(3);
|
||||||
if (blacklist.isBlacklisted(id))
|
if (blacklist.isBlacklisted(id))
|
||||||
continue;
|
continue;
|
||||||
domainIds.add(id);
|
|
||||||
|
|
||||||
int urls = rs.getInt(2);
|
int urls = rs.getInt(2);
|
||||||
double growthFactor;
|
|
||||||
|
|
||||||
if (urls < MID_URLS_PER_DOMAIN) {
|
double growthFactor = urls < MID_URLS_PER_DOMAIN
|
||||||
growthFactor = Math.max(2.5, URL_GROWTH_FACTOR);
|
? Math.max(2.5, URL_GROWTH_FACTOR)
|
||||||
}
|
: URL_GROWTH_FACTOR;
|
||||||
else {
|
|
||||||
growthFactor = URL_GROWTH_FACTOR;
|
|
||||||
}
|
|
||||||
|
|
||||||
int urlsToFetch = Math.clamp((int) (growthFactor * rs.getInt(2)), MIN_URLS_PER_DOMAIN, MAX_URLS_PER_DOMAIN);
|
int urlsToFetch = Math.clamp((int) (growthFactor * rs.getInt(2)), MIN_URLS_PER_DOMAIN, MAX_URLS_PER_DOMAIN);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user