mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(crawler) Fix bug poor handling of duplicate ids
* Also clean up the code a bit
This commit is contained in:
parent
dbb758d1a8
commit
f03146de4b
@ -18,6 +18,8 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
import java.util.concurrent.*;
|
import java.util.concurrent.*;
|
||||||
|
|
||||||
public class CrawlerMain implements AutoCloseable {
|
public class CrawlerMain implements AutoCloseable {
|
||||||
@ -38,6 +40,8 @@ public class CrawlerMain implements AutoCloseable {
|
|||||||
final int poolSize = Integer.getInteger("crawler.pool-size", 512);
|
final int poolSize = Integer.getInteger("crawler.pool-size", 512);
|
||||||
final int poolQueueSize = 32;
|
final int poolQueueSize = 32;
|
||||||
|
|
||||||
|
private final Set<String> processedIds = new HashSet<>();
|
||||||
|
|
||||||
AbortMonitor abortMonitor = AbortMonitor.getInstance();
|
AbortMonitor abortMonitor = AbortMonitor.getInstance();
|
||||||
Semaphore taskSem = new Semaphore(poolSize);
|
Semaphore taskSem = new Semaphore(poolSize);
|
||||||
|
|
||||||
@ -87,11 +91,27 @@ public class CrawlerMain implements AutoCloseable {
|
|||||||
|
|
||||||
logger.info("Let's go");
|
logger.info("Let's go");
|
||||||
|
|
||||||
|
// TODO: Make this into an iterable instead so we can abort it
|
||||||
plan.forEachCrawlingSpecification(this::startCrawlTask);
|
plan.forEachCrawlingSpecification(this::startCrawlTask);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void startCrawlTask(CrawlingSpecification crawlingSpecification) {
|
private void startCrawlTask(CrawlingSpecification crawlingSpecification) {
|
||||||
if (abortMonitor.isAlive()) {
|
|
||||||
|
if (!processedIds.add(crawlingSpecification.id)) {
|
||||||
|
|
||||||
|
// This is a duplicate id, so we ignore it. Otherwise we'd end crawling the same site twice,
|
||||||
|
// and if we're really unlucky, we might end up writing to the same output file from multiple
|
||||||
|
// threads with complete bit salad as a result.
|
||||||
|
|
||||||
|
logger.error("Ignoring duplicate id: {}", crawlingSpecification.id);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!abortMonitor.isAlive()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
taskSem.acquire();
|
taskSem.acquire();
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
@ -107,7 +127,6 @@ public class CrawlerMain implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
private void fetchDomain(CrawlingSpecification specification) {
|
private void fetchDomain(CrawlingSpecification specification) {
|
||||||
if (workLog.isJobFinished(specification.id))
|
if (workLog.isJobFinished(specification.id))
|
||||||
|
Loading…
Reference in New Issue
Block a user