mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(crawler) Fix bug poor handling of duplicate ids
* Also clean up the code a bit
This commit is contained in:
parent
dbb758d1a8
commit
f03146de4b
@ -18,6 +18,8 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
import java.util.concurrent.*;
|
import java.util.concurrent.*;
|
||||||
|
|
||||||
public class CrawlerMain implements AutoCloseable {
|
public class CrawlerMain implements AutoCloseable {
|
||||||
@ -38,6 +40,8 @@ public class CrawlerMain implements AutoCloseable {
|
|||||||
final int poolSize = Integer.getInteger("crawler.pool-size", 512);
|
final int poolSize = Integer.getInteger("crawler.pool-size", 512);
|
||||||
final int poolQueueSize = 32;
|
final int poolQueueSize = 32;
|
||||||
|
|
||||||
|
private final Set<String> processedIds = new HashSet<>();
|
||||||
|
|
||||||
AbortMonitor abortMonitor = AbortMonitor.getInstance();
|
AbortMonitor abortMonitor = AbortMonitor.getInstance();
|
||||||
Semaphore taskSem = new Semaphore(poolSize);
|
Semaphore taskSem = new Semaphore(poolSize);
|
||||||
|
|
||||||
@ -87,26 +91,41 @@ public class CrawlerMain implements AutoCloseable {
|
|||||||
|
|
||||||
logger.info("Let's go");
|
logger.info("Let's go");
|
||||||
|
|
||||||
|
// TODO: Make this into an iterable instead so we can abort it
|
||||||
plan.forEachCrawlingSpecification(this::startCrawlTask);
|
plan.forEachCrawlingSpecification(this::startCrawlTask);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void startCrawlTask(CrawlingSpecification crawlingSpecification) {
|
|
||||||
if (abortMonitor.isAlive()) {
|
|
||||||
try {
|
|
||||||
taskSem.acquire();
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
|
|
||||||
pool.execute(() -> {
|
private void startCrawlTask(CrawlingSpecification crawlingSpecification) {
|
||||||
try {
|
|
||||||
fetchDomain(crawlingSpecification);
|
if (!processedIds.add(crawlingSpecification.id)) {
|
||||||
}
|
|
||||||
finally {
|
// This is a duplicate id, so we ignore it. Otherwise we'd end crawling the same site twice,
|
||||||
taskSem.release();
|
// and if we're really unlucky, we might end up writing to the same output file from multiple
|
||||||
}
|
// threads with complete bit salad as a result.
|
||||||
});
|
|
||||||
|
logger.error("Ignoring duplicate id: {}", crawlingSpecification.id);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!abortMonitor.isAlive()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
taskSem.acquire();
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
pool.execute(() -> {
|
||||||
|
try {
|
||||||
|
fetchDomain(crawlingSpecification);
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
taskSem.release();
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private void fetchDomain(CrawlingSpecification specification) {
|
private void fetchDomain(CrawlingSpecification specification) {
|
||||||
|
Loading…
Reference in New Issue
Block a user