mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(live-crawl-actor) Clear index journal before starting live crawl
This is to prevent data corruption. This shouldn't be necessary for the regular loader path, but the live crawler is a bit different and needs some paving of the road ahead of it.
This commit is contained in:
parent
b941604135
commit
80e6d0069c
@ -3,6 +3,7 @@ package nu.marginalia.actor.task;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.actor.ExecutorActor;
|
||||
import nu.marginalia.actor.ExecutorActorStateMachines;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
@ -13,9 +14,13 @@ import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.crawling.LiveCrawlRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.Duration;
|
||||
import java.util.Objects;
|
||||
|
||||
@ -29,6 +34,8 @@ public class LiveCrawlActor extends RecordActorPrototype {
|
||||
private final FeedsClient feedsClient;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final FileStorageService fileStorageService;
|
||||
|
||||
public record Initial() implements ActorStep {}
|
||||
public record Monitor(String feedsHash) implements ActorStep {}
|
||||
public record LiveCrawl(String feedsHash, long msgId) implements ActorStep {
|
||||
@ -52,6 +59,12 @@ public class LiveCrawlActor extends RecordActorPrototype {
|
||||
}
|
||||
}
|
||||
case LiveCrawl(String feedsHash, long msgId) when msgId < 0 -> {
|
||||
// Clear the index journal before starting the crawl
|
||||
Path indexJournalLocation = IndexLocations.getIndexConstructionArea(fileStorageService).resolve("index-journal");
|
||||
if (Files.isDirectory(indexJournalLocation)) {
|
||||
FileUtils.deleteDirectory(indexJournalLocation.toFile());
|
||||
}
|
||||
|
||||
long id = mqLiveCrawlerOutbox.sendAsync(new LiveCrawlRequest());
|
||||
yield new LiveCrawl(feedsHash, id);
|
||||
}
|
||||
@ -81,13 +94,14 @@ public class LiveCrawlActor extends RecordActorPrototype {
|
||||
ProcessOutboxes processOutboxes,
|
||||
FeedsClient feedsClient,
|
||||
Gson gson,
|
||||
ExecutorActorStateMachines executorActorStateMachines)
|
||||
ExecutorActorStateMachines executorActorStateMachines, FileStorageService fileStorageService)
|
||||
{
|
||||
super(gson);
|
||||
this.processWatcher = processWatcher;
|
||||
this.mqLiveCrawlerOutbox = processOutboxes.getLiveCrawlerOutbox();
|
||||
this.executorActorStateMachines = executorActorStateMachines;
|
||||
this.feedsClient = feedsClient;
|
||||
this.fileStorageService = fileStorageService;
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user