mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(converter) Fix exception handling while reading crawl data
This commit is contained in:
parent
696fd8909d
commit
ac67b6b5da
@ -1,22 +1,32 @@
|
||||
package nu.marginalia.crawling.io;
|
||||
|
||||
import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
public class CrawledDomainReader {
|
||||
private static final Logger logger = LoggerFactory.getLogger(CrawledDomainReader.class);
|
||||
|
||||
/** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */
|
||||
public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException
|
||||
{
|
||||
|
||||
String fileName = fullPath.getFileName().toString();
|
||||
if (fileName.endsWith(".parquet")) {
|
||||
return new ParquetSerializableCrawlDataStream(fullPath);
|
||||
}
|
||||
else {
|
||||
throw new IllegalArgumentException("Unknown file type: " + fullPath);
|
||||
try {
|
||||
return new ParquetSerializableCrawlDataStream(fullPath);
|
||||
} catch (Exception ex) {
|
||||
logger.error("Error reading domain data from " + fullPath, ex);
|
||||
return SerializableCrawlDataStream.empty();
|
||||
}
|
||||
} else {
|
||||
logger.error("Unknown file type: {}", fullPath);
|
||||
return SerializableCrawlDataStream.empty();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -6,6 +6,9 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.ProcessConfiguration;
|
||||
import nu.marginalia.ProcessConfigurationModule;
|
||||
import nu.marginalia.converting.model.CrawlPlan;
|
||||
import nu.marginalia.converting.model.WorkDir;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.converting.sideload.SideloadSource;
|
||||
import nu.marginalia.converting.sideload.SideloadSourceFactory;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWritableIf;
|
||||
@ -13,28 +16,24 @@ import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||
import nu.marginalia.converting.writer.ConverterWriter;
|
||||
import nu.marginalia.crawling.io.CrawledDomainReader;
|
||||
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.process.log.WorkLogEntry;
|
||||
import nu.marginalia.service.ProcessMainClass;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.mq.MqMessage;
|
||||
import nu.marginalia.mq.inbox.MqInboxResponse;
|
||||
import nu.marginalia.mq.inbox.MqSingleShotInbox;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
||||
import nu.marginalia.process.log.WorkLog;
|
||||
import nu.marginalia.process.log.WorkLogEntry;
|
||||
import nu.marginalia.service.ProcessMainClass;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import nu.marginalia.worklog.BatchingWorkLog;
|
||||
import nu.marginalia.worklog.BatchingWorkLogImpl;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
import nu.marginalia.converting.model.CrawlPlan;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import nu.marginalia.converting.model.WorkDir;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
@ -201,7 +200,7 @@ public class ConverterMain extends ProcessMainClass {
|
||||
try {
|
||||
return Optional.of(CrawledDomainReader.createDataStream(path));
|
||||
}
|
||||
catch (IOException ex) {
|
||||
catch (Exception ex) {
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user