mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(converter) Fix exception handling while reading crawl data
This commit is contained in:
parent
696fd8909d
commit
ac67b6b5da
@ -1,22 +1,32 @@
|
|||||||
package nu.marginalia.crawling.io;
|
package nu.marginalia.crawling.io;
|
||||||
|
|
||||||
import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream;
|
import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.FileNotFoundException;
|
||||||
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
|
||||||
public class CrawledDomainReader {
|
public class CrawledDomainReader {
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(CrawledDomainReader.class);
|
||||||
|
|
||||||
/** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */
|
/** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */
|
||||||
public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException
|
public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException
|
||||||
{
|
{
|
||||||
|
|
||||||
String fileName = fullPath.getFileName().toString();
|
String fileName = fullPath.getFileName().toString();
|
||||||
if (fileName.endsWith(".parquet")) {
|
if (fileName.endsWith(".parquet")) {
|
||||||
|
try {
|
||||||
return new ParquetSerializableCrawlDataStream(fullPath);
|
return new ParquetSerializableCrawlDataStream(fullPath);
|
||||||
|
} catch (Exception ex) {
|
||||||
|
logger.error("Error reading domain data from " + fullPath, ex);
|
||||||
|
return SerializableCrawlDataStream.empty();
|
||||||
}
|
}
|
||||||
else {
|
} else {
|
||||||
throw new IllegalArgumentException("Unknown file type: " + fullPath);
|
logger.error("Unknown file type: {}", fullPath);
|
||||||
|
return SerializableCrawlDataStream.empty();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6,6 +6,9 @@ import com.google.inject.Inject;
|
|||||||
import com.google.inject.Injector;
|
import com.google.inject.Injector;
|
||||||
import nu.marginalia.ProcessConfiguration;
|
import nu.marginalia.ProcessConfiguration;
|
||||||
import nu.marginalia.ProcessConfigurationModule;
|
import nu.marginalia.ProcessConfigurationModule;
|
||||||
|
import nu.marginalia.converting.model.CrawlPlan;
|
||||||
|
import nu.marginalia.converting.model.WorkDir;
|
||||||
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
import nu.marginalia.converting.sideload.SideloadSource;
|
import nu.marginalia.converting.sideload.SideloadSource;
|
||||||
import nu.marginalia.converting.sideload.SideloadSourceFactory;
|
import nu.marginalia.converting.sideload.SideloadSourceFactory;
|
||||||
import nu.marginalia.converting.writer.ConverterBatchWritableIf;
|
import nu.marginalia.converting.writer.ConverterBatchWritableIf;
|
||||||
@ -13,28 +16,24 @@ import nu.marginalia.converting.writer.ConverterBatchWriter;
|
|||||||
import nu.marginalia.converting.writer.ConverterWriter;
|
import nu.marginalia.converting.writer.ConverterWriter;
|
||||||
import nu.marginalia.crawling.io.CrawledDomainReader;
|
import nu.marginalia.crawling.io.CrawledDomainReader;
|
||||||
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
import nu.marginalia.crawling.io.SerializableCrawlDataStream;
|
||||||
import nu.marginalia.process.log.WorkLog;
|
|
||||||
import nu.marginalia.process.log.WorkLogEntry;
|
|
||||||
import nu.marginalia.service.ProcessMainClass;
|
|
||||||
import nu.marginalia.storage.FileStorageService;
|
|
||||||
import nu.marginalia.mq.MessageQueueFactory;
|
import nu.marginalia.mq.MessageQueueFactory;
|
||||||
import nu.marginalia.mq.MqMessage;
|
import nu.marginalia.mq.MqMessage;
|
||||||
import nu.marginalia.mq.inbox.MqInboxResponse;
|
import nu.marginalia.mq.inbox.MqInboxResponse;
|
||||||
import nu.marginalia.mq.inbox.MqSingleShotInbox;
|
import nu.marginalia.mq.inbox.MqSingleShotInbox;
|
||||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||||
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
||||||
|
import nu.marginalia.process.log.WorkLog;
|
||||||
|
import nu.marginalia.process.log.WorkLogEntry;
|
||||||
|
import nu.marginalia.service.ProcessMainClass;
|
||||||
import nu.marginalia.service.module.DatabaseModule;
|
import nu.marginalia.service.module.DatabaseModule;
|
||||||
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||||
import nu.marginalia.worklog.BatchingWorkLog;
|
import nu.marginalia.worklog.BatchingWorkLog;
|
||||||
import nu.marginalia.worklog.BatchingWorkLogImpl;
|
import nu.marginalia.worklog.BatchingWorkLogImpl;
|
||||||
import org.apache.logging.log4j.util.Strings;
|
import org.apache.logging.log4j.util.Strings;
|
||||||
import nu.marginalia.converting.model.CrawlPlan;
|
|
||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import nu.marginalia.converting.model.WorkDir;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
@ -201,7 +200,7 @@ public class ConverterMain extends ProcessMainClass {
|
|||||||
try {
|
try {
|
||||||
return Optional.of(CrawledDomainReader.createDataStream(path));
|
return Optional.of(CrawledDomainReader.createDataStream(path));
|
||||||
}
|
}
|
||||||
catch (IOException ex) {
|
catch (Exception ex) {
|
||||||
return Optional.empty();
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user