(converter) Fix exception handling while reading crawl data

This commit is contained in:
Viktor Lofgren 2024-08-02 10:39:49 +02:00
parent 696fd8909d
commit ac67b6b5da
2 changed files with 23 additions and 14 deletions

View File

@ -1,22 +1,32 @@
package nu.marginalia.crawling.io; package nu.marginalia.crawling.io;
import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream; import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*; import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
public class CrawledDomainReader { public class CrawledDomainReader {
private static final Logger logger = LoggerFactory.getLogger(CrawledDomainReader.class);
/** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */ /** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */
public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException
{ {
String fileName = fullPath.getFileName().toString(); String fileName = fullPath.getFileName().toString();
if (fileName.endsWith(".parquet")) { if (fileName.endsWith(".parquet")) {
try {
return new ParquetSerializableCrawlDataStream(fullPath); return new ParquetSerializableCrawlDataStream(fullPath);
} catch (Exception ex) {
logger.error("Error reading domain data from " + fullPath, ex);
return SerializableCrawlDataStream.empty();
} }
else { } else {
throw new IllegalArgumentException("Unknown file type: " + fullPath); logger.error("Unknown file type: {}", fullPath);
return SerializableCrawlDataStream.empty();
} }
} }

View File

@ -6,6 +6,9 @@ import com.google.inject.Inject;
import com.google.inject.Injector; import com.google.inject.Injector;
import nu.marginalia.ProcessConfiguration; import nu.marginalia.ProcessConfiguration;
import nu.marginalia.ProcessConfigurationModule; import nu.marginalia.ProcessConfigurationModule;
import nu.marginalia.converting.model.CrawlPlan;
import nu.marginalia.converting.model.WorkDir;
import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.converting.sideload.SideloadSource; import nu.marginalia.converting.sideload.SideloadSource;
import nu.marginalia.converting.sideload.SideloadSourceFactory; import nu.marginalia.converting.sideload.SideloadSourceFactory;
import nu.marginalia.converting.writer.ConverterBatchWritableIf; import nu.marginalia.converting.writer.ConverterBatchWritableIf;
@ -13,28 +16,24 @@ import nu.marginalia.converting.writer.ConverterBatchWriter;
import nu.marginalia.converting.writer.ConverterWriter; import nu.marginalia.converting.writer.ConverterWriter;
import nu.marginalia.crawling.io.CrawledDomainReader; import nu.marginalia.crawling.io.CrawledDomainReader;
import nu.marginalia.crawling.io.SerializableCrawlDataStream; import nu.marginalia.crawling.io.SerializableCrawlDataStream;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.process.log.WorkLogEntry;
import nu.marginalia.service.ProcessMainClass;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MessageQueueFactory;
import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.MqMessage;
import nu.marginalia.mq.inbox.MqInboxResponse; import nu.marginalia.mq.inbox.MqInboxResponse;
import nu.marginalia.mq.inbox.MqSingleShotInbox; import nu.marginalia.mq.inbox.MqSingleShotInbox;
import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeatImpl; import nu.marginalia.process.control.ProcessHeartbeatImpl;
import nu.marginalia.process.log.WorkLog;
import nu.marginalia.process.log.WorkLogEntry;
import nu.marginalia.service.ProcessMainClass;
import nu.marginalia.service.module.DatabaseModule; import nu.marginalia.service.module.DatabaseModule;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.util.SimpleBlockingThreadPool; import nu.marginalia.util.SimpleBlockingThreadPool;
import nu.marginalia.worklog.BatchingWorkLog; import nu.marginalia.worklog.BatchingWorkLog;
import nu.marginalia.worklog.BatchingWorkLogImpl; import nu.marginalia.worklog.BatchingWorkLogImpl;
import org.apache.logging.log4j.util.Strings; import org.apache.logging.log4j.util.Strings;
import nu.marginalia.converting.model.CrawlPlan;
import nu.marginalia.converting.processor.DomainProcessor;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import nu.marginalia.converting.model.WorkDir;
import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.sql.SQLException; import java.sql.SQLException;
@ -201,7 +200,7 @@ public class ConverterMain extends ProcessMainClass {
try { try {
return Optional.of(CrawledDomainReader.createDataStream(path)); return Optional.of(CrawledDomainReader.createDataStream(path));
} }
catch (IOException ex) { catch (Exception ex) {
return Optional.empty(); return Optional.empty();
} }
} }