(model) Fix resource leak in partially read crawl data streams.

Ensuring proper resource management by closing the underlying stream in the `close` method to prevent potential resource leaks.
This commit is contained in:
Viktor Lofgren 2024-11-20 19:29:13 +01:00
parent 6a3079a167
commit 665c8831a3

View File

@ -16,6 +16,7 @@ import java.net.URISyntaxException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.*; import java.util.*;
import java.util.stream.Stream;
public class ParquetSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { public class ParquetSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream {
private static final Logger logger = LoggerFactory.getLogger(ParquetSerializableCrawlDataStream.class); private static final Logger logger = LoggerFactory.getLogger(ParquetSerializableCrawlDataStream.class);
@ -26,9 +27,12 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
private boolean wroteDomainRecord = false; private boolean wroteDomainRecord = false;
private final Path path; private final Path path;
// Reference to the underlying stream that needs to be closed when this object is closed
private final Stream<CrawledDocumentParquetRecord> streamForClosing;
public ParquetSerializableCrawlDataStream(Path file) throws IOException { public ParquetSerializableCrawlDataStream(Path file) throws IOException {
path = file; path = file;
backingIterator = CrawledDocumentParquetRecordFileReader.stream(file).iterator(); backingIterator = (streamForClosing = CrawledDocumentParquetRecordFileReader.stream(file)).iterator();
} }
@Override @Override
@ -153,7 +157,9 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
etag)); etag));
} }
public void close() throws IOException {} public void close() throws IOException {
streamForClosing.close();
}
@Override @Override
public SerializableCrawlData next() throws IOException { public SerializableCrawlData next() throws IOException {