(model) Fix resource leak in partially read crawl data streams.

Ensuring proper resource management by closing the underlying stream in the `close` method to prevent potential resource leaks.
This commit is contained in:
Viktor Lofgren 2024-11-20 19:29:13 +01:00
parent 6a3079a167
commit 665c8831a3

View File

@ -16,6 +16,7 @@ import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Stream;
public class ParquetSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream {
private static final Logger logger = LoggerFactory.getLogger(ParquetSerializableCrawlDataStream.class);
@ -26,9 +27,12 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
private boolean wroteDomainRecord = false;
private final Path path;
// Reference to the underlying stream that needs to be closed when this object is closed
private final Stream<CrawledDocumentParquetRecord> streamForClosing;
public ParquetSerializableCrawlDataStream(Path file) throws IOException {
path = file;
backingIterator = CrawledDocumentParquetRecordFileReader.stream(file).iterator();
backingIterator = (streamForClosing = CrawledDocumentParquetRecordFileReader.stream(file)).iterator();
}
@Override
@ -153,7 +157,9 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
etag));
}
public void close() throws IOException {}
public void close() throws IOException {
streamForClosing.close();
}
@Override
public SerializableCrawlData next() throws IOException {