From 6a3079a167e2692239bb61d79b4f28c908c198bd Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 18 Nov 2024 21:05:22 +0100 Subject: [PATCH 1/2] (search) Fix missing getter for proto --- code/common/model/java/nu/marginalia/model/EdgeUrl.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/code/common/model/java/nu/marginalia/model/EdgeUrl.java b/code/common/model/java/nu/marginalia/model/EdgeUrl.java index 04638c24..65f75d05 100644 --- a/code/common/model/java/nu/marginalia/model/EdgeUrl.java +++ b/code/common/model/java/nu/marginalia/model/EdgeUrl.java @@ -242,4 +242,8 @@ public class EdgeUrl implements Serializable { return this.domain; } + public String getProto() { + return this.proto; + } + } From 665c8831a30b52a396497587d17f41a7757a34fa Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 20 Nov 2024 19:29:13 +0100 Subject: [PATCH 2/2] (model) Fix resource leak in partially read crawl data streams. Ensuring proper resource management by closing the underlying stream in the `close` method to prevent potential resource leaks. --- .../format/ParquetSerializableCrawlDataStream.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java index 11c08267..703a70af 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java @@ -16,6 +16,7 @@ import java.net.URISyntaxException; import java.nio.file.Files; import java.nio.file.Path; import java.util.*; +import java.util.stream.Stream; public class ParquetSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { private static final Logger logger = LoggerFactory.getLogger(ParquetSerializableCrawlDataStream.class); @@ -26,9 +27,12 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial private boolean wroteDomainRecord = false; private final Path path; + // Reference to the underlying stream that needs to be closed when this object is closed + private final Stream streamForClosing; + public ParquetSerializableCrawlDataStream(Path file) throws IOException { path = file; - backingIterator = CrawledDocumentParquetRecordFileReader.stream(file).iterator(); + backingIterator = (streamForClosing = CrawledDocumentParquetRecordFileReader.stream(file)).iterator(); } @Override @@ -153,7 +157,9 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial etag)); } - public void close() throws IOException {} + public void close() throws IOException { + streamForClosing.close(); + } @Override public SerializableCrawlData next() throws IOException {