Merge branch 'master' into live-search

This commit is contained in:
Viktor Lofgren 2024-11-21 16:00:20 +01:00
commit 14519294d2
2 changed files with 12 additions and 2 deletions

View File

@ -242,4 +242,8 @@ public class EdgeUrl implements Serializable {
return this.domain; return this.domain;
} }
public String getProto() {
return this.proto;
}
} }

View File

@ -16,6 +16,7 @@ import java.net.URISyntaxException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.*; import java.util.*;
import java.util.stream.Stream;
public class ParquetSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { public class ParquetSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream {
private static final Logger logger = LoggerFactory.getLogger(ParquetSerializableCrawlDataStream.class); private static final Logger logger = LoggerFactory.getLogger(ParquetSerializableCrawlDataStream.class);
@ -26,9 +27,12 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
private boolean wroteDomainRecord = false; private boolean wroteDomainRecord = false;
private final Path path; private final Path path;
// Reference to the underlying stream that needs to be closed when this object is closed
private final Stream<CrawledDocumentParquetRecord> streamForClosing;
public ParquetSerializableCrawlDataStream(Path file) throws IOException { public ParquetSerializableCrawlDataStream(Path file) throws IOException {
path = file; path = file;
backingIterator = CrawledDocumentParquetRecordFileReader.stream(file).iterator(); backingIterator = (streamForClosing = CrawledDocumentParquetRecordFileReader.stream(file)).iterator();
} }
@Override @Override
@ -150,7 +154,9 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
etag)); etag));
} }
public void close() throws IOException {} public void close() throws IOException {
streamForClosing.close();
}
@Override @Override
public SerializableCrawlData next() throws IOException { public SerializableCrawlData next() throws IOException {