From 1b8b97b8ecf7dc0a87847a7b8dddcdb485c6a063 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 25 Jan 2024 11:51:53 +0100 Subject: [PATCH] (sample-exporter) Add some limits on sizes and lengths Tar files will reject entries with filenames over 100b, so we need a limit there. Also added a maximum size limit to keep the file sizes reasonable. --- .../java/nu/marginalia/extractor/SampleDataExporter.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/code/features-convert/data-extractors/src/main/java/nu/marginalia/extractor/SampleDataExporter.java b/code/features-convert/data-extractors/src/main/java/nu/marginalia/extractor/SampleDataExporter.java index 427e35e0..dc6fc90b 100644 --- a/code/features-convert/data-extractors/src/main/java/nu/marginalia/extractor/SampleDataExporter.java +++ b/code/features-convert/data-extractors/src/main/java/nu/marginalia/extractor/SampleDataExporter.java @@ -37,7 +37,14 @@ public class SampleDataExporter { List entriesAll = new ArrayList<>(100_000); for (var item : WorkLog.iterable(crawlerLogFile)) { - if (item.cnt() < 2) continue; + if (item.cnt() < 2) // this one's too small + continue; + if (item.cnt() > 5000) // this one's too big + continue; + if (item.relPath().length() > 90) // this one's too long + continue; // TAR file name limit is 100, but we add some extra for good measure + + // this one's just right entriesAll.add(item); }