From 197c82acd4d36d4ec13057f42e3f08d5af4d8ef5 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 21 May 2024 19:05:44 +0200 Subject: [PATCH] (control) Add filter functionality for crawl data inspector --- .../node/svc/ControlCrawlDataService.java | 32 +++++++++---- .../node-storage-crawl-parquet-details.hdb | 48 +++++++++++-------- 2 files changed, 50 insertions(+), 30 deletions(-) diff --git a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlCrawlDataService.java b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlCrawlDataService.java index 39a88199..28185463 100644 --- a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlCrawlDataService.java +++ b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlCrawlDataService.java @@ -55,6 +55,9 @@ public class ControlCrawlDataService { String path = request.queryParams("path"); int after = Integer.parseInt(request.queryParamOrDefault("page", "0")); + String urlGlob = request.queryParamOrDefault("urlGlob", ""); + String selectedContentType = request.queryParamOrDefault("contentType", "ALL"); + String selectedHttpStatus = request.queryParamOrDefault("httpStatus", "ALL"); var url = executorClient.remoteFileURL(fileStorageService.getStorage(fsid), path).toString(); @@ -65,8 +68,7 @@ public class ControlCrawlDataService { String domain; try (var conn = DriverManager.getConnection("jdbc:duckdb:"); - var stmt = conn.createStatement()) - { + var stmt = conn.createStatement()) { ResultSet rs; rs = stmt.executeQuery(DUCKDB."SELECT domain FROM \{url} LIMIT 1"); @@ -78,7 +80,7 @@ public class ControlCrawlDataService { ORDER BY httpStatus """); while (rs.next()) { - byStatusCode.add(new SummaryStatusCode(rs.getInt(1), rs.getInt(2))); + byStatusCode.add(new SummaryStatusCode(rs.getInt(1), rs.getInt(2), selectedHttpStatus.equals(rs.getString(1)))); } rs = stmt.executeQuery(DUCKDB.""" @@ -88,13 +90,20 @@ public class ControlCrawlDataService { ORDER BY contentType """); while (rs.next()) { - byContentType.add(new SummaryContentType(rs.getString(1), rs.getInt(2))); + byContentType.add(new SummaryContentType(rs.getString(1), rs.getInt(2), selectedContentType.equals(rs.getString(1)))); } - rs = stmt.executeQuery(DUCKDB.""" - SELECT url, contentType, httpStatus, body != '', etagHeader, lastModifiedHeader - FROM \{url} LIMIT 10 OFFSET \{after} - """); + + var query = DUCKDB."SELECT url, contentType, httpStatus, body != '', etagHeader, lastModifiedHeader FROM \{url} WHERE 1=1"; + if (!urlGlob.isBlank()) + query += DUCKDB." AND url LIKE \{urlGlob.replace('*', '%')}"; + if (!selectedContentType.equals("ALL")) + query += DUCKDB." AND contentType = \{selectedContentType}"; + if (!selectedHttpStatus.equals("ALL")) + query += DUCKDB." AND httpStatus = \{selectedHttpStatus}"; + query += DUCKDB." LIMIT 10 OFFSET \{after}"; + + rs = stmt.executeQuery(query); while (rs.next()) { records.add(new CrawlDataRecordSummary(rs.getString(1), rs.getString(2), rs.getInt(3), rs.getBoolean(4), rs.getString(5), rs.getString(6))); } @@ -104,6 +113,9 @@ public class ControlCrawlDataService { ret.put("tab", Map.of("storage", true)); ret.put("view", Map.of("crawl", true)); + ret.put("selectedContentType", Map.of(selectedContentType, true)); + ret.put("selectedHttpStatus", Map.of(selectedHttpStatus, true)); + ret.put("urlGlob", urlGlob); ret.put("pagination", new Pagination(after + 10, after - 10)); @@ -152,9 +164,9 @@ public class ControlCrawlDataService { afterDomain); } - public record SummaryContentType(String contentType, int count) {} + public record SummaryContentType(String contentType, int count, boolean filtered) {} - public record SummaryStatusCode(int statusCode, int count) {} + public record SummaryStatusCode(int statusCode, int count, boolean filtered) {} public record Pagination(int next, int prev) { public boolean isPrevPage() { return prev >= 0; diff --git a/code/services-core/control-service/resources/templates/control/node/node-storage-crawl-parquet-details.hdb b/code/services-core/control-service/resources/templates/control/node/node-storage-crawl-parquet-details.hdb index 9593931e..4b30139f 100644 --- a/code/services-core/control-service/resources/templates/control/node/node-storage-crawl-parquet-details.hdb +++ b/code/services-core/control-service/resources/templates/control/node/node-storage-crawl-parquet-details.hdb @@ -30,26 +30,6 @@ Download Parquet - - HTTP Status - Count - - {{#each byStatusCode}} - - {{statusCode}} - {{count}} - - {{/each}} - - Content Type - Count - - {{#each byContentType}} - - {{contentType}} - {{count}} - - {{/each}}

Contents

@@ -62,6 +42,33 @@ ETag Last Modified + +
+ + + + + + + + + + + + + + + + + +
+ {{#each records}} @@ -89,6 +96,7 @@ {{/each}} + {{#with pagination}}