(*) Clean up code related to crawl parquet inspection

2025-02-24 05:18:58 +00:00 · 2024-05-22 12:55:08 +02:00 · 2024-05-22 12:55:08 +02:00 · 59ec70eb73
commit 59ec70eb73
parent 365229991b
7 changed files with 74 additions and 36 deletions
--- a/code/common/service/java/nu/marginalia/service/client/GrpcSingleNodeChannelPool.java
+++ b/code/common/service/java/nu/marginalia/service/client/GrpcSingleNodeChannelPool.java
@ -53,7 +53,7 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
    @Override
    public synchronized void onChange() {
-        Set<InstanceAddress> newRoutes = serviceRegistryIf.getEndpoints(serviceKey);
+        Set<InstanceAddress> newRoutes = new HashSet<>(serviceRegistryIf.getEndpoints(serviceKey));
        Set<InstanceAddress> oldRoutes = new HashSet<>(channels.keySet());
        // Find the routes that have been added or removed
--- a/code/common/service/java/nu/marginalia/service/discovery/ServiceRegistryIf.java
+++ b/code/common/service/java/nu/marginalia/service/discovery/ServiceRegistryIf.java
@ -6,7 +6,7 @@ import static nu.marginalia.service.discovery.property.ServiceEndpoint.*;
 import nu.marginalia.service.discovery.property.ServiceKey;
-import java.util.Set;
+import java.util.List;
 import java.util.UUID;
 /** A service registry that allows services to register themselves and
@ -42,7 +42,7 @@ public interface ServiceRegistryIf {
    int requestPort(String externalHost, ServiceKey<?> key);
    /** Get all endpoints for the service on the specified node and schema. */
-    Set<InstanceAddress> getEndpoints(ServiceKey<?> schema);
+    List<InstanceAddress> getEndpoints(ServiceKey<?> schema);
    /** Register a monitor to be notified when the service registry changes.
     * <p></p>
--- a/code/common/service/java/nu/marginalia/service/discovery/ZkServiceRegistry.java
+++ b/code/common/service/java/nu/marginalia/service/discovery/ZkServiceRegistry.java
@ -177,9 +177,9 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
    }
    @Override
-    public Set<InstanceAddress> getEndpoints(ServiceKey<?> key) {
+    public List<InstanceAddress> getEndpoints(ServiceKey<?> key) {
        try {
-            Set<InstanceAddress> ret = new HashSet<>();
+            List<InstanceAddress> ret = new ArrayList<>();
            for (var uuid : curatorFramework
                    .getChildren()
                    .forPath(key.toPath())) {
@ -204,7 +204,7 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
            return ret;
        }
        catch (Exception ex) {
-            return Set.of();
+            return List.of();
        }
    }
--- a/code/common/service/test/nu/marginalia/service/server/GrpcServerTest.java
+++ b/code/common/service/test/nu/marginalia/service/server/GrpcServerTest.java
@ -63,7 +63,7 @@ public class GrpcServerTest {
        var mockRegistry = Mockito.mock(ServiceRegistryIf.class);
        when(mockRegistry.getEndpoints(any())).thenReturn(
-                Set.of(new ServiceEndpoint("127.0.0.1", port).asInstance(serverUUID)));
+                List.of(new ServiceEndpoint("127.0.0.1", port).asInstance(serverUUID)));
        var client = createClient(mockRegistry);
        client.onChange();
@ -83,7 +83,7 @@ public class GrpcServerTest {
        server1.start();
-        Set<ServiceEndpoint.InstanceAddress> endpoints = new HashSet<>();
+        List<ServiceEndpoint.InstanceAddress> endpoints = new ArrayList<>();
        endpoints.add(new ServiceEndpoint("127.0.0.1", port).asInstance(serverUUID1));
        var mockRegistry = Mockito.mock(ServiceRegistryIf.class);
--- a/code/execution/api/java/nu/marginalia/executor/client/ExecutorClient.java
+++ b/code/execution/api/java/nu/marginalia/executor/client/ExecutorClient.java
@ -21,8 +21,6 @@ import nu.marginalia.storage.model.FileStorageId;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import java.io.IOException;
 import java.io.OutputStream;
 import java.net.*;
 import java.nio.charset.StandardCharsets;
 import java.util.List;
@ -161,18 +159,24 @@ public class ExecutorClient {
        }
    }
    /** Get the URL to download a file from a (possibly remote) file storage.
     * The endpoint is compatible with range requests.
     * */
    public URL remoteFileURL(FileStorage fileStorage, String path) {
        String uriPath = STR."/transfer/file/\{fileStorage.id()}";
        String uriQuery = STR."path=\{URLEncoder.encode(path, StandardCharsets.UTF_8)}";
-        var service = registry.getEndpoints(ServiceKey.forRest(ServiceId.Executor, fileStorage.node()))
+        var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Executor, fileStorage.node()));
-                .stream().findFirst().orElseThrow();
+        if (endpoints.isEmpty()) {
            throw new RuntimeException("No endpoints for node " + fileStorage.node());
        }
        var service = endpoints.getFirst();
        try {
            return service.endpoint().toURL(uriPath, uriQuery);
        }
        catch (URISyntaxException|MalformedURLException ex) {
-            throw new RuntimeException(ex);
+            throw new RuntimeException("Failed to construct URL for path", ex);
        }
    }
--- a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlCrawlDataService.java
+++ b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlCrawlDataService.java
@ -22,12 +22,16 @@ import java.util.*;
 import java.util.stream.Stream;
 /** Service for inspecting crawl data within the control service.
- *
+ * <p></p>
 * Uses remote calls to the executor service to fetch information about the crawl data.
 * Both directly, when inspecting the crawler log, and indirectly via duckdb when
 * inspecting the parquet files.  The duckdb calls rely on range queries to fetch
 * only the relevant data from the files, so that the UI remains responsive even when
 * dealing with large (100MB+ files).
 * <p></p>
 * This service is built in a fairly "raw" manner, for the purpose of not adding architectural
 * overhead by modelling the data in a more structured way through an API; instead the data is
 * fetched and presented directly to the UI.
 */
@Singleton
 public class ControlCrawlDataService {
@ -63,14 +67,16 @@ public class ControlCrawlDataService {
        List<SummaryStatusCode> byStatusCode = new ArrayList<>();
        List<SummaryContentType> byContentType = new ArrayList<>();
        List<CrawlDataRecordSummary> records = new ArrayList<>();
        // Fetch the data from the parquet file using DuckDB
        String domain;
        try (var conn = DriverManager.getConnection("jdbc:duckdb:");
             var stmt = conn.createStatement()) {
            ResultSet rs;
            // Summarize by status code
            rs = stmt.executeQuery(DUCKDB."SELECT domain FROM \{url} LIMIT 1");
            domain = rs.next() ? rs.getString(1) : "NO DOMAIN";
@ -80,9 +86,15 @@ public class ControlCrawlDataService {
                                       ORDER BY httpStatus
                                       """);
            while (rs.next()) {
-                byStatusCode.add(new SummaryStatusCode(rs.getInt(1), rs.getInt(2), selectedHttpStatus.equals(rs.getString(1))));
+                final boolean isCurrentFilter = selectedContentType.equals(rs.getString("httpStatus"));
                final int status = rs.getInt("httpStatus");
                final int cnt = rs.getInt("cnt");
                byStatusCode.add(new SummaryStatusCode(status, cnt, isCurrentFilter));
            }
            // Summarize by content type
            rs = stmt.executeQuery(DUCKDB."""
                                        SELECT contentType, COUNT(*) as cnt
                                        FROM \{url}
@ -90,11 +102,16 @@ public class ControlCrawlDataService {
                                        ORDER BY contentType
                                        """);
            while (rs.next()) {
-                byContentType.add(new SummaryContentType(rs.getString(1), rs.getInt(2), selectedContentType.equals(rs.getString(1))));
+                final boolean isCurrentFilter = selectedContentType.equals(rs.getString("contentType"));
                final String contentType = rs.getString("contentType");
                final int cnt = rs.getInt("cnt");
                byContentType.add(new SummaryContentType(contentType, cnt, isCurrentFilter));
            }
            // Extract the document data
-            var query = DUCKDB."SELECT url, contentType, httpStatus, body != '', etagHeader, lastModifiedHeader FROM \{url} WHERE 1=1";
+            var query = DUCKDB."SELECT url, contentType, httpStatus, body != '' as bodied, etagHeader, lastModifiedHeader FROM \{url} WHERE 1=1";
            if (!urlGlob.isBlank())
                query += DUCKDB." AND url LIKE \{urlGlob.replace('*', '%')}";
            if (!selectedContentType.equals("ALL"))
@ -105,7 +122,14 @@ public class ControlCrawlDataService {
            rs = stmt.executeQuery(query);
            while (rs.next()) {
-                records.add(new CrawlDataRecordSummary(rs.getString(1), rs.getString(2), rs.getInt(3), rs.getBoolean(4), rs.getString(5), rs.getString(6)));
+
                records.add(new CrawlDataRecordSummary(
                        rs.getString("url"),
                        rs.getString("contentType"),
                        rs.getInt("httpStatus"),
                        rs.getBoolean("bodied"),
                        rs.getString("etagHeader"),
                        rs.getString("lastModifiedHeader")));
            }
        }
@ -113,19 +137,21 @@ public class ControlCrawlDataService {
        ret.put("tab", Map.of("storage", true));
        ret.put("view", Map.of("crawl", true));
        ret.put("contentType", selectedContentType);
        ret.put("httpStatus", selectedHttpStatus);
        ret.put("urlGlob", urlGlob);
        ret.put("pagination", new Pagination(after + 10, after - 10, records.size()));
        ret.put("node", nodeConfigurationService.get(nodeId));
        ret.put("storage", fileStorageService.getStorage(fsid));
        ret.put("path", path);
        ret.put("domain", domain);
        ret.put("contentType", selectedContentType);
        ret.put("httpStatus", selectedHttpStatus);
        ret.put("urlGlob", urlGlob);
        ret.put("byStatusCode", byStatusCode);
        ret.put("byContentType", byContentType);
        ret.put("records", records);
        ret.put("pagination", new Pagination(after + 10, after - 10, records.size()));
        return ret;
    }
@ -206,6 +232,7 @@ public class ControlCrawlDataService {
    // DuckDB template processor that deals with quoting and escaping values
    // in the SQL query; this offers a very basic protection against accidental SQL injection
    @SuppressWarnings("preview")
    static StringTemplate.Processor<String, IllegalArgumentException> DUCKDB = st -> {
        StringBuilder sb = new StringBuilder();
        Iterator<String> fragmentsIter = st.fragments().iterator();
--- a/code/services-core/executor-service/test/nu/marginalia/executor/ExecutorFileTransferServiceTest.java
+++ b/code/services-core/executor-service/test/nu/marginalia/executor/ExecutorFileTransferServiceTest.java
@ -7,6 +7,8 @@ import org.junit.jupiter.api.Test;
 import org.mockito.Mockito;
 import spark.Spark;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.sql.DriverManager;
 import java.sql.SQLException;
@ -16,6 +18,11 @@ class ExecutorFileTransferServiceTest {
    @Test
    public void test() throws SQLException, InterruptedException {
        // Test requires this file to exist
        if (!Files.exists(Path.of("/tmp/crawl.parquet"))) {
            return;
        }
        var fileStorage = Mockito.mock(FileStorageService.class);
        when(fileStorage.getStorage(Mockito.any(FileStorageId.class))).thenReturn(new FileStorage(null,
@ -38,17 +45,17 @@ class ExecutorFileTransferServiceTest {
        try (var conn = DriverManager.getConnection("jdbc:duckdb:");
-         var stmt = conn.createStatement())
+             var stmt = conn.createStatement()) {
    {
            var rs = stmt.executeQuery("""
                SELECT COUNT(*) AS cnt, httpStatus 
-            FROM 'http://hostname:9998/transfer/file/0?path=crawl.parquet' 
+                FROM 'http://localhost:9998/transfer/file/0?path=crawl.parquet' 
                GROUP BY httpStatus
                """);
            while (rs.next()) {
                System.out.println(rs.getInt("CNT") + " " + rs.getInt("httpStatus"));
            }
        }
-        for(;;);
+
        Spark.stop();
    }
 }