mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(*) Clean up code related to crawl parquet inspection
This commit is contained in:
parent
365229991b
commit
59ec70eb73
@ -53,7 +53,7 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
||||
|
||||
@Override
|
||||
public synchronized void onChange() {
|
||||
Set<InstanceAddress> newRoutes = serviceRegistryIf.getEndpoints(serviceKey);
|
||||
Set<InstanceAddress> newRoutes = new HashSet<>(serviceRegistryIf.getEndpoints(serviceKey));
|
||||
Set<InstanceAddress> oldRoutes = new HashSet<>(channels.keySet());
|
||||
|
||||
// Find the routes that have been added or removed
|
||||
|
@ -6,7 +6,7 @@ import static nu.marginalia.service.discovery.property.ServiceEndpoint.*;
|
||||
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
|
||||
import java.util.Set;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
/** A service registry that allows services to register themselves and
|
||||
@ -42,7 +42,7 @@ public interface ServiceRegistryIf {
|
||||
int requestPort(String externalHost, ServiceKey<?> key);
|
||||
|
||||
/** Get all endpoints for the service on the specified node and schema. */
|
||||
Set<InstanceAddress> getEndpoints(ServiceKey<?> schema);
|
||||
List<InstanceAddress> getEndpoints(ServiceKey<?> schema);
|
||||
|
||||
/** Register a monitor to be notified when the service registry changes.
|
||||
* <p></p>
|
||||
|
@ -177,9 +177,9 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<InstanceAddress> getEndpoints(ServiceKey<?> key) {
|
||||
public List<InstanceAddress> getEndpoints(ServiceKey<?> key) {
|
||||
try {
|
||||
Set<InstanceAddress> ret = new HashSet<>();
|
||||
List<InstanceAddress> ret = new ArrayList<>();
|
||||
for (var uuid : curatorFramework
|
||||
.getChildren()
|
||||
.forPath(key.toPath())) {
|
||||
@ -204,7 +204,7 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
return ret;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
return Set.of();
|
||||
return List.of();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -63,7 +63,7 @@ public class GrpcServerTest {
|
||||
|
||||
var mockRegistry = Mockito.mock(ServiceRegistryIf.class);
|
||||
when(mockRegistry.getEndpoints(any())).thenReturn(
|
||||
Set.of(new ServiceEndpoint("127.0.0.1", port).asInstance(serverUUID)));
|
||||
List.of(new ServiceEndpoint("127.0.0.1", port).asInstance(serverUUID)));
|
||||
|
||||
var client = createClient(mockRegistry);
|
||||
client.onChange();
|
||||
@ -83,7 +83,7 @@ public class GrpcServerTest {
|
||||
|
||||
server1.start();
|
||||
|
||||
Set<ServiceEndpoint.InstanceAddress> endpoints = new HashSet<>();
|
||||
List<ServiceEndpoint.InstanceAddress> endpoints = new ArrayList<>();
|
||||
endpoints.add(new ServiceEndpoint("127.0.0.1", port).asInstance(serverUUID1));
|
||||
|
||||
var mockRegistry = Mockito.mock(ServiceRegistryIf.class);
|
||||
|
@ -21,8 +21,6 @@ import nu.marginalia.storage.model.FileStorageId;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.net.*;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.List;
|
||||
@ -161,18 +159,24 @@ public class ExecutorClient {
|
||||
}
|
||||
}
|
||||
|
||||
/** Get the URL to download a file from a (possibly remote) file storage.
|
||||
* The endpoint is compatible with range requests.
|
||||
* */
|
||||
public URL remoteFileURL(FileStorage fileStorage, String path) {
|
||||
String uriPath = STR."/transfer/file/\{fileStorage.id()}";
|
||||
String uriQuery = STR."path=\{URLEncoder.encode(path, StandardCharsets.UTF_8)}";
|
||||
|
||||
var service = registry.getEndpoints(ServiceKey.forRest(ServiceId.Executor, fileStorage.node()))
|
||||
.stream().findFirst().orElseThrow();
|
||||
var endpoints = registry.getEndpoints(ServiceKey.forRest(ServiceId.Executor, fileStorage.node()));
|
||||
if (endpoints.isEmpty()) {
|
||||
throw new RuntimeException("No endpoints for node " + fileStorage.node());
|
||||
}
|
||||
var service = endpoints.getFirst();
|
||||
|
||||
try {
|
||||
return service.endpoint().toURL(uriPath, uriQuery);
|
||||
}
|
||||
catch (URISyntaxException|MalformedURLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
throw new RuntimeException("Failed to construct URL for path", ex);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -22,12 +22,16 @@ import java.util.*;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
/** Service for inspecting crawl data within the control service.
|
||||
*
|
||||
* <p></p>
|
||||
* Uses remote calls to the executor service to fetch information about the crawl data.
|
||||
* Both directly, when inspecting the crawler log, and indirectly via duckdb when
|
||||
* inspecting the parquet files. The duckdb calls rely on range queries to fetch
|
||||
* only the relevant data from the files, so that the UI remains responsive even when
|
||||
* dealing with large (100MB+ files).
|
||||
* <p></p>
|
||||
* This service is built in a fairly "raw" manner, for the purpose of not adding architectural
|
||||
* overhead by modelling the data in a more structured way through an API; instead the data is
|
||||
* fetched and presented directly to the UI.
|
||||
*/
|
||||
@Singleton
|
||||
public class ControlCrawlDataService {
|
||||
@ -63,14 +67,16 @@ public class ControlCrawlDataService {
|
||||
|
||||
List<SummaryStatusCode> byStatusCode = new ArrayList<>();
|
||||
List<SummaryContentType> byContentType = new ArrayList<>();
|
||||
|
||||
List<CrawlDataRecordSummary> records = new ArrayList<>();
|
||||
|
||||
// Fetch the data from the parquet file using DuckDB
|
||||
String domain;
|
||||
try (var conn = DriverManager.getConnection("jdbc:duckdb:");
|
||||
var stmt = conn.createStatement()) {
|
||||
ResultSet rs;
|
||||
|
||||
// Summarize by status code
|
||||
|
||||
rs = stmt.executeQuery(DUCKDB."SELECT domain FROM \{url} LIMIT 1");
|
||||
domain = rs.next() ? rs.getString(1) : "NO DOMAIN";
|
||||
|
||||
@ -80,9 +86,15 @@ public class ControlCrawlDataService {
|
||||
ORDER BY httpStatus
|
||||
""");
|
||||
while (rs.next()) {
|
||||
byStatusCode.add(new SummaryStatusCode(rs.getInt(1), rs.getInt(2), selectedHttpStatus.equals(rs.getString(1))));
|
||||
final boolean isCurrentFilter = selectedContentType.equals(rs.getString("httpStatus"));
|
||||
final int status = rs.getInt("httpStatus");
|
||||
final int cnt = rs.getInt("cnt");
|
||||
|
||||
byStatusCode.add(new SummaryStatusCode(status, cnt, isCurrentFilter));
|
||||
}
|
||||
|
||||
// Summarize by content type
|
||||
|
||||
rs = stmt.executeQuery(DUCKDB."""
|
||||
SELECT contentType, COUNT(*) as cnt
|
||||
FROM \{url}
|
||||
@ -90,11 +102,16 @@ public class ControlCrawlDataService {
|
||||
ORDER BY contentType
|
||||
""");
|
||||
while (rs.next()) {
|
||||
byContentType.add(new SummaryContentType(rs.getString(1), rs.getInt(2), selectedContentType.equals(rs.getString(1))));
|
||||
final boolean isCurrentFilter = selectedContentType.equals(rs.getString("contentType"));
|
||||
final String contentType = rs.getString("contentType");
|
||||
final int cnt = rs.getInt("cnt");
|
||||
|
||||
byContentType.add(new SummaryContentType(contentType, cnt, isCurrentFilter));
|
||||
}
|
||||
|
||||
// Extract the document data
|
||||
|
||||
var query = DUCKDB."SELECT url, contentType, httpStatus, body != '', etagHeader, lastModifiedHeader FROM \{url} WHERE 1=1";
|
||||
var query = DUCKDB."SELECT url, contentType, httpStatus, body != '' as bodied, etagHeader, lastModifiedHeader FROM \{url} WHERE 1=1";
|
||||
if (!urlGlob.isBlank())
|
||||
query += DUCKDB." AND url LIKE \{urlGlob.replace('*', '%')}";
|
||||
if (!selectedContentType.equals("ALL"))
|
||||
@ -105,7 +122,14 @@ public class ControlCrawlDataService {
|
||||
|
||||
rs = stmt.executeQuery(query);
|
||||
while (rs.next()) {
|
||||
records.add(new CrawlDataRecordSummary(rs.getString(1), rs.getString(2), rs.getInt(3), rs.getBoolean(4), rs.getString(5), rs.getString(6)));
|
||||
|
||||
records.add(new CrawlDataRecordSummary(
|
||||
rs.getString("url"),
|
||||
rs.getString("contentType"),
|
||||
rs.getInt("httpStatus"),
|
||||
rs.getBoolean("bodied"),
|
||||
rs.getString("etagHeader"),
|
||||
rs.getString("lastModifiedHeader")));
|
||||
}
|
||||
}
|
||||
|
||||
@ -113,19 +137,21 @@ public class ControlCrawlDataService {
|
||||
|
||||
ret.put("tab", Map.of("storage", true));
|
||||
ret.put("view", Map.of("crawl", true));
|
||||
ret.put("contentType", selectedContentType);
|
||||
ret.put("httpStatus", selectedHttpStatus);
|
||||
ret.put("urlGlob", urlGlob);
|
||||
|
||||
ret.put("pagination", new Pagination(after + 10, after - 10, records.size()));
|
||||
|
||||
ret.put("node", nodeConfigurationService.get(nodeId));
|
||||
ret.put("storage", fileStorageService.getStorage(fsid));
|
||||
ret.put("path", path);
|
||||
ret.put("domain", domain);
|
||||
|
||||
ret.put("contentType", selectedContentType);
|
||||
ret.put("httpStatus", selectedHttpStatus);
|
||||
ret.put("urlGlob", urlGlob);
|
||||
|
||||
ret.put("byStatusCode", byStatusCode);
|
||||
ret.put("byContentType", byContentType);
|
||||
|
||||
ret.put("records", records);
|
||||
ret.put("pagination", new Pagination(after + 10, after - 10, records.size()));
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -206,6 +232,7 @@ public class ControlCrawlDataService {
|
||||
|
||||
// DuckDB template processor that deals with quoting and escaping values
|
||||
// in the SQL query; this offers a very basic protection against accidental SQL injection
|
||||
@SuppressWarnings("preview")
|
||||
static StringTemplate.Processor<String, IllegalArgumentException> DUCKDB = st -> {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
Iterator<String> fragmentsIter = st.fragments().iterator();
|
||||
|
@ -7,6 +7,8 @@ import org.junit.jupiter.api.Test;
|
||||
import org.mockito.Mockito;
|
||||
import spark.Spark;
|
||||
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.SQLException;
|
||||
|
||||
@ -16,6 +18,11 @@ class ExecutorFileTransferServiceTest {
|
||||
|
||||
@Test
|
||||
public void test() throws SQLException, InterruptedException {
|
||||
// Test requires this file to exist
|
||||
if (!Files.exists(Path.of("/tmp/crawl.parquet"))) {
|
||||
return;
|
||||
}
|
||||
|
||||
var fileStorage = Mockito.mock(FileStorageService.class);
|
||||
|
||||
when(fileStorage.getStorage(Mockito.any(FileStorageId.class))).thenReturn(new FileStorage(null,
|
||||
@ -37,18 +44,18 @@ class ExecutorFileTransferServiceTest {
|
||||
Thread.sleep(1000);
|
||||
|
||||
|
||||
try (var conn = DriverManager.getConnection("jdbc:duckdb:");
|
||||
var stmt = conn.createStatement())
|
||||
{
|
||||
var rs = stmt.executeQuery("""
|
||||
SELECT COUNT(*) AS cnt, httpStatus
|
||||
FROM 'http://hostname:9998/transfer/file/0?path=crawl.parquet'
|
||||
GROUP BY httpStatus
|
||||
""");
|
||||
while (rs.next()) {
|
||||
System.out.println(rs.getInt("CNT") + " " + rs.getInt("httpStatus"));
|
||||
try (var conn = DriverManager.getConnection("jdbc:duckdb:");
|
||||
var stmt = conn.createStatement()) {
|
||||
var rs = stmt.executeQuery("""
|
||||
SELECT COUNT(*) AS cnt, httpStatus
|
||||
FROM 'http://localhost:9998/transfer/file/0?path=crawl.parquet'
|
||||
GROUP BY httpStatus
|
||||
""");
|
||||
while (rs.next()) {
|
||||
System.out.println(rs.getInt("CNT") + " " + rs.getInt("httpStatus"));
|
||||
}
|
||||
}
|
||||
}
|
||||
for(;;);
|
||||
|
||||
Spark.stop();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user