mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(control) Add filter functionality for crawl data inspector
This commit is contained in:
parent
9539fdb53c
commit
197c82acd4
@ -55,6 +55,9 @@ public class ControlCrawlDataService {
|
||||
String path = request.queryParams("path");
|
||||
|
||||
int after = Integer.parseInt(request.queryParamOrDefault("page", "0"));
|
||||
String urlGlob = request.queryParamOrDefault("urlGlob", "");
|
||||
String selectedContentType = request.queryParamOrDefault("contentType", "ALL");
|
||||
String selectedHttpStatus = request.queryParamOrDefault("httpStatus", "ALL");
|
||||
|
||||
var url = executorClient.remoteFileURL(fileStorageService.getStorage(fsid), path).toString();
|
||||
|
||||
@ -65,8 +68,7 @@ public class ControlCrawlDataService {
|
||||
|
||||
String domain;
|
||||
try (var conn = DriverManager.getConnection("jdbc:duckdb:");
|
||||
var stmt = conn.createStatement())
|
||||
{
|
||||
var stmt = conn.createStatement()) {
|
||||
ResultSet rs;
|
||||
|
||||
rs = stmt.executeQuery(DUCKDB."SELECT domain FROM \{url} LIMIT 1");
|
||||
@ -78,7 +80,7 @@ public class ControlCrawlDataService {
|
||||
ORDER BY httpStatus
|
||||
""");
|
||||
while (rs.next()) {
|
||||
byStatusCode.add(new SummaryStatusCode(rs.getInt(1), rs.getInt(2)));
|
||||
byStatusCode.add(new SummaryStatusCode(rs.getInt(1), rs.getInt(2), selectedHttpStatus.equals(rs.getString(1))));
|
||||
}
|
||||
|
||||
rs = stmt.executeQuery(DUCKDB."""
|
||||
@ -88,13 +90,20 @@ public class ControlCrawlDataService {
|
||||
ORDER BY contentType
|
||||
""");
|
||||
while (rs.next()) {
|
||||
byContentType.add(new SummaryContentType(rs.getString(1), rs.getInt(2)));
|
||||
byContentType.add(new SummaryContentType(rs.getString(1), rs.getInt(2), selectedContentType.equals(rs.getString(1))));
|
||||
}
|
||||
|
||||
rs = stmt.executeQuery(DUCKDB."""
|
||||
SELECT url, contentType, httpStatus, body != '', etagHeader, lastModifiedHeader
|
||||
FROM \{url} LIMIT 10 OFFSET \{after}
|
||||
""");
|
||||
|
||||
var query = DUCKDB."SELECT url, contentType, httpStatus, body != '', etagHeader, lastModifiedHeader FROM \{url} WHERE 1=1";
|
||||
if (!urlGlob.isBlank())
|
||||
query += DUCKDB." AND url LIKE \{urlGlob.replace('*', '%')}";
|
||||
if (!selectedContentType.equals("ALL"))
|
||||
query += DUCKDB." AND contentType = \{selectedContentType}";
|
||||
if (!selectedHttpStatus.equals("ALL"))
|
||||
query += DUCKDB." AND httpStatus = \{selectedHttpStatus}";
|
||||
query += DUCKDB." LIMIT 10 OFFSET \{after}";
|
||||
|
||||
rs = stmt.executeQuery(query);
|
||||
while (rs.next()) {
|
||||
records.add(new CrawlDataRecordSummary(rs.getString(1), rs.getString(2), rs.getInt(3), rs.getBoolean(4), rs.getString(5), rs.getString(6)));
|
||||
}
|
||||
@ -104,6 +113,9 @@ public class ControlCrawlDataService {
|
||||
|
||||
ret.put("tab", Map.of("storage", true));
|
||||
ret.put("view", Map.of("crawl", true));
|
||||
ret.put("selectedContentType", Map.of(selectedContentType, true));
|
||||
ret.put("selectedHttpStatus", Map.of(selectedHttpStatus, true));
|
||||
ret.put("urlGlob", urlGlob);
|
||||
|
||||
ret.put("pagination", new Pagination(after + 10, after - 10));
|
||||
|
||||
@ -152,9 +164,9 @@ public class ControlCrawlDataService {
|
||||
afterDomain);
|
||||
}
|
||||
|
||||
public record SummaryContentType(String contentType, int count) {}
|
||||
public record SummaryContentType(String contentType, int count, boolean filtered) {}
|
||||
|
||||
public record SummaryStatusCode(int statusCode, int count) {}
|
||||
public record SummaryStatusCode(int statusCode, int count, boolean filtered) {}
|
||||
public record Pagination(int next, int prev) {
|
||||
public boolean isPrevPage() {
|
||||
return prev >= 0;
|
||||
|
@ -30,26 +30,6 @@
|
||||
<td>
|
||||
<a class="btn btn-primary" href="/nodes/{{node.id}}/storage/{{storage.id}}/transfer?path={{{path}}}">Download Parquet</a>
|
||||
</td>
|
||||
<tr>
|
||||
<th>HTTP Status</th>
|
||||
<th>Count</th>
|
||||
</tr>
|
||||
{{#each byStatusCode}}
|
||||
<tr>
|
||||
<td>{{statusCode}}</td>
|
||||
<td>{{count}}</td>
|
||||
</tr>
|
||||
{{/each}}
|
||||
<tr>
|
||||
<th>Content Type</th>
|
||||
<th>Count</th>
|
||||
</tr>
|
||||
{{#each byContentType}}
|
||||
<tr>
|
||||
<td>{{contentType}}</td>
|
||||
<td>{{count}}</td>
|
||||
</tr>
|
||||
{{/each}}
|
||||
</table>
|
||||
|
||||
<h2>Contents</h2>
|
||||
@ -62,6 +42,33 @@
|
||||
<th>ETag</th>
|
||||
<th>Last Modified</th>
|
||||
</tr>
|
||||
|
||||
<form>
|
||||
<input type="hidden" name="fid" value="{{storage.id}}">
|
||||
<input type="hidden" name="path" value="{{path}}">
|
||||
<tr>
|
||||
<td>
|
||||
<input type="text" class="" id="urlGlob" name="urlGlob" value="{{urlGlob}}">
|
||||
</td>
|
||||
<td>
|
||||
<select name="contentType" id="contentType">
|
||||
<option value="ALL">All</option>
|
||||
{{#each byContentType}} <option value="{{contentType}}" {{#if filtered}}selected{{/if}}>{{contentType}}</option> {{/each}}
|
||||
</select>
|
||||
</td>
|
||||
<td>
|
||||
<select name="httpStatus" id="httpStatus">
|
||||
<option value="ALL">All</option>
|
||||
{{#each byStatusCode}} <option value="{{statusCode}}" {{#if filtered}}selected{{/if}}>{{statusCode}}</option> {{/each}}
|
||||
</select>
|
||||
</td>
|
||||
<td colspan="2"></td>
|
||||
<td>
|
||||
<button type="submit" class="btn btn-primary">Filter</button>
|
||||
</td>
|
||||
</tr>
|
||||
</form>
|
||||
|
||||
{{#each records}}
|
||||
<tr>
|
||||
<td>
|
||||
@ -89,6 +96,7 @@
|
||||
</tr>
|
||||
{{/each}}
|
||||
|
||||
|
||||
<tr>
|
||||
{{#with pagination}}
|
||||
<td>
|
||||
|
Loading…
Reference in New Issue
Block a user