From 23cce0c78aaedb7d984181bc53c99a6db4d728ac Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 27 Sep 2024 13:45:54 +0200 Subject: [PATCH] Add a new function 'Live Capture' for on-demand screenshot capture The screenshots are requested by the site-service, and triggered via the site-info view. --- .../java/nu/marginalia/model/EdgeDomain.java | 5 +- .../discovery/property/ServicePartition.java | 1 - .../browse/DbBrowseDomainsRandom.java | 7 +- .../browse/DbBrowseDomainsSimilarCosine.java | 7 +- .../domains/SimilarDomainsService.java | 8 +- code/functions/live-capture/api/build.gradle | 45 +++++ .../api/livecapture/LiveCaptureClient.java | 36 ++++ .../api/src/main/protobuf/live-capture.proto | 18 ++ code/functions/live-capture/build.gradle | 60 ++++++ .../livecapture/BrowserlessClient.java | 100 ++++++++++ .../livecapture/LiveCaptureGrpcService.java | 186 ++++++++++++++++++ .../livecapture/LivecaptureModule.java | 15 ++ .../livecapture/ScreenshotDbOperations.java | 85 ++++++++ .../livecapture/BrowserlessClientTest.java | 37 ++++ .../search-service/build.gradle | 1 + .../search/svc/SearchBrowseService.java | 2 +- .../search/svc/SearchSiteInfoService.java | 50 ++++- .../assistant-service/build.gradle | 2 + .../marginalia/assistant/AssistantMain.java | 10 +- .../assistant/AssistantService.java | 9 +- .../screenshot/ScreenshotCaptureToolMain.java | 2 +- settings.gradle | 3 + 22 files changed, 663 insertions(+), 26 deletions(-) create mode 100644 code/functions/live-capture/api/build.gradle create mode 100644 code/functions/live-capture/api/java/nu/marginalia/api/livecapture/LiveCaptureClient.java create mode 100644 code/functions/live-capture/api/src/main/protobuf/live-capture.proto create mode 100644 code/functions/live-capture/build.gradle create mode 100644 code/functions/live-capture/java/nu/marginalia/livecapture/BrowserlessClient.java create mode 100644 code/functions/live-capture/java/nu/marginalia/livecapture/LiveCaptureGrpcService.java create mode 100644 code/functions/live-capture/java/nu/marginalia/livecapture/LivecaptureModule.java create mode 100644 code/functions/live-capture/java/nu/marginalia/livecapture/ScreenshotDbOperations.java create mode 100644 code/functions/live-capture/test/nu/marginalia/livecapture/BrowserlessClientTest.java diff --git a/code/common/model/java/nu/marginalia/model/EdgeDomain.java b/code/common/model/java/nu/marginalia/model/EdgeDomain.java index 86c17824..44404ee0 100644 --- a/code/common/model/java/nu/marginalia/model/EdgeDomain.java +++ b/code/common/model/java/nu/marginalia/model/EdgeDomain.java @@ -92,10 +92,13 @@ public class EdgeDomain implements Serializable { - public EdgeUrl toRootUrl() { + public EdgeUrl toRootUrlHttp() { // Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http return new EdgeUrl("http", this, null, "/", null); } + public EdgeUrl toRootUrlHttps() { + return new EdgeUrl("https", this, null, "/", null); + } public String toString() { return getAddress(); diff --git a/code/common/service/java/nu/marginalia/service/discovery/property/ServicePartition.java b/code/common/service/java/nu/marginalia/service/discovery/property/ServicePartition.java index 32aa37fb..eb4ce120 100644 --- a/code/common/service/java/nu/marginalia/service/discovery/property/ServicePartition.java +++ b/code/common/service/java/nu/marginalia/service/discovery/property/ServicePartition.java @@ -20,7 +20,6 @@ public sealed interface ServicePartition { public String identifier() { return Integer.toString(node); } - } record None() implements ServicePartition, PartitionTraits.NoGrpc { public String identifier() { return ""; } diff --git a/code/features-search/random-websites/java/nu/marginalia/browse/DbBrowseDomainsRandom.java b/code/features-search/random-websites/java/nu/marginalia/browse/DbBrowseDomainsRandom.java index 138230f9..1badbf4d 100644 --- a/code/features-search/random-websites/java/nu/marginalia/browse/DbBrowseDomainsRandom.java +++ b/code/features-search/random-websites/java/nu/marginalia/browse/DbBrowseDomainsRandom.java @@ -4,13 +4,14 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.browse.model.BrowseResult; -import nu.marginalia.model.EdgeDomain; import nu.marginalia.db.DomainBlacklist; +import nu.marginalia.model.EdgeDomain; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.sql.SQLException; -import java.util.*; +import java.util.ArrayList; +import java.util.List; @Singleton public class DbBrowseDomainsRandom { @@ -47,7 +48,7 @@ public class DbBrowseDomainsRandom { boolean indexed = rsp.getBoolean("INDEXED"); if (!blacklist.isBlacklisted(id)) { - domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0, indexed)); + domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrlHttp(), id, 0, indexed)); } } } diff --git a/code/features-search/random-websites/java/nu/marginalia/browse/DbBrowseDomainsSimilarCosine.java b/code/features-search/random-websites/java/nu/marginalia/browse/DbBrowseDomainsSimilarCosine.java index f75a87de..62988562 100644 --- a/code/features-search/random-websites/java/nu/marginalia/browse/DbBrowseDomainsSimilarCosine.java +++ b/code/features-search/random-websites/java/nu/marginalia/browse/DbBrowseDomainsSimilarCosine.java @@ -4,13 +4,14 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.browse.model.BrowseResult; -import nu.marginalia.model.EdgeDomain; import nu.marginalia.db.DomainBlacklist; +import nu.marginalia.model.EdgeDomain; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.sql.SQLException; -import java.util.*; +import java.util.ArrayList; +import java.util.List; @Singleton public class DbBrowseDomainsSimilarCosine { @@ -53,7 +54,7 @@ public class DbBrowseDomainsSimilarCosine { boolean indexed = rsp.getBoolean("INDEXED"); if (!blacklist.isBlacklisted(id)) { - domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, relatedness, indexed)); + domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrlHttp(), id, relatedness, indexed)); } } } diff --git a/code/functions/domain-info/java/nu/marginalia/functions/domains/SimilarDomainsService.java b/code/functions/domain-info/java/nu/marginalia/functions/domains/SimilarDomainsService.java index 594a258c..4f3728cd 100644 --- a/code/functions/domain-info/java/nu/marginalia/functions/domains/SimilarDomainsService.java +++ b/code/functions/domain-info/java/nu/marginalia/functions/domains/SimilarDomainsService.java @@ -9,7 +9,7 @@ import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.set.TIntSet; import gnu.trove.set.hash.TIntHashSet; import it.unimi.dsi.fastutil.ints.Int2DoubleArrayMap; -import nu.marginalia.api.domains.*; +import nu.marginalia.api.domains.RpcSimilarDomain; import nu.marginalia.api.domains.model.SimilarDomain; import nu.marginalia.api.linkgraph.AggregateLinkGraphClient; import nu.marginalia.model.EdgeDomain; @@ -21,8 +21,6 @@ import java.sql.ResultSet; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; -import java.util.concurrent.Executors; -import java.util.concurrent.ScheduledExecutorService; import java.util.stream.IntStream; public class SimilarDomainsService { @@ -227,7 +225,7 @@ public class SimilarDomainsService { domains.add(RpcSimilarDomain.newBuilder() .setDomainId(id) - .setUrl(new EdgeDomain(domainNames[idx]).toRootUrl().toString()) + .setUrl(new EdgeDomain(domainNames[idx]).toRootUrlHttp().toString()) .setRelatedness(getRelatedness(domainId, id)) .setRank(domainRanks.get(idx)) .setIndexed(indexedDomains.contains(idx)) @@ -342,7 +340,7 @@ public class SimilarDomainsService { domains.add(RpcSimilarDomain.newBuilder() .setDomainId(id) - .setUrl(new EdgeDomain(domainNames[idx]).toRootUrl().toString()) + .setUrl(new EdgeDomain(domainNames[idx]).toRootUrlHttp().toString()) .setRelatedness(getRelatedness(domainId, id)) .setRank(domainRanks.get(idx)) .setIndexed(indexedDomains.contains(idx)) diff --git a/code/functions/live-capture/api/build.gradle b/code/functions/live-capture/api/build.gradle new file mode 100644 index 00000000..d5a4e27d --- /dev/null +++ b/code/functions/live-capture/api/build.gradle @@ -0,0 +1,45 @@ +plugins { + id 'java' + + id "com.google.protobuf" version "0.9.4" + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) + } +} + +jar.archiveBaseName = 'live-capture-api' + +apply from: "$rootProject.projectDir/protobuf.gradle" +apply from: "$rootProject.projectDir/srcsets.gradle" + +dependencies { + implementation project(':code:common:model') + implementation project(':code:common:config') + implementation project(':code:common:service') + + implementation libs.bundles.slf4j + + implementation libs.prometheus + implementation libs.notnull + implementation libs.guava + implementation dependencies.create(libs.guice.get()) { + exclude group: 'com.google.guava' + } + implementation libs.gson + implementation libs.bundles.protobuf + implementation libs.guava + libs.bundles.grpc.get().each { + implementation dependencies.create(it) { + exclude group: 'com.google.guava' + } + } + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + +} diff --git a/code/functions/live-capture/api/java/nu/marginalia/api/livecapture/LiveCaptureClient.java b/code/functions/live-capture/api/java/nu/marginalia/api/livecapture/LiveCaptureClient.java new file mode 100644 index 00000000..e422eb12 --- /dev/null +++ b/code/functions/live-capture/api/java/nu/marginalia/api/livecapture/LiveCaptureClient.java @@ -0,0 +1,36 @@ +package nu.marginalia.api.livecapture; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.api.livecapture.LiveCaptureApiGrpc.LiveCaptureApiBlockingStub; +import nu.marginalia.service.client.GrpcChannelPoolFactory; +import nu.marginalia.service.client.GrpcSingleNodeChannelPool; +import nu.marginalia.service.discovery.property.ServiceKey; +import nu.marginalia.service.discovery.property.ServicePartition; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +@Singleton +public class LiveCaptureClient { + private static final Logger logger = LoggerFactory.getLogger(LiveCaptureClient.class); + + private final GrpcSingleNodeChannelPool channelPool; + + @Inject + public LiveCaptureClient(GrpcChannelPoolFactory factory) { + // The client is only interested in the primary node + var key = ServiceKey.forGrpcApi(LiveCaptureApiGrpc.class, ServicePartition.any()); + this.channelPool = factory.createSingle(key, LiveCaptureApiGrpc::newBlockingStub); + } + + + public void requestScreengrab(int domainId) { + try { + channelPool.call(LiveCaptureApiBlockingStub::requestScreengrab) + .run(RpcDomainId.newBuilder().setDomainId(domainId).build()); + } + catch (Exception e) { + logger.error("API Exception", e); + } + } +} diff --git a/code/functions/live-capture/api/src/main/protobuf/live-capture.proto b/code/functions/live-capture/api/src/main/protobuf/live-capture.proto new file mode 100644 index 00000000..752de691 --- /dev/null +++ b/code/functions/live-capture/api/src/main/protobuf/live-capture.proto @@ -0,0 +1,18 @@ +syntax="proto3"; +package nu.marginalia.api.livecapture; + +option java_package="nu.marginalia.api.livecapture"; +option java_multiple_files=true; + +service LiveCaptureApi { + rpc requestScreengrab(RpcDomainId) returns (Empty) {} +} + +message Void { +} + +message RpcDomainId { + int32 domainId = 1; +} + +message Empty {} \ No newline at end of file diff --git a/code/functions/live-capture/build.gradle b/code/functions/live-capture/build.gradle new file mode 100644 index 00000000..169c7e23 --- /dev/null +++ b/code/functions/live-capture/build.gradle @@ -0,0 +1,60 @@ +plugins { + id 'java' + + id 'application' + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) + } +} + +apply from: "$rootProject.projectDir/srcsets.gradle" + +dependencies { + implementation project(':code:functions:live-capture:api') + + implementation project(':code:common:config') + implementation project(':code:common:service') + implementation project(':code:common:model') + implementation project(':code:common:db') + + implementation libs.bundles.slf4j + implementation libs.commons.lang3 + + implementation libs.prometheus + implementation libs.guava + libs.bundles.grpc.get().each { + implementation dependencies.create(it) { + exclude group: 'com.google.guava' + } + } + + + implementation libs.notnull + implementation libs.guava + implementation dependencies.create(libs.guice.get()) { + exclude group: 'com.google.guava' + } + implementation dependencies.create(libs.spark.get()) { + exclude group: 'org.eclipse.jetty' + } + implementation libs.bundles.jetty + implementation libs.bundles.gson + implementation libs.bundles.mariadb + + + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + + testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation libs.commons.codec + testImplementation 'org.testcontainers:mariadb:1.17.4' + testImplementation 'org.testcontainers:junit-jupiter:1.17.4' + testImplementation project(':code:libraries:test-helpers') + +} diff --git a/code/functions/live-capture/java/nu/marginalia/livecapture/BrowserlessClient.java b/code/functions/live-capture/java/nu/marginalia/livecapture/BrowserlessClient.java new file mode 100644 index 00000000..1d54867a --- /dev/null +++ b/code/functions/live-capture/java/nu/marginalia/livecapture/BrowserlessClient.java @@ -0,0 +1,100 @@ +package nu.marginalia.livecapture; + +import com.google.gson.Gson; +import nu.marginalia.model.gson.GsonFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URI; +import java.net.http.HttpClient; +import java.net.http.HttpRequest; +import java.net.http.HttpResponse; +import java.time.Duration; +import java.util.Map; + +/** Client for local browserless.io API */ +public class BrowserlessClient implements AutoCloseable { + private static final Logger logger = LoggerFactory.getLogger(BrowserlessClient.class); + + private final HttpClient httpClient = HttpClient.newBuilder() + .version(HttpClient.Version.HTTP_1_1) + .connectTimeout(Duration.ofSeconds(30)) + .build(); + + private final URI browserlessURI; + private final Gson gson = GsonFactory.get(); + + public BrowserlessClient(URI browserlessURI) { + this.browserlessURI = browserlessURI; + } + + public String content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException { + Map requestData = Map.of( + "url", url, + "gotoOptions", gotoOptions + ); + + var request = HttpRequest.newBuilder() + .uri(browserlessURI.resolve("/content")) + .method("POST", HttpRequest.BodyPublishers.ofString( + gson.toJson(requestData) + )) + .header("Content-type", "application/json") + .build(); + + var rsp = httpClient.send(request, HttpResponse.BodyHandlers.ofString()); + + if (rsp.statusCode() >= 300) { + logger.info("Failed to fetch content for {}, status {}", url, rsp.statusCode()); + return null; + } + + return rsp.body(); + } + + public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions) + throws IOException, InterruptedException { + + Map requestData = Map.of( + "url", url, + "options", screenshotOptions, + "gotoOptions", gotoOptions + ); + + var request = HttpRequest.newBuilder() + .uri(browserlessURI.resolve("/screenshot")) + .method("POST", HttpRequest.BodyPublishers.ofString( + gson.toJson(requestData) + )) + .header("Content-type", "application/json") + .build(); + + var rsp = httpClient.send(request, HttpResponse.BodyHandlers.ofByteArray()); + + if (rsp.statusCode() >= 300) { + logger.info("Failed to fetch screenshot for {}, status {}", url, rsp.statusCode()); + } + + return rsp.body(); + + } + + @Override + public void close() throws Exception { + httpClient.shutdownNow(); + } + + public record ScreenshotOptions(boolean fullPage, String type) { + public static ScreenshotOptions defaultValues() { + return new ScreenshotOptions(false, "png"); + } + } + + public record GotoOptions(String waitUntil, long timeout) { + public static GotoOptions defaultValues() { + return new GotoOptions("networkidle2", Duration.ofSeconds(10).toMillis()); + } + } + +} diff --git a/code/functions/live-capture/java/nu/marginalia/livecapture/LiveCaptureGrpcService.java b/code/functions/live-capture/java/nu/marginalia/livecapture/LiveCaptureGrpcService.java new file mode 100644 index 00000000..34b5052b --- /dev/null +++ b/code/functions/live-capture/java/nu/marginalia/livecapture/LiveCaptureGrpcService.java @@ -0,0 +1,186 @@ +package nu.marginalia.livecapture; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import io.grpc.stub.StreamObserver; +import jakarta.inject.Named; +import nu.marginalia.api.livecapture.Empty; +import nu.marginalia.api.livecapture.LiveCaptureApiGrpc; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.service.module.ServiceConfiguration; +import nu.marginalia.service.server.DiscoverableService; +import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.net.URI; +import java.net.URISyntaxException; +import java.sql.Connection; +import java.sql.SQLException; +import java.util.Arrays; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.LinkedBlockingQueue; + +/** GRPC service for on-demand capture of website screenshots */ +public class LiveCaptureGrpcService + extends LiveCaptureApiGrpc.LiveCaptureApiImplBase + implements DiscoverableService +{ + + private static final Logger logger = LoggerFactory.getLogger(LiveCaptureGrpcService.class); + + private final URI browserlessURI; + private final boolean serviceEnabled; + private final LinkedBlockingQueue requestedScreenshots = new LinkedBlockingQueue<>(128); + private final HikariDataSource dataSource; + + record ScheduledScreenshot(int domainId) {} + + // Ensure that the service is only registered if it is enabled + @Override + public boolean shouldRegisterService() { + return serviceEnabled; + } + + @Inject + public LiveCaptureGrpcService(HikariDataSource dataSource, + @Named("browserless-uri") String browserlessAddress, + @Named("browserless-agent-threads") int threads, + ServiceConfiguration serviceConfiguration + ) throws URISyntaxException { + this.dataSource = dataSource; + + if (StringUtils.isEmpty(browserlessAddress) || serviceConfiguration.node() > 1) { + logger.warn("Live capture service will not run"); + serviceEnabled = false; + browserlessURI = null; // satisfy final + } + else { + browserlessURI = new URI(browserlessAddress); + serviceEnabled = true; + + for (int i = 0; i < threads; i++) { + Thread.ofPlatform().daemon().name("Capture Agent " + i).start(new ScreenshotCaptureAgent()); + } + } + } + + public void requestScreengrab(nu.marginalia.api.livecapture.RpcDomainId request, + StreamObserver responseObserver) + { + if (serviceEnabled) { + try (var conn = dataSource.getConnection()) { + logger.info("Received request for domain {}", request.getDomainId()); + if (ScreenshotDbOperations.isEligibleForScreengrab(conn, request.getDomainId())) { + logger.info("Domain {} is eligible for a screenshot", request.getDomainId()); + // may fail, we don't care about it + requestedScreenshots.offer(new ScheduledScreenshot(request.getDomainId())); + } + else { + logger.info("Domain {} is not eligible for a screenshot", request.getDomainId()); + } + } + catch (SQLException ex) { + logger.error("Failed to check domain eligibility", ex); + } + finally { + responseObserver.onNext(Empty.getDefaultInstance()); + responseObserver.onCompleted(); + } + } + else { // service is disabled + responseObserver.onNext(Empty.getDefaultInstance()); + responseObserver.onCompleted(); + } + } + + class ScreenshotCaptureAgent implements Runnable { + + // To prevent race conditions, we use this to lock domain ids that are being processed + private static final ConcurrentHashMap domainIdsClaimed = new ConcurrentHashMap<>(); + + @Override + public void run() { + try (BrowserlessClient client = new BrowserlessClient(browserlessURI)) { + while (true) { + capture(client, requestedScreenshots.take()); + } + } catch (InterruptedException e) { + logger.error("Capture agent interrupted", e); + Thread.currentThread().interrupt(); + } catch (Exception e) { + logger.error("Capture agent failed", e); + } + } + + private void capture(BrowserlessClient client, ScheduledScreenshot scheduledScreenshot) { + // Only one agent should capture a screenshot for a domain, so we skip if another agent has claimed it + if (domainIdsClaimed.put(scheduledScreenshot.domainId(), Boolean.TRUE) != null) { + return; + } + + try (var conn = dataSource.getConnection()) { + // Double check if the domain is still eligible for a screenshot + if (!ScreenshotDbOperations.isEligibleForScreengrab(conn, scheduledScreenshot.domainId)) { + return; + } + + var domainNameOpt = ScreenshotDbOperations.getDomainName(conn, scheduledScreenshot.domainId()); + if (domainNameOpt.isEmpty()) { + logger.error("Failed to get domain name for domain {}", scheduledScreenshot.domainId()); + } + else { + EdgeDomain domain = domainNameOpt.get(); + String domainNameStr = domain.toString(); + + if (!isValidDomainForCapture(domain)) { + logger.error("Invalid domain name {}", domainNameStr); + ScreenshotDbOperations.flagDomainAsFetched(conn, domain); + } + else { + grab(client, conn, domain); + } + } + } + catch (SQLException ex) { + logger.error("Failed to check domain eligibility", ex); + } + finally { + // Release the domain ID so that another agent can claim it + // at this point we can assume the database will cover the + // case where the domain is no longer eligible + domainIdsClaimed.remove(scheduledScreenshot.domainId()); + } + } + + private boolean isValidDomainForCapture(EdgeDomain domain) { + String domainNameStr = domain.toString(); + String[] parts = domainNameStr.split("\\."); + + if (parts.length < 2) { + return false; + } + + if (Arrays.stream(parts).allMatch(StringUtils::isNumeric)) { + // IP address + return false; + } + + return true; + } + + private void grab(BrowserlessClient client, Connection conn, EdgeDomain domain) { + try { + logger.info("Capturing {}", domain); + + byte[] pngBytes = client.screenshot(domain.toRootUrlHttps().toString(), + BrowserlessClient.GotoOptions.defaultValues(), + BrowserlessClient.ScreenshotOptions.defaultValues()); + ScreenshotDbOperations.uploadScreenshot(conn, domain, pngBytes); + } catch (Exception e) { + ScreenshotDbOperations.flagDomainAsFetched(conn, domain); + } + } + } + +} diff --git a/code/functions/live-capture/java/nu/marginalia/livecapture/LivecaptureModule.java b/code/functions/live-capture/java/nu/marginalia/livecapture/LivecaptureModule.java new file mode 100644 index 00000000..ff0ec073 --- /dev/null +++ b/code/functions/live-capture/java/nu/marginalia/livecapture/LivecaptureModule.java @@ -0,0 +1,15 @@ +package nu.marginalia.livecapture; + +import com.google.inject.AbstractModule; +import com.google.inject.name.Names; + +public class LivecaptureModule extends AbstractModule { + public void configure() { + bind(String.class) + .annotatedWith(Names.named("browserless-uri")) + .toInstance(System.getProperty("live-capture.browserless-uri", "")); + bind(Integer.class) + .annotatedWith(Names.named("browserless-agent-threads")) + .toInstance(Integer.parseInt(System.getProperty("live-capture.browserless-agent-threads", "4"))); + } +} diff --git a/code/functions/live-capture/java/nu/marginalia/livecapture/ScreenshotDbOperations.java b/code/functions/live-capture/java/nu/marginalia/livecapture/ScreenshotDbOperations.java new file mode 100644 index 00000000..10e88460 --- /dev/null +++ b/code/functions/live-capture/java/nu/marginalia/livecapture/ScreenshotDbOperations.java @@ -0,0 +1,85 @@ +package nu.marginalia.livecapture; + +import nu.marginalia.model.EdgeDomain; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.sql.Connection; +import java.sql.SQLException; +import java.util.Optional; + +public class ScreenshotDbOperations { + + private static final Logger logger = LoggerFactory.getLogger(ScreenshotDbOperations.class); + + public synchronized static void flagDomainAsFetched(Connection conn, EdgeDomain domain) { + try (var stmt = conn.prepareStatement(""" + REPLACE INTO DATA_DOMAIN_HISTORY(DOMAIN_NAME, SCREENSHOT_DATE) + VALUES (?, NOW()) + """)) + { + stmt.setString(1, domain.toString()); + stmt.executeUpdate(); + } catch (SQLException e) { + logger.error("Failed to flag domain as fetched", e); + } + } + + public synchronized static void uploadScreenshot(Connection conn, EdgeDomain domain, byte[] pngBytes) { + try (var stmt = conn.prepareStatement(""" + REPLACE INTO DATA_DOMAIN_SCREENSHOT(DOMAIN_NAME, CONTENT_TYPE, DATA) + VALUES (?,?,?) + """); + var is = new ByteArrayInputStream(pngBytes) + ) { + stmt.setString(1, domain.toString()); + stmt.setString(2, "image/png"); + stmt.setBlob(3, is); + stmt.executeUpdate(); + } catch (SQLException | IOException e) { + logger.error("Failed to upload screenshot", e); + } + + flagDomainAsFetched(conn, domain); + } + + public static boolean isEligibleForScreengrab(Connection conn, int domainId) { + try (var stmt = conn.prepareStatement(""" + SELECT 1 FROM DATA_DOMAIN_HISTORY + INNER JOIN WMSA_prod.EC_DOMAIN ON DATA_DOMAIN_HISTORY.DOMAIN_NAME = EC_DOMAIN.DOMAIN_NAME + WHERE EC_DOMAIN.ID = ? + AND SCREENSHOT_DATE > DATE_SUB(NOW(), INTERVAL 1 MONTH) + """)) + { + stmt.setInt(1, domainId); + + try (var rs = stmt.executeQuery()) { + return !rs.next(); + } + } catch (SQLException e) { + logger.error("Failed to check eligibility for screengrab", e); + return false; + } + } + + public static Optional getDomainName(Connection conn, int domainId) { + try (var stmt = conn.prepareStatement(""" + SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID = ? + """)) + { + stmt.setInt(1, domainId); + + try (var rs = stmt.executeQuery()) { + if (rs.next()) { + return Optional.of(rs.getString(1)).map(EdgeDomain::new); + } + } + } + catch (SQLException ex) { + logger.error("Failed to get domain name", ex); + } + return Optional.empty(); + } +} diff --git a/code/functions/live-capture/test/nu/marginalia/livecapture/BrowserlessClientTest.java b/code/functions/live-capture/test/nu/marginalia/livecapture/BrowserlessClientTest.java new file mode 100644 index 00000000..f2db0a02 --- /dev/null +++ b/code/functions/live-capture/test/nu/marginalia/livecapture/BrowserlessClientTest.java @@ -0,0 +1,37 @@ +package nu.marginalia.livecapture; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.junit.jupiter.Testcontainers; +import org.testcontainers.utility.DockerImageName; + +import java.net.URI; + +@Testcontainers +public class BrowserlessClientTest { + static GenericContainer container = new GenericContainer<>(DockerImageName.parse("browserless/chrome")).withExposedPorts(3000); + + @BeforeAll + public static void setup() { + container.start(); + } + + @Test + public void testContent() throws Exception { + try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) { + var content = client.content("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues()); + Assertions.assertNotNull(content, "Content should not be null"); + Assertions.assertFalse(content.isBlank(), "Content should not be empty"); + } + } + + @Test + public void testScreenshot() throws Exception { + try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) { + var screenshot = client.screenshot("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues(), BrowserlessClient.ScreenshotOptions.defaultValues()); + Assertions.assertNotNull(screenshot, "Screenshot should not be null"); + } + } +} diff --git a/code/services-application/search-service/build.gradle b/code/services-application/search-service/build.gradle index 998b7f26..e2d6cb42 100644 --- a/code/services-application/search-service/build.gradle +++ b/code/services-application/search-service/build.gradle @@ -42,6 +42,7 @@ dependencies { implementation project(':code:libraries:braille-block-punch-cards') implementation project(':code:libraries:term-frequency-dict') + implementation project(':code:functions:live-capture:api') implementation project(':code:functions:math:api') implementation project(':code:functions:domain-info:api') implementation project(':code:functions:search-query:api') diff --git a/code/services-application/search-service/java/nu/marginalia/search/svc/SearchBrowseService.java b/code/services-application/search-service/java/nu/marginalia/search/svc/SearchBrowseService.java index 6ba3fa49..11c2e0e8 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/svc/SearchBrowseService.java +++ b/code/services-application/search-service/java/nu/marginalia/search/svc/SearchBrowseService.java @@ -77,7 +77,7 @@ public class SearchBrowseService { if (resultDomain.isEmpty()) continue; - results.add(new BrowseResult(resultDomain.get().toRootUrl(), sd.domainId(), 0, sd.screenshot())); + results.add(new BrowseResult(resultDomain.get().toRootUrlHttp(), sd.domainId(), 0, sd.screenshot())); } // shuffle the items for a less repetitive experience shuffle(neighbors); diff --git a/code/services-application/search-service/java/nu/marginalia/search/svc/SearchSiteInfoService.java b/code/services-application/search-service/java/nu/marginalia/search/svc/SearchSiteInfoService.java index 8c4bfc62..7bb4f319 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/svc/SearchSiteInfoService.java +++ b/code/services-application/search-service/java/nu/marginalia/search/svc/SearchSiteInfoService.java @@ -4,6 +4,7 @@ import com.google.inject.Inject; import nu.marginalia.api.domains.DomainInfoClient; import nu.marginalia.api.domains.model.DomainInformation; import nu.marginalia.api.domains.model.SimilarDomain; +import nu.marginalia.api.livecapture.LiveCaptureClient; import nu.marginalia.db.DbDomainQueries; import nu.marginalia.feedlot.FeedlotClient; import nu.marginalia.feedlot.model.FeedItems; @@ -37,6 +38,7 @@ public class SearchSiteInfoService { private final DbDomainQueries domainQueries; private final MustacheRenderer renderer; private final FeedlotClient feedlotClient; + private final LiveCaptureClient liveCaptureClient; private final ScreenshotService screenshotService; @Inject @@ -46,6 +48,7 @@ public class SearchSiteInfoService { SearchFlagSiteService flagSiteService, DbDomainQueries domainQueries, FeedlotClient feedlotClient, + LiveCaptureClient liveCaptureClient, ScreenshotService screenshotService) throws IOException { this.searchOperator = searchOperator; @@ -56,6 +59,7 @@ public class SearchSiteInfoService { this.renderer = rendererFactory.renderer("search/site-info/site-info"); this.feedlotClient = feedlotClient; + this.liveCaptureClient = liveCaptureClient; this.screenshotService = screenshotService; } @@ -165,7 +169,7 @@ public class SearchSiteInfoService { logger.debug("Failed to get feed items for {}: {}", domainName, e.getMessage()); } - return new SiteInfoWithContext(domainName, + var result = new SiteInfoWithContext(domainName, domainId, url, hasScreenshot, @@ -175,6 +179,46 @@ public class SearchSiteInfoService { feedItems, sampleResults ); + + requestMissingScreenshots(result); + + return result; + } + + /** Request missing screenshots for the given site info */ + private void requestMissingScreenshots(SiteInfoWithContext result) { + int requests = 0; + if (!result.hasScreenshot()) { + liveCaptureClient.requestScreengrab((int) result.domainId()); + requests++; + } + + if (result.similar() != null) { + for (var similar : result.similar()) { + if (similar.screenshot()) { + continue; + } + if (++requests > 5) { + break; + } + + liveCaptureClient.requestScreengrab(similar.domainId()); + } + } + + if (result.linking() != null) { + for (var linking : result.linking()) { + if (linking.screenshot()) { + continue; + } + if (++requests > 5) { + break; + } + + liveCaptureClient.requestScreengrab(linking.domainId()); + } + } + } private T waitForFuture(Future future, Supplier fallback) { @@ -233,7 +277,7 @@ public class SearchSiteInfoService { public record SiteInfoWithContext(Map view, Map domainState, String domain, - long domainId, + int domainId, String siteUrl, boolean hasScreenshot, DomainInformation domainInformation, @@ -243,7 +287,7 @@ public class SearchSiteInfoService { List samples ) { public SiteInfoWithContext(String domain, - long domainId, + int domainId, String siteUrl, boolean hasScreenshot, DomainInformation domainInformation, diff --git a/code/services-core/assistant-service/build.gradle b/code/services-core/assistant-service/build.gradle index 1dd2cfd6..46000906 100644 --- a/code/services-core/assistant-service/build.gradle +++ b/code/services-core/assistant-service/build.gradle @@ -25,6 +25,8 @@ apply from: "$rootProject.projectDir/docker.gradle" dependencies { implementation project(':third-party:symspell') + implementation project(':code:functions:live-capture') + implementation project(':code:functions:live-capture:api') implementation project(':code:functions:math') implementation project(':code:functions:math:api') implementation project(':code:functions:domain-info') diff --git a/code/services-core/assistant-service/java/nu/marginalia/assistant/AssistantMain.java b/code/services-core/assistant-service/java/nu/marginalia/assistant/AssistantMain.java index 8d515052..ddafcad2 100644 --- a/code/services-core/assistant-service/java/nu/marginalia/assistant/AssistantMain.java +++ b/code/services-core/assistant-service/java/nu/marginalia/assistant/AssistantMain.java @@ -3,13 +3,14 @@ package nu.marginalia.assistant; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; +import nu.marginalia.livecapture.LivecaptureModule; import nu.marginalia.service.MainClass; -import nu.marginalia.service.discovery.ServiceRegistryIf; -import nu.marginalia.service.module.ServiceConfiguration; -import nu.marginalia.service.module.ServiceDiscoveryModule; import nu.marginalia.service.ServiceId; -import nu.marginalia.service.module.ServiceConfigurationModule; +import nu.marginalia.service.discovery.ServiceRegistryIf; import nu.marginalia.service.module.DatabaseModule; +import nu.marginalia.service.module.ServiceConfiguration; +import nu.marginalia.service.module.ServiceConfigurationModule; +import nu.marginalia.service.module.ServiceDiscoveryModule; import nu.marginalia.service.server.Initialization; public class AssistantMain extends MainClass { @@ -25,6 +26,7 @@ public class AssistantMain extends MainClass { Injector injector = Guice.createInjector( new AssistantModule(), + new LivecaptureModule(), new ServiceConfigurationModule(ServiceId.Assistant), new ServiceDiscoveryModule(), new DatabaseModule(false) diff --git a/code/services-core/assistant-service/java/nu/marginalia/assistant/AssistantService.java b/code/services-core/assistant-service/java/nu/marginalia/assistant/AssistantService.java index c048ac24..9bff6c38 100644 --- a/code/services-core/assistant-service/java/nu/marginalia/assistant/AssistantService.java +++ b/code/services-core/assistant-service/java/nu/marginalia/assistant/AssistantService.java @@ -6,10 +6,12 @@ import lombok.SneakyThrows; import nu.marginalia.assistant.suggest.Suggestions; import nu.marginalia.functions.domains.DomainInfoGrpcService; import nu.marginalia.functions.math.MathGrpcService; +import nu.marginalia.livecapture.LiveCaptureGrpcService; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.screenshot.ScreenshotService; import nu.marginalia.service.discovery.property.ServicePartition; -import nu.marginalia.service.server.*; +import nu.marginalia.service.server.BaseServiceParams; +import nu.marginalia.service.server.Service; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import spark.Request; @@ -28,12 +30,11 @@ public class AssistantService extends Service { public AssistantService(BaseServiceParams params, ScreenshotService screenshotService, DomainInfoGrpcService domainInfoGrpcService, + LiveCaptureGrpcService liveCaptureGrpcService, MathGrpcService mathGrpcService, Suggestions suggestions) { - super(params, - ServicePartition.any(), - List.of(domainInfoGrpcService, mathGrpcService)); + super(params, ServicePartition.any(), List.of(domainInfoGrpcService, mathGrpcService, liveCaptureGrpcService)); this.suggestions = suggestions; diff --git a/code/tools/screenshot-capture-tool/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java b/code/tools/screenshot-capture-tool/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java index 8432b80e..56734448 100644 --- a/code/tools/screenshot-capture-tool/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java +++ b/code/tools/screenshot-capture-tool/java/nu/marginalia/screenshot/ScreenshotCaptureToolMain.java @@ -96,7 +96,7 @@ public class ScreenshotCaptureToolMain { private static byte[] fetchDomain(HttpClient client, EdgeDomain domain) { try { Map requestData = Map.of( - "url", domain.toRootUrl().toString(), + "url", domain.toRootUrlHttps().toString(), "options", Map.of("fullPage", false, "type", "png"), diff --git a/settings.gradle b/settings.gradle index 9d4810e5..7c87f7b9 100644 --- a/settings.gradle +++ b/settings.gradle @@ -25,6 +25,9 @@ include 'code:functions:link-graph:api' include 'code:functions:search-query' include 'code:functions:search-query:api' +include 'code:functions:live-capture' +include 'code:functions:live-capture:api' + include 'code:execution' include 'code:execution:api'