Add a new function 'Live Capture' for on-demand screenshot capture

The screenshots are requested by the site-service, and triggered via the site-info view.
This commit is contained in:
Viktor Lofgren 2024-09-27 13:45:54 +02:00
parent 1bd29a586c
commit 23cce0c78a
22 changed files with 663 additions and 26 deletions

View File

@ -92,10 +92,13 @@ public class EdgeDomain implements Serializable {
public EdgeUrl toRootUrl() { public EdgeUrl toRootUrlHttp() {
// Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http // Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http
return new EdgeUrl("http", this, null, "/", null); return new EdgeUrl("http", this, null, "/", null);
} }
public EdgeUrl toRootUrlHttps() {
return new EdgeUrl("https", this, null, "/", null);
}
public String toString() { public String toString() {
return getAddress(); return getAddress();

View File

@ -20,7 +20,6 @@ public sealed interface ServicePartition {
public String identifier() { public String identifier() {
return Integer.toString(node); return Integer.toString(node);
} }
} }
record None() implements ServicePartition, PartitionTraits.NoGrpc { record None() implements ServicePartition, PartitionTraits.NoGrpc {
public String identifier() { return ""; } public String identifier() { return ""; }

View File

@ -4,13 +4,14 @@ import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.browse.model.BrowseResult; import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.db.DomainBlacklist; import nu.marginalia.db.DomainBlacklist;
import nu.marginalia.model.EdgeDomain;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.*; import java.util.ArrayList;
import java.util.List;
@Singleton @Singleton
public class DbBrowseDomainsRandom { public class DbBrowseDomainsRandom {
@ -47,7 +48,7 @@ public class DbBrowseDomainsRandom {
boolean indexed = rsp.getBoolean("INDEXED"); boolean indexed = rsp.getBoolean("INDEXED");
if (!blacklist.isBlacklisted(id)) { if (!blacklist.isBlacklisted(id)) {
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0, indexed)); domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrlHttp(), id, 0, indexed));
} }
} }
} }

View File

@ -4,13 +4,14 @@ import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.browse.model.BrowseResult; import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.db.DomainBlacklist; import nu.marginalia.db.DomainBlacklist;
import nu.marginalia.model.EdgeDomain;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.*; import java.util.ArrayList;
import java.util.List;
@Singleton @Singleton
public class DbBrowseDomainsSimilarCosine { public class DbBrowseDomainsSimilarCosine {
@ -53,7 +54,7 @@ public class DbBrowseDomainsSimilarCosine {
boolean indexed = rsp.getBoolean("INDEXED"); boolean indexed = rsp.getBoolean("INDEXED");
if (!blacklist.isBlacklisted(id)) { if (!blacklist.isBlacklisted(id)) {
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, relatedness, indexed)); domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrlHttp(), id, relatedness, indexed));
} }
} }
} }

View File

@ -9,7 +9,7 @@ import gnu.trove.map.hash.TIntIntHashMap;
import gnu.trove.set.TIntSet; import gnu.trove.set.TIntSet;
import gnu.trove.set.hash.TIntHashSet; import gnu.trove.set.hash.TIntHashSet;
import it.unimi.dsi.fastutil.ints.Int2DoubleArrayMap; import it.unimi.dsi.fastutil.ints.Int2DoubleArrayMap;
import nu.marginalia.api.domains.*; import nu.marginalia.api.domains.RpcSimilarDomain;
import nu.marginalia.api.domains.model.SimilarDomain; import nu.marginalia.api.domains.model.SimilarDomain;
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient; import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
@ -21,8 +21,6 @@ import java.sql.ResultSet;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.stream.IntStream; import java.util.stream.IntStream;
public class SimilarDomainsService { public class SimilarDomainsService {
@ -227,7 +225,7 @@ public class SimilarDomainsService {
domains.add(RpcSimilarDomain.newBuilder() domains.add(RpcSimilarDomain.newBuilder()
.setDomainId(id) .setDomainId(id)
.setUrl(new EdgeDomain(domainNames[idx]).toRootUrl().toString()) .setUrl(new EdgeDomain(domainNames[idx]).toRootUrlHttp().toString())
.setRelatedness(getRelatedness(domainId, id)) .setRelatedness(getRelatedness(domainId, id))
.setRank(domainRanks.get(idx)) .setRank(domainRanks.get(idx))
.setIndexed(indexedDomains.contains(idx)) .setIndexed(indexedDomains.contains(idx))
@ -342,7 +340,7 @@ public class SimilarDomainsService {
domains.add(RpcSimilarDomain.newBuilder() domains.add(RpcSimilarDomain.newBuilder()
.setDomainId(id) .setDomainId(id)
.setUrl(new EdgeDomain(domainNames[idx]).toRootUrl().toString()) .setUrl(new EdgeDomain(domainNames[idx]).toRootUrlHttp().toString())
.setRelatedness(getRelatedness(domainId, id)) .setRelatedness(getRelatedness(domainId, id))
.setRank(domainRanks.get(idx)) .setRank(domainRanks.get(idx))
.setIndexed(indexedDomains.contains(idx)) .setIndexed(indexedDomains.contains(idx))

View File

@ -0,0 +1,45 @@
plugins {
id 'java'
id "com.google.protobuf" version "0.9.4"
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
jar.archiveBaseName = 'live-capture-api'
apply from: "$rootProject.projectDir/protobuf.gradle"
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:common:model')
implementation project(':code:common:config')
implementation project(':code:common:service')
implementation libs.bundles.slf4j
implementation libs.prometheus
implementation libs.notnull
implementation libs.guava
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}
implementation libs.gson
implementation libs.bundles.protobuf
implementation libs.guava
libs.bundles.grpc.get().each {
implementation dependencies.create(it) {
exclude group: 'com.google.guava'
}
}
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
}

View File

@ -0,0 +1,36 @@
package nu.marginalia.api.livecapture;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.api.livecapture.LiveCaptureApiGrpc.LiveCaptureApiBlockingStub;
import nu.marginalia.service.client.GrpcChannelPoolFactory;
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
import nu.marginalia.service.discovery.property.ServiceKey;
import nu.marginalia.service.discovery.property.ServicePartition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@Singleton
public class LiveCaptureClient {
private static final Logger logger = LoggerFactory.getLogger(LiveCaptureClient.class);
private final GrpcSingleNodeChannelPool<LiveCaptureApiBlockingStub> channelPool;
@Inject
public LiveCaptureClient(GrpcChannelPoolFactory factory) {
// The client is only interested in the primary node
var key = ServiceKey.forGrpcApi(LiveCaptureApiGrpc.class, ServicePartition.any());
this.channelPool = factory.createSingle(key, LiveCaptureApiGrpc::newBlockingStub);
}
public void requestScreengrab(int domainId) {
try {
channelPool.call(LiveCaptureApiBlockingStub::requestScreengrab)
.run(RpcDomainId.newBuilder().setDomainId(domainId).build());
}
catch (Exception e) {
logger.error("API Exception", e);
}
}
}

View File

@ -0,0 +1,18 @@
syntax="proto3";
package nu.marginalia.api.livecapture;
option java_package="nu.marginalia.api.livecapture";
option java_multiple_files=true;
service LiveCaptureApi {
rpc requestScreengrab(RpcDomainId) returns (Empty) {}
}
message Void {
}
message RpcDomainId {
int32 domainId = 1;
}
message Empty {}

View File

@ -0,0 +1,60 @@
plugins {
id 'java'
id 'application'
id 'jvm-test-suite'
}
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
apply from: "$rootProject.projectDir/srcsets.gradle"
dependencies {
implementation project(':code:functions:live-capture:api')
implementation project(':code:common:config')
implementation project(':code:common:service')
implementation project(':code:common:model')
implementation project(':code:common:db')
implementation libs.bundles.slf4j
implementation libs.commons.lang3
implementation libs.prometheus
implementation libs.guava
libs.bundles.grpc.get().each {
implementation dependencies.create(it) {
exclude group: 'com.google.guava'
}
}
implementation libs.notnull
implementation libs.guava
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}
implementation dependencies.create(libs.spark.get()) {
exclude group: 'org.eclipse.jetty'
}
implementation libs.bundles.jetty
implementation libs.bundles.gson
implementation libs.bundles.mariadb
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
testImplementation libs.commons.codec
testImplementation 'org.testcontainers:mariadb:1.17.4'
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
testImplementation project(':code:libraries:test-helpers')
}

View File

@ -0,0 +1,100 @@
package nu.marginalia.livecapture;
import com.google.gson.Gson;
import nu.marginalia.model.gson.GsonFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.time.Duration;
import java.util.Map;
/** Client for local browserless.io API */
public class BrowserlessClient implements AutoCloseable {
private static final Logger logger = LoggerFactory.getLogger(BrowserlessClient.class);
private final HttpClient httpClient = HttpClient.newBuilder()
.version(HttpClient.Version.HTTP_1_1)
.connectTimeout(Duration.ofSeconds(30))
.build();
private final URI browserlessURI;
private final Gson gson = GsonFactory.get();
public BrowserlessClient(URI browserlessURI) {
this.browserlessURI = browserlessURI;
}
public String content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
Map<String, Object> requestData = Map.of(
"url", url,
"gotoOptions", gotoOptions
);
var request = HttpRequest.newBuilder()
.uri(browserlessURI.resolve("/content"))
.method("POST", HttpRequest.BodyPublishers.ofString(
gson.toJson(requestData)
))
.header("Content-type", "application/json")
.build();
var rsp = httpClient.send(request, HttpResponse.BodyHandlers.ofString());
if (rsp.statusCode() >= 300) {
logger.info("Failed to fetch content for {}, status {}", url, rsp.statusCode());
return null;
}
return rsp.body();
}
public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
throws IOException, InterruptedException {
Map<String, Object> requestData = Map.of(
"url", url,
"options", screenshotOptions,
"gotoOptions", gotoOptions
);
var request = HttpRequest.newBuilder()
.uri(browserlessURI.resolve("/screenshot"))
.method("POST", HttpRequest.BodyPublishers.ofString(
gson.toJson(requestData)
))
.header("Content-type", "application/json")
.build();
var rsp = httpClient.send(request, HttpResponse.BodyHandlers.ofByteArray());
if (rsp.statusCode() >= 300) {
logger.info("Failed to fetch screenshot for {}, status {}", url, rsp.statusCode());
}
return rsp.body();
}
@Override
public void close() throws Exception {
httpClient.shutdownNow();
}
public record ScreenshotOptions(boolean fullPage, String type) {
public static ScreenshotOptions defaultValues() {
return new ScreenshotOptions(false, "png");
}
}
public record GotoOptions(String waitUntil, long timeout) {
public static GotoOptions defaultValues() {
return new GotoOptions("networkidle2", Duration.ofSeconds(10).toMillis());
}
}
}

View File

@ -0,0 +1,186 @@
package nu.marginalia.livecapture;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import io.grpc.stub.StreamObserver;
import jakarta.inject.Named;
import nu.marginalia.api.livecapture.Empty;
import nu.marginalia.api.livecapture.LiveCaptureApiGrpc;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.service.server.DiscoverableService;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URI;
import java.net.URISyntaxException;
import java.sql.Connection;
import java.sql.SQLException;
import java.util.Arrays;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.LinkedBlockingQueue;
/** GRPC service for on-demand capture of website screenshots */
public class LiveCaptureGrpcService
extends LiveCaptureApiGrpc.LiveCaptureApiImplBase
implements DiscoverableService
{
private static final Logger logger = LoggerFactory.getLogger(LiveCaptureGrpcService.class);
private final URI browserlessURI;
private final boolean serviceEnabled;
private final LinkedBlockingQueue<ScheduledScreenshot> requestedScreenshots = new LinkedBlockingQueue<>(128);
private final HikariDataSource dataSource;
record ScheduledScreenshot(int domainId) {}
// Ensure that the service is only registered if it is enabled
@Override
public boolean shouldRegisterService() {
return serviceEnabled;
}
@Inject
public LiveCaptureGrpcService(HikariDataSource dataSource,
@Named("browserless-uri") String browserlessAddress,
@Named("browserless-agent-threads") int threads,
ServiceConfiguration serviceConfiguration
) throws URISyntaxException {
this.dataSource = dataSource;
if (StringUtils.isEmpty(browserlessAddress) || serviceConfiguration.node() > 1) {
logger.warn("Live capture service will not run");
serviceEnabled = false;
browserlessURI = null; // satisfy final
}
else {
browserlessURI = new URI(browserlessAddress);
serviceEnabled = true;
for (int i = 0; i < threads; i++) {
Thread.ofPlatform().daemon().name("Capture Agent " + i).start(new ScreenshotCaptureAgent());
}
}
}
public void requestScreengrab(nu.marginalia.api.livecapture.RpcDomainId request,
StreamObserver<Empty> responseObserver)
{
if (serviceEnabled) {
try (var conn = dataSource.getConnection()) {
logger.info("Received request for domain {}", request.getDomainId());
if (ScreenshotDbOperations.isEligibleForScreengrab(conn, request.getDomainId())) {
logger.info("Domain {} is eligible for a screenshot", request.getDomainId());
// may fail, we don't care about it
requestedScreenshots.offer(new ScheduledScreenshot(request.getDomainId()));
}
else {
logger.info("Domain {} is not eligible for a screenshot", request.getDomainId());
}
}
catch (SQLException ex) {
logger.error("Failed to check domain eligibility", ex);
}
finally {
responseObserver.onNext(Empty.getDefaultInstance());
responseObserver.onCompleted();
}
}
else { // service is disabled
responseObserver.onNext(Empty.getDefaultInstance());
responseObserver.onCompleted();
}
}
class ScreenshotCaptureAgent implements Runnable {
// To prevent race conditions, we use this to lock domain ids that are being processed
private static final ConcurrentHashMap<Integer, Boolean> domainIdsClaimed = new ConcurrentHashMap<>();
@Override
public void run() {
try (BrowserlessClient client = new BrowserlessClient(browserlessURI)) {
while (true) {
capture(client, requestedScreenshots.take());
}
} catch (InterruptedException e) {
logger.error("Capture agent interrupted", e);
Thread.currentThread().interrupt();
} catch (Exception e) {
logger.error("Capture agent failed", e);
}
}
private void capture(BrowserlessClient client, ScheduledScreenshot scheduledScreenshot) {
// Only one agent should capture a screenshot for a domain, so we skip if another agent has claimed it
if (domainIdsClaimed.put(scheduledScreenshot.domainId(), Boolean.TRUE) != null) {
return;
}
try (var conn = dataSource.getConnection()) {
// Double check if the domain is still eligible for a screenshot
if (!ScreenshotDbOperations.isEligibleForScreengrab(conn, scheduledScreenshot.domainId)) {
return;
}
var domainNameOpt = ScreenshotDbOperations.getDomainName(conn, scheduledScreenshot.domainId());
if (domainNameOpt.isEmpty()) {
logger.error("Failed to get domain name for domain {}", scheduledScreenshot.domainId());
}
else {
EdgeDomain domain = domainNameOpt.get();
String domainNameStr = domain.toString();
if (!isValidDomainForCapture(domain)) {
logger.error("Invalid domain name {}", domainNameStr);
ScreenshotDbOperations.flagDomainAsFetched(conn, domain);
}
else {
grab(client, conn, domain);
}
}
}
catch (SQLException ex) {
logger.error("Failed to check domain eligibility", ex);
}
finally {
// Release the domain ID so that another agent can claim it
// at this point we can assume the database will cover the
// case where the domain is no longer eligible
domainIdsClaimed.remove(scheduledScreenshot.domainId());
}
}
private boolean isValidDomainForCapture(EdgeDomain domain) {
String domainNameStr = domain.toString();
String[] parts = domainNameStr.split("\\.");
if (parts.length < 2) {
return false;
}
if (Arrays.stream(parts).allMatch(StringUtils::isNumeric)) {
// IP address
return false;
}
return true;
}
private void grab(BrowserlessClient client, Connection conn, EdgeDomain domain) {
try {
logger.info("Capturing {}", domain);
byte[] pngBytes = client.screenshot(domain.toRootUrlHttps().toString(),
BrowserlessClient.GotoOptions.defaultValues(),
BrowserlessClient.ScreenshotOptions.defaultValues());
ScreenshotDbOperations.uploadScreenshot(conn, domain, pngBytes);
} catch (Exception e) {
ScreenshotDbOperations.flagDomainAsFetched(conn, domain);
}
}
}
}

View File

@ -0,0 +1,15 @@
package nu.marginalia.livecapture;
import com.google.inject.AbstractModule;
import com.google.inject.name.Names;
public class LivecaptureModule extends AbstractModule {
public void configure() {
bind(String.class)
.annotatedWith(Names.named("browserless-uri"))
.toInstance(System.getProperty("live-capture.browserless-uri", ""));
bind(Integer.class)
.annotatedWith(Names.named("browserless-agent-threads"))
.toInstance(Integer.parseInt(System.getProperty("live-capture.browserless-agent-threads", "4")));
}
}

View File

@ -0,0 +1,85 @@
package nu.marginalia.livecapture;
import nu.marginalia.model.EdgeDomain;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.sql.Connection;
import java.sql.SQLException;
import java.util.Optional;
public class ScreenshotDbOperations {
private static final Logger logger = LoggerFactory.getLogger(ScreenshotDbOperations.class);
public synchronized static void flagDomainAsFetched(Connection conn, EdgeDomain domain) {
try (var stmt = conn.prepareStatement("""
REPLACE INTO DATA_DOMAIN_HISTORY(DOMAIN_NAME, SCREENSHOT_DATE)
VALUES (?, NOW())
"""))
{
stmt.setString(1, domain.toString());
stmt.executeUpdate();
} catch (SQLException e) {
logger.error("Failed to flag domain as fetched", e);
}
}
public synchronized static void uploadScreenshot(Connection conn, EdgeDomain domain, byte[] pngBytes) {
try (var stmt = conn.prepareStatement("""
REPLACE INTO DATA_DOMAIN_SCREENSHOT(DOMAIN_NAME, CONTENT_TYPE, DATA)
VALUES (?,?,?)
""");
var is = new ByteArrayInputStream(pngBytes)
) {
stmt.setString(1, domain.toString());
stmt.setString(2, "image/png");
stmt.setBlob(3, is);
stmt.executeUpdate();
} catch (SQLException | IOException e) {
logger.error("Failed to upload screenshot", e);
}
flagDomainAsFetched(conn, domain);
}
public static boolean isEligibleForScreengrab(Connection conn, int domainId) {
try (var stmt = conn.prepareStatement("""
SELECT 1 FROM DATA_DOMAIN_HISTORY
INNER JOIN WMSA_prod.EC_DOMAIN ON DATA_DOMAIN_HISTORY.DOMAIN_NAME = EC_DOMAIN.DOMAIN_NAME
WHERE EC_DOMAIN.ID = ?
AND SCREENSHOT_DATE > DATE_SUB(NOW(), INTERVAL 1 MONTH)
"""))
{
stmt.setInt(1, domainId);
try (var rs = stmt.executeQuery()) {
return !rs.next();
}
} catch (SQLException e) {
logger.error("Failed to check eligibility for screengrab", e);
return false;
}
}
public static Optional<EdgeDomain> getDomainName(Connection conn, int domainId) {
try (var stmt = conn.prepareStatement("""
SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID = ?
"""))
{
stmt.setInt(1, domainId);
try (var rs = stmt.executeQuery()) {
if (rs.next()) {
return Optional.of(rs.getString(1)).map(EdgeDomain::new);
}
}
}
catch (SQLException ex) {
logger.error("Failed to get domain name", ex);
}
return Optional.empty();
}
}

View File

@ -0,0 +1,37 @@
package nu.marginalia.livecapture;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import org.testcontainers.containers.GenericContainer;
import org.testcontainers.junit.jupiter.Testcontainers;
import org.testcontainers.utility.DockerImageName;
import java.net.URI;
@Testcontainers
public class BrowserlessClientTest {
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome")).withExposedPorts(3000);
@BeforeAll
public static void setup() {
container.start();
}
@Test
public void testContent() throws Exception {
try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) {
var content = client.content("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues());
Assertions.assertNotNull(content, "Content should not be null");
Assertions.assertFalse(content.isBlank(), "Content should not be empty");
}
}
@Test
public void testScreenshot() throws Exception {
try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) {
var screenshot = client.screenshot("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues(), BrowserlessClient.ScreenshotOptions.defaultValues());
Assertions.assertNotNull(screenshot, "Screenshot should not be null");
}
}
}

View File

@ -42,6 +42,7 @@ dependencies {
implementation project(':code:libraries:braille-block-punch-cards') implementation project(':code:libraries:braille-block-punch-cards')
implementation project(':code:libraries:term-frequency-dict') implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:functions:live-capture:api')
implementation project(':code:functions:math:api') implementation project(':code:functions:math:api')
implementation project(':code:functions:domain-info:api') implementation project(':code:functions:domain-info:api')
implementation project(':code:functions:search-query:api') implementation project(':code:functions:search-query:api')

View File

@ -77,7 +77,7 @@ public class SearchBrowseService {
if (resultDomain.isEmpty()) if (resultDomain.isEmpty())
continue; continue;
results.add(new BrowseResult(resultDomain.get().toRootUrl(), sd.domainId(), 0, sd.screenshot())); results.add(new BrowseResult(resultDomain.get().toRootUrlHttp(), sd.domainId(), 0, sd.screenshot()));
} }
// shuffle the items for a less repetitive experience // shuffle the items for a less repetitive experience
shuffle(neighbors); shuffle(neighbors);

View File

@ -4,6 +4,7 @@ import com.google.inject.Inject;
import nu.marginalia.api.domains.DomainInfoClient; import nu.marginalia.api.domains.DomainInfoClient;
import nu.marginalia.api.domains.model.DomainInformation; import nu.marginalia.api.domains.model.DomainInformation;
import nu.marginalia.api.domains.model.SimilarDomain; import nu.marginalia.api.domains.model.SimilarDomain;
import nu.marginalia.api.livecapture.LiveCaptureClient;
import nu.marginalia.db.DbDomainQueries; import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.feedlot.FeedlotClient; import nu.marginalia.feedlot.FeedlotClient;
import nu.marginalia.feedlot.model.FeedItems; import nu.marginalia.feedlot.model.FeedItems;
@ -37,6 +38,7 @@ public class SearchSiteInfoService {
private final DbDomainQueries domainQueries; private final DbDomainQueries domainQueries;
private final MustacheRenderer<Object> renderer; private final MustacheRenderer<Object> renderer;
private final FeedlotClient feedlotClient; private final FeedlotClient feedlotClient;
private final LiveCaptureClient liveCaptureClient;
private final ScreenshotService screenshotService; private final ScreenshotService screenshotService;
@Inject @Inject
@ -46,6 +48,7 @@ public class SearchSiteInfoService {
SearchFlagSiteService flagSiteService, SearchFlagSiteService flagSiteService,
DbDomainQueries domainQueries, DbDomainQueries domainQueries,
FeedlotClient feedlotClient, FeedlotClient feedlotClient,
LiveCaptureClient liveCaptureClient,
ScreenshotService screenshotService) throws IOException ScreenshotService screenshotService) throws IOException
{ {
this.searchOperator = searchOperator; this.searchOperator = searchOperator;
@ -56,6 +59,7 @@ public class SearchSiteInfoService {
this.renderer = rendererFactory.renderer("search/site-info/site-info"); this.renderer = rendererFactory.renderer("search/site-info/site-info");
this.feedlotClient = feedlotClient; this.feedlotClient = feedlotClient;
this.liveCaptureClient = liveCaptureClient;
this.screenshotService = screenshotService; this.screenshotService = screenshotService;
} }
@ -165,7 +169,7 @@ public class SearchSiteInfoService {
logger.debug("Failed to get feed items for {}: {}", domainName, e.getMessage()); logger.debug("Failed to get feed items for {}: {}", domainName, e.getMessage());
} }
return new SiteInfoWithContext(domainName, var result = new SiteInfoWithContext(domainName,
domainId, domainId,
url, url,
hasScreenshot, hasScreenshot,
@ -175,6 +179,46 @@ public class SearchSiteInfoService {
feedItems, feedItems,
sampleResults sampleResults
); );
requestMissingScreenshots(result);
return result;
}
/** Request missing screenshots for the given site info */
private void requestMissingScreenshots(SiteInfoWithContext result) {
int requests = 0;
if (!result.hasScreenshot()) {
liveCaptureClient.requestScreengrab((int) result.domainId());
requests++;
}
if (result.similar() != null) {
for (var similar : result.similar()) {
if (similar.screenshot()) {
continue;
}
if (++requests > 5) {
break;
}
liveCaptureClient.requestScreengrab(similar.domainId());
}
}
if (result.linking() != null) {
for (var linking : result.linking()) {
if (linking.screenshot()) {
continue;
}
if (++requests > 5) {
break;
}
liveCaptureClient.requestScreengrab(linking.domainId());
}
}
} }
private <T> T waitForFuture(Future<T> future, Supplier<T> fallback) { private <T> T waitForFuture(Future<T> future, Supplier<T> fallback) {
@ -233,7 +277,7 @@ public class SearchSiteInfoService {
public record SiteInfoWithContext(Map<String, Boolean> view, public record SiteInfoWithContext(Map<String, Boolean> view,
Map<String, Boolean> domainState, Map<String, Boolean> domainState,
String domain, String domain,
long domainId, int domainId,
String siteUrl, String siteUrl,
boolean hasScreenshot, boolean hasScreenshot,
DomainInformation domainInformation, DomainInformation domainInformation,
@ -243,7 +287,7 @@ public class SearchSiteInfoService {
List<UrlDetails> samples List<UrlDetails> samples
) { ) {
public SiteInfoWithContext(String domain, public SiteInfoWithContext(String domain,
long domainId, int domainId,
String siteUrl, String siteUrl,
boolean hasScreenshot, boolean hasScreenshot,
DomainInformation domainInformation, DomainInformation domainInformation,

View File

@ -25,6 +25,8 @@ apply from: "$rootProject.projectDir/docker.gradle"
dependencies { dependencies {
implementation project(':third-party:symspell') implementation project(':third-party:symspell')
implementation project(':code:functions:live-capture')
implementation project(':code:functions:live-capture:api')
implementation project(':code:functions:math') implementation project(':code:functions:math')
implementation project(':code:functions:math:api') implementation project(':code:functions:math:api')
implementation project(':code:functions:domain-info') implementation project(':code:functions:domain-info')

View File

@ -3,13 +3,14 @@ package nu.marginalia.assistant;
import com.google.inject.Guice; import com.google.inject.Guice;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Injector; import com.google.inject.Injector;
import nu.marginalia.livecapture.LivecaptureModule;
import nu.marginalia.service.MainClass; import nu.marginalia.service.MainClass;
import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.service.module.ServiceDiscoveryModule;
import nu.marginalia.service.ServiceId; import nu.marginalia.service.ServiceId;
import nu.marginalia.service.module.ServiceConfigurationModule; import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.module.DatabaseModule; import nu.marginalia.service.module.DatabaseModule;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.service.module.ServiceConfigurationModule;
import nu.marginalia.service.module.ServiceDiscoveryModule;
import nu.marginalia.service.server.Initialization; import nu.marginalia.service.server.Initialization;
public class AssistantMain extends MainClass { public class AssistantMain extends MainClass {
@ -25,6 +26,7 @@ public class AssistantMain extends MainClass {
Injector injector = Guice.createInjector( Injector injector = Guice.createInjector(
new AssistantModule(), new AssistantModule(),
new LivecaptureModule(),
new ServiceConfigurationModule(ServiceId.Assistant), new ServiceConfigurationModule(ServiceId.Assistant),
new ServiceDiscoveryModule(), new ServiceDiscoveryModule(),
new DatabaseModule(false) new DatabaseModule(false)

View File

@ -6,10 +6,12 @@ import lombok.SneakyThrows;
import nu.marginalia.assistant.suggest.Suggestions; import nu.marginalia.assistant.suggest.Suggestions;
import nu.marginalia.functions.domains.DomainInfoGrpcService; import nu.marginalia.functions.domains.DomainInfoGrpcService;
import nu.marginalia.functions.math.MathGrpcService; import nu.marginalia.functions.math.MathGrpcService;
import nu.marginalia.livecapture.LiveCaptureGrpcService;
import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.screenshot.ScreenshotService; import nu.marginalia.screenshot.ScreenshotService;
import nu.marginalia.service.discovery.property.ServicePartition; import nu.marginalia.service.discovery.property.ServicePartition;
import nu.marginalia.service.server.*; import nu.marginalia.service.server.BaseServiceParams;
import nu.marginalia.service.server.Service;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import spark.Request; import spark.Request;
@ -28,12 +30,11 @@ public class AssistantService extends Service {
public AssistantService(BaseServiceParams params, public AssistantService(BaseServiceParams params,
ScreenshotService screenshotService, ScreenshotService screenshotService,
DomainInfoGrpcService domainInfoGrpcService, DomainInfoGrpcService domainInfoGrpcService,
LiveCaptureGrpcService liveCaptureGrpcService,
MathGrpcService mathGrpcService, MathGrpcService mathGrpcService,
Suggestions suggestions) Suggestions suggestions)
{ {
super(params, super(params, ServicePartition.any(), List.of(domainInfoGrpcService, mathGrpcService, liveCaptureGrpcService));
ServicePartition.any(),
List.of(domainInfoGrpcService, mathGrpcService));
this.suggestions = suggestions; this.suggestions = suggestions;

View File

@ -96,7 +96,7 @@ public class ScreenshotCaptureToolMain {
private static byte[] fetchDomain(HttpClient client, EdgeDomain domain) { private static byte[] fetchDomain(HttpClient client, EdgeDomain domain) {
try { try {
Map<String, Object> requestData = Map.of( Map<String, Object> requestData = Map.of(
"url", domain.toRootUrl().toString(), "url", domain.toRootUrlHttps().toString(),
"options", "options",
Map.of("fullPage", false, Map.of("fullPage", false,
"type", "png"), "type", "png"),

View File

@ -25,6 +25,9 @@ include 'code:functions:link-graph:api'
include 'code:functions:search-query' include 'code:functions:search-query'
include 'code:functions:search-query:api' include 'code:functions:search-query:api'
include 'code:functions:live-capture'
include 'code:functions:live-capture:api'
include 'code:execution' include 'code:execution'
include 'code:execution:api' include 'code:execution:api'