mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-22 20:48:59 +00:00
Add a new function 'Live Capture' for on-demand screenshot capture
The screenshots are requested by the site-service, and triggered via the site-info view.
This commit is contained in:
parent
1bd29a586c
commit
23cce0c78a
@ -92,10 +92,13 @@ public class EdgeDomain implements Serializable {
|
||||
|
||||
|
||||
|
||||
public EdgeUrl toRootUrl() {
|
||||
public EdgeUrl toRootUrlHttp() {
|
||||
// Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http
|
||||
return new EdgeUrl("http", this, null, "/", null);
|
||||
}
|
||||
public EdgeUrl toRootUrlHttps() {
|
||||
return new EdgeUrl("https", this, null, "/", null);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return getAddress();
|
||||
|
@ -20,7 +20,6 @@ public sealed interface ServicePartition {
|
||||
public String identifier() {
|
||||
return Integer.toString(node);
|
||||
}
|
||||
|
||||
}
|
||||
record None() implements ServicePartition, PartitionTraits.NoGrpc {
|
||||
public String identifier() { return ""; }
|
||||
|
@ -4,13 +4,14 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.browse.model.BrowseResult;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.db.DomainBlacklist;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Singleton
|
||||
public class DbBrowseDomainsRandom {
|
||||
@ -47,7 +48,7 @@ public class DbBrowseDomainsRandom {
|
||||
boolean indexed = rsp.getBoolean("INDEXED");
|
||||
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, 0, indexed));
|
||||
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrlHttp(), id, 0, indexed));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -4,13 +4,14 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.browse.model.BrowseResult;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.db.DomainBlacklist;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Singleton
|
||||
public class DbBrowseDomainsSimilarCosine {
|
||||
@ -53,7 +54,7 @@ public class DbBrowseDomainsSimilarCosine {
|
||||
boolean indexed = rsp.getBoolean("INDEXED");
|
||||
|
||||
if (!blacklist.isBlacklisted(id)) {
|
||||
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrl(), id, relatedness, indexed));
|
||||
domains.add(new BrowseResult(new EdgeDomain(domain).toRootUrlHttp(), id, relatedness, indexed));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -9,7 +9,7 @@ import gnu.trove.map.hash.TIntIntHashMap;
|
||||
import gnu.trove.set.TIntSet;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import it.unimi.dsi.fastutil.ints.Int2DoubleArrayMap;
|
||||
import nu.marginalia.api.domains.*;
|
||||
import nu.marginalia.api.domains.RpcSimilarDomain;
|
||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
@ -21,8 +21,6 @@ import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.ScheduledExecutorService;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
public class SimilarDomainsService {
|
||||
@ -227,7 +225,7 @@ public class SimilarDomainsService {
|
||||
|
||||
domains.add(RpcSimilarDomain.newBuilder()
|
||||
.setDomainId(id)
|
||||
.setUrl(new EdgeDomain(domainNames[idx]).toRootUrl().toString())
|
||||
.setUrl(new EdgeDomain(domainNames[idx]).toRootUrlHttp().toString())
|
||||
.setRelatedness(getRelatedness(domainId, id))
|
||||
.setRank(domainRanks.get(idx))
|
||||
.setIndexed(indexedDomains.contains(idx))
|
||||
@ -342,7 +340,7 @@ public class SimilarDomainsService {
|
||||
|
||||
domains.add(RpcSimilarDomain.newBuilder()
|
||||
.setDomainId(id)
|
||||
.setUrl(new EdgeDomain(domainNames[idx]).toRootUrl().toString())
|
||||
.setUrl(new EdgeDomain(domainNames[idx]).toRootUrlHttp().toString())
|
||||
.setRelatedness(getRelatedness(domainId, id))
|
||||
.setRank(domainRanks.get(idx))
|
||||
.setIndexed(indexedDomains.contains(idx))
|
||||
|
45
code/functions/live-capture/api/build.gradle
Normal file
45
code/functions/live-capture/api/build.gradle
Normal file
@ -0,0 +1,45 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
id "com.google.protobuf" version "0.9.4"
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
jar.archiveBaseName = 'live-capture-api'
|
||||
|
||||
apply from: "$rootProject.projectDir/protobuf.gradle"
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation libs.prometheus
|
||||
implementation libs.notnull
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.gson
|
||||
implementation libs.bundles.protobuf
|
||||
implementation libs.guava
|
||||
libs.bundles.grpc.get().each {
|
||||
implementation dependencies.create(it) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
}
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
}
|
@ -0,0 +1,36 @@
|
||||
package nu.marginalia.api.livecapture;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.api.livecapture.LiveCaptureApiGrpc.LiveCaptureApiBlockingStub;
|
||||
import nu.marginalia.service.client.GrpcChannelPoolFactory;
|
||||
import nu.marginalia.service.client.GrpcSingleNodeChannelPool;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@Singleton
|
||||
public class LiveCaptureClient {
|
||||
private static final Logger logger = LoggerFactory.getLogger(LiveCaptureClient.class);
|
||||
|
||||
private final GrpcSingleNodeChannelPool<LiveCaptureApiBlockingStub> channelPool;
|
||||
|
||||
@Inject
|
||||
public LiveCaptureClient(GrpcChannelPoolFactory factory) {
|
||||
// The client is only interested in the primary node
|
||||
var key = ServiceKey.forGrpcApi(LiveCaptureApiGrpc.class, ServicePartition.any());
|
||||
this.channelPool = factory.createSingle(key, LiveCaptureApiGrpc::newBlockingStub);
|
||||
}
|
||||
|
||||
|
||||
public void requestScreengrab(int domainId) {
|
||||
try {
|
||||
channelPool.call(LiveCaptureApiBlockingStub::requestScreengrab)
|
||||
.run(RpcDomainId.newBuilder().setDomainId(domainId).build());
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("API Exception", e);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
syntax="proto3";
|
||||
package nu.marginalia.api.livecapture;
|
||||
|
||||
option java_package="nu.marginalia.api.livecapture";
|
||||
option java_multiple_files=true;
|
||||
|
||||
service LiveCaptureApi {
|
||||
rpc requestScreengrab(RpcDomainId) returns (Empty) {}
|
||||
}
|
||||
|
||||
message Void {
|
||||
}
|
||||
|
||||
message RpcDomainId {
|
||||
int32 domainId = 1;
|
||||
}
|
||||
|
||||
message Empty {}
|
60
code/functions/live-capture/build.gradle
Normal file
60
code/functions/live-capture/build.gradle
Normal file
@ -0,0 +1,60 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:db')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.commons.lang3
|
||||
|
||||
implementation libs.prometheus
|
||||
implementation libs.guava
|
||||
libs.bundles.grpc.get().each {
|
||||
implementation dependencies.create(it) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
implementation libs.notnull
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation dependencies.create(libs.spark.get()) {
|
||||
exclude group: 'org.eclipse.jetty'
|
||||
}
|
||||
implementation libs.bundles.jetty
|
||||
implementation libs.bundles.gson
|
||||
implementation libs.bundles.mariadb
|
||||
|
||||
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation libs.commons.codec
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
|
||||
}
|
@ -0,0 +1,100 @@
|
||||
package nu.marginalia.livecapture;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.net.http.HttpClient;
|
||||
import java.net.http.HttpRequest;
|
||||
import java.net.http.HttpResponse;
|
||||
import java.time.Duration;
|
||||
import java.util.Map;
|
||||
|
||||
/** Client for local browserless.io API */
|
||||
public class BrowserlessClient implements AutoCloseable {
|
||||
private static final Logger logger = LoggerFactory.getLogger(BrowserlessClient.class);
|
||||
|
||||
private final HttpClient httpClient = HttpClient.newBuilder()
|
||||
.version(HttpClient.Version.HTTP_1_1)
|
||||
.connectTimeout(Duration.ofSeconds(30))
|
||||
.build();
|
||||
|
||||
private final URI browserlessURI;
|
||||
private final Gson gson = GsonFactory.get();
|
||||
|
||||
public BrowserlessClient(URI browserlessURI) {
|
||||
this.browserlessURI = browserlessURI;
|
||||
}
|
||||
|
||||
public String content(String url, GotoOptions gotoOptions) throws IOException, InterruptedException {
|
||||
Map<String, Object> requestData = Map.of(
|
||||
"url", url,
|
||||
"gotoOptions", gotoOptions
|
||||
);
|
||||
|
||||
var request = HttpRequest.newBuilder()
|
||||
.uri(browserlessURI.resolve("/content"))
|
||||
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||
gson.toJson(requestData)
|
||||
))
|
||||
.header("Content-type", "application/json")
|
||||
.build();
|
||||
|
||||
var rsp = httpClient.send(request, HttpResponse.BodyHandlers.ofString());
|
||||
|
||||
if (rsp.statusCode() >= 300) {
|
||||
logger.info("Failed to fetch content for {}, status {}", url, rsp.statusCode());
|
||||
return null;
|
||||
}
|
||||
|
||||
return rsp.body();
|
||||
}
|
||||
|
||||
public byte[] screenshot(String url, GotoOptions gotoOptions, ScreenshotOptions screenshotOptions)
|
||||
throws IOException, InterruptedException {
|
||||
|
||||
Map<String, Object> requestData = Map.of(
|
||||
"url", url,
|
||||
"options", screenshotOptions,
|
||||
"gotoOptions", gotoOptions
|
||||
);
|
||||
|
||||
var request = HttpRequest.newBuilder()
|
||||
.uri(browserlessURI.resolve("/screenshot"))
|
||||
.method("POST", HttpRequest.BodyPublishers.ofString(
|
||||
gson.toJson(requestData)
|
||||
))
|
||||
.header("Content-type", "application/json")
|
||||
.build();
|
||||
|
||||
var rsp = httpClient.send(request, HttpResponse.BodyHandlers.ofByteArray());
|
||||
|
||||
if (rsp.statusCode() >= 300) {
|
||||
logger.info("Failed to fetch screenshot for {}, status {}", url, rsp.statusCode());
|
||||
}
|
||||
|
||||
return rsp.body();
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
httpClient.shutdownNow();
|
||||
}
|
||||
|
||||
public record ScreenshotOptions(boolean fullPage, String type) {
|
||||
public static ScreenshotOptions defaultValues() {
|
||||
return new ScreenshotOptions(false, "png");
|
||||
}
|
||||
}
|
||||
|
||||
public record GotoOptions(String waitUntil, long timeout) {
|
||||
public static GotoOptions defaultValues() {
|
||||
return new GotoOptions("networkidle2", Duration.ofSeconds(10).toMillis());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,186 @@
|
||||
package nu.marginalia.livecapture;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import jakarta.inject.Named;
|
||||
import nu.marginalia.api.livecapture.Empty;
|
||||
import nu.marginalia.api.livecapture.LiveCaptureApiGrpc;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.service.server.DiscoverableService;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.sql.Connection;
|
||||
import java.sql.SQLException;
|
||||
import java.util.Arrays;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
/** GRPC service for on-demand capture of website screenshots */
|
||||
public class LiveCaptureGrpcService
|
||||
extends LiveCaptureApiGrpc.LiveCaptureApiImplBase
|
||||
implements DiscoverableService
|
||||
{
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(LiveCaptureGrpcService.class);
|
||||
|
||||
private final URI browserlessURI;
|
||||
private final boolean serviceEnabled;
|
||||
private final LinkedBlockingQueue<ScheduledScreenshot> requestedScreenshots = new LinkedBlockingQueue<>(128);
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
record ScheduledScreenshot(int domainId) {}
|
||||
|
||||
// Ensure that the service is only registered if it is enabled
|
||||
@Override
|
||||
public boolean shouldRegisterService() {
|
||||
return serviceEnabled;
|
||||
}
|
||||
|
||||
@Inject
|
||||
public LiveCaptureGrpcService(HikariDataSource dataSource,
|
||||
@Named("browserless-uri") String browserlessAddress,
|
||||
@Named("browserless-agent-threads") int threads,
|
||||
ServiceConfiguration serviceConfiguration
|
||||
) throws URISyntaxException {
|
||||
this.dataSource = dataSource;
|
||||
|
||||
if (StringUtils.isEmpty(browserlessAddress) || serviceConfiguration.node() > 1) {
|
||||
logger.warn("Live capture service will not run");
|
||||
serviceEnabled = false;
|
||||
browserlessURI = null; // satisfy final
|
||||
}
|
||||
else {
|
||||
browserlessURI = new URI(browserlessAddress);
|
||||
serviceEnabled = true;
|
||||
|
||||
for (int i = 0; i < threads; i++) {
|
||||
Thread.ofPlatform().daemon().name("Capture Agent " + i).start(new ScreenshotCaptureAgent());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void requestScreengrab(nu.marginalia.api.livecapture.RpcDomainId request,
|
||||
StreamObserver<Empty> responseObserver)
|
||||
{
|
||||
if (serviceEnabled) {
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
logger.info("Received request for domain {}", request.getDomainId());
|
||||
if (ScreenshotDbOperations.isEligibleForScreengrab(conn, request.getDomainId())) {
|
||||
logger.info("Domain {} is eligible for a screenshot", request.getDomainId());
|
||||
// may fail, we don't care about it
|
||||
requestedScreenshots.offer(new ScheduledScreenshot(request.getDomainId()));
|
||||
}
|
||||
else {
|
||||
logger.info("Domain {} is not eligible for a screenshot", request.getDomainId());
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to check domain eligibility", ex);
|
||||
}
|
||||
finally {
|
||||
responseObserver.onNext(Empty.getDefaultInstance());
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
}
|
||||
else { // service is disabled
|
||||
responseObserver.onNext(Empty.getDefaultInstance());
|
||||
responseObserver.onCompleted();
|
||||
}
|
||||
}
|
||||
|
||||
class ScreenshotCaptureAgent implements Runnable {
|
||||
|
||||
// To prevent race conditions, we use this to lock domain ids that are being processed
|
||||
private static final ConcurrentHashMap<Integer, Boolean> domainIdsClaimed = new ConcurrentHashMap<>();
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
try (BrowserlessClient client = new BrowserlessClient(browserlessURI)) {
|
||||
while (true) {
|
||||
capture(client, requestedScreenshots.take());
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
logger.error("Capture agent interrupted", e);
|
||||
Thread.currentThread().interrupt();
|
||||
} catch (Exception e) {
|
||||
logger.error("Capture agent failed", e);
|
||||
}
|
||||
}
|
||||
|
||||
private void capture(BrowserlessClient client, ScheduledScreenshot scheduledScreenshot) {
|
||||
// Only one agent should capture a screenshot for a domain, so we skip if another agent has claimed it
|
||||
if (domainIdsClaimed.put(scheduledScreenshot.domainId(), Boolean.TRUE) != null) {
|
||||
return;
|
||||
}
|
||||
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
// Double check if the domain is still eligible for a screenshot
|
||||
if (!ScreenshotDbOperations.isEligibleForScreengrab(conn, scheduledScreenshot.domainId)) {
|
||||
return;
|
||||
}
|
||||
|
||||
var domainNameOpt = ScreenshotDbOperations.getDomainName(conn, scheduledScreenshot.domainId());
|
||||
if (domainNameOpt.isEmpty()) {
|
||||
logger.error("Failed to get domain name for domain {}", scheduledScreenshot.domainId());
|
||||
}
|
||||
else {
|
||||
EdgeDomain domain = domainNameOpt.get();
|
||||
String domainNameStr = domain.toString();
|
||||
|
||||
if (!isValidDomainForCapture(domain)) {
|
||||
logger.error("Invalid domain name {}", domainNameStr);
|
||||
ScreenshotDbOperations.flagDomainAsFetched(conn, domain);
|
||||
}
|
||||
else {
|
||||
grab(client, conn, domain);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to check domain eligibility", ex);
|
||||
}
|
||||
finally {
|
||||
// Release the domain ID so that another agent can claim it
|
||||
// at this point we can assume the database will cover the
|
||||
// case where the domain is no longer eligible
|
||||
domainIdsClaimed.remove(scheduledScreenshot.domainId());
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isValidDomainForCapture(EdgeDomain domain) {
|
||||
String domainNameStr = domain.toString();
|
||||
String[] parts = domainNameStr.split("\\.");
|
||||
|
||||
if (parts.length < 2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (Arrays.stream(parts).allMatch(StringUtils::isNumeric)) {
|
||||
// IP address
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private void grab(BrowserlessClient client, Connection conn, EdgeDomain domain) {
|
||||
try {
|
||||
logger.info("Capturing {}", domain);
|
||||
|
||||
byte[] pngBytes = client.screenshot(domain.toRootUrlHttps().toString(),
|
||||
BrowserlessClient.GotoOptions.defaultValues(),
|
||||
BrowserlessClient.ScreenshotOptions.defaultValues());
|
||||
ScreenshotDbOperations.uploadScreenshot(conn, domain, pngBytes);
|
||||
} catch (Exception e) {
|
||||
ScreenshotDbOperations.flagDomainAsFetched(conn, domain);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,15 @@
|
||||
package nu.marginalia.livecapture;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.name.Names;
|
||||
|
||||
public class LivecaptureModule extends AbstractModule {
|
||||
public void configure() {
|
||||
bind(String.class)
|
||||
.annotatedWith(Names.named("browserless-uri"))
|
||||
.toInstance(System.getProperty("live-capture.browserless-uri", ""));
|
||||
bind(Integer.class)
|
||||
.annotatedWith(Names.named("browserless-agent-threads"))
|
||||
.toInstance(Integer.parseInt(System.getProperty("live-capture.browserless-agent-threads", "4")));
|
||||
}
|
||||
}
|
@ -0,0 +1,85 @@
|
||||
package nu.marginalia.livecapture;
|
||||
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.sql.Connection;
|
||||
import java.sql.SQLException;
|
||||
import java.util.Optional;
|
||||
|
||||
public class ScreenshotDbOperations {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ScreenshotDbOperations.class);
|
||||
|
||||
public synchronized static void flagDomainAsFetched(Connection conn, EdgeDomain domain) {
|
||||
try (var stmt = conn.prepareStatement("""
|
||||
REPLACE INTO DATA_DOMAIN_HISTORY(DOMAIN_NAME, SCREENSHOT_DATE)
|
||||
VALUES (?, NOW())
|
||||
"""))
|
||||
{
|
||||
stmt.setString(1, domain.toString());
|
||||
stmt.executeUpdate();
|
||||
} catch (SQLException e) {
|
||||
logger.error("Failed to flag domain as fetched", e);
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized static void uploadScreenshot(Connection conn, EdgeDomain domain, byte[] pngBytes) {
|
||||
try (var stmt = conn.prepareStatement("""
|
||||
REPLACE INTO DATA_DOMAIN_SCREENSHOT(DOMAIN_NAME, CONTENT_TYPE, DATA)
|
||||
VALUES (?,?,?)
|
||||
""");
|
||||
var is = new ByteArrayInputStream(pngBytes)
|
||||
) {
|
||||
stmt.setString(1, domain.toString());
|
||||
stmt.setString(2, "image/png");
|
||||
stmt.setBlob(3, is);
|
||||
stmt.executeUpdate();
|
||||
} catch (SQLException | IOException e) {
|
||||
logger.error("Failed to upload screenshot", e);
|
||||
}
|
||||
|
||||
flagDomainAsFetched(conn, domain);
|
||||
}
|
||||
|
||||
public static boolean isEligibleForScreengrab(Connection conn, int domainId) {
|
||||
try (var stmt = conn.prepareStatement("""
|
||||
SELECT 1 FROM DATA_DOMAIN_HISTORY
|
||||
INNER JOIN WMSA_prod.EC_DOMAIN ON DATA_DOMAIN_HISTORY.DOMAIN_NAME = EC_DOMAIN.DOMAIN_NAME
|
||||
WHERE EC_DOMAIN.ID = ?
|
||||
AND SCREENSHOT_DATE > DATE_SUB(NOW(), INTERVAL 1 MONTH)
|
||||
"""))
|
||||
{
|
||||
stmt.setInt(1, domainId);
|
||||
|
||||
try (var rs = stmt.executeQuery()) {
|
||||
return !rs.next();
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
logger.error("Failed to check eligibility for screengrab", e);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public static Optional<EdgeDomain> getDomainName(Connection conn, int domainId) {
|
||||
try (var stmt = conn.prepareStatement("""
|
||||
SELECT DOMAIN_NAME FROM EC_DOMAIN WHERE ID = ?
|
||||
"""))
|
||||
{
|
||||
stmt.setInt(1, domainId);
|
||||
|
||||
try (var rs = stmt.executeQuery()) {
|
||||
if (rs.next()) {
|
||||
return Optional.of(rs.getString(1)).map(EdgeDomain::new);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to get domain name", ex);
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
@ -0,0 +1,37 @@
|
||||
package nu.marginalia.livecapture;
|
||||
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.testcontainers.containers.GenericContainer;
|
||||
import org.testcontainers.junit.jupiter.Testcontainers;
|
||||
import org.testcontainers.utility.DockerImageName;
|
||||
|
||||
import java.net.URI;
|
||||
|
||||
@Testcontainers
|
||||
public class BrowserlessClientTest {
|
||||
static GenericContainer<?> container = new GenericContainer<>(DockerImageName.parse("browserless/chrome")).withExposedPorts(3000);
|
||||
|
||||
@BeforeAll
|
||||
public static void setup() {
|
||||
container.start();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testContent() throws Exception {
|
||||
try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) {
|
||||
var content = client.content("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues());
|
||||
Assertions.assertNotNull(content, "Content should not be null");
|
||||
Assertions.assertFalse(content.isBlank(), "Content should not be empty");
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testScreenshot() throws Exception {
|
||||
try (var client = new BrowserlessClient(URI.create("http://" + container.getHost() + ":" + container.getMappedPort(3000)))) {
|
||||
var screenshot = client.screenshot("https://www.marginalia.nu/", BrowserlessClient.GotoOptions.defaultValues(), BrowserlessClient.ScreenshotOptions.defaultValues());
|
||||
Assertions.assertNotNull(screenshot, "Screenshot should not be null");
|
||||
}
|
||||
}
|
||||
}
|
@ -42,6 +42,7 @@ dependencies {
|
||||
implementation project(':code:libraries:braille-block-punch-cards')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
implementation project(':code:functions:math:api')
|
||||
implementation project(':code:functions:domain-info:api')
|
||||
implementation project(':code:functions:search-query:api')
|
||||
|
@ -77,7 +77,7 @@ public class SearchBrowseService {
|
||||
if (resultDomain.isEmpty())
|
||||
continue;
|
||||
|
||||
results.add(new BrowseResult(resultDomain.get().toRootUrl(), sd.domainId(), 0, sd.screenshot()));
|
||||
results.add(new BrowseResult(resultDomain.get().toRootUrlHttp(), sd.domainId(), 0, sd.screenshot()));
|
||||
}
|
||||
// shuffle the items for a less repetitive experience
|
||||
shuffle(neighbors);
|
||||
|
@ -4,6 +4,7 @@ import com.google.inject.Inject;
|
||||
import nu.marginalia.api.domains.DomainInfoClient;
|
||||
import nu.marginalia.api.domains.model.DomainInformation;
|
||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||
import nu.marginalia.api.livecapture.LiveCaptureClient;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.feedlot.FeedlotClient;
|
||||
import nu.marginalia.feedlot.model.FeedItems;
|
||||
@ -37,6 +38,7 @@ public class SearchSiteInfoService {
|
||||
private final DbDomainQueries domainQueries;
|
||||
private final MustacheRenderer<Object> renderer;
|
||||
private final FeedlotClient feedlotClient;
|
||||
private final LiveCaptureClient liveCaptureClient;
|
||||
private final ScreenshotService screenshotService;
|
||||
|
||||
@Inject
|
||||
@ -46,6 +48,7 @@ public class SearchSiteInfoService {
|
||||
SearchFlagSiteService flagSiteService,
|
||||
DbDomainQueries domainQueries,
|
||||
FeedlotClient feedlotClient,
|
||||
LiveCaptureClient liveCaptureClient,
|
||||
ScreenshotService screenshotService) throws IOException
|
||||
{
|
||||
this.searchOperator = searchOperator;
|
||||
@ -56,6 +59,7 @@ public class SearchSiteInfoService {
|
||||
this.renderer = rendererFactory.renderer("search/site-info/site-info");
|
||||
|
||||
this.feedlotClient = feedlotClient;
|
||||
this.liveCaptureClient = liveCaptureClient;
|
||||
this.screenshotService = screenshotService;
|
||||
}
|
||||
|
||||
@ -165,7 +169,7 @@ public class SearchSiteInfoService {
|
||||
logger.debug("Failed to get feed items for {}: {}", domainName, e.getMessage());
|
||||
}
|
||||
|
||||
return new SiteInfoWithContext(domainName,
|
||||
var result = new SiteInfoWithContext(domainName,
|
||||
domainId,
|
||||
url,
|
||||
hasScreenshot,
|
||||
@ -175,6 +179,46 @@ public class SearchSiteInfoService {
|
||||
feedItems,
|
||||
sampleResults
|
||||
);
|
||||
|
||||
requestMissingScreenshots(result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/** Request missing screenshots for the given site info */
|
||||
private void requestMissingScreenshots(SiteInfoWithContext result) {
|
||||
int requests = 0;
|
||||
if (!result.hasScreenshot()) {
|
||||
liveCaptureClient.requestScreengrab((int) result.domainId());
|
||||
requests++;
|
||||
}
|
||||
|
||||
if (result.similar() != null) {
|
||||
for (var similar : result.similar()) {
|
||||
if (similar.screenshot()) {
|
||||
continue;
|
||||
}
|
||||
if (++requests > 5) {
|
||||
break;
|
||||
}
|
||||
|
||||
liveCaptureClient.requestScreengrab(similar.domainId());
|
||||
}
|
||||
}
|
||||
|
||||
if (result.linking() != null) {
|
||||
for (var linking : result.linking()) {
|
||||
if (linking.screenshot()) {
|
||||
continue;
|
||||
}
|
||||
if (++requests > 5) {
|
||||
break;
|
||||
}
|
||||
|
||||
liveCaptureClient.requestScreengrab(linking.domainId());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private <T> T waitForFuture(Future<T> future, Supplier<T> fallback) {
|
||||
@ -233,7 +277,7 @@ public class SearchSiteInfoService {
|
||||
public record SiteInfoWithContext(Map<String, Boolean> view,
|
||||
Map<String, Boolean> domainState,
|
||||
String domain,
|
||||
long domainId,
|
||||
int domainId,
|
||||
String siteUrl,
|
||||
boolean hasScreenshot,
|
||||
DomainInformation domainInformation,
|
||||
@ -243,7 +287,7 @@ public class SearchSiteInfoService {
|
||||
List<UrlDetails> samples
|
||||
) {
|
||||
public SiteInfoWithContext(String domain,
|
||||
long domainId,
|
||||
int domainId,
|
||||
String siteUrl,
|
||||
boolean hasScreenshot,
|
||||
DomainInformation domainInformation,
|
||||
|
@ -25,6 +25,8 @@ apply from: "$rootProject.projectDir/docker.gradle"
|
||||
dependencies {
|
||||
implementation project(':third-party:symspell')
|
||||
|
||||
implementation project(':code:functions:live-capture')
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
implementation project(':code:functions:math')
|
||||
implementation project(':code:functions:math:api')
|
||||
implementation project(':code:functions:domain-info')
|
||||
|
@ -3,13 +3,14 @@ package nu.marginalia.assistant;
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.livecapture.LivecaptureModule;
|
||||
import nu.marginalia.service.MainClass;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.service.module.ServiceDiscoveryModule;
|
||||
import nu.marginalia.service.ServiceId;
|
||||
import nu.marginalia.service.module.ServiceConfigurationModule;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.service.module.ServiceConfigurationModule;
|
||||
import nu.marginalia.service.module.ServiceDiscoveryModule;
|
||||
import nu.marginalia.service.server.Initialization;
|
||||
|
||||
public class AssistantMain extends MainClass {
|
||||
@ -25,6 +26,7 @@ public class AssistantMain extends MainClass {
|
||||
|
||||
Injector injector = Guice.createInjector(
|
||||
new AssistantModule(),
|
||||
new LivecaptureModule(),
|
||||
new ServiceConfigurationModule(ServiceId.Assistant),
|
||||
new ServiceDiscoveryModule(),
|
||||
new DatabaseModule(false)
|
||||
|
@ -6,10 +6,12 @@ import lombok.SneakyThrows;
|
||||
import nu.marginalia.assistant.suggest.Suggestions;
|
||||
import nu.marginalia.functions.domains.DomainInfoGrpcService;
|
||||
import nu.marginalia.functions.math.MathGrpcService;
|
||||
import nu.marginalia.livecapture.LiveCaptureGrpcService;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import nu.marginalia.screenshot.ScreenshotService;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.server.*;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import nu.marginalia.service.server.Service;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
@ -28,12 +30,11 @@ public class AssistantService extends Service {
|
||||
public AssistantService(BaseServiceParams params,
|
||||
ScreenshotService screenshotService,
|
||||
DomainInfoGrpcService domainInfoGrpcService,
|
||||
LiveCaptureGrpcService liveCaptureGrpcService,
|
||||
MathGrpcService mathGrpcService,
|
||||
Suggestions suggestions)
|
||||
{
|
||||
super(params,
|
||||
ServicePartition.any(),
|
||||
List.of(domainInfoGrpcService, mathGrpcService));
|
||||
super(params, ServicePartition.any(), List.of(domainInfoGrpcService, mathGrpcService, liveCaptureGrpcService));
|
||||
|
||||
this.suggestions = suggestions;
|
||||
|
||||
|
@ -96,7 +96,7 @@ public class ScreenshotCaptureToolMain {
|
||||
private static byte[] fetchDomain(HttpClient client, EdgeDomain domain) {
|
||||
try {
|
||||
Map<String, Object> requestData = Map.of(
|
||||
"url", domain.toRootUrl().toString(),
|
||||
"url", domain.toRootUrlHttps().toString(),
|
||||
"options",
|
||||
Map.of("fullPage", false,
|
||||
"type", "png"),
|
||||
|
@ -25,6 +25,9 @@ include 'code:functions:link-graph:api'
|
||||
include 'code:functions:search-query'
|
||||
include 'code:functions:search-query:api'
|
||||
|
||||
include 'code:functions:live-capture'
|
||||
include 'code:functions:live-capture:api'
|
||||
|
||||
include 'code:execution'
|
||||
include 'code:execution:api'
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user