2023-03-06 17:32:13 +00:00
|
|
|
package nu.marginalia.screenshot;
|
2022-08-04 19:14:17 +00:00
|
|
|
|
2023-11-01 15:38:55 +00:00
|
|
|
import com.google.gson.Gson;
|
2022-08-04 19:14:17 +00:00
|
|
|
import com.zaxxer.hikari.HikariDataSource;
|
2023-03-04 12:19:01 +00:00
|
|
|
import nu.marginalia.model.EdgeDomain;
|
2023-11-01 15:38:55 +00:00
|
|
|
import nu.marginalia.model.gson.GsonFactory;
|
2023-03-04 12:19:01 +00:00
|
|
|
import nu.marginalia.service.module.DatabaseModule;
|
2023-04-16 06:55:32 +00:00
|
|
|
import org.openqa.selenium.chrome.ChromeDriverService;
|
2022-08-04 19:14:17 +00:00
|
|
|
import org.slf4j.Logger;
|
|
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
|
|
|
|
import java.io.ByteArrayInputStream;
|
|
|
|
import java.io.IOException;
|
2023-04-17 16:04:22 +00:00
|
|
|
import java.net.URI;
|
|
|
|
import java.net.http.HttpClient;
|
|
|
|
import java.net.http.HttpRequest;
|
|
|
|
import java.net.http.HttpResponse;
|
2022-08-04 19:14:17 +00:00
|
|
|
import java.sql.Connection;
|
|
|
|
import java.sql.SQLException;
|
|
|
|
import java.time.Duration;
|
2023-11-01 15:38:55 +00:00
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.Collections;
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.Map;
|
|
|
|
import java.util.concurrent.TimeUnit;
|
2023-04-16 06:55:32 +00:00
|
|
|
|
2022-08-04 19:14:17 +00:00
|
|
|
public class ScreenshotCaptureToolMain {
|
|
|
|
|
|
|
|
private static final Logger logger = LoggerFactory.getLogger(ScreenshotCaptureToolMain.class);
|
|
|
|
|
|
|
|
public static void main(String[] args) {
|
2024-01-11 11:40:03 +00:00
|
|
|
DatabaseModule databaseModule = new DatabaseModule(false);
|
2022-08-04 19:14:17 +00:00
|
|
|
var ds = databaseModule.provideConnection();
|
|
|
|
|
2023-04-16 06:55:32 +00:00
|
|
|
System.setProperty(ChromeDriverService.CHROME_DRIVER_SILENT_OUTPUT_PROPERTY, "true");
|
|
|
|
|
2023-04-17 16:04:22 +00:00
|
|
|
List<EdgeDomain> crawlQueue = fetchCrawlQueue(ds, 1000);
|
|
|
|
|
2023-11-01 15:38:55 +00:00
|
|
|
HttpClient httpClient = HttpClient.newBuilder()
|
|
|
|
.version(HttpClient.Version.HTTP_1_1)
|
|
|
|
.connectTimeout(Duration.ofSeconds(30))
|
|
|
|
.build()
|
|
|
|
;
|
2022-08-04 19:14:17 +00:00
|
|
|
|
|
|
|
try (Connection conn = ds.getConnection()) {
|
2023-11-01 15:38:55 +00:00
|
|
|
for (var domain : crawlQueue) {
|
2022-08-04 19:14:17 +00:00
|
|
|
logger.info("Fetching {}", domain);
|
|
|
|
|
2023-11-01 15:38:55 +00:00
|
|
|
byte[] webpBytes = fetchDomain(httpClient, domain);
|
|
|
|
if (webpBytes != null) {
|
|
|
|
uploadScreenshot(conn, domain, webpBytes);
|
2023-04-17 16:04:22 +00:00
|
|
|
} else {
|
|
|
|
flagDomainAsFetched(conn, domain);
|
|
|
|
}
|
2022-08-04 19:14:17 +00:00
|
|
|
}
|
2023-04-17 16:04:22 +00:00
|
|
|
|
2022-08-04 19:14:17 +00:00
|
|
|
} catch (SQLException e) {
|
|
|
|
e.printStackTrace();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private static void flagDomainAsFetched(Connection conn, EdgeDomain domain) {
|
2023-11-01 15:38:55 +00:00
|
|
|
try (var stmt = conn.prepareStatement("""
|
|
|
|
REPLACE INTO DATA_DOMAIN_HISTORY(DOMAIN_NAME, SCREENSHOT_DATE)
|
|
|
|
VALUES (?, NOW())
|
|
|
|
"""))
|
|
|
|
{
|
2022-08-04 19:14:17 +00:00
|
|
|
stmt.setString(1, domain.toString());
|
|
|
|
stmt.executeUpdate();
|
|
|
|
} catch (SQLException e) {
|
|
|
|
throw new RuntimeException(e);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-11-01 15:38:55 +00:00
|
|
|
private static void uploadScreenshot(Connection conn, EdgeDomain domain, byte[] webpBytes) {
|
|
|
|
try (var stmt = conn.prepareStatement("""
|
|
|
|
REPLACE INTO DATA_DOMAIN_SCREENSHOT(DOMAIN_NAME, CONTENT_TYPE, DATA)
|
|
|
|
VALUES (?,?,?)
|
|
|
|
""");
|
|
|
|
var is = new ByteArrayInputStream(webpBytes)
|
2022-08-04 19:14:17 +00:00
|
|
|
) {
|
|
|
|
stmt.setString(1, domain.toString());
|
2023-11-01 15:38:55 +00:00
|
|
|
stmt.setString(2, "image/png");
|
2022-08-04 19:14:17 +00:00
|
|
|
stmt.setBlob(3, is);
|
|
|
|
stmt.executeUpdate();
|
|
|
|
} catch (SQLException | IOException e) {
|
|
|
|
e.printStackTrace();
|
|
|
|
}
|
|
|
|
|
|
|
|
flagDomainAsFetched(conn, domain);
|
|
|
|
}
|
|
|
|
|
2023-11-01 15:38:55 +00:00
|
|
|
private static Gson gson = GsonFactory.get();
|
2023-04-17 16:04:22 +00:00
|
|
|
|
2023-11-01 15:38:55 +00:00
|
|
|
private static byte[] fetchDomain(HttpClient client, EdgeDomain domain) {
|
2022-08-04 19:14:17 +00:00
|
|
|
try {
|
2023-11-01 15:38:55 +00:00
|
|
|
Map<String, Object> requestData = Map.of(
|
|
|
|
"url", domain.toRootUrl().toString(),
|
|
|
|
"options",
|
|
|
|
Map.of("fullPage", false,
|
|
|
|
"type", "png"),
|
|
|
|
"gotoOptions", Map.of(
|
|
|
|
"waitUntil", "networkidle2",
|
|
|
|
"timeout", TimeUnit.SECONDS.toMillis(10)
|
|
|
|
)
|
|
|
|
);
|
2022-08-04 19:14:17 +00:00
|
|
|
|
2023-11-01 15:38:55 +00:00
|
|
|
var request = HttpRequest.newBuilder()
|
|
|
|
.uri(new URI("http://browserless:3000/screenshot"))
|
|
|
|
.method("POST", HttpRequest.BodyPublishers.ofString(
|
|
|
|
gson.toJson(requestData)
|
|
|
|
))
|
|
|
|
.header("Content-type", "application/json")
|
|
|
|
.build();
|
|
|
|
var rsp = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
|
2023-04-16 06:55:32 +00:00
|
|
|
|
2023-11-01 15:38:55 +00:00
|
|
|
if (rsp.statusCode() >= 300) {
|
|
|
|
return null;
|
2023-04-16 06:55:32 +00:00
|
|
|
}
|
|
|
|
|
2023-11-01 15:38:55 +00:00
|
|
|
byte[] image = rsp.body();
|
|
|
|
if (image.length < 3500) {
|
|
|
|
logger.warn("Skipping {} due to size ({})", domain, image.length);
|
2023-04-17 16:04:22 +00:00
|
|
|
return null;
|
2022-08-04 19:14:17 +00:00
|
|
|
}
|
|
|
|
|
2023-11-01 15:38:55 +00:00
|
|
|
return image;
|
2022-08-04 19:14:17 +00:00
|
|
|
}
|
|
|
|
catch (Exception ex) {
|
2023-11-01 15:38:55 +00:00
|
|
|
logger.warn("Exception in screenshotting " + domain, ex);
|
2023-04-17 16:04:22 +00:00
|
|
|
return null;
|
2022-08-04 19:14:17 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private static List<EdgeDomain> fetchCrawlQueue(HikariDataSource ds, int queueSize) {
|
2023-11-01 15:38:55 +00:00
|
|
|
|
2022-08-04 19:14:17 +00:00
|
|
|
List<EdgeDomain> ret = new ArrayList<>(queueSize);
|
|
|
|
|
|
|
|
try (var conn = ds.getConnection(); var stmt = conn.createStatement()) {
|
|
|
|
var rsp = stmt.executeQuery(
|
|
|
|
"""
|
|
|
|
SELECT EC_DOMAIN.DOMAIN_NAME FROM EC_DOMAIN
|
|
|
|
LEFT JOIN DATA_DOMAIN_HISTORY ON EC_DOMAIN.DOMAIN_NAME=DATA_DOMAIN_HISTORY.DOMAIN_NAME
|
|
|
|
ORDER BY SCREENSHOT_DATE IS NULL DESC, SCREENSHOT_DATE, INDEXED DESC
|
2023-11-01 15:38:55 +00:00
|
|
|
LIMIT
|
2022-08-04 19:14:17 +00:00
|
|
|
""" + queueSize);
|
|
|
|
while (rsp.next()) {
|
|
|
|
ret.add(new EdgeDomain(rsp.getString(1)));
|
|
|
|
}
|
|
|
|
}
|
2023-11-01 15:38:55 +00:00
|
|
|
catch (Exception ex) {
|
|
|
|
logger.warn("Exception in fetching queue", ex);
|
2022-08-04 19:14:17 +00:00
|
|
|
return Collections.emptyList();
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
}
|