mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
Improving screenshots capture bot.
This commit is contained in:
parent
fbbaf584ba
commit
4d298cd5fa
@ -117,7 +117,7 @@ public class ScreenshotService {
|
|||||||
style="font-size:32px;fill:#000000;font-family:monospace;"
|
style="font-size:32px;fill:#000000;font-family:monospace;"
|
||||||
x="320" y="240" dominant-baseline="middle" text-anchor="middle">%s</text>
|
x="320" y="240" dominant-baseline="middle" text-anchor="middle">%s</text>
|
||||||
</g>
|
</g>
|
||||||
</svg>\n
|
</svg>
|
||||||
""".formatted(name);
|
""".formatted(name);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -16,12 +16,18 @@ import org.slf4j.LoggerFactory;
|
|||||||
import javax.imageio.ImageIO;
|
import javax.imageio.ImageIO;
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.net.URI;
|
||||||
|
import java.net.http.HttpClient;
|
||||||
|
import java.net.http.HttpRequest;
|
||||||
|
import java.net.http.HttpResponse;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.sql.Connection;
|
import java.sql.Connection;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.openqa.selenium.support.ui.ExpectedCondition;
|
import org.openqa.selenium.support.ui.ExpectedCondition;
|
||||||
import org.openqa.selenium.support.ui.WebDriverWait;
|
import org.openqa.selenium.support.ui.WebDriverWait;
|
||||||
@ -38,17 +44,34 @@ public class ScreenshotCaptureToolMain {
|
|||||||
System.setProperty(ChromeDriverService.CHROME_DRIVER_SILENT_OUTPUT_PROPERTY, "true");
|
System.setProperty(ChromeDriverService.CHROME_DRIVER_SILENT_OUTPUT_PROPERTY, "true");
|
||||||
|
|
||||||
ChromeDriver driver = initChromeDriver();
|
ChromeDriver driver = initChromeDriver();
|
||||||
List<EdgeDomain> crawlQueue = fetchCrawlQueue(ds, 100);
|
List<EdgeDomain> crawlQueue = fetchCrawlQueue(ds, 1000);
|
||||||
|
|
||||||
|
HttpClient httpClient = HttpClient.newHttpClient();
|
||||||
|
|
||||||
try (Connection conn = ds.getConnection()) {
|
try (Connection conn = ds.getConnection()) {
|
||||||
for (var domain : crawlQueue) {
|
|
||||||
|
|
||||||
|
logger.info("Probing domains");
|
||||||
|
var ret = crawlQueue.parallelStream().collect(Collectors.partitioningBy(domain -> probeUrl(httpClient, domain)));
|
||||||
|
|
||||||
|
var badDomains = ret.getOrDefault(Boolean.FALSE, Collections.emptyList());
|
||||||
|
var goodDomains = ret.getOrDefault(Boolean.TRUE, Collections.emptyList());
|
||||||
|
|
||||||
|
logger.info("Result: {} good domains, {} bad domains", goodDomains.size(), badDomains.size());
|
||||||
|
|
||||||
|
badDomains.forEach(domain -> flagDomainAsFetched(conn, domain));
|
||||||
|
|
||||||
|
for (var domain : goodDomains) {
|
||||||
logger.info("Fetching {}", domain);
|
logger.info("Fetching {}", domain);
|
||||||
|
|
||||||
fetchDomain(driver, domain)
|
var filePath = fetchDomain(driver, domain);
|
||||||
.ifPresentOrElse(
|
if (filePath != null) {
|
||||||
(path) -> uploadScreenshot(conn, domain, path),
|
uploadScreenshot(conn, domain, filePath);
|
||||||
() -> flagDomainAsFetched(conn, domain));
|
} else {
|
||||||
|
flagDomainAsFetched(conn, domain);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} catch (SQLException e) {
|
} catch (SQLException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
} finally {
|
} finally {
|
||||||
@ -110,7 +133,24 @@ public class ScreenshotCaptureToolMain {
|
|||||||
flagDomainAsFetched(conn, domain);
|
flagDomainAsFetched(conn, domain);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Optional<Path> fetchDomain(ChromeDriver driver, EdgeDomain domain) {
|
private static boolean probeUrl(HttpClient httpClient, EdgeDomain domain) {
|
||||||
|
try {
|
||||||
|
var request = HttpRequest.newBuilder()
|
||||||
|
.uri(new URI(domain.toRootUrl().toString()))
|
||||||
|
.timeout(Duration.ofSeconds(5))
|
||||||
|
.method("HEAD", HttpRequest.BodyPublishers.noBody())
|
||||||
|
.header("user-agent", "search.marginialia.nu")
|
||||||
|
.build();
|
||||||
|
|
||||||
|
var response = httpClient.send(request, HttpResponse.BodyHandlers.ofString());
|
||||||
|
|
||||||
|
return response.statusCode() < 400;
|
||||||
|
} catch (Exception ex) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
private static Path fetchDomain(ChromeDriver driver, EdgeDomain domain) {
|
||||||
try {
|
try {
|
||||||
driver.get(domain.toRootUrl().toString());
|
driver.get(domain.toRootUrl().toString());
|
||||||
|
|
||||||
@ -140,16 +180,16 @@ public class ScreenshotCaptureToolMain {
|
|||||||
ImageIO.write(img, "webp", destPath.toFile());
|
ImageIO.write(img, "webp", destPath.toFile());
|
||||||
|
|
||||||
// If the screenshot is very small by size, it's very likely not particularly interesting to look at
|
// If the screenshot is very small by size, it's very likely not particularly interesting to look at
|
||||||
if (Files.size(destPath) < 2500) {
|
if (Files.size(destPath) < 3500) {
|
||||||
Files.delete(destPath);
|
Files.delete(destPath);
|
||||||
return Optional.empty();
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
return Optional.of(destPath);
|
return destPath;
|
||||||
}
|
}
|
||||||
catch (Exception ex) {
|
catch (Exception ex) {
|
||||||
ex.printStackTrace();
|
ex.printStackTrace();
|
||||||
return Optional.empty();
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user