mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 04:58:59 +00:00
(crawler) Add favicon data to domain state db in its own table
This commit is contained in:
parent
8862100f7e
commit
98a340a0d1
@ -60,6 +60,8 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public record FaviconRecord(String contentType, byte[] imageData) {}
|
||||||
|
|
||||||
public DomainStateDb(Path filename) throws SQLException {
|
public DomainStateDb(Path filename) throws SQLException {
|
||||||
String sqliteDbString = "jdbc:sqlite:" + filename.toString();
|
String sqliteDbString = "jdbc:sqlite:" + filename.toString();
|
||||||
connection = DriverManager.getConnection(sqliteDbString);
|
connection = DriverManager.getConnection(sqliteDbString);
|
||||||
@ -74,7 +76,13 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
feedUrl TEXT
|
feedUrl TEXT
|
||||||
)
|
)
|
||||||
""");
|
""");
|
||||||
|
stmt.executeUpdate("""
|
||||||
|
CREATE TABLE IF NOT EXISTS favicon (
|
||||||
|
domain TEXT PRIMARY KEY,
|
||||||
|
contentType TEXT NOT NULL,
|
||||||
|
icon BLOB NOT NULL
|
||||||
|
)
|
||||||
|
""");
|
||||||
stmt.execute("PRAGMA journal_mode=WAL");
|
stmt.execute("PRAGMA journal_mode=WAL");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -85,6 +93,41 @@ public class DomainStateDb implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void saveIcon(String domain, FaviconRecord faviconRecord) {
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
INSERT OR REPLACE INTO favicon (domain, contentType, icon)
|
||||||
|
VALUES(?, ?, ?)
|
||||||
|
""")) {
|
||||||
|
stmt.setString(1, domain);
|
||||||
|
stmt.setString(2, faviconRecord.contentType);
|
||||||
|
stmt.setBytes(3, faviconRecord.imageData);
|
||||||
|
stmt.executeUpdate();
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
logger.error("Failed to insert favicon", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public Optional<FaviconRecord> getIcon(String domain) {
|
||||||
|
try (var stmt = connection.prepareStatement("SELECT contentType, icon FROM favicon WHERE DOMAIN = ?")) {
|
||||||
|
stmt.setString(1, domain);
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
|
||||||
|
if (rs.next()) {
|
||||||
|
return Optional.of(
|
||||||
|
new FaviconRecord(
|
||||||
|
rs.getString("contentType"),
|
||||||
|
rs.getBytes("icon")
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
} catch (SQLException e) {
|
||||||
|
logger.error("Failed to retrieve favicon", e);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
public void save(SummaryRecord record) {
|
public void save(SummaryRecord record) {
|
||||||
try (var stmt = connection.prepareStatement("""
|
try (var stmt = connection.prepareStatement("""
|
||||||
INSERT OR REPLACE INTO summary (domain, lastUpdatedEpochMs, state, stateDesc, feedUrl)
|
INSERT OR REPLACE INTO summary (domain, lastUpdatedEpochMs, state, stateDesc, feedUrl)
|
||||||
|
@ -19,7 +19,6 @@ import nu.marginalia.model.EdgeUrl;
|
|||||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||||
import nu.marginalia.model.body.HttpFetchResult;
|
import nu.marginalia.model.body.HttpFetchResult;
|
||||||
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||||
import org.jsoup.Jsoup;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -273,7 +272,16 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
feedLink.ifPresent(s -> fetcher.fetchSitemapUrls(s, timer));
|
feedLink.ifPresent(s -> fetcher.fetchSitemapUrls(s, timer));
|
||||||
|
|
||||||
// Grab the favicon if it exists
|
// Grab the favicon if it exists
|
||||||
fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
|
||||||
|
if (fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty()) instanceof HttpFetchResult.ResultOk iconResult) {
|
||||||
|
String contentType = iconResult.header("Content-Type");
|
||||||
|
byte[] iconData = iconResult.getBodyBytes();
|
||||||
|
|
||||||
|
domainStateDb.saveIcon(
|
||||||
|
domain,
|
||||||
|
new DomainStateDb.FaviconRecord(contentType, iconData)
|
||||||
|
);
|
||||||
|
}
|
||||||
timer.waitFetchDelay(0);
|
timer.waitFetchDelay(0);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -12,6 +12,7 @@ import java.io.InputStream;
|
|||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.net.http.HttpHeaders;
|
import java.net.http.HttpHeaders;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
/* FIXME: This interface has a very unfortunate name that is not very descriptive.
|
/* FIXME: This interface has a very unfortunate name that is not very descriptive.
|
||||||
@ -58,7 +59,7 @@ public sealed interface HttpFetchResult {
|
|||||||
int statusCode,
|
int statusCode,
|
||||||
HttpHeaders headers,
|
HttpHeaders headers,
|
||||||
String ipAddress,
|
String ipAddress,
|
||||||
byte[] bytesRaw,
|
byte[] bytesRaw, // raw data for the entire response including headers
|
||||||
int bytesStart,
|
int bytesStart,
|
||||||
int bytesLength
|
int bytesLength
|
||||||
) implements HttpFetchResult {
|
) implements HttpFetchResult {
|
||||||
@ -75,6 +76,12 @@ public sealed interface HttpFetchResult {
|
|||||||
return new ByteArrayInputStream(bytesRaw, bytesStart, bytesLength);
|
return new ByteArrayInputStream(bytesRaw, bytesStart, bytesLength);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Copy the byte range corresponding to the payload of the response,
|
||||||
|
Warning: Copies the data, use getInputStream() for zero copy access */
|
||||||
|
public byte[] getBodyBytes() {
|
||||||
|
return Arrays.copyOfRange(bytesRaw, bytesStart, bytesStart + bytesLength);
|
||||||
|
}
|
||||||
|
|
||||||
public Optional<Document> parseDocument() {
|
public Optional<Document> parseDocument() {
|
||||||
return DocumentBodyExtractor.asString(this).flatMapOpt((contentType, body) -> {
|
return DocumentBodyExtractor.asString(this).flatMapOpt((contentType, body) -> {
|
||||||
if (contentType.is("text/html")) {
|
if (contentType.is("text/html")) {
|
||||||
|
@ -10,7 +10,7 @@ import java.nio.file.Path;
|
|||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.time.Instant;
|
import java.time.Instant;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
class DomainStateDbTest {
|
class DomainStateDbTest {
|
||||||
|
|
||||||
@ -26,7 +26,7 @@ class DomainStateDbTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testSunnyDay() throws SQLException {
|
public void testSummaryRecord() throws SQLException {
|
||||||
try (var db = new DomainStateDb(tempFile)) {
|
try (var db = new DomainStateDb(tempFile)) {
|
||||||
var allFields = new DomainStateDb.SummaryRecord(
|
var allFields = new DomainStateDb.SummaryRecord(
|
||||||
"all.marginalia.nu",
|
"all.marginalia.nu",
|
||||||
@ -63,4 +63,21 @@ class DomainStateDbTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFavicon() throws SQLException {
|
||||||
|
try (var db = new DomainStateDb(tempFile)) {
|
||||||
|
db.saveIcon("www.marginalia.nu", new DomainStateDb.FaviconRecord("text/plain", "hello world".getBytes()));
|
||||||
|
|
||||||
|
var maybeData = db.getIcon("www.marginalia.nu");
|
||||||
|
assertTrue(maybeData.isPresent());
|
||||||
|
var actualData = maybeData.get();
|
||||||
|
|
||||||
|
assertEquals("text/plain", actualData.contentType());
|
||||||
|
assertArrayEquals("hello world".getBytes(), actualData.imageData());
|
||||||
|
|
||||||
|
maybeData = db.getIcon("foobar");
|
||||||
|
assertTrue(maybeData.isEmpty());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user