mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-22 20:48:59 +00:00
(crawler) Add favicon data to domain state db in its own table
This commit is contained in:
parent
8862100f7e
commit
98a340a0d1
@ -60,6 +60,8 @@ public class DomainStateDb implements AutoCloseable {
|
||||
|
||||
}
|
||||
|
||||
public record FaviconRecord(String contentType, byte[] imageData) {}
|
||||
|
||||
public DomainStateDb(Path filename) throws SQLException {
|
||||
String sqliteDbString = "jdbc:sqlite:" + filename.toString();
|
||||
connection = DriverManager.getConnection(sqliteDbString);
|
||||
@ -74,7 +76,13 @@ public class DomainStateDb implements AutoCloseable {
|
||||
feedUrl TEXT
|
||||
)
|
||||
""");
|
||||
|
||||
stmt.executeUpdate("""
|
||||
CREATE TABLE IF NOT EXISTS favicon (
|
||||
domain TEXT PRIMARY KEY,
|
||||
contentType TEXT NOT NULL,
|
||||
icon BLOB NOT NULL
|
||||
)
|
||||
""");
|
||||
stmt.execute("PRAGMA journal_mode=WAL");
|
||||
}
|
||||
}
|
||||
@ -85,6 +93,41 @@ public class DomainStateDb implements AutoCloseable {
|
||||
}
|
||||
|
||||
|
||||
public void saveIcon(String domain, FaviconRecord faviconRecord) {
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
INSERT OR REPLACE INTO favicon (domain, contentType, icon)
|
||||
VALUES(?, ?, ?)
|
||||
""")) {
|
||||
stmt.setString(1, domain);
|
||||
stmt.setString(2, faviconRecord.contentType);
|
||||
stmt.setBytes(3, faviconRecord.imageData);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.error("Failed to insert favicon", ex);
|
||||
}
|
||||
}
|
||||
|
||||
public Optional<FaviconRecord> getIcon(String domain) {
|
||||
try (var stmt = connection.prepareStatement("SELECT contentType, icon FROM favicon WHERE DOMAIN = ?")) {
|
||||
stmt.setString(1, domain);
|
||||
var rs = stmt.executeQuery();
|
||||
|
||||
if (rs.next()) {
|
||||
return Optional.of(
|
||||
new FaviconRecord(
|
||||
rs.getString("contentType"),
|
||||
rs.getBytes("icon")
|
||||
)
|
||||
);
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
logger.error("Failed to retrieve favicon", e);
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
public void save(SummaryRecord record) {
|
||||
try (var stmt = connection.prepareStatement("""
|
||||
INSERT OR REPLACE INTO summary (domain, lastUpdatedEpochMs, state, stateDesc, feedUrl)
|
||||
|
@ -19,7 +19,6 @@ import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||
import nu.marginalia.model.body.HttpFetchResult;
|
||||
import nu.marginalia.model.crawldata.CrawlerDomainStatus;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -273,7 +272,16 @@ public class CrawlerRetreiver implements AutoCloseable {
|
||||
feedLink.ifPresent(s -> fetcher.fetchSitemapUrls(s, timer));
|
||||
|
||||
// Grab the favicon if it exists
|
||||
fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty());
|
||||
|
||||
if (fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty()) instanceof HttpFetchResult.ResultOk iconResult) {
|
||||
String contentType = iconResult.header("Content-Type");
|
||||
byte[] iconData = iconResult.getBodyBytes();
|
||||
|
||||
domainStateDb.saveIcon(
|
||||
domain,
|
||||
new DomainStateDb.FaviconRecord(contentType, iconData)
|
||||
);
|
||||
}
|
||||
timer.waitFetchDelay(0);
|
||||
|
||||
}
|
||||
|
@ -12,6 +12,7 @@ import java.io.InputStream;
|
||||
import java.net.InetAddress;
|
||||
import java.net.URI;
|
||||
import java.net.http.HttpHeaders;
|
||||
import java.util.Arrays;
|
||||
import java.util.Optional;
|
||||
|
||||
/* FIXME: This interface has a very unfortunate name that is not very descriptive.
|
||||
@ -58,7 +59,7 @@ public sealed interface HttpFetchResult {
|
||||
int statusCode,
|
||||
HttpHeaders headers,
|
||||
String ipAddress,
|
||||
byte[] bytesRaw,
|
||||
byte[] bytesRaw, // raw data for the entire response including headers
|
||||
int bytesStart,
|
||||
int bytesLength
|
||||
) implements HttpFetchResult {
|
||||
@ -75,6 +76,12 @@ public sealed interface HttpFetchResult {
|
||||
return new ByteArrayInputStream(bytesRaw, bytesStart, bytesLength);
|
||||
}
|
||||
|
||||
/** Copy the byte range corresponding to the payload of the response,
|
||||
Warning: Copies the data, use getInputStream() for zero copy access */
|
||||
public byte[] getBodyBytes() {
|
||||
return Arrays.copyOfRange(bytesRaw, bytesStart, bytesStart + bytesLength);
|
||||
}
|
||||
|
||||
public Optional<Document> parseDocument() {
|
||||
return DocumentBodyExtractor.asString(this).flatMapOpt((contentType, body) -> {
|
||||
if (contentType.is("text/html")) {
|
||||
|
@ -10,7 +10,7 @@ import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.time.Instant;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class DomainStateDbTest {
|
||||
|
||||
@ -26,7 +26,7 @@ class DomainStateDbTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSunnyDay() throws SQLException {
|
||||
public void testSummaryRecord() throws SQLException {
|
||||
try (var db = new DomainStateDb(tempFile)) {
|
||||
var allFields = new DomainStateDb.SummaryRecord(
|
||||
"all.marginalia.nu",
|
||||
@ -63,4 +63,21 @@ class DomainStateDbTest {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFavicon() throws SQLException {
|
||||
try (var db = new DomainStateDb(tempFile)) {
|
||||
db.saveIcon("www.marginalia.nu", new DomainStateDb.FaviconRecord("text/plain", "hello world".getBytes()));
|
||||
|
||||
var maybeData = db.getIcon("www.marginalia.nu");
|
||||
assertTrue(maybeData.isPresent());
|
||||
var actualData = maybeData.get();
|
||||
|
||||
assertEquals("text/plain", actualData.contentType());
|
||||
assertArrayEquals("hello world".getBytes(), actualData.imageData());
|
||||
|
||||
maybeData = db.getIcon("foobar");
|
||||
assertTrue(maybeData.isEmpty());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user