diff --git a/ROADMAP.md b/ROADMAP.md index fc35e394..d41c1e1a 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -8,20 +8,10 @@ be implemented as well. Major goals: * Reach 1 billion pages indexed -* Improve technical ability of indexing and search. Although this area has improved a bit, the - search engine is still not very good at dealing with longer queries. -## Proper Position Index (COMPLETED 2024-09) -The search engine uses a fixed width bit mask to indicate word positions. It has the benefit -of being very fast to evaluate and works well for what it is, but is inaccurate and has the -drawback of making support for quoted search terms inaccurate and largely reliant on indexing -word n-grams known beforehand. This limits the ability to interpret longer queries. - -The positions mask should be supplemented or replaced with a more accurate (e.g.) gamma coded positions -list, as is the civilized way of doing this. - -Completed with PR https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99 +* Improve technical ability of indexing and search. ~~Although this area has improved a bit, the + search engine is still not very good at dealing with longer queries.~~ (As of PR [#129](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/129), this has improved significantly. There is still more work to be done ) ## Hybridize crawler w/ Common Crawl data @@ -37,10 +27,15 @@ Retaining the ability to independently crawl the web is still strongly desirable ## Safe Search -The search engine has a bit of a problem showing spicy content mixed in with the results. It would be desirable -to have a way to filter this out. It's likely something like a URL blacklist (e.g. [UT1](https://dsi.ut-capitole.fr/blacklists/index_en.php) ) +The search engine has a bit of a problem showing spicy content mixed in with the results. It would be desirable to have a way to filter this out. It's likely something like a URL blacklist (e.g. [UT1](https://dsi.ut-capitole.fr/blacklists/index_en.php) ) combined with naive bayesian filter would go a long way, or something more sophisticated...? +## Web Design Overhaul + +The design is kinda clunky and hard to maintain, and needlessly outdated-looking. + +In progress: PR [#127](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/127) -- demo available at https://test.marginalia.nu/ + ## Additional Language Support It would be desirable if the search engine supported more languages than English. This is partially about @@ -49,15 +44,6 @@ associated with each language added, at least a models file or two, as well as s It would be very helpful to find a speaker of a large language other than English to help in the fine tuning. -## Finalize RSS support (COMPLETED 2024-11) - -Marginalia has experimental RSS preview support for a few domains. This works well and -it should be extended to all domains. It would also be interesting to offer search of the -RSS data itself, or use the RSS set to feed a special live index that updates faster than the -main dataset. - -Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122) - ## Support for binary formats like PDF The crawler needs to be modified to retain them, and the conversion logic needs to parse them. @@ -74,5 +60,27 @@ This looks like a good idea that wouldn't just help clean up the search filters website, but might be cheap enough we might go as far as to offer a number of ad-hoc custom search filter for any API consumer. -I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, -which is quite ad-hoc, but instead to work together to find some new common description language for this. +I've talked to the stract dev and he does not think it's a good idea to mimic their optics language, which is quite ad-hoc, but instead to work together to find some new common description language for this. + +# Completed + +## Proper Position Index (COMPLETED 2024-09) + +The search engine uses a fixed width bit mask to indicate word positions. It has the benefit +of being very fast to evaluate and works well for what it is, but is inaccurate and has the +drawback of making support for quoted search terms inaccurate and largely reliant on indexing +word n-grams known beforehand. This limits the ability to interpret longer queries. + +The positions mask should be supplemented or replaced with a more accurate (e.g.) gamma coded positions +list, as is the civilized way of doing this. + +Completed with PR [#99](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/99) + +## Finalize RSS support (COMPLETED 2024-11) + +Marginalia has experimental RSS preview support for a few domains. This works well and +it should be extended to all domains. It would also be interesting to offer search of the +RSS data itself, or use the RSS set to feed a special live index that updates faster than the +main dataset. + +Completed with PR [#122](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/122) and PR [#125](https://github.com/MarginaliaSearch/MarginaliaSearch/pull/125) diff --git a/code/functions/domain-info/api/src/main/protobuf/domain-info.proto b/code/functions/domain-info/api/src/main/protobuf/domain-info.proto index 8be12d11..a4beeacd 100644 --- a/code/functions/domain-info/api/src/main/protobuf/domain-info.proto +++ b/code/functions/domain-info/api/src/main/protobuf/domain-info.proto @@ -101,6 +101,7 @@ message RpcSimilarDomain { bool active = 6; bool screenshot = 7; LINK_TYPE linkType = 8; + bool feed = 9; enum LINK_TYPE { BACKWARD = 0; diff --git a/code/functions/domain-info/java/nu/marginalia/functions/domains/SimilarDomainsService.java b/code/functions/domain-info/java/nu/marginalia/functions/domains/SimilarDomainsService.java index 38d07d0e..cc342530 100644 --- a/code/functions/domain-info/java/nu/marginalia/functions/domains/SimilarDomainsService.java +++ b/code/functions/domain-info/java/nu/marginalia/functions/domains/SimilarDomainsService.java @@ -9,6 +9,7 @@ import gnu.trove.map.hash.TIntIntHashMap; import gnu.trove.set.TIntSet; import gnu.trove.set.hash.TIntHashSet; import it.unimi.dsi.fastutil.ints.Int2DoubleArrayMap; +import nu.marginalia.WmsaHome; import nu.marginalia.api.domains.RpcSimilarDomain; import nu.marginalia.api.domains.model.SimilarDomain; import nu.marginalia.api.linkgraph.AggregateLinkGraphClient; @@ -17,10 +18,14 @@ import org.roaringbitmap.RoaringBitmap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.nio.file.Path; +import java.sql.DriverManager; import java.sql.ResultSet; import java.sql.SQLException; import java.util.ArrayList; +import java.util.HashSet; import java.util.List; +import java.util.Set; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; @@ -32,12 +37,13 @@ public class SimilarDomainsService { private final HikariDataSource dataSource; private final AggregateLinkGraphClient linkGraphClient; - private volatile TIntIntHashMap domainIdToIdx = new TIntIntHashMap(100_000); + private final TIntIntHashMap domainIdToIdx = new TIntIntHashMap(100_000); private volatile int[] domainIdxToId; public volatile Int2DoubleArrayMap[] relatedDomains; public volatile TIntList[] domainNeighbors = null; public volatile RoaringBitmap screenshotDomains = null; + public volatile RoaringBitmap feedDomains = null; public volatile RoaringBitmap activeDomains = null; public volatile RoaringBitmap indexedDomains = null; public volatile TIntDoubleHashMap domainRanks = null; @@ -82,6 +88,7 @@ public class SimilarDomainsService { domainNames = new String[domainIdToIdx.size()]; domainNeighbors = new TIntList[domainIdToIdx.size()]; screenshotDomains = new RoaringBitmap(); + feedDomains = new RoaringBitmap(); activeDomains = new RoaringBitmap(); indexedDomains = new RoaringBitmap(); relatedDomains = new Int2DoubleArrayMap[domainIdToIdx.size()]; @@ -145,10 +152,12 @@ public class SimilarDomainsService { activeDomains.add(idx); } - updateScreenshotInfo(); - logger.info("Loaded {} domains", domainRanks.size()); isReady = true; + + // We can defer these as they only populate a roaringbitmap, and will degrade gracefully when not complete + updateScreenshotInfo(); + updateFeedInfo(); } } catch (SQLException throwables) { @@ -156,6 +165,41 @@ public class SimilarDomainsService { } } + private void updateFeedInfo() { + Set feedsDomainNames = new HashSet<>(500_000); + Path readerDbPath = WmsaHome.getDataPath().resolve("feeds.db").toAbsolutePath(); + String dbUrl = "jdbc:sqlite:" + readerDbPath; + + logger.info("Opening feed db at " + dbUrl); + + try (var conn = DriverManager.getConnection(dbUrl); + var stmt = conn.createStatement()) { + var rs = stmt.executeQuery(""" + select + json_extract(feed, '$.domain') as domain + from feed + """); + while (rs.next()) { + feedsDomainNames.add(rs.getString(1)); + } + } + catch (SQLException ex) { + // + } + + for (int idx = 0; idx < domainNames.length; idx++) { + String name = domainNames[idx]; + if (name == null) { + continue; + } + + if (feedsDomainNames.contains(name)) { + feedDomains.add(idx); + } + } + + } + private void updateScreenshotInfo() { try (var connection = dataSource.getConnection()) { try (var stmt = connection.createStatement()) { @@ -254,6 +298,7 @@ public class SimilarDomainsService { .setIndexed(indexedDomains.contains(idx)) .setActive(activeDomains.contains(idx)) .setScreenshot(screenshotDomains.contains(idx)) + .setFeed(feedDomains.contains(idx)) .setLinkType(RpcSimilarDomain.LINK_TYPE.valueOf(linkType.name())) .build()); @@ -369,6 +414,7 @@ public class SimilarDomainsService { .setIndexed(indexedDomains.contains(idx)) .setActive(activeDomains.contains(idx)) .setScreenshot(screenshotDomains.contains(idx)) + .setFeed(feedDomains.contains(idx)) .setLinkType(RpcSimilarDomain.LINK_TYPE.valueOf(linkType.name())) .build()); diff --git a/code/functions/live-capture/build.gradle b/code/functions/live-capture/build.gradle index 40e33e69..714684c9 100644 --- a/code/functions/live-capture/build.gradle +++ b/code/functions/live-capture/build.gradle @@ -24,6 +24,7 @@ dependencies { implementation project(':code:libraries:message-queue') implementation project(':code:execution:api') + implementation project(':code:processes:crawling-process:ft-content-type') implementation libs.jsoup implementation libs.rssreader diff --git a/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDb.java b/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDb.java index 0fb87e3c..530b81d6 100644 --- a/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDb.java +++ b/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDb.java @@ -8,6 +8,7 @@ import nu.marginalia.rss.model.FeedDefinition; import nu.marginalia.rss.model.FeedItems; import nu.marginalia.service.module.ServiceConfiguration; import org.jetbrains.annotations.NotNull; +import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -127,6 +128,26 @@ public class FeedDb { return FeedItems.none(); } + + @Nullable + public String getEtag(EdgeDomain domain) { + if (!feedDbEnabled) { + throw new IllegalStateException("Feed database is disabled on this node"); + } + + // Capture the current reader to avoid concurrency issues + FeedDbReader reader = this.reader; + try { + if (reader != null) { + return reader.getEtag(domain); + } + } + catch (Exception e) { + logger.error("Error getting etag for " + domain, e); + } + return null; + } + public Optional getFeedAsJson(String domain) { if (!feedDbEnabled) { throw new IllegalStateException("Feed database is disabled on this node"); @@ -214,7 +235,7 @@ public class FeedDb { public Instant getFetchTime() { if (!Files.exists(readerDbPath)) { - return Instant.ofEpochMilli(0); + return Instant.EPOCH; } try { @@ -224,7 +245,23 @@ public class FeedDb { } catch (IOException ex) { logger.error("Failed to read the creatiom time of {}", readerDbPath); - return Instant.ofEpochMilli(0); + return Instant.EPOCH; } } + + public boolean hasData() { + if (!feedDbEnabled) { + throw new IllegalStateException("Feed database is disabled on this node"); + } + + // Capture the current reader to avoid concurrency issues + FeedDbReader reader = this.reader; + + if (reader != null) { + return reader.hasData(); + } + + return false; + } + } diff --git a/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDbReader.java b/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDbReader.java index 9bb02acf..af4c5aa0 100644 --- a/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDbReader.java +++ b/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDbReader.java @@ -8,6 +8,7 @@ import nu.marginalia.rss.model.FeedItems; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import javax.annotation.Nullable; import java.nio.file.Path; import java.sql.Connection; import java.sql.DriverManager; @@ -32,6 +33,7 @@ public class FeedDbReader implements AutoCloseable { try (var stmt = connection.createStatement()) { stmt.executeUpdate("CREATE TABLE IF NOT EXISTS feed (domain TEXT PRIMARY KEY, feed JSON)"); stmt.executeUpdate("CREATE TABLE IF NOT EXISTS errors (domain TEXT PRIMARY KEY, cnt INT DEFAULT 0)"); + stmt.executeUpdate("CREATE TABLE IF NOT EXISTS etags (domain TEXT PRIMARY KEY, etag TEXT)"); } } @@ -106,6 +108,22 @@ public class FeedDbReader implements AutoCloseable { return FeedItems.none(); } + @Nullable + public String getEtag(EdgeDomain domain) { + try (var stmt = connection.prepareStatement("SELECT etag FROM etags WHERE DOMAIN = ?")) { + stmt.setString(1, domain.toString()); + var rs = stmt.executeQuery(); + + if (rs.next()) { + return rs.getString(1); + } + } catch (SQLException e) { + logger.error("Error getting etag for " + domain, e); + } + + return null; + } + private FeedItems deserialize(String string) { return gson.fromJson(string, FeedItems.class); } @@ -141,4 +159,18 @@ public class FeedDbReader implements AutoCloseable { } + public boolean hasData() { + try (var stmt = connection.prepareStatement("SELECT 1 FROM feed LIMIT 1")) { + var rs = stmt.executeQuery(); + if (rs.next()) { + return rs.getBoolean(1); + } + else { + return false; + } + } + catch (SQLException ex) { + return false; + } + } } diff --git a/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDbWriter.java b/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDbWriter.java index bbd6354e..0ca561e2 100644 --- a/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDbWriter.java +++ b/code/functions/live-capture/java/nu/marginalia/rss/db/FeedDbWriter.java @@ -20,6 +20,7 @@ public class FeedDbWriter implements AutoCloseable { private final Connection connection; private final PreparedStatement insertFeedStmt; private final PreparedStatement insertErrorStmt; + private final PreparedStatement insertEtagStmt; private final Path dbPath; private volatile boolean closed = false; @@ -34,10 +35,12 @@ public class FeedDbWriter implements AutoCloseable { try (var stmt = connection.createStatement()) { stmt.executeUpdate("CREATE TABLE IF NOT EXISTS feed (domain TEXT PRIMARY KEY, feed JSON)"); stmt.executeUpdate("CREATE TABLE IF NOT EXISTS errors (domain TEXT PRIMARY KEY, cnt INT DEFAULT 0)"); + stmt.executeUpdate("CREATE TABLE IF NOT EXISTS etags (domain TEXT PRIMARY KEY, etag TEXT)"); } insertFeedStmt = connection.prepareStatement("INSERT INTO feed (domain, feed) VALUES (?, ?)"); insertErrorStmt = connection.prepareStatement("INSERT INTO errors (domain, cnt) VALUES (?, ?)"); + insertEtagStmt = connection.prepareStatement("INSERT INTO etags (domain, etag) VALUES (?, ?)"); } public Path getDbPath() { @@ -56,6 +59,20 @@ public class FeedDbWriter implements AutoCloseable { } } + public synchronized void saveEtag(String domain, String etag) { + if (etag == null || etag.isBlank()) + return; + + try { + insertEtagStmt.setString(1, domain.toLowerCase()); + insertEtagStmt.setString(2, etag); + insertEtagStmt.executeUpdate(); + } + catch (SQLException e) { + logger.error("Error saving etag for " + domain, e); + } + } + public synchronized void setErrorCount(String domain, int count) { try { insertErrorStmt.setString(1, domain); diff --git a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java index 396fee1e..f6c971a2 100644 --- a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java +++ b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java @@ -5,6 +5,8 @@ import com.apptasticsoftware.rssreader.RssReader; import com.google.inject.Inject; import com.opencsv.CSVReader; import nu.marginalia.WmsaHome; +import nu.marginalia.contenttype.ContentType; +import nu.marginalia.contenttype.DocumentBodyToString; import nu.marginalia.executor.client.ExecutorClient; import nu.marginalia.model.EdgeDomain; import nu.marginalia.nodecfg.NodeConfigurationService; @@ -32,9 +34,7 @@ import java.net.http.HttpRequest; import java.net.http.HttpResponse; import java.nio.charset.StandardCharsets; import java.sql.SQLException; -import java.time.Duration; -import java.time.LocalDateTime; -import java.time.ZonedDateTime; +import java.time.*; import java.time.format.DateTimeFormatter; import java.util.*; import java.util.concurrent.Executors; @@ -59,7 +59,6 @@ public class FeedFetcherService { private final DomainLocks domainLocks = new DomainLocks(); private volatile boolean updating; - private boolean deterministic = false; @Inject public FeedFetcherService(FeedDb feedDb, @@ -91,11 +90,6 @@ public class FeedFetcherService { REFRESH }; - /** Disable random-based heuristics. This is meant for testing */ - public void setDeterministic() { - this.deterministic = true; - } - public void updateFeeds(UpdateMode updateMode) throws IOException { if (updating) // Prevent concurrent updates { @@ -135,37 +129,37 @@ public class FeedFetcherService { for (var feed : definitions) { executor.submitQuietly(() -> { try { - var oldData = feedDb.getFeed(new EdgeDomain(feed.domain())); + EdgeDomain domain = new EdgeDomain(feed.domain()); + var oldData = feedDb.getFeed(domain); - // If we have existing data, we might skip updating it with a probability that increases with time, - // this is to avoid hammering the feeds that are updated very rarely and save some time and resources - // on our end + @Nullable + String ifModifiedSinceDate = switch(updateMode) { + case REFRESH -> getIfModifiedSinceDate(feedDb); + case CLEAN -> null; + }; - /* Disable for now: - - if (!oldData.isEmpty()) { - Duration duration = feed.durationSinceUpdated(); - long daysSinceUpdate = duration.toDays(); - - - if (deterministic || (daysSinceUpdate > 2 && ThreadLocalRandom.current() - .nextInt(1, 1 + (int) Math.min(10, daysSinceUpdate) / 2) > 1)) { - // Skip updating this feed, just write the old data back instead - writer.saveFeed(oldData); - return; - } - } - */ + @Nullable + String ifNoneMatchTag = switch (updateMode) { + case REFRESH -> feedDb.getEtag(domain); + case CLEAN -> null; + }; FetchResult feedData; try (DomainLocks.DomainLock domainLock = domainLocks.lockDomain(new EdgeDomain(feed.domain()))) { - feedData = fetchFeedData(feed, client); + feedData = fetchFeedData(feed, client, ifModifiedSinceDate, ifNoneMatchTag); } catch (Exception ex) { feedData = new FetchResult.TransientError(); } switch (feedData) { - case FetchResult.Success(String value) -> writer.saveFeed(parseFeed(value, feed)); + case FetchResult.Success(String value, String etag) -> { + writer.saveEtag(feed.domain(), etag); + writer.saveFeed(parseFeed(value, feed)); + } + case FetchResult.NotModified() -> { + writer.saveEtag(feed.domain(), ifNoneMatchTag); + writer.saveFeed(oldData); + } case FetchResult.TransientError() -> { int errorCount = errorCounts.getOrDefault(feed.domain().toLowerCase(), 0); writer.setErrorCount(feed.domain().toLowerCase(), ++errorCount); @@ -212,30 +206,73 @@ public class FeedFetcherService { } } - private FetchResult fetchFeedData(FeedDefinition feed, HttpClient client) { + @Nullable + static String getIfModifiedSinceDate(FeedDb feedDb) { + + // If the db is fresh, we don't send If-Modified-Since + if (!feedDb.hasData()) + return null; + + Instant cutoffInstant = feedDb.getFetchTime(); + + // If we're unable to establish fetch time, we don't send If-Modified-Since + if (cutoffInstant == Instant.EPOCH) + return null; + + return cutoffInstant.atZone(ZoneId.of("GMT")).format(DateTimeFormatter.RFC_1123_DATE_TIME); + } + + private FetchResult fetchFeedData(FeedDefinition feed, + HttpClient client, + @Nullable String ifModifiedSinceDate, + @Nullable String ifNoneMatchTag) + { try { URI uri = new URI(feed.feedUrl()); - HttpRequest getRequest = HttpRequest.newBuilder() + HttpRequest.Builder requestBuilder = HttpRequest.newBuilder() .GET() .uri(uri) .header("User-Agent", WmsaHome.getUserAgent().uaIdentifier()) + .header("Accept-Encoding", "gzip") .header("Accept", "text/*, */*;q=0.9") .timeout(Duration.ofSeconds(15)) - .build(); + ; + + if (ifModifiedSinceDate != null) { + requestBuilder.header("If-Modified-Since", ifModifiedSinceDate); + } + + if (ifNoneMatchTag != null) { + requestBuilder.header("If-None-Match", ifNoneMatchTag); + } + + HttpRequest getRequest = requestBuilder.build(); for (int i = 0; i < 3; i++) { - var rs = client.send(getRequest, HttpResponse.BodyHandlers.ofString()); - if (429 == rs.statusCode()) { + HttpResponse rs = client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray()); + + if (rs.statusCode() == 429) { // Too Many Requests int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2")); Thread.sleep(Duration.ofSeconds(Math.clamp(retryAfter, 1, 5))); - } else if (200 == rs.statusCode()) { - return new FetchResult.Success(rs.body()); - } else if (404 == rs.statusCode()) { - return new FetchResult.PermanentError(); // never try again - } else { - return new FetchResult.TransientError(); // we try again in a few days + continue; } + + String newEtagValue = rs.headers().firstValue("ETag").orElse(""); + + return switch (rs.statusCode()) { + case 200 -> { + byte[] responseData = getResponseData(rs); + + String contentType = rs.headers().firstValue("Content-Type").orElse(""); + String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), responseData); + + yield new FetchResult.Success(bodyText, newEtagValue); + } + case 304 -> new FetchResult.NotModified(); // via If-Modified-Since semantics + case 404 -> new FetchResult.PermanentError(); // never try again + default -> new FetchResult.TransientError(); // we try again later + }; } } catch (Exception ex) { @@ -245,8 +282,22 @@ public class FeedFetcherService { return new FetchResult.TransientError(); } + private byte[] getResponseData(HttpResponse response) throws IOException { + String encoding = response.headers().firstValue("Content-Encoding").orElse(""); + + if ("gzip".equals(encoding)) { + try (var stream = new GZIPInputStream(new ByteArrayInputStream(response.body()))) { + return stream.readAllBytes(); + } + } + else { + return response.body(); + } + } + public sealed interface FetchResult { - record Success(String value) implements FetchResult {} + record Success(String value, String etag) implements FetchResult {} + record NotModified() implements FetchResult {} record TransientError() implements FetchResult {} record PermanentError() implements FetchResult {} } @@ -316,6 +367,8 @@ public class FeedFetcherService { public FeedItems parseFeed(String feedData, FeedDefinition definition) { try { + feedData = sanitizeEntities(feedData); + List rawItems = rssReader.read( // Massage the data to maximize the possibility of the flaky XML parser consuming it new BOMInputStream(new ByteArrayInputStream(feedData.trim().getBytes(StandardCharsets.UTF_8)), false) @@ -342,6 +395,33 @@ public class FeedFetcherService { } } + private static final Map HTML_ENTITIES = Map.of( + "»", "»", + "«", "«", + "—", "--", + "–", "-", + "’", "'", + "‘", "'", + """, "\"", + " ", "" + ); + + /** The XML parser will blow up if you insert HTML entities in the feed XML, + * which is unfortunately relatively common. Replace them as far as is possible + * with their corresponding characters + */ + static String sanitizeEntities(String feedData) { + String result = feedData; + for (Map.Entry entry : HTML_ENTITIES.entrySet()) { + result = result.replace(entry.getKey(), entry.getValue()); + } + + // Handle lone ampersands not part of a recognized XML entity + result = result.replaceAll("&(?!(amp|lt|gt|apos|quot);)", "&"); + + return result; + } + /** Decide whether to keep URI fragments in the feed items. *

* We keep fragments if there are multiple different fragments in the items. diff --git a/code/functions/live-capture/test/nu/marginalia/rss/svc/FeedFetcherServiceTest.java b/code/functions/live-capture/test/nu/marginalia/rss/svc/FeedFetcherServiceTest.java index 88fb07cf..d5bf025e 100644 --- a/code/functions/live-capture/test/nu/marginalia/rss/svc/FeedFetcherServiceTest.java +++ b/code/functions/live-capture/test/nu/marginalia/rss/svc/FeedFetcherServiceTest.java @@ -96,10 +96,31 @@ class FeedFetcherServiceTest extends AbstractModule { feedDb.switchDb(writer); } - feedFetcherService.setDeterministic(); feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH); - Assertions.assertFalse(feedDb.getFeed(new EdgeDomain("www.marginalia.nu")).isEmpty()); + var result = feedDb.getFeed(new EdgeDomain("www.marginalia.nu")); + System.out.println(result); + Assertions.assertFalse(result.isEmpty()); + } + + @Tag("flaky") + @Test + public void testFetchRepeatedly() throws Exception { + try (var writer = feedDb.createWriter()) { + writer.saveFeed(new FeedItems("www.marginalia.nu", "https://www.marginalia.nu/log/index.xml", "", List.of())); + feedDb.switchDb(writer); + } + + feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH); + Assertions.assertNotNull(feedDb.getEtag(new EdgeDomain("www.marginalia.nu"))); + feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH); + Assertions.assertNotNull(feedDb.getEtag(new EdgeDomain("www.marginalia.nu"))); + feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH); + Assertions.assertNotNull(feedDb.getEtag(new EdgeDomain("www.marginalia.nu"))); + + var result = feedDb.getFeed(new EdgeDomain("www.marginalia.nu")); + System.out.println(result); + Assertions.assertFalse(result.isEmpty()); } @Tag("flaky") @@ -110,7 +131,6 @@ class FeedFetcherServiceTest extends AbstractModule { feedDb.switchDb(writer); } - feedFetcherService.setDeterministic(); feedFetcherService.updateFeeds(FeedFetcherService.UpdateMode.REFRESH); // We forget the feed on a 404 error diff --git a/code/functions/live-capture/test/nu/marginalia/rss/svc/TestXmlSanitization.java b/code/functions/live-capture/test/nu/marginalia/rss/svc/TestXmlSanitization.java new file mode 100644 index 00000000..8cdc5cd4 --- /dev/null +++ b/code/functions/live-capture/test/nu/marginalia/rss/svc/TestXmlSanitization.java @@ -0,0 +1,30 @@ +package nu.marginalia.rss.svc; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +public class TestXmlSanitization { + + @Test + public void testPreservedEntities() { + Assertions.assertEquals("&", FeedFetcherService.sanitizeEntities("&")); + Assertions.assertEquals("<", FeedFetcherService.sanitizeEntities("<")); + Assertions.assertEquals(">", FeedFetcherService.sanitizeEntities(">")); + Assertions.assertEquals("'", FeedFetcherService.sanitizeEntities("'")); + } + + @Test + public void testStrayAmpersand() { + Assertions.assertEquals("Bed & Breakfast", FeedFetcherService.sanitizeEntities("Bed & Breakfast")); + } + + @Test + public void testTranslatedHtmlEntity() { + Assertions.assertEquals("Foo -- Bar", FeedFetcherService.sanitizeEntities("Foo — Bar")); + } + + @Test + public void testTranslatedHtmlEntityQuot() { + Assertions.assertEquals("\"Bob\"", FeedFetcherService.sanitizeEntities(""Bob"")); + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index b8d1f062..c36f410e 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -25,6 +25,7 @@ public class QueryExpansion { this::joinDashes, this::splitWordNum, this::joinTerms, + this::categoryKeywords, this::ngramAll ); @@ -98,6 +99,24 @@ public class QueryExpansion { } } + // Category keyword substitution, e.g. guitar wiki -> guitar generator:wiki + public void categoryKeywords(QWordGraph graph) { + + for (var qw : graph) { + + // Ensure we only perform the substitution on the last word in the query + if (!graph.getNextOriginal(qw).getFirst().isEnd()) { + continue; + } + + switch (qw.word()) { + case "recipe", "recipes" -> graph.addVariant(qw, "category:food"); + case "forum" -> graph.addVariant(qw, "generator:forum"); + case "wiki" -> graph.addVariant(qw, "generator:wiki"); + } + } + } + // Turn 'lawn chair' into 'lawnchair' public void joinTerms(QWordGraph graph) { QWord prev = null; diff --git a/code/functions/search-query/java/nu/marginalia/util/language/EnglishDictionary.java b/code/functions/search-query/java/nu/marginalia/util/language/EnglishDictionary.java deleted file mode 100644 index 56e90701..00000000 --- a/code/functions/search-query/java/nu/marginalia/util/language/EnglishDictionary.java +++ /dev/null @@ -1,165 +0,0 @@ -package nu.marginalia.util.language; - -import com.google.inject.Inject; -import nu.marginalia.term_frequency_dict.TermFrequencyDict; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.BufferedReader; -import java.io.InputStreamReader; -import java.util.*; -import java.util.regex.Pattern; -import java.util.stream.Collectors; - -public class EnglishDictionary { - private final Set englishWords = new HashSet<>(); - private final TermFrequencyDict tfDict; - private final Logger logger = LoggerFactory.getLogger(getClass()); - - @Inject - public EnglishDictionary(TermFrequencyDict tfDict) { - this.tfDict = tfDict; - try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-words"), - "Could not load word frequency table"); - var br = new BufferedReader(new InputStreamReader(resource)) - ) { - for (;;) { - String s = br.readLine(); - if (s == null) { - break; - } - englishWords.add(s.toLowerCase()); - } - } - catch (Exception ex) { - throw new RuntimeException(ex); - } - } - - public boolean isWord(String word) { - return englishWords.contains(word); - } - - private static final Pattern ingPattern = Pattern.compile(".*(\\w)\\1ing$"); - - public Collection getWordVariants(String s) { - var variants = findWordVariants(s); - - var ret = variants.stream() - .filter(var -> tfDict.getTermFreq(var) > 100) - .collect(Collectors.toList()); - - if (s.equals("recipe") || s.equals("recipes")) { - ret.add("category:food"); - } - - return ret; - } - - - public Collection findWordVariants(String s) { - int sl = s.length(); - - if (sl < 2) { - return Collections.emptyList(); - } - if (s.endsWith("s")) { - String a = s.substring(0, sl-1); - String b = s + "es"; - if (isWord(a) && isWord(b)) { - return List.of(a, b); - } - else if (isWord(a)) { - return List.of(a); - } - else if (isWord(b)) { - return List.of(b); - } - } - if (s.endsWith("sm")) { - String a = s.substring(0, sl-1)+"t"; - String b = s.substring(0, sl-1)+"ts"; - if (isWord(a) && isWord(b)) { - return List.of(a, b); - } - else if (isWord(a)) { - return List.of(a); - } - else if (isWord(b)) { - return List.of(b); - } - } - if (s.endsWith("st")) { - String a = s.substring(0, sl-1)+"m"; - String b = s + "s"; - if (isWord(a) && isWord(b)) { - return List.of(a, b); - } - else if (isWord(a)) { - return List.of(a); - } - else if (isWord(b)) { - return List.of(b); - } - } - else if (ingPattern.matcher(s).matches() && sl > 4) { // humming, clapping - var a = s.substring(0, sl-4); - var b = s.substring(0, sl-3) + "ed"; - - if (isWord(a) && isWord(b)) { - return List.of(a, b); - } - else if (isWord(a)) { - return List.of(a); - } - else if (isWord(b)) { - return List.of(b); - } - } - else { - String a = s + "s"; - String b = ingForm(s); - String c = s + "ed"; - - if (isWord(a) && isWord(b) && isWord(c)) { - return List.of(a, b, c); - } - else if (isWord(a) && isWord(b)) { - return List.of(a, b); - } - else if (isWord(b) && isWord(c)) { - return List.of(b, c); - } - else if (isWord(a) && isWord(c)) { - return List.of(a, c); - } - else if (isWord(a)) { - return List.of(a); - } - else if (isWord(b)) { - return List.of(b); - } - else if (isWord(c)) { - return List.of(c); - } - } - - return Collections.emptyList(); - } - - public String ingForm(String s) { - if (s.endsWith("t") && !s.endsWith("tt")) { - return s + "ting"; - } - if (s.endsWith("n") && !s.endsWith("nn")) { - return s + "ning"; - } - if (s.endsWith("m") && !s.endsWith("mm")) { - return s + "ming"; - } - if (s.endsWith("r") && !s.endsWith("rr")) { - return s + "ring"; - } - return s + "ing"; - } -} diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index 0adb2f56..b94bf77d 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -12,6 +12,7 @@ import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.index.query.limit.SpecificationLimitType; import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.term_frequency_dict.TermFrequencyDict; +import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; @@ -207,6 +208,17 @@ public class QueryFactoryTest { System.out.println(subquery); } + @Test + public void testExpansion9() { + var subquery = parseAndGetSpecs("pie recipe"); + + Assertions.assertTrue(subquery.query.compiledQuery.contains(" category:food ")); + + subquery = parseAndGetSpecs("recipe pie"); + + Assertions.assertFalse(subquery.query.compiledQuery.contains(" category:food ")); + } + @Test public void testParsing() { var subquery = parseAndGetSpecs("strlen()"); diff --git a/code/libraries/btree/test/nu/marginalia/btree/BTreeWriterTest.java b/code/libraries/btree/test/nu/marginalia/btree/BTreeWriterTest.java index dcb802b7..2cad98fc 100644 --- a/code/libraries/btree/test/nu/marginalia/btree/BTreeWriterTest.java +++ b/code/libraries/btree/test/nu/marginalia/btree/BTreeWriterTest.java @@ -85,7 +85,7 @@ class BTreeWriterTest { public void testWriteEntrySize2() throws IOException { BTreeContext ctx = new BTreeContext(4, 2, BTreeBlockSize.BS_64); - var tempFile = Files.createTempFile(Path.of("/tmp"), "tst", "dat"); + var tempFile = Files.createTempFile("tst", "dat"); int[] data = generateItems32(64); diff --git a/code/processes/converting-process/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java b/code/processes/converting-process/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java index 900464c5..21f7ab35 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/model/ProcessedDocumentDetails.java @@ -25,12 +25,11 @@ public class ProcessedDocumentDetails { public List linksInternal; public List linksExternal; - public List feedLinks; public DocumentMetadata metadata; public GeneratorType generator; public String toString() { - return "ProcessedDocumentDetails(title=" + this.title + ", description=" + this.description + ", pubYear=" + this.pubYear + ", length=" + this.length + ", quality=" + this.quality + ", hashCode=" + this.hashCode + ", features=" + this.features + ", standard=" + this.standard + ", linksInternal=" + this.linksInternal + ", linksExternal=" + this.linksExternal + ", feedLinks=" + this.feedLinks + ", metadata=" + this.metadata + ", generator=" + this.generator + ")"; + return "ProcessedDocumentDetails(title=" + this.title + ", description=" + this.description + ", pubYear=" + this.pubYear + ", length=" + this.length + ", quality=" + this.quality + ", hashCode=" + this.hashCode + ", features=" + this.features + ", standard=" + this.standard + ", linksInternal=" + this.linksInternal + ", linksExternal=" + this.linksExternal + ", metadata=" + this.metadata + ", generator=" + this.generator + ")"; } } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/links/LinkProcessor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/links/LinkProcessor.java index 3f706caa..c543e0a8 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/links/LinkProcessor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/links/LinkProcessor.java @@ -34,7 +34,6 @@ public class LinkProcessor { ret.linksExternal = new ArrayList<>(); ret.linksInternal = new ArrayList<>(); - ret.feedLinks = new ArrayList<>(); } public Set getSeenUrls() { @@ -72,19 +71,6 @@ public class LinkProcessor { } } - /** Accepts a link as a feed link */ - public void acceptFeed(EdgeUrl link) { - if (!isLinkPermitted(link)) { - return; - } - - if (!seenUrls.add(link)) { - return; - } - - ret.feedLinks.add(link); - } - private boolean isLinkPermitted(EdgeUrl link) { if (!permittedSchemas.contains(link.proto.toLowerCase())) { return false; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 94056f03..bc51e472 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -294,11 +294,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin for (var meta : doc.select("meta[http-equiv=refresh]")) { linkParser.parseMetaRedirect(baseUrl, meta).ifPresent(lp::accept); } - for (var link : doc.select("link[rel=alternate]")) { - feedExtractor - .getFeedFromAlternateTag(baseUrl, link) - .ifPresent(lp::acceptFeed); - } words.addAllSyntheticTerms(FileLinks.createFileLinkKeywords(lp, domain)); words.addAllSyntheticTerms(FileLinks.createFileEndingKeywords(doc)); diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java index 2007a5ed..23f444a9 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java @@ -125,7 +125,6 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP /* These are assumed to be populated */ ret.linksInternal = new ArrayList<>(); ret.linksExternal = new ArrayList<>(); - ret.feedLinks = new ArrayList<>(); return new DetailsWithWords(ret, words); } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java index a0026949..432c0b75 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java @@ -166,7 +166,6 @@ public class StackexchangeSideloader implements SideloadSource { ret.details.length = 128; ret.details.standard = HtmlStandard.HTML5; - ret.details.feedLinks = List.of(); ret.details.linksExternal = List.of(); ret.details.linksInternal = List.of(); ret.state = UrlIndexingState.OK; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java b/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java index 06331612..2fef520d 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java @@ -178,7 +178,6 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter public void writeDomainData(ProcessedDomain domain) throws IOException { DomainMetadata metadata = DomainMetadata.from(domain); - List feeds = getFeedUrls(domain); domainWriter.write( new SlopDomainRecord( @@ -188,25 +187,11 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter metadata.visited(), Optional.ofNullable(domain.state).map(DomainIndexingState::toString).orElse(""), Optional.ofNullable(domain.redirect).map(EdgeDomain::toString).orElse(""), - domain.ip, - feeds + domain.ip ) ); } - private List getFeedUrls(ProcessedDomain domain) { - var documents = domain.documents; - if (documents == null) - return List.of(); - - return documents.stream().map(doc -> doc.details) - .filter(Objects::nonNull) - .flatMap(dets -> dets.feedLinks.stream()) - .distinct() - .map(EdgeUrl::toString) - .toList(); - } - public void close() throws IOException { domainWriter.close(); documentWriter.close(); diff --git a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainRecord.java b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainRecord.java index 820d0c7f..6f76c756 100644 --- a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainRecord.java +++ b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainRecord.java @@ -1,7 +1,6 @@ package nu.marginalia.model.processed; import nu.marginalia.slop.SlopTable; -import nu.marginalia.slop.column.array.ObjectArrayColumn; import nu.marginalia.slop.column.primitive.IntColumn; import nu.marginalia.slop.column.string.EnumColumn; import nu.marginalia.slop.column.string.TxtStringColumn; @@ -10,7 +9,6 @@ import nu.marginalia.slop.desc.StorageType; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Path; -import java.util.List; import java.util.function.Consumer; public record SlopDomainRecord( @@ -20,8 +18,7 @@ public record SlopDomainRecord( int visitedUrls, String state, String redirectDomain, - String ip, - List rssFeeds) + String ip) { public record DomainWithIpProjection( @@ -38,9 +35,6 @@ public record SlopDomainRecord( private static final IntColumn goodUrlsColumn = new IntColumn("goodUrls", StorageType.PLAIN); private static final IntColumn visitedUrlsColumn = new IntColumn("visitedUrls", StorageType.PLAIN); - private static final ObjectArrayColumn rssFeedsColumn = new TxtStringColumn("rssFeeds", StandardCharsets.UTF_8, StorageType.GZIP).asArray(); - - public static class DomainNameReader extends SlopTable { private final TxtStringColumn.Reader domainsReader; @@ -101,8 +95,6 @@ public record SlopDomainRecord( private final IntColumn.Reader goodUrlsReader; private final IntColumn.Reader visitedUrlsReader; - private final ObjectArrayColumn.Reader rssFeedsReader; - public Reader(SlopTable.Ref ref) throws IOException { super(ref); @@ -114,8 +106,6 @@ public record SlopDomainRecord( knownUrlsReader = knownUrlsColumn.open(this); goodUrlsReader = goodUrlsColumn.open(this); visitedUrlsReader = visitedUrlsColumn.open(this); - - rssFeedsReader = rssFeedsColumn.open(this); } public Reader(Path baseDir, int page) throws IOException { @@ -140,8 +130,7 @@ public record SlopDomainRecord( visitedUrlsReader.get(), statesReader.get(), redirectReader.get(), - ipReader.get(), - rssFeedsReader.get() + ipReader.get() ); } } @@ -156,8 +145,6 @@ public record SlopDomainRecord( private final IntColumn.Writer goodUrlsWriter; private final IntColumn.Writer visitedUrlsWriter; - private final ObjectArrayColumn.Writer rssFeedsWriter; - public Writer(Path baseDir, int page) throws IOException { super(baseDir, page); @@ -169,8 +156,6 @@ public record SlopDomainRecord( knownUrlsWriter = knownUrlsColumn.create(this); goodUrlsWriter = goodUrlsColumn.create(this); visitedUrlsWriter = visitedUrlsColumn.create(this); - - rssFeedsWriter = rssFeedsColumn.create(this); } public void write(SlopDomainRecord record) throws IOException { @@ -182,8 +167,6 @@ public record SlopDomainRecord( knownUrlsWriter.put(record.knownUrls()); goodUrlsWriter.put(record.goodUrls()); visitedUrlsWriter.put(record.visitedUrls()); - - rssFeedsWriter.put(record.rssFeeds()); } } } diff --git a/code/processes/converting-process/model/test/nu/marginalia/model/processed/SlopDomainRecordTest.java b/code/processes/converting-process/model/test/nu/marginalia/model/processed/SlopDomainRecordTest.java index f4d7e0f0..6dc12cc0 100644 --- a/code/processes/converting-process/model/test/nu/marginalia/model/processed/SlopDomainRecordTest.java +++ b/code/processes/converting-process/model/test/nu/marginalia/model/processed/SlopDomainRecordTest.java @@ -9,7 +9,6 @@ import org.junit.jupiter.api.Test; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.List; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -35,8 +34,7 @@ public class SlopDomainRecordTest { 1, 2, 3, "state", "redirectDomain", - "192.168.0.1", - List.of("rss1", "rss2") + "192.168.0.1" ); try (var writer = new SlopDomainRecord.Writer(testDir, 0)) { diff --git a/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index e32bdbf6..9f4fe2f9 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -7,6 +7,7 @@ import nu.marginalia.WmsaHome; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.crawl.CrawlerMain; +import nu.marginalia.crawl.DomainStateDb; import nu.marginalia.crawl.fetcher.HttpFetcher; import nu.marginalia.crawl.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.fetcher.warc.WarcRecorder; @@ -46,6 +47,7 @@ public class CrawlingThenConvertingIntegrationTest { private Path fileName; private Path fileName2; + private Path dbTempFile; @BeforeAll public static void setUpAll() { @@ -63,16 +65,18 @@ public class CrawlingThenConvertingIntegrationTest { httpFetcher = new HttpFetcherImpl(WmsaHome.getUserAgent().uaString()); this.fileName = Files.createTempFile("crawling-then-converting", ".warc.gz"); this.fileName2 = Files.createTempFile("crawling-then-converting", ".warc.gz"); + this.dbTempFile = Files.createTempFile("domains", "db"); } @AfterEach public void tearDown() throws IOException { Files.deleteIfExists(fileName); Files.deleteIfExists(fileName2); + Files.deleteIfExists(dbTempFile); } @Test - public void testInvalidDomain() throws IOException { + public void testInvalidDomain() throws Exception { // Attempt to fetch an invalid domain var specs = new CrawlerMain.CrawlSpecRecord("invalid.invalid.invalid", 10); @@ -88,7 +92,7 @@ public class CrawlingThenConvertingIntegrationTest { } @Test - public void testRedirectingDomain() throws IOException { + public void testRedirectingDomain() throws Exception { // Attempt to fetch an invalid domain var specs = new CrawlerMain.CrawlSpecRecord("memex.marginalia.nu", 10); @@ -107,7 +111,7 @@ public class CrawlingThenConvertingIntegrationTest { } @Test - public void testBlockedDomain() throws IOException { + public void testBlockedDomain() throws Exception { // Attempt to fetch an invalid domain var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 10); @@ -124,7 +128,7 @@ public class CrawlingThenConvertingIntegrationTest { } @Test - public void crawlSunnyDay() throws IOException { + public void crawlSunnyDay() throws Exception { var specs = new CrawlerMain.CrawlSpecRecord("www.marginalia.nu", 10); CrawledDomain domain = crawl(specs); @@ -157,7 +161,7 @@ public class CrawlingThenConvertingIntegrationTest { @Test - public void crawlContentTypes() throws IOException { + public void crawlContentTypes() throws Exception { var specs = new CrawlerMain.CrawlSpecRecord("www.marginalia.nu", 10, List.of( "https://www.marginalia.nu/sanic.png", @@ -195,7 +199,7 @@ public class CrawlingThenConvertingIntegrationTest { @Test - public void crawlRobotsTxt() throws IOException { + public void crawlRobotsTxt() throws Exception { var specs = new CrawlerMain.CrawlSpecRecord("search.marginalia.nu", 5, List.of("https://search.marginalia.nu/search?q=hello+world") ); @@ -235,15 +239,17 @@ public class CrawlingThenConvertingIntegrationTest { return null; // unreachable } } - private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs) throws IOException { + private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs) throws Exception { return crawl(specs, domain -> true); } - private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate domainBlacklist) throws IOException { + private CrawledDomain crawl(CrawlerMain.CrawlSpecRecord specs, Predicate domainBlacklist) throws Exception { List data = new ArrayList<>(); - try (var recorder = new WarcRecorder(fileName)) { - new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).crawlDomain(); + try (var recorder = new WarcRecorder(fileName); + var db = new DomainStateDb(dbTempFile)) + { + new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, db, recorder).crawlDomain(); } CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain(), diff --git a/code/processes/crawling-process/build.gradle b/code/processes/crawling-process/build.gradle index e955f86c..fa7579f3 100644 --- a/code/processes/crawling-process/build.gradle +++ b/code/processes/crawling-process/build.gradle @@ -46,6 +46,8 @@ dependencies { implementation libs.notnull implementation libs.guava + implementation libs.sqlite + implementation dependencies.create(libs.guice.get()) { exclude group: 'com.google.guava' } diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java index e7fbe4f9..01204b24 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java @@ -241,6 +241,7 @@ public class CrawlerMain extends ProcessMainClass { // Set up the work log and the warc archiver so we can keep track of what we've done try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler.log")); + DomainStateDb domainStateDb = new DomainStateDb(outputDir.resolve("domainstate.db")); WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir); AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(domainsToCrawl) ) { @@ -258,6 +259,7 @@ public class CrawlerMain extends ProcessMainClass { anchorTagsSource, outputDir, warcArchiver, + domainStateDb, workLog); if (pendingCrawlTasks.putIfAbsent(crawlSpec.domain(), task) == null) { @@ -299,11 +301,12 @@ public class CrawlerMain extends ProcessMainClass { heartbeat.start(); try (WorkLog workLog = new WorkLog(outputDir.resolve("crawler-" + targetDomainName.replace('/', '-') + ".log")); + DomainStateDb domainStateDb = new DomainStateDb(outputDir.resolve("domainstate.db")); WarcArchiverIf warcArchiver = warcArchiverFactory.get(outputDir); AnchorTagsSource anchorTagsSource = anchorTagsSourceFactory.create(List.of(new EdgeDomain(targetDomainName))) ) { var spec = new CrawlSpecRecord(targetDomainName, 1000, List.of()); - var task = new CrawlTask(spec, anchorTagsSource, outputDir, warcArchiver, workLog); + var task = new CrawlTask(spec, anchorTagsSource, outputDir, warcArchiver, domainStateDb, workLog); task.run(); } catch (Exception ex) { @@ -324,18 +327,21 @@ public class CrawlerMain extends ProcessMainClass { private final AnchorTagsSource anchorTagsSource; private final Path outputDir; private final WarcArchiverIf warcArchiver; + private final DomainStateDb domainStateDb; private final WorkLog workLog; CrawlTask(CrawlSpecRecord specification, AnchorTagsSource anchorTagsSource, Path outputDir, WarcArchiverIf warcArchiver, + DomainStateDb domainStateDb, WorkLog workLog) { this.specification = specification; this.anchorTagsSource = anchorTagsSource; this.outputDir = outputDir; this.warcArchiver = warcArchiver; + this.domainStateDb = domainStateDb; this.workLog = workLog; this.domain = specification.domain(); @@ -359,7 +365,7 @@ public class CrawlerMain extends ProcessMainClass { } try (var warcRecorder = new WarcRecorder(newWarcFile); // write to a temp file for now - var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, warcRecorder); + var retriever = new CrawlerRetreiver(fetcher, domainProber, specification, domainStateDb, warcRecorder); CrawlDataReference reference = getReference(); ) { diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/DomainStateDb.java b/code/processes/crawling-process/java/nu/marginalia/crawl/DomainStateDb.java new file mode 100644 index 00000000..0824c3fe --- /dev/null +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/DomainStateDb.java @@ -0,0 +1,127 @@ +package nu.marginalia.crawl; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; +import java.nio.file.Path; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; +import java.time.Instant; +import java.util.Optional; + +/** Supplemental sqlite database for storing the summary of a crawl. + * One database exists per crawl data set. + * */ +public class DomainStateDb implements AutoCloseable { + + private static final Logger logger = LoggerFactory.getLogger(DomainStateDb.class); + + private final Connection connection; + + public record SummaryRecord( + String domainName, + Instant lastUpdated, + String state, + @Nullable String stateDesc, + @Nullable String feedUrl + ) + { + public static SummaryRecord forSuccess(String domainName) { + return new SummaryRecord(domainName, Instant.now(), "OK", null, null); + } + + public static SummaryRecord forSuccess(String domainName, String feedUrl) { + return new SummaryRecord(domainName, Instant.now(), "OK", null, feedUrl); + } + + public static SummaryRecord forError(String domainName, String state, String stateDesc) { + return new SummaryRecord(domainName, Instant.now(), state, stateDesc, null); + } + + public boolean equals(Object other) { + if (other == this) { + return true; + } + if (!(other instanceof SummaryRecord(String name, Instant updated, String state1, String desc, String url))) { + return false; + } + return domainName.equals(name) && + lastUpdated.toEpochMilli() == updated.toEpochMilli() && + state.equals(state1) && + (stateDesc == null ? desc == null : stateDesc.equals(desc)) && + (feedUrl == null ? url == null : feedUrl.equals(url)); + } + + public int hashCode() { + return domainName.hashCode() + Long.hashCode(lastUpdated.toEpochMilli()); + } + + } + + public DomainStateDb(Path filename) throws SQLException { + String sqliteDbString = "jdbc:sqlite:" + filename.toString(); + connection = DriverManager.getConnection(sqliteDbString); + + try (var stmt = connection.createStatement()) { + stmt.executeUpdate(""" + CREATE TABLE IF NOT EXISTS summary ( + domain TEXT PRIMARY KEY, + lastUpdatedEpochMs LONG NOT NULL, + state TEXT NOT NULL, + stateDesc TEXT, + feedUrl TEXT + ) + """); + + stmt.execute("PRAGMA journal_mode=WAL"); + } + } + + @Override + public void close() throws SQLException { + connection.close(); + } + + + public void save(SummaryRecord record) { + try (var stmt = connection.prepareStatement(""" + INSERT OR REPLACE INTO summary (domain, lastUpdatedEpochMs, state, stateDesc, feedUrl) + VALUES (?, ?, ?, ?, ?) + """)) { + stmt.setString(1, record.domainName()); + stmt.setLong(2, record.lastUpdated().toEpochMilli()); + stmt.setString(3, record.state()); + stmt.setString(4, record.stateDesc()); + stmt.setString(5, record.feedUrl()); + stmt.executeUpdate(); + } catch (SQLException e) { + logger.error("Failed to insert summary record", e); + } + } + + public Optional get(String domainName) { + try (var stmt = connection.prepareStatement(""" + SELECT domain, lastUpdatedEpochMs, state, stateDesc, feedUrl + FROM summary + WHERE domain = ? + """)) { + stmt.setString(1, domainName); + var rs = stmt.executeQuery(); + if (rs.next()) { + return Optional.of(new SummaryRecord( + rs.getString("domain"), + Instant.ofEpochMilli(rs.getLong("lastUpdatedEpochMs")), + rs.getString("state"), + rs.getString("stateDesc"), + rs.getString("feedUrl") + )); + } + } catch (SQLException e) { + logger.error("Failed to get summary record", e); + } + + return Optional.empty(); + } +} diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/ContentTags.java b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/ContentTags.java index c8cddc4e..29a2b101 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/ContentTags.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/ContentTags.java @@ -20,34 +20,11 @@ public record ContentTags(String etag, String lastMod) { public void paint(Request.Builder getBuilder) { if (etag != null) { - getBuilder.addHeader("If-None-Match", ifNoneMatch()); + getBuilder.addHeader("If-None-Match", etag); } if (lastMod != null) { - getBuilder.addHeader("If-Modified-Since", ifModifiedSince()); + getBuilder.addHeader("If-Modified-Since", lastMod); } } - - private String ifNoneMatch() { - // Remove the W/ prefix if it exists - - //'W/' (case-sensitive) indicates that a weak validator is used. Weak etags are - // easy to generate, but are far less useful for comparisons. Strong validators - // are ideal for comparisons but can be very difficult to generate efficiently. - // Weak ETag values of two representations of the same resources might be semantically - // equivalent, but not byte-for-byte identical. This means weak etags prevent caching - // when byte range requests are used, but strong etags mean range requests can - // still be cached. - // - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/ETag - - if (null != etag && etag.startsWith("W/")) { - return etag.substring(2); - } else { - return etag; - } - } - - private String ifModifiedSince() { - return lastMod; - } } diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java index 62560d83..5ac9cf21 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/HttpFetcherImpl.java @@ -139,7 +139,7 @@ public class HttpFetcherImpl implements HttpFetcher { public ContentTypeProbeResult probeContentType(EdgeUrl url, WarcRecorder warcRecorder, ContentTags tags) throws RateLimitException { - if (tags.isEmpty()) { + if (tags.isEmpty() && contentTypeLogic.isUrlLikeBinary(url)) { var headBuilder = new Request.Builder().head() .addHeader("User-agent", userAgentString) .addHeader("Accept-Encoding", "gzip") diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java index 2c099e30..06ba3719 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java @@ -34,8 +34,9 @@ import java.util.*; public class WarcRecorder implements AutoCloseable { /** Maximum time we'll wait on a single request */ static final int MAX_TIME = 30_000; - /** Maximum (decompressed) size we'll fetch */ - static final int MAX_SIZE = 1024 * 1024 * 10; + + /** Maximum (decompressed) size we'll save */ + static final int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024); private final WarcWriter writer; private final Path warcFile; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index ace2059b..195088bb 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -4,6 +4,7 @@ import crawlercommons.robots.SimpleRobotRules; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.contenttype.ContentType; import nu.marginalia.crawl.CrawlerMain; +import nu.marginalia.crawl.DomainStateDb; import nu.marginalia.crawl.fetcher.ContentTags; import nu.marginalia.crawl.fetcher.HttpFetcher; import nu.marginalia.crawl.fetcher.HttpFetcherImpl; @@ -16,7 +17,9 @@ import nu.marginalia.ip_blocklist.UrlBlocklist; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.body.DocumentBodyExtractor; import nu.marginalia.model.body.HttpFetchResult; +import nu.marginalia.model.crawldata.CrawlerDomainStatus; import org.jsoup.Jsoup; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -46,6 +49,7 @@ public class CrawlerRetreiver implements AutoCloseable { private final DomainProber domainProber; private final DomainCrawlFrontier crawlFrontier; + private final DomainStateDb domainStateDb; private final WarcRecorder warcRecorder; private final CrawlerRevisitor crawlerRevisitor; @@ -55,8 +59,10 @@ public class CrawlerRetreiver implements AutoCloseable { public CrawlerRetreiver(HttpFetcher fetcher, DomainProber domainProber, CrawlerMain.CrawlSpecRecord specs, + DomainStateDb domainStateDb, WarcRecorder warcRecorder) { + this.domainStateDb = domainStateDb; this.warcRecorder = warcRecorder; this.fetcher = fetcher; this.domainProber = domainProber; @@ -90,8 +96,21 @@ public class CrawlerRetreiver implements AutoCloseable { try { // Do an initial domain probe to determine the root URL EdgeUrl rootUrl; - if (probeRootUrl() instanceof HttpFetcher.DomainProbeResult.Ok ok) rootUrl = ok.probedUrl(); - else return 1; + + var probeResult = probeRootUrl(); + switch (probeResult) { + case HttpFetcher.DomainProbeResult.Ok(EdgeUrl probedUrl) -> { + rootUrl = probedUrl; // Good track + } + case HttpFetcher.DomainProbeResult.Redirect(EdgeDomain domain1) -> { + domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, "Redirect", domain1.toString())); + return 1; + } + case HttpFetcher.DomainProbeResult.Error(CrawlerDomainStatus status, String desc) -> { + domainStateDb.save(DomainStateDb.SummaryRecord.forError(domain, status.toString(), desc)); + return 1; + } + } // Sleep after the initial probe, we don't have access to the robots.txt yet // so we don't know the crawl delay @@ -114,7 +133,8 @@ public class CrawlerRetreiver implements AutoCloseable { delayTimer.waitFetchDelay(0); // initial delay after robots.txt - sniffRootDocument(rootUrl, delayTimer); + DomainStateDb.SummaryRecord summaryRecord = sniffRootDocument(rootUrl, delayTimer); + domainStateDb.save(summaryRecord); // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified if (crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer) > 0) { @@ -196,7 +216,9 @@ public class CrawlerRetreiver implements AutoCloseable { return domainProbeResult; } - private void sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) { + private DomainStateDb.SummaryRecord sniffRootDocument(EdgeUrl rootUrl, CrawlDelayTimer timer) { + Optional feedLink = Optional.empty(); + try { var url = rootUrl.withPathAndParam("/", null); @@ -204,11 +226,11 @@ public class CrawlerRetreiver implements AutoCloseable { timer.waitFetchDelay(0); if (!(result instanceof HttpFetchResult.ResultOk ok)) - return; + return DomainStateDb.SummaryRecord.forSuccess(domain); var optDoc = ok.parseDocument(); if (optDoc.isEmpty()) - return; + return DomainStateDb.SummaryRecord.forSuccess(domain); // Sniff the software based on the sample document var doc = optDoc.get(); @@ -216,7 +238,6 @@ public class CrawlerRetreiver implements AutoCloseable { crawlFrontier.enqueueLinksFromDocument(url, doc); EdgeUrl faviconUrl = url.withPathAndParam("/favicon.ico", null); - Optional sitemapUrl = Optional.empty(); for (var link : doc.getElementsByTag("link")) { String rel = link.attr("rel"); @@ -232,23 +253,33 @@ public class CrawlerRetreiver implements AutoCloseable { // Grab the RSS/Atom as a sitemap if it exists if (rel.equalsIgnoreCase("alternate") - && (type.equalsIgnoreCase("application/atom+xml") || type.equalsIgnoreCase("application/atomsvc+xml"))) { + && (type.equalsIgnoreCase("application/atom+xml") + || type.equalsIgnoreCase("application/atomsvc+xml") + || type.equalsIgnoreCase("application/rss+xml") + )) { String href = link.attr("href"); - sitemapUrl = linkParser.parseLink(url, href) - .filter(crawlFrontier::isSameDomain); + feedLink = linkParser.parseLink(url, href) + .filter(crawlFrontier::isSameDomain) + .map(EdgeUrl::toString); } } - // Download the sitemap if available exists - if (sitemapUrl.isPresent()) { - sitemapFetcher.downloadSitemaps(List.of(sitemapUrl.get())); + + if (feedLink.isEmpty()) { + feedLink = guessFeedUrl(timer); + } + + // Download the sitemap if available + if (feedLink.isPresent()) { + sitemapFetcher.downloadSitemaps(List.of(feedLink.get())); timer.waitFetchDelay(0); } // Grab the favicon if it exists fetchWithRetry(faviconUrl, timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty()); timer.waitFetchDelay(0); + } catch (Exception ex) { logger.error("Error configuring link filter", ex); @@ -256,6 +287,74 @@ public class CrawlerRetreiver implements AutoCloseable { finally { crawlFrontier.addVisited(rootUrl); } + + if (feedLink.isPresent()) { + return DomainStateDb.SummaryRecord.forSuccess(domain, feedLink.get()); + } + else { + return DomainStateDb.SummaryRecord.forSuccess(domain); + } + } + + private final List likelyFeedEndpoints = List.of( + "rss.xml", + "atom.xml", + "feed.xml", + "index.xml", + "feed", + "rss", + "atom", + "feeds", + "blog/feed", + "blog/rss" + ); + + private Optional guessFeedUrl(CrawlDelayTimer timer) throws InterruptedException { + var oldDomainStateRecord = domainStateDb.get(domain); + + // If we are already aware of an old feed URL, then we can just revalidate it + if (oldDomainStateRecord.isPresent()) { + var oldRecord = oldDomainStateRecord.get(); + if (oldRecord.feedUrl() != null && validateFeedUrl(oldRecord.feedUrl(), timer)) { + return Optional.of(oldRecord.feedUrl()); + } + } + + for (String endpoint : likelyFeedEndpoints) { + String url = "https://" + domain + "/" + endpoint; + if (validateFeedUrl(url, timer)) { + return Optional.of(url); + } + } + + return Optional.empty(); + } + + private boolean validateFeedUrl(String url, CrawlDelayTimer timer) throws InterruptedException { + var parsedOpt = EdgeUrl.parse(url); + if (parsedOpt.isEmpty()) + return false; + + HttpFetchResult result = fetchWithRetry(parsedOpt.get(), timer, HttpFetcher.ProbeType.DISABLED, ContentTags.empty()); + timer.waitFetchDelay(0); + + if (!(result instanceof HttpFetchResult.ResultOk ok)) { + return false; + } + + // Extract the beginning of the + Optional bodyOpt = DocumentBodyExtractor.asString(ok).getBody(); + if (bodyOpt.isEmpty()) + return false; + String body = bodyOpt.get(); + body = body.substring(0, Math.min(128, body.length())).toLowerCase(); + + if (body.contains(" sitemaps = robotsRules.getSitemaps(); + List urls = robotsRules.getSitemaps(); - List urls = new ArrayList<>(sitemaps.size()); - if (!sitemaps.isEmpty()) { - for (var url : sitemaps) { - EdgeUrl.parse(url).ifPresent(urls::add); - } - } - else { - urls.add(rootUrl.withPathAndParam("/sitemap.xml", null)); + if (urls.isEmpty()) { + urls = List.of(rootUrl.withPathAndParam("/sitemap.xml", null).toString()); } downloadSitemaps(urls); } - public void downloadSitemaps(List urls) { + public void downloadSitemaps(List urls) { Set checkedSitemaps = new HashSet<>(); - for (var url : urls) { + for (var rawUrl : urls) { + Optional parsedUrl = EdgeUrl.parse(rawUrl); + if (parsedUrl.isEmpty()) { + continue; + } + + EdgeUrl url = parsedUrl.get(); + // Let's not download sitemaps from other domains for now if (!crawlFrontier.isSameDomain(url)) { continue; diff --git a/code/processes/crawling-process/model/java/nu/marginalia/io/SerializableCrawlDataStream.java b/code/processes/crawling-process/model/java/nu/marginalia/io/SerializableCrawlDataStream.java index 1d7006d9..ab2a6624 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/io/SerializableCrawlDataStream.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/io/SerializableCrawlDataStream.java @@ -1,11 +1,15 @@ package nu.marginalia.io; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawledDomain; import nu.marginalia.model.crawldata.SerializableCrawlData; import org.jetbrains.annotations.Nullable; import java.io.IOException; import java.nio.file.Path; +import java.util.ArrayList; import java.util.Iterator; +import java.util.List; /** Closable iterator exceptional over serialized crawl data * The data may appear in any order, and the iterator must be closed. @@ -26,6 +30,37 @@ public interface SerializableCrawlDataStream extends AutoCloseable { @Nullable default Path path() { return null; } + /** For tests */ + default List asList() throws IOException { + List data = new ArrayList<>(); + while (hasNext()) { + data.add(next()); + } + return data; + } + + /** For tests */ + default List docsAsList() throws IOException { + List data = new ArrayList<>(); + while (hasNext()) { + if (next() instanceof CrawledDocument doc) { + data.add(doc); + } + } + return data; + } + + /** For tests */ + default List domainsAsList() throws IOException { + List data = new ArrayList<>(); + while (hasNext()) { + if (next() instanceof CrawledDomain domain) { + data.add(domain); + } + } + return data; + } + // Dummy iterator over nothing static SerializableCrawlDataStream empty() { return new SerializableCrawlDataStream() { diff --git a/code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java b/code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java index c38bcb3b..8d33fe00 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java @@ -18,6 +18,7 @@ public class ContentTypeLogic { "application/xhtml", "application/xml", "application/atom+xml", + "application/atomsvc+xml", "application/rss+xml", "application/x-rss+xml", "application/rdf+xml", diff --git a/code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyResult.java b/code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyResult.java index a29e7093..1248ecba 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyResult.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyResult.java @@ -23,6 +23,10 @@ public sealed interface DocumentBodyResult { return mapper.apply(contentType, body); } + public Optional getBody() { + return Optional.of(body); + } + @Override public void ifPresent(ExConsumer consumer) throws Exception { consumer.accept(contentType, body); @@ -41,6 +45,11 @@ public sealed interface DocumentBodyResult { return (DocumentBodyResult) this; } + @Override + public Optional getBody() { + return Optional.empty(); + } + @Override public void ifPresent(ExConsumer consumer) throws Exception { } @@ -49,6 +58,7 @@ public sealed interface DocumentBodyResult { Optional mapOpt(BiFunction mapper); Optional flatMapOpt(BiFunction> mapper); DocumentBodyResult flatMap(BiFunction> mapper); + Optional getBody(); void ifPresent(ExConsumer consumer) throws Exception; diff --git a/code/processes/crawling-process/test/nu/marginalia/crawl/DomainStateDbTest.java b/code/processes/crawling-process/test/nu/marginalia/crawl/DomainStateDbTest.java new file mode 100644 index 00000000..156f6f6d --- /dev/null +++ b/code/processes/crawling-process/test/nu/marginalia/crawl/DomainStateDbTest.java @@ -0,0 +1,66 @@ +package nu.marginalia.crawl; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.time.Instant; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class DomainStateDbTest { + + Path tempFile; + @BeforeEach + void setUp() throws IOException { + tempFile = Files.createTempFile(getClass().getSimpleName(), ".db"); + } + + @AfterEach + void tearDown() throws IOException { + Files.deleteIfExists(tempFile); + } + + @Test + public void testSunnyDay() throws SQLException { + try (var db = new DomainStateDb(tempFile)) { + var allFields = new DomainStateDb.SummaryRecord( + "all.marginalia.nu", + Instant.now(), + "OK", + "Bad address", + "https://www.marginalia.nu/atom.xml" + ); + + var minFields = new DomainStateDb.SummaryRecord( + "min.marginalia.nu", + Instant.now(), + "OK", + null, + null + ); + + db.save(allFields); + db.save(minFields); + + assertEquals(allFields, db.get("all.marginalia.nu").orElseThrow()); + assertEquals(minFields, db.get("min.marginalia.nu").orElseThrow()); + + var updatedAllFields = new DomainStateDb.SummaryRecord( + "all.marginalia.nu", + Instant.now(), + "BAD", + null, + null + ); + + db.save(updatedAllFields); + assertEquals(updatedAllFields, db.get("all.marginalia.nu").orElseThrow()); + } + } + +} \ No newline at end of file diff --git a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java index 486a4550..300c1d0c 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/ContentTypeProberTest.java @@ -42,24 +42,24 @@ class ContentTypeProberTest { port = r.nextInt(10000) + 8000; server = HttpServer.create(new InetSocketAddress("127.0.0.1", port), 10); - server.createContext("/html", exchange -> { + server.createContext("/html.gz", exchange -> { exchange.getResponseHeaders().add("Content-Type", "text/html"); exchange.sendResponseHeaders(200, -1); exchange.close(); }); - server.createContext("/redir", exchange -> { - exchange.getResponseHeaders().add("Location", "/html"); + server.createContext("/redir.gz", exchange -> { + exchange.getResponseHeaders().add("Location", "/html.gz"); exchange.sendResponseHeaders(301, -1); exchange.close(); }); - server.createContext("/bin", exchange -> { + server.createContext("/bin.gz", exchange -> { exchange.getResponseHeaders().add("Content-Type", "application/binary"); exchange.sendResponseHeaders(200, -1); exchange.close(); }); - server.createContext("/timeout", exchange -> { + server.createContext("/timeout.gz", exchange -> { try { Thread.sleep(15_000); } catch (InterruptedException e) { @@ -73,10 +73,10 @@ class ContentTypeProberTest { server.start(); - htmlEndpoint = EdgeUrl.parse("http://localhost:" + port + "/html").get(); - binaryEndpoint = EdgeUrl.parse("http://localhost:" + port + "/bin").get(); - timeoutEndpoint = EdgeUrl.parse("http://localhost:" + port + "/timeout").get(); - htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir").get(); + htmlEndpoint = EdgeUrl.parse("http://localhost:" + port + "/html.gz").get(); + binaryEndpoint = EdgeUrl.parse("http://localhost:" + port + "/bin.gz").get(); + timeoutEndpoint = EdgeUrl.parse("http://localhost:" + port + "/timeout.gz").get(); + htmlRedirEndpoint = EdgeUrl.parse("http://localhost:" + port + "/redir.gz").get(); fetcher = new HttpFetcherImpl("test"); recorder = new WarcRecorder(warcFile); diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index ea9bcf60..aacc0e52 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -2,6 +2,7 @@ package nu.marginalia.crawling.retreival; import crawlercommons.robots.SimpleRobotRules; import nu.marginalia.crawl.CrawlerMain; +import nu.marginalia.crawl.DomainStateDb; import nu.marginalia.crawl.fetcher.ContentTags; import nu.marginalia.crawl.fetcher.HttpFetcher; import nu.marginalia.crawl.fetcher.HttpFetcherImpl; @@ -18,6 +19,7 @@ import nu.marginalia.model.crawldata.SerializableCrawlData; import nu.marginalia.test.CommonTestData; import okhttp3.Headers; import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.mockito.Mockito; import org.slf4j.Logger; @@ -25,6 +27,9 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.net.URISyntaxException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -36,9 +41,14 @@ public class CrawlerMockFetcherTest { Map mockData = new HashMap<>(); HttpFetcher fetcherMock = new MockFetcher(); - + private Path dbTempFile; + @BeforeEach + public void setUp() throws IOException { + dbTempFile = Files.createTempFile("domains","db"); + } @AfterEach - public void tearDown() { + public void tearDown() throws IOException { + Files.deleteIfExists(dbTempFile); mockData.clear(); } @@ -66,15 +76,17 @@ public class CrawlerMockFetcherTest { } - void crawl(CrawlerMain.CrawlSpecRecord spec) throws IOException { - try (var recorder = new WarcRecorder()) { - new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, recorder) + void crawl(CrawlerMain.CrawlSpecRecord spec) throws IOException, SQLException { + try (var recorder = new WarcRecorder(); + var db = new DomainStateDb(dbTempFile) + ) { + new CrawlerRetreiver(fetcherMock, new DomainProber(d -> true), spec, db, recorder) .crawlDomain(); } } @Test - public void testLemmy() throws URISyntaxException, IOException { + public void testLemmy() throws Exception { List out = new ArrayList<>(); registerUrlClasspathData(new EdgeUrl("https://startrek.website/"), "mock-crawl-data/lemmy/index.html"); @@ -85,7 +97,7 @@ public class CrawlerMockFetcherTest { } @Test - public void testMediawiki() throws URISyntaxException, IOException { + public void testMediawiki() throws Exception { List out = new ArrayList<>(); registerUrlClasspathData(new EdgeUrl("https://en.wikipedia.org/"), "mock-crawl-data/mediawiki/index.html"); @@ -94,7 +106,7 @@ public class CrawlerMockFetcherTest { } @Test - public void testDiscourse() throws URISyntaxException, IOException { + public void testDiscourse() throws Exception { List out = new ArrayList<>(); registerUrlClasspathData(new EdgeUrl("https://community.tt-rss.org/"), "mock-crawl-data/discourse/index.html"); diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index edd0de78..01cf8339 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -4,6 +4,7 @@ import nu.marginalia.UserAgent; import nu.marginalia.WmsaHome; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.crawl.CrawlerMain; +import nu.marginalia.crawl.DomainStateDb; import nu.marginalia.crawl.fetcher.HttpFetcher; import nu.marginalia.crawl.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.fetcher.warc.WarcRecorder; @@ -25,6 +26,7 @@ import java.io.RandomAccessFile; import java.net.URISyntaxException; import java.nio.file.Files; import java.nio.file.Path; +import java.sql.SQLException; import java.util.*; import java.util.stream.Collectors; @@ -39,11 +41,13 @@ class CrawlerRetreiverTest { Path tempFileWarc2; Path tempFileParquet2; Path tempFileWarc3; + Path tempFileDb; @BeforeEach public void setUp() throws IOException { httpFetcher = new HttpFetcherImpl("search.marginalia.nu; testing a bit :D"); tempFileParquet1 = Files.createTempFile("crawling-process", ".parquet"); tempFileParquet2 = Files.createTempFile("crawling-process", ".parquet"); + tempFileDb = Files.createTempFile("crawling-process", ".db"); } @@ -505,22 +509,26 @@ class CrawlerRetreiverTest { } private void doCrawlWithReferenceStream(CrawlerMain.CrawlSpecRecord specs, SerializableCrawlDataStream stream) { - try (var recorder = new WarcRecorder(tempFileWarc2)) { - new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder).crawlDomain(new DomainLinks(), + try (var recorder = new WarcRecorder(tempFileWarc2); + var db = new DomainStateDb(tempFileDb) + ) { + new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder).crawlDomain(new DomainLinks(), new CrawlDataReference(stream)); } - catch (IOException ex) { + catch (IOException | SQLException ex) { Assertions.fail(ex); } } @NotNull private DomainCrawlFrontier doCrawl(Path tempFileWarc1, CrawlerMain.CrawlSpecRecord specs) { - try (var recorder = new WarcRecorder(tempFileWarc1)) { - var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, recorder); + try (var recorder = new WarcRecorder(tempFileWarc1); + var db = new DomainStateDb(tempFileDb) + ) { + var crawler = new CrawlerRetreiver(httpFetcher, new DomainProber(d -> true), specs, db, recorder); crawler.crawlDomain(); return crawler.getCrawlFrontier(); - } catch (IOException ex) { + } catch (IOException| SQLException ex) { Assertions.fail(ex); return null; // unreachable } diff --git a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java index f8af9267..bb193e51 100644 --- a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java +++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java @@ -179,6 +179,9 @@ public class LiveCrawlerMain extends ProcessMainClass { EdgeDomain domain = new EdgeDomain(entry.getKey()); List urls = entry.getValue(); + if (urls.isEmpty()) + continue; + fetcher.scheduleRetrieval(domain, urls); } } diff --git a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java index 5253c042..48f379a3 100644 --- a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java +++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java @@ -3,6 +3,8 @@ package nu.marginalia.livecrawler; import crawlercommons.robots.SimpleRobotRules; import crawlercommons.robots.SimpleRobotRulesParser; import nu.marginalia.WmsaHome; +import nu.marginalia.contenttype.ContentType; +import nu.marginalia.contenttype.DocumentBodyToString; import nu.marginalia.crawl.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.logic.DomainLocks; import nu.marginalia.crawl.retreival.CrawlDelayTimer; @@ -16,6 +18,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.annotation.Nullable; +import java.io.ByteArrayInputStream; import java.io.IOException; import java.net.URISyntaxException; import java.net.http.HttpClient; @@ -23,10 +26,12 @@ import java.net.http.HttpHeaders; import java.net.http.HttpRequest; import java.net.http.HttpResponse; import java.time.Duration; +import java.util.ArrayList; import java.util.List; import java.util.Optional; import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; +import java.util.zip.GZIPInputStream; /** A simple link scraper that fetches URLs and stores them in a database, * with no concept of a crawl frontier, WARC output, or other advanced features @@ -43,6 +48,8 @@ public class SimpleLinkScraper implements AutoCloseable { private final Duration readTimeout = Duration.ofSeconds(10); private final DomainLocks domainLocks = new DomainLocks(); + private final static int MAX_SIZE = Integer.getInteger("crawler.maxFetchSize", 10 * 1024 * 1024); + public SimpleLinkScraper(LiveCrawlDataSet dataSet, DbDomainQueries domainQueries, DomainBlacklist domainBlacklist) { @@ -61,52 +68,68 @@ public class SimpleLinkScraper implements AutoCloseable { pool.submitQuietly(() -> retrieveNow(domain, id.getAsInt(), urls)); } - public void retrieveNow(EdgeDomain domain, int domainId, List urls) throws Exception { + public int retrieveNow(EdgeDomain domain, int domainId, List urls) throws Exception { + + EdgeUrl rootUrl = domain.toRootUrlHttps(); + + List relevantUrls = new ArrayList<>(); + + for (var url : urls) { + Optional optParsedUrl = lp.parseLink(rootUrl, url); + if (optParsedUrl.isEmpty()) { + continue; + } + if (dataSet.hasUrl(optParsedUrl.get())) { + continue; + } + relevantUrls.add(optParsedUrl.get()); + } + + if (relevantUrls.isEmpty()) { + return 0; + } + + int fetched = 0; + try (HttpClient client = HttpClient .newBuilder() .connectTimeout(connectTimeout) .followRedirects(HttpClient.Redirect.NEVER) .version(HttpClient.Version.HTTP_2) .build(); - DomainLocks.DomainLock lock = domainLocks.lockDomain(domain) // throttle concurrent access per domain; do not remove + // throttle concurrent access per domain; IDE will complain it's not used, but it holds a semaphore -- do not remove: + DomainLocks.DomainLock lock = domainLocks.lockDomain(domain) ) { - - EdgeUrl rootUrl = domain.toRootUrlHttps(); - SimpleRobotRules rules = fetchRobotsRules(rootUrl, client); if (rules == null) { // I/O error fetching robots.txt // If we can't fetch the robots.txt, - for (var url : urls) { - lp.parseLink(rootUrl, url).ifPresent(this::maybeFlagAsBad); + for (var url : relevantUrls) { + maybeFlagAsBad(url); } - return; + return fetched; } CrawlDelayTimer timer = new CrawlDelayTimer(rules.getCrawlDelay()); - for (var url : urls) { - Optional optParsedUrl = lp.parseLink(rootUrl, url); - if (optParsedUrl.isEmpty()) { - continue; - } - if (dataSet.hasUrl(optParsedUrl.get())) { - continue; - } + for (var parsedUrl : relevantUrls) { - EdgeUrl parsedUrl = optParsedUrl.get(); - if (!rules.isAllowed(url)) { + if (!rules.isAllowed(parsedUrl.toString())) { maybeFlagAsBad(parsedUrl); continue; } switch (fetchUrl(domainId, parsedUrl, timer, client)) { - case FetchResult.Success(int id, EdgeUrl docUrl, String body, String headers) - -> dataSet.saveDocument(id, docUrl, body, headers, ""); + case FetchResult.Success(int id, EdgeUrl docUrl, String body, String headers) -> { + dataSet.saveDocument(id, docUrl, body, headers, ""); + fetched++; + } case FetchResult.Error(EdgeUrl docUrl) -> maybeFlagAsBad(docUrl); } } } + + return fetched; } private void maybeFlagAsBad(EdgeUrl url) { @@ -128,6 +151,7 @@ public class SimpleLinkScraper implements AutoCloseable { var robotsRequest = HttpRequest.newBuilder(rootUrl.withPathAndParam("/robots.txt", null).asURI()) .GET() .header("User-Agent", WmsaHome.getUserAgent().uaString()) + .header("Accept-Encoding","gzip") .timeout(readTimeout); // Fetch the robots.txt @@ -135,9 +159,10 @@ public class SimpleLinkScraper implements AutoCloseable { try { SimpleRobotRulesParser parser = new SimpleRobotRulesParser(); HttpResponse robotsTxt = client.send(robotsRequest.build(), HttpResponse.BodyHandlers.ofByteArray()); + if (robotsTxt.statusCode() == 200) { return parser.parseContent(rootUrl.toString(), - robotsTxt.body(), + getResponseData(robotsTxt), robotsTxt.headers().firstValue("Content-Type").orElse("text/plain"), WmsaHome.getUserAgent().uaIdentifier()); } @@ -161,18 +186,19 @@ public class SimpleLinkScraper implements AutoCloseable { .GET() .header("User-Agent", WmsaHome.getUserAgent().uaString()) .header("Accept", "text/html") + .header("Accept-Encoding", "gzip") .timeout(readTimeout) .build(); try { - HttpResponse response = client.send(request, HttpResponse.BodyHandlers.ofString()); + HttpResponse response = client.send(request, HttpResponse.BodyHandlers.ofByteArray()); // Handle rate limiting by waiting and retrying once if (response.statusCode() == 429) { timer.waitRetryDelay(new HttpFetcherImpl.RateLimitException( response.headers().firstValue("Retry-After").orElse("5") )); - response = client.send(request, HttpResponse.BodyHandlers.ofString()); + response = client.send(request, HttpResponse.BodyHandlers.ofByteArray()); } String contentType = response.headers().firstValue("Content-Type").orElse("").toLowerCase(); @@ -182,12 +208,14 @@ public class SimpleLinkScraper implements AutoCloseable { return new FetchResult.Error(parsedUrl); } - String body = response.body(); - if (body.length() > 1024 * 1024) { + byte[] body = getResponseData(response); + if (body.length > MAX_SIZE) { return new FetchResult.Error(parsedUrl); } - return new FetchResult.Success(domainId, parsedUrl, body, headersToString(response.headers())); + String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), body); + + return new FetchResult.Success(domainId, parsedUrl, bodyText, headersToString(response.headers())); } } catch (IOException ex) { @@ -198,6 +226,19 @@ public class SimpleLinkScraper implements AutoCloseable { return new FetchResult.Error(parsedUrl); } + private byte[] getResponseData(HttpResponse response) throws IOException { + String encoding = response.headers().firstValue("Content-Encoding").orElse(""); + + if ("gzip".equals(encoding)) { + try (var stream = new GZIPInputStream(new ByteArrayInputStream(response.body()))) { + return stream.readAllBytes(); + } + } + else { + return response.body(); + } + } + sealed interface FetchResult { record Success(int domainId, EdgeUrl url, String body, String headers) implements FetchResult {} record Error(EdgeUrl url) implements FetchResult {} diff --git a/code/processes/live-crawling-process/test/nu/marginalia/livecrawler/SimpleLinkScraperTest.java b/code/processes/live-crawling-process/test/nu/marginalia/livecrawler/SimpleLinkScraperTest.java new file mode 100644 index 00000000..74aa51f2 --- /dev/null +++ b/code/processes/live-crawling-process/test/nu/marginalia/livecrawler/SimpleLinkScraperTest.java @@ -0,0 +1,66 @@ +package nu.marginalia.livecrawler; + +import nu.marginalia.db.DomainBlacklistImpl; +import nu.marginalia.io.SerializableCrawlDataStream; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawldata.CrawledDocument; +import org.apache.commons.io.FileUtils; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.List; + +class SimpleLinkScraperTest { + private Path tempDir; + private LiveCrawlDataSet dataSet; + + @BeforeEach + public void setUp() throws IOException, SQLException { + tempDir = Files.createTempDirectory(getClass().getSimpleName()); + dataSet = new LiveCrawlDataSet(tempDir); + } + + + @AfterEach + public void tearDown() throws Exception { + dataSet.close(); + FileUtils.deleteDirectory(tempDir.toFile()); + } + + @Test + public void testRetrieveNow() throws Exception { + var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class)); + int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/")); + Assertions.assertEquals(1, fetched); + + var streams = dataSet.getDataStreams(); + Assertions.assertEquals(1, streams.size()); + + SerializableCrawlDataStream firstStream = streams.iterator().next(); + Assertions.assertTrue(firstStream.hasNext()); + + List documents = firstStream.docsAsList(); + Assertions.assertEquals(1, documents.size()); + Assertions.assertTrue(documents.getFirst().documentBody.startsWith("", "", "127.0.0.1"); + var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class)); + + // If the requested URL is already in the dataSet, we retrieveNow should shortcircuit and not fetch anything + int fetched = scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/")); + Assertions.assertEquals(0, fetched); + } +} \ No newline at end of file diff --git a/code/services-application/search-service/resources/templates/search/index/index-redesign.hdb b/code/services-application/search-service/resources/templates/search/index/index-redesign.hdb new file mode 100644 index 00000000..04b688c1 --- /dev/null +++ b/code/services-application/search-service/resources/templates/search/index/index-redesign.hdb @@ -0,0 +1,14 @@ +
+

Public Beta Available

+
+

+ A redesigned version of the search engine UI is available for beta testing. + Feel free to give it a spin, feedback is welcome! + The old one will also be keep being available if you hate it, + or have compatibility issues. +

+

+ Try it out! +

+
+
\ No newline at end of file diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java index b484b50c..d2336c3e 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java @@ -1,39 +1,19 @@ package nu.marginalia.tools.experiments; import com.google.inject.Inject; -import nu.marginalia.converting.model.ProcessedDocument; -import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.io.SerializableCrawlDataStream; -import nu.marginalia.model.EdgeUrl; import nu.marginalia.tools.Experiment; -import java.util.Comparator; - public class SiteStatisticsExperiment extends Experiment { - private final DomainProcessor domainProcessor; - @Inject - public SiteStatisticsExperiment(DomainProcessor domainProcessor) { - this.domainProcessor = domainProcessor; + public SiteStatisticsExperiment() { } @Override public boolean process(SerializableCrawlDataStream stream) { - var ret = domainProcessor.fullProcessing(stream); - - ret.documents.stream() - .filter(ProcessedDocument::isProcessedFully) - .sorted(Comparator.comparing(doc -> doc.details.metadata.topology())) - .flatMap(doc -> doc.details.feedLinks.stream()) - .map(EdgeUrl::toString) - .min(Comparator.comparing(String::length)) - .ifPresent(url -> { - System.out.printf("\"%s\",\"%s\"\n", ret.domain, url); - }); - return true; } diff --git a/tools/deployment/deployment.py b/tools/deployment/deployment.py old mode 100644 new mode 100755 index f5233d61..deb7e4df --- a/tools/deployment/deployment.py +++ b/tools/deployment/deployment.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + from dataclasses import dataclass import subprocess, os from typing import List, Set, Dict, Optional @@ -220,6 +222,31 @@ def run_gradle_build(targets: str) -> None: if return_code != 0: raise BuildError(service, return_code) + +def find_free_tag() -> str: + cmd = ['git', 'tag'] + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode != 0: + raise RuntimeError(f"Git command failed: {result.stderr}") + + existing_tags = set(result.stdout.splitlines()) + + for i in range(1, 100000): + tag = f'deploy-{i:04d}' + if not tag in existing_tags: + return tag + raise RuntimeError(f"Failed to find a free deployment tag") + +def add_tags(tags: str) -> None: + new_tag = find_free_tag() + + cmd = ['git', 'tag', new_tag, '-am', tags] + result = subprocess.run(cmd) + + if result.returncode != 0: + raise RuntimeError(f"Git command failed: {result.stderr}") + # Example usage: if __name__ == '__main__': # Define service configuration @@ -293,7 +320,9 @@ if __name__ == '__main__': parser = argparse.ArgumentParser( prog='deployment.py', description='Continuous Deployment helper') + parser.add_argument('-v', '--verify', help='Verify the tags are valid, if present', action='store_true') + parser.add_argument('-a', '--add', help='Add the tags provided as a new deployment tag, usually combined with -t', action='store_true') parser.add_argument('-t', '--tag', help='Use the specified tag value instead of the head git tag starting with deploy-') args = parser.parse_args() @@ -314,7 +343,10 @@ if __name__ == '__main__': print("Services to build:", plan.services_to_build) print("Instances to deploy:", [container.name for container in plan.instances_to_deploy]) - if not args.verify: + if args.verify: + if args.add: + add_tags(args.tag) + else: print("\nExecution Plan:") build_and_deploy(plan, SERVICE_CONFIG)