(live-crawler) Keep track of bad URLs

To avoid hammering the same invalid URLs for up to two months, URLs that fail to fetch correctly are on a dice roll added to a bad URLs table, that prevents further attempts at fetching them.
2025-02-23 13:09:00 +00:00 · 2024-11-22 00:55:46 +01:00 · 2024-11-22 00:55:46 +01:00 · 52eb5bc84f
commit 52eb5bc84f
parent 4d23fe6261
3 changed files with 113 additions and 18 deletions
--- a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlDataSet.java
+++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlDataSet.java
@ -34,6 +34,7 @@ public class LiveCrawlDataSet implements AutoCloseable {
        try (var stmt = connection.createStatement()) {
            stmt.execute("CREATE TABLE IF NOT EXISTS urls (url TEXT PRIMARY KEY, domainId LONG, body BLOB, headers BLOB, ip TEXT, timestamp long)");
            stmt.execute("CREATE INDEX IF NOT EXISTS domainIdIndex ON urls (domainId)");
+            stmt.execute("CREATE TABLE IF NOT EXISTS badUrls (url TEXT PRIMARY KEY, timestamp long)");
        }
    }

@ -47,12 +48,24 @@ public class LiveCrawlDataSet implements AutoCloseable {
            stmt.setLong(1, cutoff.toEpochMilli());
            stmt.executeUpdate();
        }
+
+        try (var stmt = connection.prepareStatement("DELETE FROM badUrls WHERE timestamp < ?")) {
+            stmt.setLong(1, cutoff.toEpochMilli());
+            stmt.executeUpdate();
+        }
    }

    /** Check if the given URL is already in the database */
    public boolean hasUrl(String url) throws SQLException {
-        try (var stmt = connection.prepareStatement("SELECT 1 FROM urls WHERE url = ?")) {
+        try (var stmt = connection.prepareStatement("""
+                SELECT 1 FROM urls WHERE urls.url = ?
+                UNION
+                SELECT 1 FROM badUrls WHERE badUrls.url = ?
+                """);
+        ) {
            stmt.setString(1, url);
+            stmt.setString(2, url);
+
            return stmt.executeQuery().next();
        }
    }
@ -79,6 +92,22 @@ public class LiveCrawlDataSet implements AutoCloseable {
        }
    }

+    /** Flag a URL as bad, i.e. it should not be revisited */
+    public void flagAsBad(EdgeUrl url) {
+        try (var stmt = connection.prepareStatement("""
+                INSERT OR IGNORE INTO badUrls (url, timestamp)
+                VALUES (?, ?)
+                """))
+        {
+            stmt.setString(1, url.toString());
+            stmt.setLong(2, Instant.now().toEpochMilli());
+            stmt.executeUpdate();
+        }
+        catch (SQLException ex) {
+            throw new RuntimeException(ex);
+        }
+    }
+
    private byte[] compress(String data) throws IOException {
        // gzip compression
        try (var bos = new ByteArrayOutputStream();
--- a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java
+++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java
@ -11,6 +11,8 @@ import nu.marginalia.link_parser.LinkParser;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.model.EdgeUrl;
 import nu.marginalia.util.SimpleBlockingThreadPool;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;

 import java.io.IOException;
 import java.net.URISyntaxException;
@ -21,12 +23,15 @@ import java.net.http.HttpResponse;
 import java.time.Duration;
 import java.util.List;
 import java.util.Optional;
+import java.util.concurrent.ThreadLocalRandom;
 import java.util.concurrent.TimeUnit;

 /** A simple link scraper that fetches URLs and stores them in a database,
 * with no concept of a crawl frontier, WARC output, or other advanced features
 */
 public class SimpleLinkScraper implements AutoCloseable {
+    private static final Logger logger = LoggerFactory.getLogger(SimpleLinkScraper.class);
+
    private final SimpleBlockingThreadPool pool = new SimpleBlockingThreadPool("LiveCrawler", 32, 10);
    private final LinkParser lp = new LinkParser();
    private final LiveCrawlDataSet dataSet;
@ -81,7 +86,24 @@ public class SimpleLinkScraper implements AutoCloseable {
                    continue;
                }

-                fetchUrl(domainId, parsedUrl, timer, client);
+                switch (fetchUrl(domainId, parsedUrl, timer, client)) {
+                    case FetchResult.Success(int id, EdgeUrl docUrl, String body, String headers)
+                            -> dataSet.saveDocument(id, docUrl, body, headers, "");
+                    case FetchResult.Error(EdgeUrl docUrl) ->
+                    {
+                        // To give bad URLs a chance to be re-fetched, we only flag them as bad
+                        // with a 20% probability.  This will prevent the same bad URL being
+                        // re-fetched over and over again for several months, but still allow
+                        // us to *mostly* re-fetch it if it was just a transient error.
+
+                        // There's of course the chance we immediately flag it as bad on an
+                        // unlucky roll, but you know, that's xcom baby
+                        if (ThreadLocalRandom.current().nextDouble(0, 1) < 0.2) {
+                            dataSet.flagAsBad(docUrl);
+                        }
+                    }
+                }
+
            }
        }
    }
@ -107,36 +129,56 @@ public class SimpleLinkScraper implements AutoCloseable {
        return rules;
    }

-    private void fetchUrl(int domainId, EdgeUrl parsedUrl, CrawlDelayTimer timer, HttpClient client) throws Exception {
+    /** Fetch a URL and store it in the database
+     */
+    private FetchResult fetchUrl(int domainId, EdgeUrl parsedUrl, CrawlDelayTimer timer, HttpClient client) throws Exception {

        timer.waitFetchDelay();

-        // Loop for HTTP 429 retries
-        for (int i = 0; i < 2; i++) {
-            HttpRequest request = HttpRequest.newBuilder(parsedUrl.asURI())
-                    .GET()
-                    .header("User-Agent", WmsaHome.getUserAgent().uaString())
-                    .header("Accept", "text/html")
-                    .timeout(readTimeout)
-                    .build();
+        HttpRequest request = HttpRequest.newBuilder(parsedUrl.asURI())
+                .GET()
+                .header("User-Agent", WmsaHome.getUserAgent().uaString())
+                .header("Accept", "text/html")
+                .timeout(readTimeout)
+                .build();

+        try {
            HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());

+            // Handle rate limiting by waiting and retrying once
            if (response.statusCode() == 429) {
                timer.waitRetryDelay(new HttpFetcherImpl.RateLimitException(
                        response.headers().firstValue("Retry-After").orElse("5")
                ));
-                continue;
+                response = client.send(request, HttpResponse.BodyHandlers.ofString());
            }

            String contentType = response.headers().firstValue("Content-Type").orElse("").toLowerCase();

-            if (response.statusCode() == 200 && contentType.startsWith("text/html")) {
-                dataSet.saveDocument(domainId, parsedUrl, response.body(), headersToString(response.headers()), "");
-            }
+            if (response.statusCode() == 200) {
+                if (!contentType.toLowerCase().startsWith("text/html")) {
+                    return new FetchResult.Error(parsedUrl);
+                }

-            break;
+                String body = response.body();
+                if (body.length() > 1024 * 1024) {
+                    return new FetchResult.Error(parsedUrl);
+                }
+
+                return new FetchResult.Success(domainId, parsedUrl, body, headersToString(response.headers()));
+            }
        }
+        catch (IOException ex) {
+            // We don't want a full stack trace on every error, as it's quite common and very noisy
+            logger.error("Error fetching URL {}: {} {}", parsedUrl, ex.getClass().getSimpleName(), ex.getMessage());
+        }
+
+        return new FetchResult.Error(parsedUrl);
+    }
+
+    sealed interface FetchResult {
+        record Success(int domainId, EdgeUrl url, String body, String headers) implements FetchResult {}
+        record Error(EdgeUrl url) implements FetchResult {}
    }

    private String headersToString(HttpHeaders headers) {
--- a/code/processes/live-crawling-process/test/nu/marginalia/livecrawler/LiveCrawlDataSetTest.java
+++ b/code/processes/live-crawling-process/test/nu/marginalia/livecrawler/LiveCrawlDataSetTest.java
@ -18,8 +18,7 @@ public class LiveCrawlDataSetTest {
    @Test
    public void testGetDataSet() throws Exception {
        Path tempDir = Files.createTempDirectory("live-crawl-data-set-test");
-        try {
-            LiveCrawlDataSet dataSet = new LiveCrawlDataSet(tempDir);
+        try (LiveCrawlDataSet dataSet = new LiveCrawlDataSet(tempDir)) {

            Assertions.assertFalse(dataSet.hasUrl("https://www.example.com/"));
            dataSet.saveDocument(
@ -65,4 +64,29 @@ public class LiveCrawlDataSetTest {
        }
    }

+    @Test
+    public void testHasUrl() throws Exception {
+        Path tempDir = Files.createTempDirectory("live-crawl-data-set-test");
+        try (LiveCrawlDataSet dataSet = new LiveCrawlDataSet(tempDir)) {
+            Assertions.assertFalse(dataSet.hasUrl("https://www.example.com/"));
+            dataSet.saveDocument(
+                    1,
+                    new EdgeUrl("https://www.example.com/saved"),
+                    "test",
+                    "test",
+                    "test"
+            );
+            Assertions.assertTrue(dataSet.hasUrl("https://www.example.com/saved"));
+
+            dataSet.flagAsBad(new EdgeUrl("https://www.example.com/bad"));
+
+            Assertions.assertTrue(dataSet.hasUrl("https://www.example.com/bad"));
+
+            Assertions.assertFalse(dataSet.hasUrl("https://www.example.com/notPresent"));
+        }
+        finally {
+            FileUtils.deleteDirectory(tempDir.toFile());
+        }
+    }
+
 }