(live-crawler) Add Accept-Encoding: gzip to outbound requests

This change adds `Accept-Encoding: gzip` to all outbound requests from the live crawler and feed fetcher, and the corresponding decoding logic for the compressed response data. The change addresses issue #136, save for making the fetcher's requests conditional.
2025-02-23 13:09:00 +00:00 · 2024-12-27 03:59:34 +01:00 · 2024-12-27 03:59:34 +01:00 · 927bc0b63c
commit 927bc0b63c
parent d968801dc1
5 changed files with 123 additions and 8 deletions
--- a/code/functions/live-capture/build.gradle
+++ b/code/functions/live-capture/build.gradle
@ -24,6 +24,7 @@ dependencies {
    implementation project(':code:libraries:message-queue')

    implementation project(':code:execution:api')
+    implementation project(':code:processes:crawling-process:ft-content-type')

    implementation libs.jsoup
    implementation libs.rssreader
--- a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java
+++ b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java
@ -5,6 +5,8 @@ import com.apptasticsoftware.rssreader.RssReader;
 import com.google.inject.Inject;
 import com.opencsv.CSVReader;
 import nu.marginalia.WmsaHome;
+import nu.marginalia.contenttype.ContentType;
+import nu.marginalia.contenttype.DocumentBodyToString;
 import nu.marginalia.executor.client.ExecutorClient;
 import nu.marginalia.model.EdgeDomain;
 import nu.marginalia.nodecfg.NodeConfigurationService;
@ -220,17 +222,23 @@ public class FeedFetcherService {
                    .GET()
                    .uri(uri)
                    .header("User-Agent", WmsaHome.getUserAgent().uaIdentifier())
+                    .header("Accept-Encoding", "gzip")
                    .header("Accept", "text/*, */*;q=0.9")
                    .timeout(Duration.ofSeconds(15))
                    .build();

            for (int i = 0; i < 3; i++) {
-                var rs = client.send(getRequest, HttpResponse.BodyHandlers.ofString());
+                var rs = client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray());
                if (429 == rs.statusCode()) {
                    int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2"));
                    Thread.sleep(Duration.ofSeconds(Math.clamp(retryAfter, 1, 5)));
                } else if (200 == rs.statusCode()) {
-                    return new FetchResult.Success(rs.body());
+                    byte[] responseData = getResponseData(rs);
+
+                    String contentType = rs.headers().firstValue("Content-Type").orElse("");
+                    String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), responseData);
+
+                    return new FetchResult.Success(bodyText);
                } else if (404 == rs.statusCode()) {
                    return new FetchResult.PermanentError(); // never try again
                } else {
@ -245,6 +253,19 @@ public class FeedFetcherService {
        return new FetchResult.TransientError();
    }

+    private byte[] getResponseData(HttpResponse<byte[]> response) throws IOException {
+        String encoding = response.headers().firstValue("Content-Encoding").orElse("");
+
+        if ("gzip".equals(encoding)) {
+            try (var stream = new GZIPInputStream(new ByteArrayInputStream(response.body()))) {
+                return stream.readAllBytes();
+            }
+        }
+        else {
+            return response.body();
+        }
+    }
+
    public sealed interface FetchResult {
        record Success(String value) implements FetchResult {}
        record TransientError() implements FetchResult {}
--- a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java
+++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java
@ -179,6 +179,9 @@ public class LiveCrawlerMain extends ProcessMainClass {
                    EdgeDomain domain = new EdgeDomain(entry.getKey());
                    List<String> urls = entry.getValue();

+                    if (urls.isEmpty())
+                        continue;
+
                    fetcher.scheduleRetrieval(domain, urls);
                }
            }
--- a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java
+++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java
@ -3,6 +3,8 @@ package nu.marginalia.livecrawler;
 import crawlercommons.robots.SimpleRobotRules;
 import crawlercommons.robots.SimpleRobotRulesParser;
 import nu.marginalia.WmsaHome;
+import nu.marginalia.contenttype.ContentType;
+import nu.marginalia.contenttype.DocumentBodyToString;
 import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
 import nu.marginalia.crawl.logic.DomainLocks;
 import nu.marginalia.crawl.retreival.CrawlDelayTimer;
@ -16,6 +18,7 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import javax.annotation.Nullable;
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.net.URISyntaxException;
 import java.net.http.HttpClient;
@ -27,6 +30,7 @@ import java.util.List;
 import java.util.Optional;
 import java.util.concurrent.ThreadLocalRandom;
 import java.util.concurrent.TimeUnit;
+import java.util.zip.GZIPInputStream;

 /** A simple link scraper that fetches URLs and stores them in a database,
 * with no concept of a crawl frontier, WARC output, or other advanced features
@ -128,6 +132,7 @@ public class SimpleLinkScraper implements AutoCloseable {
        var robotsRequest = HttpRequest.newBuilder(rootUrl.withPathAndParam("/robots.txt", null).asURI())
                .GET()
                .header("User-Agent", WmsaHome.getUserAgent().uaString())
+                .header("Accept-Encoding","gzip")
                .timeout(readTimeout);

        // Fetch the robots.txt
@ -135,9 +140,10 @@ public class SimpleLinkScraper implements AutoCloseable {
        try {
            SimpleRobotRulesParser parser = new SimpleRobotRulesParser();
            HttpResponse<byte[]> robotsTxt = client.send(robotsRequest.build(), HttpResponse.BodyHandlers.ofByteArray());
+
            if (robotsTxt.statusCode() == 200) {
                return parser.parseContent(rootUrl.toString(),
-                        robotsTxt.body(),
+                        getResponseData(robotsTxt),
                        robotsTxt.headers().firstValue("Content-Type").orElse("text/plain"),
                        WmsaHome.getUserAgent().uaIdentifier());
            }
@ -161,18 +167,19 @@ public class SimpleLinkScraper implements AutoCloseable {
                .GET()
                .header("User-Agent", WmsaHome.getUserAgent().uaString())
                .header("Accept", "text/html")
+                .header("Accept-Encoding", "gzip")
                .timeout(readTimeout)
                .build();

        try {
-            HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
+            HttpResponse<byte[]> response = client.send(request, HttpResponse.BodyHandlers.ofByteArray());

            // Handle rate limiting by waiting and retrying once
            if (response.statusCode() == 429) {
                timer.waitRetryDelay(new HttpFetcherImpl.RateLimitException(
                        response.headers().firstValue("Retry-After").orElse("5")
                ));
-                response = client.send(request, HttpResponse.BodyHandlers.ofString());
+                response = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
            }

            String contentType = response.headers().firstValue("Content-Type").orElse("").toLowerCase();
@ -182,12 +189,14 @@ public class SimpleLinkScraper implements AutoCloseable {
                    return new FetchResult.Error(parsedUrl);
                }

-                String body = response.body();
-                if (body.length() > 1024 * 1024) {
+                byte[] body = getResponseData(response);
+                if (body.length > 1024 * 1024) {
                    return new FetchResult.Error(parsedUrl);
                }

-                return new FetchResult.Success(domainId, parsedUrl, body, headersToString(response.headers()));
+                String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), body);
+
+                return new FetchResult.Success(domainId, parsedUrl, bodyText, headersToString(response.headers()));
            }
        }
        catch (IOException ex) {
@ -198,6 +207,19 @@ public class SimpleLinkScraper implements AutoCloseable {
        return new FetchResult.Error(parsedUrl);
    }

+    private byte[] getResponseData(HttpResponse<byte[]> response) throws IOException {
+        String encoding = response.headers().firstValue("Content-Encoding").orElse("");
+
+        if ("gzip".equals(encoding)) {
+            try (var stream = new GZIPInputStream(new ByteArrayInputStream(response.body()))) {
+                return stream.readAllBytes();
+            }
+        }
+        else {
+            return response.body();
+        }
+    }
+
    sealed interface FetchResult {
        record Success(int domainId, EdgeUrl url, String body, String headers) implements FetchResult {}
        record Error(EdgeUrl url) implements FetchResult {}
--- a/code/processes/live-crawling-process/test/nu/marginalia/livecrawler/SimpleLinkScraperTest.java
+++ b/code/processes/live-crawling-process/test/nu/marginalia/livecrawler/SimpleLinkScraperTest.java
@ -0,0 +1,68 @@
+package nu.marginalia.livecrawler;
+
+import nu.marginalia.db.DomainBlacklistImpl;
+import nu.marginalia.io.SerializableCrawlDataStream;
+import nu.marginalia.model.EdgeDomain;
+import nu.marginalia.model.crawldata.CrawledDocument;
+import nu.marginalia.model.crawldata.CrawledDomain;
+import org.apache.commons.io.FileUtils;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.mockito.Mockito;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.sql.SQLException;
+import java.util.List;
+
+class SimpleLinkScraperTest {
+    private Path tempDir;
+    private LiveCrawlDataSet dataSet;
+
+    @BeforeEach
+    public void setUp() throws IOException, SQLException {
+        tempDir = Files.createTempDirectory(getClass().getSimpleName());
+        dataSet = new LiveCrawlDataSet(tempDir);
+    }
+
+
+    @AfterEach
+    public void tearDown() throws Exception {
+        dataSet.close();
+        FileUtils.deleteDirectory(tempDir.toFile());
+    }
+
+    @Test
+    public void testRetrieveNow() throws Exception {
+        var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class));
+        scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
+
+        var streams = dataSet.getDataStreams();
+        Assertions.assertEquals(1, streams.size());
+
+        SerializableCrawlDataStream firstStream = streams.iterator().next();
+        Assertions.assertTrue(firstStream.hasNext());
+
+        if (firstStream.next() instanceof CrawledDomain domain) {
+            Assertions.assertEquals("www.marginalia.nu",domain.getDomain());
+        }
+        else {
+            Assertions.fail();
+        }
+
+        Assertions.assertTrue(firstStream.hasNext());
+
+        if ((firstStream.next() instanceof CrawledDocument document)) {
+            // verify we decompress the body string
+            Assertions.assertTrue(document.documentBody.startsWith("<!doctype"));
+        }
+        else{
+            Assertions.fail();
+        }
+
+        Assertions.assertFalse(firstStream.hasNext());
+    }
+}