diff --git a/code/functions/live-capture/build.gradle b/code/functions/live-capture/build.gradle index 40e33e69..714684c9 100644 --- a/code/functions/live-capture/build.gradle +++ b/code/functions/live-capture/build.gradle @@ -24,6 +24,7 @@ dependencies { implementation project(':code:libraries:message-queue') implementation project(':code:execution:api') + implementation project(':code:processes:crawling-process:ft-content-type') implementation libs.jsoup implementation libs.rssreader diff --git a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java index 646c2788..00f9663c 100644 --- a/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java +++ b/code/functions/live-capture/java/nu/marginalia/rss/svc/FeedFetcherService.java @@ -5,6 +5,8 @@ import com.apptasticsoftware.rssreader.RssReader; import com.google.inject.Inject; import com.opencsv.CSVReader; import nu.marginalia.WmsaHome; +import nu.marginalia.contenttype.ContentType; +import nu.marginalia.contenttype.DocumentBodyToString; import nu.marginalia.executor.client.ExecutorClient; import nu.marginalia.model.EdgeDomain; import nu.marginalia.nodecfg.NodeConfigurationService; @@ -220,17 +222,23 @@ public class FeedFetcherService { .GET() .uri(uri) .header("User-Agent", WmsaHome.getUserAgent().uaIdentifier()) + .header("Accept-Encoding", "gzip") .header("Accept", "text/*, */*;q=0.9") .timeout(Duration.ofSeconds(15)) .build(); for (int i = 0; i < 3; i++) { - var rs = client.send(getRequest, HttpResponse.BodyHandlers.ofString()); + var rs = client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray()); if (429 == rs.statusCode()) { int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2")); Thread.sleep(Duration.ofSeconds(Math.clamp(retryAfter, 1, 5))); } else if (200 == rs.statusCode()) { - return new FetchResult.Success(rs.body()); + byte[] responseData = getResponseData(rs); + + String contentType = rs.headers().firstValue("Content-Type").orElse(""); + String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), responseData); + + return new FetchResult.Success(bodyText); } else if (404 == rs.statusCode()) { return new FetchResult.PermanentError(); // never try again } else { @@ -245,6 +253,19 @@ public class FeedFetcherService { return new FetchResult.TransientError(); } + private byte[] getResponseData(HttpResponse response) throws IOException { + String encoding = response.headers().firstValue("Content-Encoding").orElse(""); + + if ("gzip".equals(encoding)) { + try (var stream = new GZIPInputStream(new ByteArrayInputStream(response.body()))) { + return stream.readAllBytes(); + } + } + else { + return response.body(); + } + } + public sealed interface FetchResult { record Success(String value) implements FetchResult {} record TransientError() implements FetchResult {} diff --git a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java index f8af9267..bb193e51 100644 --- a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java +++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java @@ -179,6 +179,9 @@ public class LiveCrawlerMain extends ProcessMainClass { EdgeDomain domain = new EdgeDomain(entry.getKey()); List urls = entry.getValue(); + if (urls.isEmpty()) + continue; + fetcher.scheduleRetrieval(domain, urls); } } diff --git a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java index 5253c042..8c11d1a0 100644 --- a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java +++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/SimpleLinkScraper.java @@ -3,6 +3,8 @@ package nu.marginalia.livecrawler; import crawlercommons.robots.SimpleRobotRules; import crawlercommons.robots.SimpleRobotRulesParser; import nu.marginalia.WmsaHome; +import nu.marginalia.contenttype.ContentType; +import nu.marginalia.contenttype.DocumentBodyToString; import nu.marginalia.crawl.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.logic.DomainLocks; import nu.marginalia.crawl.retreival.CrawlDelayTimer; @@ -16,6 +18,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import javax.annotation.Nullable; +import java.io.ByteArrayInputStream; import java.io.IOException; import java.net.URISyntaxException; import java.net.http.HttpClient; @@ -27,6 +30,7 @@ import java.util.List; import java.util.Optional; import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; +import java.util.zip.GZIPInputStream; /** A simple link scraper that fetches URLs and stores them in a database, * with no concept of a crawl frontier, WARC output, or other advanced features @@ -128,6 +132,7 @@ public class SimpleLinkScraper implements AutoCloseable { var robotsRequest = HttpRequest.newBuilder(rootUrl.withPathAndParam("/robots.txt", null).asURI()) .GET() .header("User-Agent", WmsaHome.getUserAgent().uaString()) + .header("Accept-Encoding","gzip") .timeout(readTimeout); // Fetch the robots.txt @@ -135,9 +140,10 @@ public class SimpleLinkScraper implements AutoCloseable { try { SimpleRobotRulesParser parser = new SimpleRobotRulesParser(); HttpResponse robotsTxt = client.send(robotsRequest.build(), HttpResponse.BodyHandlers.ofByteArray()); + if (robotsTxt.statusCode() == 200) { return parser.parseContent(rootUrl.toString(), - robotsTxt.body(), + getResponseData(robotsTxt), robotsTxt.headers().firstValue("Content-Type").orElse("text/plain"), WmsaHome.getUserAgent().uaIdentifier()); } @@ -161,18 +167,19 @@ public class SimpleLinkScraper implements AutoCloseable { .GET() .header("User-Agent", WmsaHome.getUserAgent().uaString()) .header("Accept", "text/html") + .header("Accept-Encoding", "gzip") .timeout(readTimeout) .build(); try { - HttpResponse response = client.send(request, HttpResponse.BodyHandlers.ofString()); + HttpResponse response = client.send(request, HttpResponse.BodyHandlers.ofByteArray()); // Handle rate limiting by waiting and retrying once if (response.statusCode() == 429) { timer.waitRetryDelay(new HttpFetcherImpl.RateLimitException( response.headers().firstValue("Retry-After").orElse("5") )); - response = client.send(request, HttpResponse.BodyHandlers.ofString()); + response = client.send(request, HttpResponse.BodyHandlers.ofByteArray()); } String contentType = response.headers().firstValue("Content-Type").orElse("").toLowerCase(); @@ -182,12 +189,14 @@ public class SimpleLinkScraper implements AutoCloseable { return new FetchResult.Error(parsedUrl); } - String body = response.body(); - if (body.length() > 1024 * 1024) { + byte[] body = getResponseData(response); + if (body.length > 1024 * 1024) { return new FetchResult.Error(parsedUrl); } - return new FetchResult.Success(domainId, parsedUrl, body, headersToString(response.headers())); + String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), body); + + return new FetchResult.Success(domainId, parsedUrl, bodyText, headersToString(response.headers())); } } catch (IOException ex) { @@ -198,6 +207,19 @@ public class SimpleLinkScraper implements AutoCloseable { return new FetchResult.Error(parsedUrl); } + private byte[] getResponseData(HttpResponse response) throws IOException { + String encoding = response.headers().firstValue("Content-Encoding").orElse(""); + + if ("gzip".equals(encoding)) { + try (var stream = new GZIPInputStream(new ByteArrayInputStream(response.body()))) { + return stream.readAllBytes(); + } + } + else { + return response.body(); + } + } + sealed interface FetchResult { record Success(int domainId, EdgeUrl url, String body, String headers) implements FetchResult {} record Error(EdgeUrl url) implements FetchResult {} diff --git a/code/processes/live-crawling-process/test/nu/marginalia/livecrawler/SimpleLinkScraperTest.java b/code/processes/live-crawling-process/test/nu/marginalia/livecrawler/SimpleLinkScraperTest.java new file mode 100644 index 00000000..29b618ef --- /dev/null +++ b/code/processes/live-crawling-process/test/nu/marginalia/livecrawler/SimpleLinkScraperTest.java @@ -0,0 +1,68 @@ +package nu.marginalia.livecrawler; + +import nu.marginalia.db.DomainBlacklistImpl; +import nu.marginalia.io.SerializableCrawlDataStream; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawledDomain; +import org.apache.commons.io.FileUtils; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.List; + +class SimpleLinkScraperTest { + private Path tempDir; + private LiveCrawlDataSet dataSet; + + @BeforeEach + public void setUp() throws IOException, SQLException { + tempDir = Files.createTempDirectory(getClass().getSimpleName()); + dataSet = new LiveCrawlDataSet(tempDir); + } + + + @AfterEach + public void tearDown() throws Exception { + dataSet.close(); + FileUtils.deleteDirectory(tempDir.toFile()); + } + + @Test + public void testRetrieveNow() throws Exception { + var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class)); + scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/")); + + var streams = dataSet.getDataStreams(); + Assertions.assertEquals(1, streams.size()); + + SerializableCrawlDataStream firstStream = streams.iterator().next(); + Assertions.assertTrue(firstStream.hasNext()); + + if (firstStream.next() instanceof CrawledDomain domain) { + Assertions.assertEquals("www.marginalia.nu",domain.getDomain()); + } + else { + Assertions.fail(); + } + + Assertions.assertTrue(firstStream.hasNext()); + + if ((firstStream.next() instanceof CrawledDocument document)) { + // verify we decompress the body string + Assertions.assertTrue(document.documentBody.startsWith("