mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(live-crawler) Add Accept-Encoding: gzip to outbound requests
This change adds `Accept-Encoding: gzip` to all outbound requests from the live crawler and feed fetcher, and the corresponding decoding logic for the compressed response data. The change addresses issue #136, save for making the fetcher's requests conditional.
This commit is contained in:
parent
d968801dc1
commit
927bc0b63c
@ -24,6 +24,7 @@ dependencies {
|
|||||||
implementation project(':code:libraries:message-queue')
|
implementation project(':code:libraries:message-queue')
|
||||||
|
|
||||||
implementation project(':code:execution:api')
|
implementation project(':code:execution:api')
|
||||||
|
implementation project(':code:processes:crawling-process:ft-content-type')
|
||||||
|
|
||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
implementation libs.rssreader
|
implementation libs.rssreader
|
||||||
|
@ -5,6 +5,8 @@ import com.apptasticsoftware.rssreader.RssReader;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.opencsv.CSVReader;
|
import com.opencsv.CSVReader;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.contenttype.ContentType;
|
||||||
|
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||||
import nu.marginalia.executor.client.ExecutorClient;
|
import nu.marginalia.executor.client.ExecutorClient;
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||||
@ -220,17 +222,23 @@ public class FeedFetcherService {
|
|||||||
.GET()
|
.GET()
|
||||||
.uri(uri)
|
.uri(uri)
|
||||||
.header("User-Agent", WmsaHome.getUserAgent().uaIdentifier())
|
.header("User-Agent", WmsaHome.getUserAgent().uaIdentifier())
|
||||||
|
.header("Accept-Encoding", "gzip")
|
||||||
.header("Accept", "text/*, */*;q=0.9")
|
.header("Accept", "text/*, */*;q=0.9")
|
||||||
.timeout(Duration.ofSeconds(15))
|
.timeout(Duration.ofSeconds(15))
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
for (int i = 0; i < 3; i++) {
|
for (int i = 0; i < 3; i++) {
|
||||||
var rs = client.send(getRequest, HttpResponse.BodyHandlers.ofString());
|
var rs = client.send(getRequest, HttpResponse.BodyHandlers.ofByteArray());
|
||||||
if (429 == rs.statusCode()) {
|
if (429 == rs.statusCode()) {
|
||||||
int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2"));
|
int retryAfter = Integer.parseInt(rs.headers().firstValue("Retry-After").orElse("2"));
|
||||||
Thread.sleep(Duration.ofSeconds(Math.clamp(retryAfter, 1, 5)));
|
Thread.sleep(Duration.ofSeconds(Math.clamp(retryAfter, 1, 5)));
|
||||||
} else if (200 == rs.statusCode()) {
|
} else if (200 == rs.statusCode()) {
|
||||||
return new FetchResult.Success(rs.body());
|
byte[] responseData = getResponseData(rs);
|
||||||
|
|
||||||
|
String contentType = rs.headers().firstValue("Content-Type").orElse("");
|
||||||
|
String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), responseData);
|
||||||
|
|
||||||
|
return new FetchResult.Success(bodyText);
|
||||||
} else if (404 == rs.statusCode()) {
|
} else if (404 == rs.statusCode()) {
|
||||||
return new FetchResult.PermanentError(); // never try again
|
return new FetchResult.PermanentError(); // never try again
|
||||||
} else {
|
} else {
|
||||||
@ -245,6 +253,19 @@ public class FeedFetcherService {
|
|||||||
return new FetchResult.TransientError();
|
return new FetchResult.TransientError();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private byte[] getResponseData(HttpResponse<byte[]> response) throws IOException {
|
||||||
|
String encoding = response.headers().firstValue("Content-Encoding").orElse("");
|
||||||
|
|
||||||
|
if ("gzip".equals(encoding)) {
|
||||||
|
try (var stream = new GZIPInputStream(new ByteArrayInputStream(response.body()))) {
|
||||||
|
return stream.readAllBytes();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return response.body();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public sealed interface FetchResult {
|
public sealed interface FetchResult {
|
||||||
record Success(String value) implements FetchResult {}
|
record Success(String value) implements FetchResult {}
|
||||||
record TransientError() implements FetchResult {}
|
record TransientError() implements FetchResult {}
|
||||||
|
@ -179,6 +179,9 @@ public class LiveCrawlerMain extends ProcessMainClass {
|
|||||||
EdgeDomain domain = new EdgeDomain(entry.getKey());
|
EdgeDomain domain = new EdgeDomain(entry.getKey());
|
||||||
List<String> urls = entry.getValue();
|
List<String> urls = entry.getValue();
|
||||||
|
|
||||||
|
if (urls.isEmpty())
|
||||||
|
continue;
|
||||||
|
|
||||||
fetcher.scheduleRetrieval(domain, urls);
|
fetcher.scheduleRetrieval(domain, urls);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,8 @@ package nu.marginalia.livecrawler;
|
|||||||
import crawlercommons.robots.SimpleRobotRules;
|
import crawlercommons.robots.SimpleRobotRules;
|
||||||
import crawlercommons.robots.SimpleRobotRulesParser;
|
import crawlercommons.robots.SimpleRobotRulesParser;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
|
import nu.marginalia.contenttype.ContentType;
|
||||||
|
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||||
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
|
||||||
import nu.marginalia.crawl.logic.DomainLocks;
|
import nu.marginalia.crawl.logic.DomainLocks;
|
||||||
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
import nu.marginalia.crawl.retreival.CrawlDelayTimer;
|
||||||
@ -16,6 +18,7 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.net.http.HttpClient;
|
import java.net.http.HttpClient;
|
||||||
@ -27,6 +30,7 @@ import java.util.List;
|
|||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.concurrent.ThreadLocalRandom;
|
import java.util.concurrent.ThreadLocalRandom;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.zip.GZIPInputStream;
|
||||||
|
|
||||||
/** A simple link scraper that fetches URLs and stores them in a database,
|
/** A simple link scraper that fetches URLs and stores them in a database,
|
||||||
* with no concept of a crawl frontier, WARC output, or other advanced features
|
* with no concept of a crawl frontier, WARC output, or other advanced features
|
||||||
@ -128,6 +132,7 @@ public class SimpleLinkScraper implements AutoCloseable {
|
|||||||
var robotsRequest = HttpRequest.newBuilder(rootUrl.withPathAndParam("/robots.txt", null).asURI())
|
var robotsRequest = HttpRequest.newBuilder(rootUrl.withPathAndParam("/robots.txt", null).asURI())
|
||||||
.GET()
|
.GET()
|
||||||
.header("User-Agent", WmsaHome.getUserAgent().uaString())
|
.header("User-Agent", WmsaHome.getUserAgent().uaString())
|
||||||
|
.header("Accept-Encoding","gzip")
|
||||||
.timeout(readTimeout);
|
.timeout(readTimeout);
|
||||||
|
|
||||||
// Fetch the robots.txt
|
// Fetch the robots.txt
|
||||||
@ -135,9 +140,10 @@ public class SimpleLinkScraper implements AutoCloseable {
|
|||||||
try {
|
try {
|
||||||
SimpleRobotRulesParser parser = new SimpleRobotRulesParser();
|
SimpleRobotRulesParser parser = new SimpleRobotRulesParser();
|
||||||
HttpResponse<byte[]> robotsTxt = client.send(robotsRequest.build(), HttpResponse.BodyHandlers.ofByteArray());
|
HttpResponse<byte[]> robotsTxt = client.send(robotsRequest.build(), HttpResponse.BodyHandlers.ofByteArray());
|
||||||
|
|
||||||
if (robotsTxt.statusCode() == 200) {
|
if (robotsTxt.statusCode() == 200) {
|
||||||
return parser.parseContent(rootUrl.toString(),
|
return parser.parseContent(rootUrl.toString(),
|
||||||
robotsTxt.body(),
|
getResponseData(robotsTxt),
|
||||||
robotsTxt.headers().firstValue("Content-Type").orElse("text/plain"),
|
robotsTxt.headers().firstValue("Content-Type").orElse("text/plain"),
|
||||||
WmsaHome.getUserAgent().uaIdentifier());
|
WmsaHome.getUserAgent().uaIdentifier());
|
||||||
}
|
}
|
||||||
@ -161,18 +167,19 @@ public class SimpleLinkScraper implements AutoCloseable {
|
|||||||
.GET()
|
.GET()
|
||||||
.header("User-Agent", WmsaHome.getUserAgent().uaString())
|
.header("User-Agent", WmsaHome.getUserAgent().uaString())
|
||||||
.header("Accept", "text/html")
|
.header("Accept", "text/html")
|
||||||
|
.header("Accept-Encoding", "gzip")
|
||||||
.timeout(readTimeout)
|
.timeout(readTimeout)
|
||||||
.build();
|
.build();
|
||||||
|
|
||||||
try {
|
try {
|
||||||
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
|
HttpResponse<byte[]> response = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
|
||||||
|
|
||||||
// Handle rate limiting by waiting and retrying once
|
// Handle rate limiting by waiting and retrying once
|
||||||
if (response.statusCode() == 429) {
|
if (response.statusCode() == 429) {
|
||||||
timer.waitRetryDelay(new HttpFetcherImpl.RateLimitException(
|
timer.waitRetryDelay(new HttpFetcherImpl.RateLimitException(
|
||||||
response.headers().firstValue("Retry-After").orElse("5")
|
response.headers().firstValue("Retry-After").orElse("5")
|
||||||
));
|
));
|
||||||
response = client.send(request, HttpResponse.BodyHandlers.ofString());
|
response = client.send(request, HttpResponse.BodyHandlers.ofByteArray());
|
||||||
}
|
}
|
||||||
|
|
||||||
String contentType = response.headers().firstValue("Content-Type").orElse("").toLowerCase();
|
String contentType = response.headers().firstValue("Content-Type").orElse("").toLowerCase();
|
||||||
@ -182,12 +189,14 @@ public class SimpleLinkScraper implements AutoCloseable {
|
|||||||
return new FetchResult.Error(parsedUrl);
|
return new FetchResult.Error(parsedUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
String body = response.body();
|
byte[] body = getResponseData(response);
|
||||||
if (body.length() > 1024 * 1024) {
|
if (body.length > 1024 * 1024) {
|
||||||
return new FetchResult.Error(parsedUrl);
|
return new FetchResult.Error(parsedUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
return new FetchResult.Success(domainId, parsedUrl, body, headersToString(response.headers()));
|
String bodyText = DocumentBodyToString.getStringData(ContentType.parse(contentType), body);
|
||||||
|
|
||||||
|
return new FetchResult.Success(domainId, parsedUrl, bodyText, headersToString(response.headers()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (IOException ex) {
|
catch (IOException ex) {
|
||||||
@ -198,6 +207,19 @@ public class SimpleLinkScraper implements AutoCloseable {
|
|||||||
return new FetchResult.Error(parsedUrl);
|
return new FetchResult.Error(parsedUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private byte[] getResponseData(HttpResponse<byte[]> response) throws IOException {
|
||||||
|
String encoding = response.headers().firstValue("Content-Encoding").orElse("");
|
||||||
|
|
||||||
|
if ("gzip".equals(encoding)) {
|
||||||
|
try (var stream = new GZIPInputStream(new ByteArrayInputStream(response.body()))) {
|
||||||
|
return stream.readAllBytes();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return response.body();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
sealed interface FetchResult {
|
sealed interface FetchResult {
|
||||||
record Success(int domainId, EdgeUrl url, String body, String headers) implements FetchResult {}
|
record Success(int domainId, EdgeUrl url, String body, String headers) implements FetchResult {}
|
||||||
record Error(EdgeUrl url) implements FetchResult {}
|
record Error(EdgeUrl url) implements FetchResult {}
|
||||||
|
@ -0,0 +1,68 @@
|
|||||||
|
package nu.marginalia.livecrawler;
|
||||||
|
|
||||||
|
import nu.marginalia.db.DomainBlacklistImpl;
|
||||||
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
|
import nu.marginalia.model.crawldata.CrawledDomain;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.mockito.Mockito;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
class SimpleLinkScraperTest {
|
||||||
|
private Path tempDir;
|
||||||
|
private LiveCrawlDataSet dataSet;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws IOException, SQLException {
|
||||||
|
tempDir = Files.createTempDirectory(getClass().getSimpleName());
|
||||||
|
dataSet = new LiveCrawlDataSet(tempDir);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws Exception {
|
||||||
|
dataSet.close();
|
||||||
|
FileUtils.deleteDirectory(tempDir.toFile());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRetrieveNow() throws Exception {
|
||||||
|
var scraper = new SimpleLinkScraper(dataSet, null, Mockito.mock(DomainBlacklistImpl.class));
|
||||||
|
scraper.retrieveNow(new EdgeDomain("www.marginalia.nu"), 1, List.of("https://www.marginalia.nu/"));
|
||||||
|
|
||||||
|
var streams = dataSet.getDataStreams();
|
||||||
|
Assertions.assertEquals(1, streams.size());
|
||||||
|
|
||||||
|
SerializableCrawlDataStream firstStream = streams.iterator().next();
|
||||||
|
Assertions.assertTrue(firstStream.hasNext());
|
||||||
|
|
||||||
|
if (firstStream.next() instanceof CrawledDomain domain) {
|
||||||
|
Assertions.assertEquals("www.marginalia.nu",domain.getDomain());
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
Assertions.fail();
|
||||||
|
}
|
||||||
|
|
||||||
|
Assertions.assertTrue(firstStream.hasNext());
|
||||||
|
|
||||||
|
if ((firstStream.next() instanceof CrawledDocument document)) {
|
||||||
|
// verify we decompress the body string
|
||||||
|
Assertions.assertTrue(document.documentBody.startsWith("<!doctype"));
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
Assertions.fail();
|
||||||
|
}
|
||||||
|
|
||||||
|
Assertions.assertFalse(firstStream.hasNext());
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user