mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(live-crawler) Keep track of bad URLs
To avoid hammering the same invalid URLs for up to two months, URLs that fail to fetch correctly are on a dice roll added to a bad URLs table, that prevents further attempts at fetching them.
This commit is contained in:
parent
4d23fe6261
commit
52eb5bc84f
@ -34,6 +34,7 @@ public class LiveCrawlDataSet implements AutoCloseable {
|
|||||||
try (var stmt = connection.createStatement()) {
|
try (var stmt = connection.createStatement()) {
|
||||||
stmt.execute("CREATE TABLE IF NOT EXISTS urls (url TEXT PRIMARY KEY, domainId LONG, body BLOB, headers BLOB, ip TEXT, timestamp long)");
|
stmt.execute("CREATE TABLE IF NOT EXISTS urls (url TEXT PRIMARY KEY, domainId LONG, body BLOB, headers BLOB, ip TEXT, timestamp long)");
|
||||||
stmt.execute("CREATE INDEX IF NOT EXISTS domainIdIndex ON urls (domainId)");
|
stmt.execute("CREATE INDEX IF NOT EXISTS domainIdIndex ON urls (domainId)");
|
||||||
|
stmt.execute("CREATE TABLE IF NOT EXISTS badUrls (url TEXT PRIMARY KEY, timestamp long)");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -47,12 +48,24 @@ public class LiveCrawlDataSet implements AutoCloseable {
|
|||||||
stmt.setLong(1, cutoff.toEpochMilli());
|
stmt.setLong(1, cutoff.toEpochMilli());
|
||||||
stmt.executeUpdate();
|
stmt.executeUpdate();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
try (var stmt = connection.prepareStatement("DELETE FROM badUrls WHERE timestamp < ?")) {
|
||||||
|
stmt.setLong(1, cutoff.toEpochMilli());
|
||||||
|
stmt.executeUpdate();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Check if the given URL is already in the database */
|
/** Check if the given URL is already in the database */
|
||||||
public boolean hasUrl(String url) throws SQLException {
|
public boolean hasUrl(String url) throws SQLException {
|
||||||
try (var stmt = connection.prepareStatement("SELECT 1 FROM urls WHERE url = ?")) {
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
SELECT 1 FROM urls WHERE urls.url = ?
|
||||||
|
UNION
|
||||||
|
SELECT 1 FROM badUrls WHERE badUrls.url = ?
|
||||||
|
""");
|
||||||
|
) {
|
||||||
stmt.setString(1, url);
|
stmt.setString(1, url);
|
||||||
|
stmt.setString(2, url);
|
||||||
|
|
||||||
return stmt.executeQuery().next();
|
return stmt.executeQuery().next();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -79,6 +92,22 @@ public class LiveCrawlDataSet implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Flag a URL as bad, i.e. it should not be revisited */
|
||||||
|
public void flagAsBad(EdgeUrl url) {
|
||||||
|
try (var stmt = connection.prepareStatement("""
|
||||||
|
INSERT OR IGNORE INTO badUrls (url, timestamp)
|
||||||
|
VALUES (?, ?)
|
||||||
|
"""))
|
||||||
|
{
|
||||||
|
stmt.setString(1, url.toString());
|
||||||
|
stmt.setLong(2, Instant.now().toEpochMilli());
|
||||||
|
stmt.executeUpdate();
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
throw new RuntimeException(ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private byte[] compress(String data) throws IOException {
|
private byte[] compress(String data) throws IOException {
|
||||||
// gzip compression
|
// gzip compression
|
||||||
try (var bos = new ByteArrayOutputStream();
|
try (var bos = new ByteArrayOutputStream();
|
||||||
|
@ -11,6 +11,8 @@ import nu.marginalia.link_parser.LinkParser;
|
|||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
@ -21,12 +23,15 @@ import java.net.http.HttpResponse;
|
|||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
import java.util.concurrent.ThreadLocalRandom;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
/** A simple link scraper that fetches URLs and stores them in a database,
|
/** A simple link scraper that fetches URLs and stores them in a database,
|
||||||
* with no concept of a crawl frontier, WARC output, or other advanced features
|
* with no concept of a crawl frontier, WARC output, or other advanced features
|
||||||
*/
|
*/
|
||||||
public class SimpleLinkScraper implements AutoCloseable {
|
public class SimpleLinkScraper implements AutoCloseable {
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(SimpleLinkScraper.class);
|
||||||
|
|
||||||
private final SimpleBlockingThreadPool pool = new SimpleBlockingThreadPool("LiveCrawler", 32, 10);
|
private final SimpleBlockingThreadPool pool = new SimpleBlockingThreadPool("LiveCrawler", 32, 10);
|
||||||
private final LinkParser lp = new LinkParser();
|
private final LinkParser lp = new LinkParser();
|
||||||
private final LiveCrawlDataSet dataSet;
|
private final LiveCrawlDataSet dataSet;
|
||||||
@ -81,7 +86,24 @@ public class SimpleLinkScraper implements AutoCloseable {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
fetchUrl(domainId, parsedUrl, timer, client);
|
switch (fetchUrl(domainId, parsedUrl, timer, client)) {
|
||||||
|
case FetchResult.Success(int id, EdgeUrl docUrl, String body, String headers)
|
||||||
|
-> dataSet.saveDocument(id, docUrl, body, headers, "");
|
||||||
|
case FetchResult.Error(EdgeUrl docUrl) ->
|
||||||
|
{
|
||||||
|
// To give bad URLs a chance to be re-fetched, we only flag them as bad
|
||||||
|
// with a 20% probability. This will prevent the same bad URL being
|
||||||
|
// re-fetched over and over again for several months, but still allow
|
||||||
|
// us to *mostly* re-fetch it if it was just a transient error.
|
||||||
|
|
||||||
|
// There's of course the chance we immediately flag it as bad on an
|
||||||
|
// unlucky roll, but you know, that's xcom baby
|
||||||
|
if (ThreadLocalRandom.current().nextDouble(0, 1) < 0.2) {
|
||||||
|
dataSet.flagAsBad(docUrl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -107,36 +129,56 @@ public class SimpleLinkScraper implements AutoCloseable {
|
|||||||
return rules;
|
return rules;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void fetchUrl(int domainId, EdgeUrl parsedUrl, CrawlDelayTimer timer, HttpClient client) throws Exception {
|
/** Fetch a URL and store it in the database
|
||||||
|
*/
|
||||||
|
private FetchResult fetchUrl(int domainId, EdgeUrl parsedUrl, CrawlDelayTimer timer, HttpClient client) throws Exception {
|
||||||
|
|
||||||
timer.waitFetchDelay();
|
timer.waitFetchDelay();
|
||||||
|
|
||||||
// Loop for HTTP 429 retries
|
HttpRequest request = HttpRequest.newBuilder(parsedUrl.asURI())
|
||||||
for (int i = 0; i < 2; i++) {
|
.GET()
|
||||||
HttpRequest request = HttpRequest.newBuilder(parsedUrl.asURI())
|
.header("User-Agent", WmsaHome.getUserAgent().uaString())
|
||||||
.GET()
|
.header("Accept", "text/html")
|
||||||
.header("User-Agent", WmsaHome.getUserAgent().uaString())
|
.timeout(readTimeout)
|
||||||
.header("Accept", "text/html")
|
.build();
|
||||||
.timeout(readTimeout)
|
|
||||||
.build();
|
|
||||||
|
|
||||||
|
try {
|
||||||
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
|
HttpResponse<String> response = client.send(request, HttpResponse.BodyHandlers.ofString());
|
||||||
|
|
||||||
|
// Handle rate limiting by waiting and retrying once
|
||||||
if (response.statusCode() == 429) {
|
if (response.statusCode() == 429) {
|
||||||
timer.waitRetryDelay(new HttpFetcherImpl.RateLimitException(
|
timer.waitRetryDelay(new HttpFetcherImpl.RateLimitException(
|
||||||
response.headers().firstValue("Retry-After").orElse("5")
|
response.headers().firstValue("Retry-After").orElse("5")
|
||||||
));
|
));
|
||||||
continue;
|
response = client.send(request, HttpResponse.BodyHandlers.ofString());
|
||||||
}
|
}
|
||||||
|
|
||||||
String contentType = response.headers().firstValue("Content-Type").orElse("").toLowerCase();
|
String contentType = response.headers().firstValue("Content-Type").orElse("").toLowerCase();
|
||||||
|
|
||||||
if (response.statusCode() == 200 && contentType.startsWith("text/html")) {
|
if (response.statusCode() == 200) {
|
||||||
dataSet.saveDocument(domainId, parsedUrl, response.body(), headersToString(response.headers()), "");
|
if (!contentType.toLowerCase().startsWith("text/html")) {
|
||||||
}
|
return new FetchResult.Error(parsedUrl);
|
||||||
|
}
|
||||||
|
|
||||||
break;
|
String body = response.body();
|
||||||
|
if (body.length() > 1024 * 1024) {
|
||||||
|
return new FetchResult.Error(parsedUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new FetchResult.Success(domainId, parsedUrl, body, headersToString(response.headers()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
catch (IOException ex) {
|
||||||
|
// We don't want a full stack trace on every error, as it's quite common and very noisy
|
||||||
|
logger.error("Error fetching URL {}: {} {}", parsedUrl, ex.getClass().getSimpleName(), ex.getMessage());
|
||||||
|
}
|
||||||
|
|
||||||
|
return new FetchResult.Error(parsedUrl);
|
||||||
|
}
|
||||||
|
|
||||||
|
sealed interface FetchResult {
|
||||||
|
record Success(int domainId, EdgeUrl url, String body, String headers) implements FetchResult {}
|
||||||
|
record Error(EdgeUrl url) implements FetchResult {}
|
||||||
}
|
}
|
||||||
|
|
||||||
private String headersToString(HttpHeaders headers) {
|
private String headersToString(HttpHeaders headers) {
|
||||||
|
@ -18,8 +18,7 @@ public class LiveCrawlDataSetTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testGetDataSet() throws Exception {
|
public void testGetDataSet() throws Exception {
|
||||||
Path tempDir = Files.createTempDirectory("live-crawl-data-set-test");
|
Path tempDir = Files.createTempDirectory("live-crawl-data-set-test");
|
||||||
try {
|
try (LiveCrawlDataSet dataSet = new LiveCrawlDataSet(tempDir)) {
|
||||||
LiveCrawlDataSet dataSet = new LiveCrawlDataSet(tempDir);
|
|
||||||
|
|
||||||
Assertions.assertFalse(dataSet.hasUrl("https://www.example.com/"));
|
Assertions.assertFalse(dataSet.hasUrl("https://www.example.com/"));
|
||||||
dataSet.saveDocument(
|
dataSet.saveDocument(
|
||||||
@ -65,4 +64,29 @@ public class LiveCrawlDataSetTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testHasUrl() throws Exception {
|
||||||
|
Path tempDir = Files.createTempDirectory("live-crawl-data-set-test");
|
||||||
|
try (LiveCrawlDataSet dataSet = new LiveCrawlDataSet(tempDir)) {
|
||||||
|
Assertions.assertFalse(dataSet.hasUrl("https://www.example.com/"));
|
||||||
|
dataSet.saveDocument(
|
||||||
|
1,
|
||||||
|
new EdgeUrl("https://www.example.com/saved"),
|
||||||
|
"test",
|
||||||
|
"test",
|
||||||
|
"test"
|
||||||
|
);
|
||||||
|
Assertions.assertTrue(dataSet.hasUrl("https://www.example.com/saved"));
|
||||||
|
|
||||||
|
dataSet.flagAsBad(new EdgeUrl("https://www.example.com/bad"));
|
||||||
|
|
||||||
|
Assertions.assertTrue(dataSet.hasUrl("https://www.example.com/bad"));
|
||||||
|
|
||||||
|
Assertions.assertFalse(dataSet.hasUrl("https://www.example.com/notPresent"));
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
FileUtils.deleteDirectory(tempDir.toFile());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user