diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java index 8c6e92d2..f3c6227d 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java @@ -65,8 +65,7 @@ public class SideloadSourceFactory { public SideloadSource sideloadReddit(Path pathToDbFiles) throws IOException { return sideload(pathToDbFiles, new PathSuffixPredicate(".db"), - (List paths) -> new RedditSideloader(paths, - anchorTagsSourceFactory, anchorTextKeywords, sideloaderProcessing)); + (List paths) -> new RedditSideloader(paths, anchorTextKeywords, sideloaderProcessing)); } public Collection sideloadStackexchange(Path pathToDbFileRoot) throws IOException { diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java index b0b2c014..98133bcf 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java @@ -1,5 +1,6 @@ package nu.marginalia.crawl.retreival; +import nu.marginalia.ContentTypes; import nu.marginalia.io.SerializableCrawlDataStream; import nu.marginalia.lsh.EasyLSH; import nu.marginalia.model.crawldata.CrawledDocument; @@ -43,6 +44,9 @@ public class CrawlDataReference implements AutoCloseable { try { while (data.hasNext()) { if (data.next() instanceof CrawledDocument doc) { + if (!ContentTypes.isAccepted(doc.contentType)) + continue; + return doc; } } diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index c6b426b3..ace2059b 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -317,26 +317,24 @@ public class CrawlerRetreiver implements AutoCloseable { long probeStart = System.currentTimeMillis(); - /* - probing is on probation for now while we evaluate how much the added delays slows down the crawler - if (probeType == HttpFetcher.ProbeType.FULL) { + retryLoop: for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) { try { var probeResult = fetcher.probeContentType(url, warcRecorder, contentTags); - if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.Ok ok) { - url = ok.resolvedUrl(); // If we were redirected while probing, use the final URL for fetching - break; - } else if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.BadContentType badContentType) { - return new HttpFetchResult.ResultNone(); - } else if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.BadContentType.Timeout timeout) { - return new HttpFetchResult.ResultException(timeout.ex()); - } else if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.Exception exception) { - return new HttpFetchResult.ResultException(exception.ex()); - } - else { // should be unreachable - throw new IllegalStateException("Unknown probe result"); + switch (probeResult) { + case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl): + url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching + break retryLoop; + case HttpFetcher.ContentTypeProbeResult.BadContentType badContentType: + return new HttpFetchResult.ResultNone(); + case HttpFetcher.ContentTypeProbeResult.BadContentType.Timeout timeout: + return new HttpFetchResult.ResultException(timeout.ex()); + case HttpFetcher.ContentTypeProbeResult.Exception exception: + return new HttpFetchResult.ResultException(exception.ex()); + default: // should be unreachable + throw new IllegalStateException("Unknown probe result"); } } catch (HttpFetcherImpl.RateLimitException ex) { @@ -348,8 +346,8 @@ public class CrawlerRetreiver implements AutoCloseable { } } - timer.waitFetchDelay(System.currentTimeMillis() - probeStart); - }*/ + timer.waitFetchDelay(System.currentTimeMillis() - probeStart); + } for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) { diff --git a/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java b/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java new file mode 100644 index 00000000..dbc1989c --- /dev/null +++ b/code/processes/crawling-process/model/java/nu/marginalia/ContentTypes.java @@ -0,0 +1,22 @@ +package nu.marginalia; + +import java.util.Set; + +public class ContentTypes { + public static final Set acceptedContentTypes = Set.of("application/xhtml+xml", + "application/xhtml", + "text/html", + "image/x-icon", + "text/plain"); + + public static boolean isAccepted(String contentTypeHeader) { + String lcHeader = contentTypeHeader.toLowerCase(); + for (var type : acceptedContentTypes) { + if (lcHeader.startsWith(type)) { + return true; + } + } + return false; + } + +} diff --git a/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java index f231c703..9474c2ff 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java @@ -1,6 +1,7 @@ package nu.marginalia.parquet.crawldata; import blue.strategic.parquet.ParquetWriter; +import nu.marginalia.ContentTypes; import nu.marginalia.UserAgent; import nu.marginalia.model.body.DocumentBodyExtractor; import nu.marginalia.model.body.DocumentBodyResult; @@ -62,6 +63,8 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { } } + + /** Return true if the WarcResponse should be excluded from conversion */ private static boolean filterResponse(String uaString, WarcResponse response) throws IOException { @@ -74,14 +77,25 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { return false; } - var robotsTags = response.http().headers().all("X-Robots-Tag"); + var headers = response.http().headers(); + var robotsTags = headers.all("X-Robots-Tag"); + if (!isXRobotsTagsPermitted(robotsTags, uaString)) { return false; } + // Strip out responses with content types we aren't interested in + // (though ideally we wouldn't download these at all) + String contentType = headers.first("Content-Type").orElse("text/plain").toLowerCase(); + + if (!ContentTypes.isAccepted(contentType)) { + return false; + } + return true; } + private void write(String domain, WarcXEntityRefused refused) throws IOException { URI profile = refused.profile(); diff --git a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java index d6d407bf..b2a0f2bc 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java @@ -157,10 +157,10 @@ class WarcRecorderTest { fileNameParquet); var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList(); - assertEquals(3, urls.size()); + assertEquals(2, urls.size()); assertEquals("https://www.marginalia.nu/", urls.get(0)); assertEquals("https://www.marginalia.nu/log/", urls.get(1)); - assertEquals("https://www.marginalia.nu/sanic.png", urls.get(2)); + // sanic.jpg gets filtered out for its bad mime type }