mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(crawler) Reintroduce content type probing and clean out bad content type data from the existing crawl sets
This commit is contained in:
parent
3b99cffb3d
commit
e65d75a0f9
@ -65,8 +65,7 @@ public class SideloadSourceFactory {
|
|||||||
public SideloadSource sideloadReddit(Path pathToDbFiles) throws IOException {
|
public SideloadSource sideloadReddit(Path pathToDbFiles) throws IOException {
|
||||||
return sideload(pathToDbFiles,
|
return sideload(pathToDbFiles,
|
||||||
new PathSuffixPredicate(".db"),
|
new PathSuffixPredicate(".db"),
|
||||||
(List<Path> paths) -> new RedditSideloader(paths,
|
(List<Path> paths) -> new RedditSideloader(paths, anchorTextKeywords, sideloaderProcessing));
|
||||||
anchorTagsSourceFactory, anchorTextKeywords, sideloaderProcessing));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Collection<? extends SideloadSource> sideloadStackexchange(Path pathToDbFileRoot) throws IOException {
|
public Collection<? extends SideloadSource> sideloadStackexchange(Path pathToDbFileRoot) throws IOException {
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.crawl.retreival;
|
package nu.marginalia.crawl.retreival;
|
||||||
|
|
||||||
|
import nu.marginalia.ContentTypes;
|
||||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||||
import nu.marginalia.lsh.EasyLSH;
|
import nu.marginalia.lsh.EasyLSH;
|
||||||
import nu.marginalia.model.crawldata.CrawledDocument;
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
||||||
@ -43,6 +44,9 @@ public class CrawlDataReference implements AutoCloseable {
|
|||||||
try {
|
try {
|
||||||
while (data.hasNext()) {
|
while (data.hasNext()) {
|
||||||
if (data.next() instanceof CrawledDocument doc) {
|
if (data.next() instanceof CrawledDocument doc) {
|
||||||
|
if (!ContentTypes.isAccepted(doc.contentType))
|
||||||
|
continue;
|
||||||
|
|
||||||
return doc;
|
return doc;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -317,26 +317,24 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
|
|
||||||
long probeStart = System.currentTimeMillis();
|
long probeStart = System.currentTimeMillis();
|
||||||
|
|
||||||
/*
|
|
||||||
probing is on probation for now while we evaluate how much the added delays slows down the crawler
|
|
||||||
|
|
||||||
if (probeType == HttpFetcher.ProbeType.FULL) {
|
if (probeType == HttpFetcher.ProbeType.FULL) {
|
||||||
|
retryLoop:
|
||||||
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
||||||
try {
|
try {
|
||||||
var probeResult = fetcher.probeContentType(url, warcRecorder, contentTags);
|
var probeResult = fetcher.probeContentType(url, warcRecorder, contentTags);
|
||||||
|
|
||||||
if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.Ok ok) {
|
switch (probeResult) {
|
||||||
url = ok.resolvedUrl(); // If we were redirected while probing, use the final URL for fetching
|
case HttpFetcher.ContentTypeProbeResult.Ok(EdgeUrl resolvedUrl):
|
||||||
break;
|
url = resolvedUrl; // If we were redirected while probing, use the final URL for fetching
|
||||||
} else if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.BadContentType badContentType) {
|
break retryLoop;
|
||||||
return new HttpFetchResult.ResultNone();
|
case HttpFetcher.ContentTypeProbeResult.BadContentType badContentType:
|
||||||
} else if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.BadContentType.Timeout timeout) {
|
return new HttpFetchResult.ResultNone();
|
||||||
return new HttpFetchResult.ResultException(timeout.ex());
|
case HttpFetcher.ContentTypeProbeResult.BadContentType.Timeout timeout:
|
||||||
} else if (probeResult instanceof HttpFetcher.ContentTypeProbeResult.Exception exception) {
|
return new HttpFetchResult.ResultException(timeout.ex());
|
||||||
return new HttpFetchResult.ResultException(exception.ex());
|
case HttpFetcher.ContentTypeProbeResult.Exception exception:
|
||||||
}
|
return new HttpFetchResult.ResultException(exception.ex());
|
||||||
else { // should be unreachable
|
default: // should be unreachable
|
||||||
throw new IllegalStateException("Unknown probe result");
|
throw new IllegalStateException("Unknown probe result");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (HttpFetcherImpl.RateLimitException ex) {
|
catch (HttpFetcherImpl.RateLimitException ex) {
|
||||||
@ -348,8 +346,8 @@ public class CrawlerRetreiver implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
timer.waitFetchDelay(System.currentTimeMillis() - probeStart);
|
timer.waitFetchDelay(System.currentTimeMillis() - probeStart);
|
||||||
}*/
|
}
|
||||||
|
|
||||||
|
|
||||||
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
for (int i = 0; i <= HTTP_429_RETRY_LIMIT; i++) {
|
||||||
|
@ -0,0 +1,22 @@
|
|||||||
|
package nu.marginalia;
|
||||||
|
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
public class ContentTypes {
|
||||||
|
public static final Set<String> acceptedContentTypes = Set.of("application/xhtml+xml",
|
||||||
|
"application/xhtml",
|
||||||
|
"text/html",
|
||||||
|
"image/x-icon",
|
||||||
|
"text/plain");
|
||||||
|
|
||||||
|
public static boolean isAccepted(String contentTypeHeader) {
|
||||||
|
String lcHeader = contentTypeHeader.toLowerCase();
|
||||||
|
for (var type : acceptedContentTypes) {
|
||||||
|
if (lcHeader.startsWith(type)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.parquet.crawldata;
|
package nu.marginalia.parquet.crawldata;
|
||||||
|
|
||||||
import blue.strategic.parquet.ParquetWriter;
|
import blue.strategic.parquet.ParquetWriter;
|
||||||
|
import nu.marginalia.ContentTypes;
|
||||||
import nu.marginalia.UserAgent;
|
import nu.marginalia.UserAgent;
|
||||||
import nu.marginalia.model.body.DocumentBodyExtractor;
|
import nu.marginalia.model.body.DocumentBodyExtractor;
|
||||||
import nu.marginalia.model.body.DocumentBodyResult;
|
import nu.marginalia.model.body.DocumentBodyResult;
|
||||||
@ -62,6 +63,8 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/** Return true if the WarcResponse should be excluded from conversion */
|
/** Return true if the WarcResponse should be excluded from conversion */
|
||||||
private static boolean filterResponse(String uaString, WarcResponse response) throws IOException {
|
private static boolean filterResponse(String uaString, WarcResponse response) throws IOException {
|
||||||
|
|
||||||
@ -74,14 +77,25 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
var robotsTags = response.http().headers().all("X-Robots-Tag");
|
var headers = response.http().headers();
|
||||||
|
var robotsTags = headers.all("X-Robots-Tag");
|
||||||
|
|
||||||
if (!isXRobotsTagsPermitted(robotsTags, uaString)) {
|
if (!isXRobotsTagsPermitted(robotsTags, uaString)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Strip out responses with content types we aren't interested in
|
||||||
|
// (though ideally we wouldn't download these at all)
|
||||||
|
String contentType = headers.first("Content-Type").orElse("text/plain").toLowerCase();
|
||||||
|
|
||||||
|
if (!ContentTypes.isAccepted(contentType)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void write(String domain, WarcXEntityRefused refused) throws IOException {
|
private void write(String domain, WarcXEntityRefused refused) throws IOException {
|
||||||
URI profile = refused.profile();
|
URI profile = refused.profile();
|
||||||
|
|
||||||
|
@ -157,10 +157,10 @@ class WarcRecorderTest {
|
|||||||
fileNameParquet);
|
fileNameParquet);
|
||||||
|
|
||||||
var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList();
|
var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList();
|
||||||
assertEquals(3, urls.size());
|
assertEquals(2, urls.size());
|
||||||
assertEquals("https://www.marginalia.nu/", urls.get(0));
|
assertEquals("https://www.marginalia.nu/", urls.get(0));
|
||||||
assertEquals("https://www.marginalia.nu/log/", urls.get(1));
|
assertEquals("https://www.marginalia.nu/log/", urls.get(1));
|
||||||
assertEquals("https://www.marginalia.nu/sanic.png", urls.get(2));
|
// sanic.jpg gets filtered out for its bad mime type
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user