(warc) Filter WarcResponses based on X-Robots-Tags

There really is no fantastic place to put this logic, but we need to remove entries with an X-Robots-Tags header where that header indicates it doesn't want to be crawled by Marginalia.
This commit is contained in:
Viktor Lofgren 2023-12-16 15:57:10 +01:00
parent 54ed3b86ba
commit 3113b5a551
7 changed files with 117 additions and 81 deletions

View File

@ -15,6 +15,7 @@ java {
dependencies { dependencies {
implementation project(':code:common:model') implementation project(':code:common:model')
implementation project(':code:common:db') implementation project(':code:common:db')
implementation project(':code:common:config')
implementation project(':code:common:process') implementation project(':code:common:process')
implementation project(':code:libraries:big-string') implementation project(':code:libraries:big-string')
implementation project(':code:api:index-api') implementation project(':code:api:index-api')
@ -33,6 +34,7 @@ dependencies {
implementation libs.jwarc implementation libs.jwarc
implementation libs.gson implementation libs.gson
implementation libs.commons.io implementation libs.commons.io
implementation libs.commons.lang3
implementation libs.okhttp3 implementation libs.okhttp3
implementation libs.jsoup implementation libs.jsoup
implementation libs.snakeyaml implementation libs.snakeyaml

View File

@ -1,9 +1,11 @@
package nu.marginalia.crawling.parquet; package nu.marginalia.crawling.parquet;
import blue.strategic.parquet.ParquetWriter; import blue.strategic.parquet.ParquetWriter;
import nu.marginalia.UserAgent;
import nu.marginalia.crawling.body.DocumentBodyExtractor; import nu.marginalia.crawling.body.DocumentBodyExtractor;
import nu.marginalia.crawling.body.DocumentBodyResult; import nu.marginalia.crawling.body.DocumentBodyResult;
import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawling.body.HttpFetchResult;
import org.apache.commons.lang3.StringUtils;
import org.netpreserve.jwarc.*; import org.netpreserve.jwarc.*;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -12,24 +14,35 @@ import java.io.IOException;
import java.net.URI; import java.net.URI;
import java.nio.file.Path; import java.nio.file.Path;
import java.time.Instant; import java.time.Instant;
import java.util.List;
import java.util.Objects;
public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
private final ParquetWriter<CrawledDocumentParquetRecord> writer; private final ParquetWriter<CrawledDocumentParquetRecord> writer;
private static final Logger logger = LoggerFactory.getLogger(CrawledDocumentParquetRecordFileWriter.class); private static final Logger logger = LoggerFactory.getLogger(CrawledDocumentParquetRecordFileWriter.class);
public static void convertWarc(String domain, Path warcInputFile, Path parquetOutputFile) { public static void convertWarc(String domain,
UserAgent userAgent,
Path warcInputFile,
Path parquetOutputFile) {
try (var warcReader = new WarcReader(warcInputFile); try (var warcReader = new WarcReader(warcInputFile);
var parquetWriter = new CrawledDocumentParquetRecordFileWriter(parquetOutputFile) var parquetWriter = new CrawledDocumentParquetRecordFileWriter(parquetOutputFile)
) { ) {
WarcXResponseReference.register(warcReader); WarcXResponseReference.register(warcReader);
WarcXEntityRefused.register(warcReader); WarcXEntityRefused.register(warcReader);
String uaString = userAgent.uaString();
for (var record : warcReader) { for (var record : warcReader) {
if (record instanceof WarcResponse response) { if (record instanceof WarcResponse response) {
// this also captures WarcXResponseReference, which inherits from WarcResponse // this also captures WarcXResponseReference, which inherits from WarcResponse
// and is used to store old responses from previous crawls; in this part of the logic // and is used to store old responses from previous crawls; in this part of the logic
// we treat them the same as a normal response // we treat them the same as a normal response
if (!filterResponse(uaString, response)) {
continue;
}
parquetWriter.write(domain, response); parquetWriter.write(domain, response);
} }
else if (record instanceof WarcXEntityRefused refused) { else if (record instanceof WarcXEntityRefused refused) {
@ -45,6 +58,26 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
} }
} }
/** Return true if the WarcResponse should be excluded from conversion */
private static boolean filterResponse(String uaString, WarcResponse response) throws IOException {
// We don't want to store robots.txt files, as they are not
// interesting for the analysis we want to do. This is important
// since txt-files in general are interesting, and we don't want to
// exclude them as a class.
if (response.targetURI().getPath().equals("/robots.txt")) {
return false;
}
var robotsTags = response.http().headers().all("X-Robots-Tag");
if (!isXRobotsTagsPermitted(robotsTags, uaString)) {
return false;
}
return true;
}
private void write(String domain, WarcXEntityRefused refused) throws IOException { private void write(String domain, WarcXEntityRefused refused) throws IOException {
URI profile = refused.profile(); URI profile = refused.profile();
@ -98,15 +131,6 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
return; return;
} }
// We don't want to store robots.txt files, as they are not
// interesting for the analysis we want to do. This is important
// since txt-files in general are interesting, and we don't want to
// exclude them as a class.
if (fetchOk.uri().getPath().equals("/robots.txt")) {
return;
}
byte[] bodyBytes; byte[] bodyBytes;
String contentType; String contentType;
@ -172,4 +196,52 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
new byte[0] new byte[0]
); );
} }
/** Check X-Robots-Tag header tag to see if we are allowed to index this page.
* <p>
* Reference: <a href="https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag">https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag</a>
*
* @param xRobotsHeaderTags List of X-Robots-Tag values
* @param userAgent User agent string
* @return true if we are allowed to index this page
*/
// Visible for tests
public static boolean isXRobotsTagsPermitted(List<String> xRobotsHeaderTags, String userAgent) {
boolean isPermittedGeneral = true;
boolean isPermittedMarginalia = false;
boolean isForbiddenMarginalia = false;
for (String header : xRobotsHeaderTags) {
if (header.indexOf(':') >= 0) {
String[] parts = StringUtils.split(header, ":", 2);
if (parts.length < 2)
continue;
// Is this relevant to us?
if (!Objects.equals(parts[0].trim(), userAgent))
continue;
if (parts[1].contains("noindex"))
isForbiddenMarginalia = true;
else if (parts[1].contains("none"))
isForbiddenMarginalia = true;
else if (parts[1].contains("all"))
isPermittedMarginalia = true;
}
else {
if (header.contains("noindex"))
isPermittedGeneral = false;
if (header.contains("none"))
isPermittedGeneral = false;
}
}
if (isPermittedMarginalia)
return true;
if (isForbiddenMarginalia)
return false;
return isPermittedGeneral;
}
} }

View File

@ -3,6 +3,7 @@ package nu.marginalia.converting;
import com.google.inject.Guice; import com.google.inject.Guice;
import com.google.inject.Injector; import com.google.inject.Injector;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.UserAgent;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.converting.processor.DomainProcessor;
@ -268,7 +269,9 @@ public class CrawlingThenConvertingIntegrationTest {
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).fetch(); new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).fetch();
} }
CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain, fileName, fileName2); CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain,
new UserAgent("test"),
fileName, fileName2);
try (var reader = new ParquetSerializableCrawlDataStream(fileName2)) { try (var reader = new ParquetSerializableCrawlDataStream(fileName2)) {
while (reader.hasNext()) { while (reader.hasNext()) {

View File

@ -51,6 +51,7 @@ import static nu.marginalia.mqapi.ProcessInboxNames.CRAWLER_INBOX;
public class CrawlerMain { public class CrawlerMain {
private final static Logger logger = LoggerFactory.getLogger(CrawlerMain.class); private final static Logger logger = LoggerFactory.getLogger(CrawlerMain.class);
private final UserAgent userAgent;
private final ProcessHeartbeatImpl heartbeat; private final ProcessHeartbeatImpl heartbeat;
private final MessageQueueFactory messageQueueFactory; private final MessageQueueFactory messageQueueFactory;
private final DomainProber domainProber; private final DomainProber domainProber;
@ -78,6 +79,7 @@ public class CrawlerMain {
DbCrawlSpecProvider dbCrawlSpecProvider, DbCrawlSpecProvider dbCrawlSpecProvider,
AnchorTagsSourceFactory anchorTagsSourceFactory, AnchorTagsSourceFactory anchorTagsSourceFactory,
Gson gson) { Gson gson) {
this.userAgent = userAgent;
this.heartbeat = heartbeat; this.heartbeat = heartbeat;
this.messageQueueFactory = messageQueueFactory; this.messageQueueFactory = messageQueueFactory;
this.domainProber = domainProber; this.domainProber = domainProber;
@ -245,7 +247,7 @@ public class CrawlerMain {
reference.delete(); reference.delete();
CrawledDocumentParquetRecordFileWriter CrawledDocumentParquetRecordFileWriter
.convertWarc(domain, newWarcFile, parquetFile); .convertWarc(domain, userAgent, newWarcFile, parquetFile);
workLog.setJobToFinished(domain, parquetFile.toString(), size); workLog.setJobToFinished(domain, parquetFile.toString(), size);
heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks); heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);

View File

@ -192,54 +192,6 @@ public class HttpFetcherImpl implements HttpFetcher {
return new HttpFetchResult.ResultNone(); return new HttpFetchResult.ResultNone();
} }
/** Check X-Robots-Tag header tag to see if we are allowed to index this page.
* <p>
* Reference: <a href="https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag">https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag</a>
*
* @param xRobotsHeaderTags List of X-Robots-Tag values
* @param userAgent User agent string
* @return true if we are allowed to index this page
*/
// Visible for tests
public static boolean isXRobotsTagsPermitted(List<String> xRobotsHeaderTags, String userAgent) {
boolean isPermittedGeneral = true;
boolean isPermittedMarginalia = false;
boolean isForbiddenMarginalia = false;
for (String header : xRobotsHeaderTags) {
if (header.indexOf(':') >= 0) {
String[] parts = StringUtils.split(header, ":", 2);
if (parts.length < 2)
continue;
// Is this relevant to us?
if (!Objects.equals(parts[0].trim(), userAgent))
continue;
if (parts[1].contains("noindex"))
isForbiddenMarginalia = true;
else if (parts[1].contains("none"))
isForbiddenMarginalia = true;
else if (parts[1].contains("all"))
isPermittedMarginalia = true;
}
else {
if (header.contains("noindex"))
isPermittedGeneral = false;
if (header.contains("none"))
isPermittedGeneral = false;
}
}
if (isPermittedMarginalia)
return true;
if (isForbiddenMarginalia)
return false;
return isPermittedGeneral;
}
@Override @Override
public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) { public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) {
return fetchRobotsForProto("https", recorder, domain) return fetchRobotsForProto("https", recorder, domain)

View File

@ -1,5 +1,6 @@
package nu.marginalia.crawl.retreival.fetcher; package nu.marginalia.crawl.retreival.fetcher;
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.util.List; import java.util.List;
@ -7,30 +8,30 @@ import java.util.List;
import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.assertTrue;
class HttpFetcherImplTest { class CrawledDocumentParquetRecordFileWriterTest {
@Test @Test
public void testXRobotsTag() { public void testXRobotsTag() {
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("foo:"), "search.marginalia.nu")); assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("foo:"), "search.marginalia.nu"));
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(":bar"), "search.marginalia.nu")); assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of(":bar"), "search.marginalia.nu"));
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(":"), "search.marginalia.nu")); assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of(":"), "search.marginalia.nu"));
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(""), "search.marginalia.nu")); assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of(""), "search.marginalia.nu"));
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("doindex"), "search.marginalia.nu")); assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("doindex"), "search.marginalia.nu"));
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("noindex"), "search.marginalia.nu")); assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("noindex"), "search.marginalia.nu"));
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none"), "search.marginalia.nu")); assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("none"), "search.marginalia.nu"));
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: noindex"), "search.marginalia.nu")); assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: noindex"), "search.marginalia.nu"));
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none"), "search.marginalia.nu")); assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none"), "search.marginalia.nu"));
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("googlebot: noindex"), "search.marginalia.nu")); assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("googlebot: noindex"), "search.marginalia.nu"));
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("noindex", "search.marginalia.nu: all"), "search.marginalia.nu")); assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("noindex", "search.marginalia.nu: all"), "search.marginalia.nu"));
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: all"), "search.marginalia.nu")); assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: all"), "search.marginalia.nu"));
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: none"), "search.marginalia.nu")); assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: none"), "search.marginalia.nu"));
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("all", "search.marginalia.nu: none"), "search.marginalia.nu")); assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("all", "search.marginalia.nu: none"), "search.marginalia.nu"));
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "noindex"), "search.marginalia.nu")); assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "noindex"), "search.marginalia.nu"));
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "none"), "search.marginalia.nu")); assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "none"), "search.marginalia.nu"));
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "none"), "search.marginalia.nu")); assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "none"), "search.marginalia.nu"));
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "all"), "search.marginalia.nu")); assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "all"), "search.marginalia.nu"));
} }
} }

View File

@ -1,5 +1,6 @@
package nu.marginalia.crawl.retreival.fetcher; package nu.marginalia.crawl.retreival.fetcher;
import nu.marginalia.UserAgent;
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader; import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader;
@ -19,7 +20,6 @@ import java.nio.file.Path;
import java.security.NoSuchAlgorithmException; import java.security.NoSuchAlgorithmException;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.zip.GZIPInputStream;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
@ -130,7 +130,11 @@ class WarcRecorderTest {
.get().build()); .get().build());
client.close(); client.close();
CrawledDocumentParquetRecordFileWriter.convertWarc("www.marginalia.nu", fileNameWarc, fileNameParquet); CrawledDocumentParquetRecordFileWriter.convertWarc(
"www.marginalia.nu",
new UserAgent("test"),
fileNameWarc,
fileNameParquet);
var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList(); var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList();
assertEquals(3, urls.size()); assertEquals(3, urls.size());