mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 13:19:02 +00:00
(warc) Filter WarcResponses based on X-Robots-Tags
There really is no fantastic place to put this logic, but we need to remove entries with an X-Robots-Tags header where that header indicates it doesn't want to be crawled by Marginalia.
This commit is contained in:
parent
54ed3b86ba
commit
3113b5a551
@ -15,6 +15,7 @@ java {
|
|||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:db')
|
implementation project(':code:common:db')
|
||||||
|
implementation project(':code:common:config')
|
||||||
implementation project(':code:common:process')
|
implementation project(':code:common:process')
|
||||||
implementation project(':code:libraries:big-string')
|
implementation project(':code:libraries:big-string')
|
||||||
implementation project(':code:api:index-api')
|
implementation project(':code:api:index-api')
|
||||||
@ -33,6 +34,7 @@ dependencies {
|
|||||||
implementation libs.jwarc
|
implementation libs.jwarc
|
||||||
implementation libs.gson
|
implementation libs.gson
|
||||||
implementation libs.commons.io
|
implementation libs.commons.io
|
||||||
|
implementation libs.commons.lang3
|
||||||
implementation libs.okhttp3
|
implementation libs.okhttp3
|
||||||
implementation libs.jsoup
|
implementation libs.jsoup
|
||||||
implementation libs.snakeyaml
|
implementation libs.snakeyaml
|
||||||
|
@ -1,9 +1,11 @@
|
|||||||
package nu.marginalia.crawling.parquet;
|
package nu.marginalia.crawling.parquet;
|
||||||
|
|
||||||
import blue.strategic.parquet.ParquetWriter;
|
import blue.strategic.parquet.ParquetWriter;
|
||||||
|
import nu.marginalia.UserAgent;
|
||||||
import nu.marginalia.crawling.body.DocumentBodyExtractor;
|
import nu.marginalia.crawling.body.DocumentBodyExtractor;
|
||||||
import nu.marginalia.crawling.body.DocumentBodyResult;
|
import nu.marginalia.crawling.body.DocumentBodyResult;
|
||||||
import nu.marginalia.crawling.body.HttpFetchResult;
|
import nu.marginalia.crawling.body.HttpFetchResult;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.netpreserve.jwarc.*;
|
import org.netpreserve.jwarc.*;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -12,24 +14,35 @@ import java.io.IOException;
|
|||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.time.Instant;
|
import java.time.Instant;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
||||||
private final ParquetWriter<CrawledDocumentParquetRecord> writer;
|
private final ParquetWriter<CrawledDocumentParquetRecord> writer;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(CrawledDocumentParquetRecordFileWriter.class);
|
private static final Logger logger = LoggerFactory.getLogger(CrawledDocumentParquetRecordFileWriter.class);
|
||||||
|
|
||||||
public static void convertWarc(String domain, Path warcInputFile, Path parquetOutputFile) {
|
public static void convertWarc(String domain,
|
||||||
|
UserAgent userAgent,
|
||||||
|
Path warcInputFile,
|
||||||
|
Path parquetOutputFile) {
|
||||||
try (var warcReader = new WarcReader(warcInputFile);
|
try (var warcReader = new WarcReader(warcInputFile);
|
||||||
var parquetWriter = new CrawledDocumentParquetRecordFileWriter(parquetOutputFile)
|
var parquetWriter = new CrawledDocumentParquetRecordFileWriter(parquetOutputFile)
|
||||||
) {
|
) {
|
||||||
WarcXResponseReference.register(warcReader);
|
WarcXResponseReference.register(warcReader);
|
||||||
WarcXEntityRefused.register(warcReader);
|
WarcXEntityRefused.register(warcReader);
|
||||||
|
|
||||||
|
String uaString = userAgent.uaString();
|
||||||
|
|
||||||
for (var record : warcReader) {
|
for (var record : warcReader) {
|
||||||
if (record instanceof WarcResponse response) {
|
if (record instanceof WarcResponse response) {
|
||||||
// this also captures WarcXResponseReference, which inherits from WarcResponse
|
// this also captures WarcXResponseReference, which inherits from WarcResponse
|
||||||
// and is used to store old responses from previous crawls; in this part of the logic
|
// and is used to store old responses from previous crawls; in this part of the logic
|
||||||
// we treat them the same as a normal response
|
// we treat them the same as a normal response
|
||||||
|
|
||||||
|
if (!filterResponse(uaString, response)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
parquetWriter.write(domain, response);
|
parquetWriter.write(domain, response);
|
||||||
}
|
}
|
||||||
else if (record instanceof WarcXEntityRefused refused) {
|
else if (record instanceof WarcXEntityRefused refused) {
|
||||||
@ -45,6 +58,26 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Return true if the WarcResponse should be excluded from conversion */
|
||||||
|
private static boolean filterResponse(String uaString, WarcResponse response) throws IOException {
|
||||||
|
|
||||||
|
// We don't want to store robots.txt files, as they are not
|
||||||
|
// interesting for the analysis we want to do. This is important
|
||||||
|
// since txt-files in general are interesting, and we don't want to
|
||||||
|
// exclude them as a class.
|
||||||
|
|
||||||
|
if (response.targetURI().getPath().equals("/robots.txt")) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
var robotsTags = response.http().headers().all("X-Robots-Tag");
|
||||||
|
if (!isXRobotsTagsPermitted(robotsTags, uaString)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
private void write(String domain, WarcXEntityRefused refused) throws IOException {
|
private void write(String domain, WarcXEntityRefused refused) throws IOException {
|
||||||
URI profile = refused.profile();
|
URI profile = refused.profile();
|
||||||
|
|
||||||
@ -98,15 +131,6 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// We don't want to store robots.txt files, as they are not
|
|
||||||
// interesting for the analysis we want to do. This is important
|
|
||||||
// since txt-files in general are interesting, and we don't want to
|
|
||||||
// exclude them as a class.
|
|
||||||
|
|
||||||
if (fetchOk.uri().getPath().equals("/robots.txt")) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
byte[] bodyBytes;
|
byte[] bodyBytes;
|
||||||
String contentType;
|
String contentType;
|
||||||
|
|
||||||
@ -172,4 +196,52 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable {
|
|||||||
new byte[0]
|
new byte[0]
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** Check X-Robots-Tag header tag to see if we are allowed to index this page.
|
||||||
|
* <p>
|
||||||
|
* Reference: <a href="https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag">https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag</a>
|
||||||
|
*
|
||||||
|
* @param xRobotsHeaderTags List of X-Robots-Tag values
|
||||||
|
* @param userAgent User agent string
|
||||||
|
* @return true if we are allowed to index this page
|
||||||
|
*/
|
||||||
|
// Visible for tests
|
||||||
|
public static boolean isXRobotsTagsPermitted(List<String> xRobotsHeaderTags, String userAgent) {
|
||||||
|
boolean isPermittedGeneral = true;
|
||||||
|
boolean isPermittedMarginalia = false;
|
||||||
|
boolean isForbiddenMarginalia = false;
|
||||||
|
|
||||||
|
for (String header : xRobotsHeaderTags) {
|
||||||
|
if (header.indexOf(':') >= 0) {
|
||||||
|
String[] parts = StringUtils.split(header, ":", 2);
|
||||||
|
|
||||||
|
if (parts.length < 2)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// Is this relevant to us?
|
||||||
|
if (!Objects.equals(parts[0].trim(), userAgent))
|
||||||
|
continue;
|
||||||
|
|
||||||
|
if (parts[1].contains("noindex"))
|
||||||
|
isForbiddenMarginalia = true;
|
||||||
|
else if (parts[1].contains("none"))
|
||||||
|
isForbiddenMarginalia = true;
|
||||||
|
else if (parts[1].contains("all"))
|
||||||
|
isPermittedMarginalia = true;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (header.contains("noindex"))
|
||||||
|
isPermittedGeneral = false;
|
||||||
|
if (header.contains("none"))
|
||||||
|
isPermittedGeneral = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isPermittedMarginalia)
|
||||||
|
return true;
|
||||||
|
if (isForbiddenMarginalia)
|
||||||
|
return false;
|
||||||
|
return isPermittedGeneral;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,7 @@ package nu.marginalia.converting;
|
|||||||
import com.google.inject.Guice;
|
import com.google.inject.Guice;
|
||||||
import com.google.inject.Injector;
|
import com.google.inject.Injector;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.UserAgent;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.converting.model.ProcessedDomain;
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
import nu.marginalia.converting.processor.DomainProcessor;
|
import nu.marginalia.converting.processor.DomainProcessor;
|
||||||
@ -268,7 +269,9 @@ public class CrawlingThenConvertingIntegrationTest {
|
|||||||
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).fetch();
|
new CrawlerRetreiver(httpFetcher, new DomainProber(domainBlacklist), specs, recorder).fetch();
|
||||||
}
|
}
|
||||||
|
|
||||||
CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain, fileName, fileName2);
|
CrawledDocumentParquetRecordFileWriter.convertWarc(specs.domain,
|
||||||
|
new UserAgent("test"),
|
||||||
|
fileName, fileName2);
|
||||||
|
|
||||||
try (var reader = new ParquetSerializableCrawlDataStream(fileName2)) {
|
try (var reader = new ParquetSerializableCrawlDataStream(fileName2)) {
|
||||||
while (reader.hasNext()) {
|
while (reader.hasNext()) {
|
||||||
|
@ -51,6 +51,7 @@ import static nu.marginalia.mqapi.ProcessInboxNames.CRAWLER_INBOX;
|
|||||||
public class CrawlerMain {
|
public class CrawlerMain {
|
||||||
private final static Logger logger = LoggerFactory.getLogger(CrawlerMain.class);
|
private final static Logger logger = LoggerFactory.getLogger(CrawlerMain.class);
|
||||||
|
|
||||||
|
private final UserAgent userAgent;
|
||||||
private final ProcessHeartbeatImpl heartbeat;
|
private final ProcessHeartbeatImpl heartbeat;
|
||||||
private final MessageQueueFactory messageQueueFactory;
|
private final MessageQueueFactory messageQueueFactory;
|
||||||
private final DomainProber domainProber;
|
private final DomainProber domainProber;
|
||||||
@ -78,6 +79,7 @@ public class CrawlerMain {
|
|||||||
DbCrawlSpecProvider dbCrawlSpecProvider,
|
DbCrawlSpecProvider dbCrawlSpecProvider,
|
||||||
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
||||||
Gson gson) {
|
Gson gson) {
|
||||||
|
this.userAgent = userAgent;
|
||||||
this.heartbeat = heartbeat;
|
this.heartbeat = heartbeat;
|
||||||
this.messageQueueFactory = messageQueueFactory;
|
this.messageQueueFactory = messageQueueFactory;
|
||||||
this.domainProber = domainProber;
|
this.domainProber = domainProber;
|
||||||
@ -245,7 +247,7 @@ public class CrawlerMain {
|
|||||||
reference.delete();
|
reference.delete();
|
||||||
|
|
||||||
CrawledDocumentParquetRecordFileWriter
|
CrawledDocumentParquetRecordFileWriter
|
||||||
.convertWarc(domain, newWarcFile, parquetFile);
|
.convertWarc(domain, userAgent, newWarcFile, parquetFile);
|
||||||
|
|
||||||
workLog.setJobToFinished(domain, parquetFile.toString(), size);
|
workLog.setJobToFinished(domain, parquetFile.toString(), size);
|
||||||
heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
|
heartbeat.setProgress(tasksDone.incrementAndGet() / (double) totalTasks);
|
||||||
|
@ -192,54 +192,6 @@ public class HttpFetcherImpl implements HttpFetcher {
|
|||||||
return new HttpFetchResult.ResultNone();
|
return new HttpFetchResult.ResultNone();
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Check X-Robots-Tag header tag to see if we are allowed to index this page.
|
|
||||||
* <p>
|
|
||||||
* Reference: <a href="https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag">https://developers.google.com/search/docs/crawling-indexing/robots-meta-tag</a>
|
|
||||||
*
|
|
||||||
* @param xRobotsHeaderTags List of X-Robots-Tag values
|
|
||||||
* @param userAgent User agent string
|
|
||||||
* @return true if we are allowed to index this page
|
|
||||||
*/
|
|
||||||
// Visible for tests
|
|
||||||
public static boolean isXRobotsTagsPermitted(List<String> xRobotsHeaderTags, String userAgent) {
|
|
||||||
boolean isPermittedGeneral = true;
|
|
||||||
boolean isPermittedMarginalia = false;
|
|
||||||
boolean isForbiddenMarginalia = false;
|
|
||||||
|
|
||||||
for (String header : xRobotsHeaderTags) {
|
|
||||||
if (header.indexOf(':') >= 0) {
|
|
||||||
String[] parts = StringUtils.split(header, ":", 2);
|
|
||||||
|
|
||||||
if (parts.length < 2)
|
|
||||||
continue;
|
|
||||||
|
|
||||||
// Is this relevant to us?
|
|
||||||
if (!Objects.equals(parts[0].trim(), userAgent))
|
|
||||||
continue;
|
|
||||||
|
|
||||||
if (parts[1].contains("noindex"))
|
|
||||||
isForbiddenMarginalia = true;
|
|
||||||
else if (parts[1].contains("none"))
|
|
||||||
isForbiddenMarginalia = true;
|
|
||||||
else if (parts[1].contains("all"))
|
|
||||||
isPermittedMarginalia = true;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
if (header.contains("noindex"))
|
|
||||||
isPermittedGeneral = false;
|
|
||||||
if (header.contains("none"))
|
|
||||||
isPermittedGeneral = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isPermittedMarginalia)
|
|
||||||
return true;
|
|
||||||
if (isForbiddenMarginalia)
|
|
||||||
return false;
|
|
||||||
return isPermittedGeneral;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) {
|
public SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder) {
|
||||||
return fetchRobotsForProto("https", recorder, domain)
|
return fetchRobotsForProto("https", recorder, domain)
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.crawl.retreival.fetcher;
|
package nu.marginalia.crawl.retreival.fetcher;
|
||||||
|
|
||||||
|
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@ -7,30 +8,30 @@ import java.util.List;
|
|||||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
class HttpFetcherImplTest {
|
class CrawledDocumentParquetRecordFileWriterTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testXRobotsTag() {
|
public void testXRobotsTag() {
|
||||||
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("foo:"), "search.marginalia.nu"));
|
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("foo:"), "search.marginalia.nu"));
|
||||||
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(":bar"), "search.marginalia.nu"));
|
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of(":bar"), "search.marginalia.nu"));
|
||||||
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(":"), "search.marginalia.nu"));
|
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of(":"), "search.marginalia.nu"));
|
||||||
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of(""), "search.marginalia.nu"));
|
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of(""), "search.marginalia.nu"));
|
||||||
|
|
||||||
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("doindex"), "search.marginalia.nu"));
|
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("doindex"), "search.marginalia.nu"));
|
||||||
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("noindex"), "search.marginalia.nu"));
|
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("noindex"), "search.marginalia.nu"));
|
||||||
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none"), "search.marginalia.nu"));
|
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("none"), "search.marginalia.nu"));
|
||||||
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: noindex"), "search.marginalia.nu"));
|
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: noindex"), "search.marginalia.nu"));
|
||||||
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none"), "search.marginalia.nu"));
|
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none"), "search.marginalia.nu"));
|
||||||
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("googlebot: noindex"), "search.marginalia.nu"));
|
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("googlebot: noindex"), "search.marginalia.nu"));
|
||||||
|
|
||||||
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("noindex", "search.marginalia.nu: all"), "search.marginalia.nu"));
|
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("noindex", "search.marginalia.nu: all"), "search.marginalia.nu"));
|
||||||
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: all"), "search.marginalia.nu"));
|
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: all"), "search.marginalia.nu"));
|
||||||
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: none"), "search.marginalia.nu"));
|
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("none", "search.marginalia.nu: none"), "search.marginalia.nu"));
|
||||||
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("all", "search.marginalia.nu: none"), "search.marginalia.nu"));
|
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("all", "search.marginalia.nu: none"), "search.marginalia.nu"));
|
||||||
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "noindex"), "search.marginalia.nu"));
|
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "noindex"), "search.marginalia.nu"));
|
||||||
assertTrue(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "none"), "search.marginalia.nu"));
|
assertTrue(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: all", "none"), "search.marginalia.nu"));
|
||||||
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "none"), "search.marginalia.nu"));
|
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "none"), "search.marginalia.nu"));
|
||||||
assertFalse(HttpFetcherImpl.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "all"), "search.marginalia.nu"));
|
assertFalse(CrawledDocumentParquetRecordFileWriter.isXRobotsTagsPermitted(List.of("search.marginalia.nu: none", "all"), "search.marginalia.nu"));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.crawl.retreival.fetcher;
|
package nu.marginalia.crawl.retreival.fetcher;
|
||||||
|
|
||||||
|
import nu.marginalia.UserAgent;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
|
import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||||
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader;
|
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader;
|
||||||
@ -19,7 +20,6 @@ import java.nio.file.Path;
|
|||||||
import java.security.NoSuchAlgorithmException;
|
import java.security.NoSuchAlgorithmException;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.zip.GZIPInputStream;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
@ -130,7 +130,11 @@ class WarcRecorderTest {
|
|||||||
.get().build());
|
.get().build());
|
||||||
client.close();
|
client.close();
|
||||||
|
|
||||||
CrawledDocumentParquetRecordFileWriter.convertWarc("www.marginalia.nu", fileNameWarc, fileNameParquet);
|
CrawledDocumentParquetRecordFileWriter.convertWarc(
|
||||||
|
"www.marginalia.nu",
|
||||||
|
new UserAgent("test"),
|
||||||
|
fileNameWarc,
|
||||||
|
fileNameParquet);
|
||||||
|
|
||||||
var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList();
|
var urls = CrawledDocumentParquetRecordFileReader.stream(fileNameParquet).map(doc -> doc.url).toList();
|
||||||
assertEquals(3, urls.size());
|
assertEquals(3, urls.size());
|
||||||
|
Loading…
Reference in New Issue
Block a user