(warc) Integrate the crawler's content type parsing and charset logic into the WarcSideloader

This commit is contained in:
Viktor Lofgren 2023-12-07 15:26:01 +01:00
parent 064265b0b9
commit fabffa80f0
3 changed files with 51 additions and 15 deletions

View File

@ -58,6 +58,7 @@ dependencies {
implementation project(':code:features-crawl:crawl-blocklist') implementation project(':code:features-crawl:crawl-blocklist')
implementation project(':code:features-crawl:link-parser') implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-crawl:content-type')
testImplementation project(':code:libraries:term-frequency-dict') testImplementation project(':code:libraries:term-frequency-dict')
testImplementation project(':code:process-models:crawl-spec') testImplementation project(':code:process-models:crawl-spec')

View File

@ -2,6 +2,8 @@ package nu.marginalia.converting.sideload.warc;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.contenttype.ContentTypeParser;
import nu.marginalia.contenttype.DocumentBodyToString;
import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.model.ProcessedDomain;
@ -11,31 +13,32 @@ import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawl.DomainIndexingState;
import org.netpreserve.jwarc.*; import org.netpreserve.jwarc.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException; import java.io.IOException;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.stream.StreamSupport;
public class WarcSideloader implements SideloadSource, AutoCloseable { public class WarcSideloader implements SideloadSource, AutoCloseable {
private final Path warcFile; private static final Logger logger = LoggerFactory.getLogger(WarcSideloader.class);
private final SideloaderProcessing sideloaderProcessing; private final SideloaderProcessing sideloaderProcessing;
private final WarcReader reader; private final WarcReader reader;
private final EdgeDomain domain; private final EdgeDomain domain;
public WarcSideloader(Path warcFile, public WarcSideloader(Path warcFile,
SideloaderProcessing sideloaderProcessing) SideloaderProcessing sideloaderProcessing)
throws IOException throws IOException
{ {
this.warcFile = warcFile;
this.sideloaderProcessing = sideloaderProcessing; this.sideloaderProcessing = sideloaderProcessing;
this.reader = new WarcReader(warcFile); this.reader = new WarcReader(warcFile);
this.domain = sniffDomainFromWarc() this.domain = sniffDomainFromWarc()
@ -82,6 +85,8 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
.map(WarcResponse.class::cast) .map(WarcResponse.class::cast)
.filter(this::isRelevantResponse) .filter(this::isRelevantResponse)
.map(this::process) .map(this::process)
.filter(Optional::isPresent)
.map(Optional::get)
.iterator(); .iterator();
} }
@ -109,8 +114,8 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
} }
@SneakyThrows @SneakyThrows
private ProcessedDocument process(WarcResponse response) { private Optional<ProcessedDocument> process(WarcResponse response) {
String body = getBody(response); Optional<String> body = getBody(response);
String url = response.target(); String url = response.target();
// We trim "/index.html"-suffixes from the index if they are present, // We trim "/index.html"-suffixes from the index if they are present,
@ -119,18 +124,32 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
url = url.substring(0, url.length() - "index.html".length()); url = url.substring(0, url.length() - "index.html".length());
} }
return sideloaderProcessing if (body.isEmpty()) {
.processDocument(url, body, List.of(), new DomainLinks(), return Optional.empty();
}
return Optional.of(sideloaderProcessing
.processDocument(url, body.get(), List.of(), new DomainLinks(),
GeneratorType.DOCS, GeneratorType.DOCS,
10_000); 10_000));
} }
@SneakyThrows @SneakyThrows
private String getBody(WarcResponse response) { private Optional<String> getBody(WarcResponse response) {
var http = response.http(); var http = response.http();
// TODO: We should support additional encodings here // TODO: We should support additional encodings here
return new String(http.body().stream().readAllBytes(), StandardCharsets.UTF_8); try (var body = http.body()) {
String contentType = http.headers().first("Content-Type").orElse(null);
byte[] bytes = body.stream().readAllBytes();
var ct = ContentTypeParser.parseContentType(contentType, bytes);
return Optional.of(DocumentBodyToString.getStringData(ct, bytes));
}
catch (Exception ex) {
logger.info("Failed to parse body", ex);
}
return Optional.empty();
} }
@Override @Override

View File

@ -3,6 +3,8 @@ package nu.marginalia.converting.sideload.warc;
import com.google.inject.AbstractModule; import com.google.inject.AbstractModule;
import com.google.inject.Guice; import com.google.inject.Guice;
import nu.marginalia.converting.ConverterModule; import nu.marginalia.converting.ConverterModule;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.processor.ConverterDomainTypes; import nu.marginalia.converting.processor.ConverterDomainTypes;
import nu.marginalia.converting.sideload.SideloaderProcessing; import nu.marginalia.converting.sideload.SideloaderProcessing;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
@ -16,7 +18,11 @@ import java.net.URI;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.mockito.Mockito.when; import static org.mockito.Mockito.when;
class WarcSideloaderTest extends AbstractModule { class WarcSideloaderTest extends AbstractModule {
@ -53,13 +59,23 @@ class WarcSideloaderTest extends AbstractModule {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
try (var sideloader = new WarcSideloader(warcFile, processing)) { ProcessedDomain domain;
List<ProcessedDocument> docs = new ArrayList<>();
var domain = sideloader.getDomain(); try (var sideloader = new WarcSideloader(warcFile, processing)) {
System.out.println(domain); domain = sideloader.getDomain();
sideloader.getDocumentsStream().forEachRemaining(System.out::println); sideloader.getDocumentsStream().forEachRemaining(docs::add);
} catch (Exception e) { } catch (Exception e) {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
assertNotNull(domain);
assertEquals(3, docs.size());
List<String> fetchedUrls = docs.stream().map(doc -> doc.url).map(Object::toString).toList();
assertEquals(List.of(
"https://www.marginalia.nu/",
"https://www.marginalia.nu/log/93_atags/",
"https://www.marginalia.nu/links/"),
fetchedUrls);
} }
} }