(warc) Integrate the crawler's content type parsing and charset logic into the WarcSideloader

This commit is contained in:
Viktor Lofgren 2023-12-07 15:26:01 +01:00
parent 064265b0b9
commit fabffa80f0
3 changed files with 51 additions and 15 deletions

View File

@ -58,6 +58,7 @@ dependencies {
implementation project(':code:features-crawl:crawl-blocklist')
implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-crawl:content-type')
testImplementation project(':code:libraries:term-frequency-dict')
testImplementation project(':code:process-models:crawl-spec')

View File

@ -2,6 +2,8 @@ package nu.marginalia.converting.sideload.warc;
import lombok.SneakyThrows;
import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.contenttype.ContentTypeParser;
import nu.marginalia.contenttype.DocumentBodyToString;
import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain;
@ -11,31 +13,32 @@ import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState;
import org.netpreserve.jwarc.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.StreamSupport;
public class WarcSideloader implements SideloadSource, AutoCloseable {
private final Path warcFile;
private static final Logger logger = LoggerFactory.getLogger(WarcSideloader.class);
private final SideloaderProcessing sideloaderProcessing;
private final WarcReader reader;
private final EdgeDomain domain;
public WarcSideloader(Path warcFile,
SideloaderProcessing sideloaderProcessing)
throws IOException
{
this.warcFile = warcFile;
this.sideloaderProcessing = sideloaderProcessing;
this.reader = new WarcReader(warcFile);
this.domain = sniffDomainFromWarc()
@ -82,6 +85,8 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
.map(WarcResponse.class::cast)
.filter(this::isRelevantResponse)
.map(this::process)
.filter(Optional::isPresent)
.map(Optional::get)
.iterator();
}
@ -109,8 +114,8 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
}
@SneakyThrows
private ProcessedDocument process(WarcResponse response) {
String body = getBody(response);
private Optional<ProcessedDocument> process(WarcResponse response) {
Optional<String> body = getBody(response);
String url = response.target();
// We trim "/index.html"-suffixes from the index if they are present,
@ -119,18 +124,32 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
url = url.substring(0, url.length() - "index.html".length());
}
return sideloaderProcessing
.processDocument(url, body, List.of(), new DomainLinks(),
if (body.isEmpty()) {
return Optional.empty();
}
return Optional.of(sideloaderProcessing
.processDocument(url, body.get(), List.of(), new DomainLinks(),
GeneratorType.DOCS,
10_000);
10_000));
}
@SneakyThrows
private String getBody(WarcResponse response) {
private Optional<String> getBody(WarcResponse response) {
var http = response.http();
// TODO: We should support additional encodings here
return new String(http.body().stream().readAllBytes(), StandardCharsets.UTF_8);
try (var body = http.body()) {
String contentType = http.headers().first("Content-Type").orElse(null);
byte[] bytes = body.stream().readAllBytes();
var ct = ContentTypeParser.parseContentType(contentType, bytes);
return Optional.of(DocumentBodyToString.getStringData(ct, bytes));
}
catch (Exception ex) {
logger.info("Failed to parse body", ex);
}
return Optional.empty();
}
@Override

View File

@ -3,6 +3,8 @@ package nu.marginalia.converting.sideload.warc;
import com.google.inject.AbstractModule;
import com.google.inject.Guice;
import nu.marginalia.converting.ConverterModule;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.processor.ConverterDomainTypes;
import nu.marginalia.converting.sideload.SideloaderProcessing;
import org.junit.jupiter.api.AfterEach;
@ -16,7 +18,11 @@ import java.net.URI;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.mockito.Mockito.when;
class WarcSideloaderTest extends AbstractModule {
@ -53,13 +59,23 @@ class WarcSideloaderTest extends AbstractModule {
throw new RuntimeException(e);
}
try (var sideloader = new WarcSideloader(warcFile, processing)) {
ProcessedDomain domain;
List<ProcessedDocument> docs = new ArrayList<>();
var domain = sideloader.getDomain();
System.out.println(domain);
sideloader.getDocumentsStream().forEachRemaining(System.out::println);
try (var sideloader = new WarcSideloader(warcFile, processing)) {
domain = sideloader.getDomain();
sideloader.getDocumentsStream().forEachRemaining(docs::add);
} catch (Exception e) {
throw new RuntimeException(e);
}
assertNotNull(domain);
assertEquals(3, docs.size());
List<String> fetchedUrls = docs.stream().map(doc -> doc.url).map(Object::toString).toList();
assertEquals(List.of(
"https://www.marginalia.nu/",
"https://www.marginalia.nu/log/93_atags/",
"https://www.marginalia.nu/links/"),
fetchedUrls);
}
}