mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
(warc) Integrate the crawler's content type parsing and charset logic into the WarcSideloader
This commit is contained in:
parent
064265b0b9
commit
fabffa80f0
@ -58,6 +58,7 @@ dependencies {
|
||||
|
||||
implementation project(':code:features-crawl:crawl-blocklist')
|
||||
implementation project(':code:features-crawl:link-parser')
|
||||
implementation project(':code:features-crawl:content-type')
|
||||
|
||||
testImplementation project(':code:libraries:term-frequency-dict')
|
||||
testImplementation project(':code:process-models:crawl-spec')
|
||||
|
@ -2,6 +2,8 @@ package nu.marginalia.converting.sideload.warc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.contenttype.ContentTypeParser;
|
||||
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
@ -11,31 +13,32 @@ import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import org.netpreserve.jwarc.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
public class WarcSideloader implements SideloadSource, AutoCloseable {
|
||||
|
||||
private final Path warcFile;
|
||||
private static final Logger logger = LoggerFactory.getLogger(WarcSideloader.class);
|
||||
|
||||
private final SideloaderProcessing sideloaderProcessing;
|
||||
|
||||
private final WarcReader reader;
|
||||
|
||||
private final EdgeDomain domain;
|
||||
|
||||
|
||||
public WarcSideloader(Path warcFile,
|
||||
SideloaderProcessing sideloaderProcessing)
|
||||
throws IOException
|
||||
{
|
||||
this.warcFile = warcFile;
|
||||
this.sideloaderProcessing = sideloaderProcessing;
|
||||
this.reader = new WarcReader(warcFile);
|
||||
this.domain = sniffDomainFromWarc()
|
||||
@ -82,6 +85,8 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
|
||||
.map(WarcResponse.class::cast)
|
||||
.filter(this::isRelevantResponse)
|
||||
.map(this::process)
|
||||
.filter(Optional::isPresent)
|
||||
.map(Optional::get)
|
||||
.iterator();
|
||||
}
|
||||
|
||||
@ -109,8 +114,8 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private ProcessedDocument process(WarcResponse response) {
|
||||
String body = getBody(response);
|
||||
private Optional<ProcessedDocument> process(WarcResponse response) {
|
||||
Optional<String> body = getBody(response);
|
||||
String url = response.target();
|
||||
|
||||
// We trim "/index.html"-suffixes from the index if they are present,
|
||||
@ -119,18 +124,32 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
|
||||
url = url.substring(0, url.length() - "index.html".length());
|
||||
}
|
||||
|
||||
return sideloaderProcessing
|
||||
.processDocument(url, body, List.of(), new DomainLinks(),
|
||||
if (body.isEmpty()) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
return Optional.of(sideloaderProcessing
|
||||
.processDocument(url, body.get(), List.of(), new DomainLinks(),
|
||||
GeneratorType.DOCS,
|
||||
10_000);
|
||||
10_000));
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private String getBody(WarcResponse response) {
|
||||
private Optional<String> getBody(WarcResponse response) {
|
||||
var http = response.http();
|
||||
|
||||
// TODO: We should support additional encodings here
|
||||
return new String(http.body().stream().readAllBytes(), StandardCharsets.UTF_8);
|
||||
try (var body = http.body()) {
|
||||
String contentType = http.headers().first("Content-Type").orElse(null);
|
||||
byte[] bytes = body.stream().readAllBytes();
|
||||
|
||||
var ct = ContentTypeParser.parseContentType(contentType, bytes);
|
||||
return Optional.of(DocumentBodyToString.getStringData(ct, bytes));
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.info("Failed to parse body", ex);
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -3,6 +3,8 @@ package nu.marginalia.converting.sideload.warc;
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.Guice;
|
||||
import nu.marginalia.converting.ConverterModule;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.converting.processor.ConverterDomainTypes;
|
||||
import nu.marginalia.converting.sideload.SideloaderProcessing;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
@ -16,7 +18,11 @@ import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
class WarcSideloaderTest extends AbstractModule {
|
||||
@ -53,13 +59,23 @@ class WarcSideloaderTest extends AbstractModule {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
try (var sideloader = new WarcSideloader(warcFile, processing)) {
|
||||
ProcessedDomain domain;
|
||||
List<ProcessedDocument> docs = new ArrayList<>();
|
||||
|
||||
var domain = sideloader.getDomain();
|
||||
System.out.println(domain);
|
||||
sideloader.getDocumentsStream().forEachRemaining(System.out::println);
|
||||
try (var sideloader = new WarcSideloader(warcFile, processing)) {
|
||||
domain = sideloader.getDomain();
|
||||
sideloader.getDocumentsStream().forEachRemaining(docs::add);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
assertNotNull(domain);
|
||||
assertEquals(3, docs.size());
|
||||
List<String> fetchedUrls = docs.stream().map(doc -> doc.url).map(Object::toString).toList();
|
||||
assertEquals(List.of(
|
||||
"https://www.marginalia.nu/",
|
||||
"https://www.marginalia.nu/log/93_atags/",
|
||||
"https://www.marginalia.nu/links/"),
|
||||
fetchedUrls);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user