mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(warc) Integrate the crawler's content type parsing and charset logic into the WarcSideloader
This commit is contained in:
parent
064265b0b9
commit
fabffa80f0
@ -58,6 +58,7 @@ dependencies {
|
|||||||
|
|
||||||
implementation project(':code:features-crawl:crawl-blocklist')
|
implementation project(':code:features-crawl:crawl-blocklist')
|
||||||
implementation project(':code:features-crawl:link-parser')
|
implementation project(':code:features-crawl:link-parser')
|
||||||
|
implementation project(':code:features-crawl:content-type')
|
||||||
|
|
||||||
testImplementation project(':code:libraries:term-frequency-dict')
|
testImplementation project(':code:libraries:term-frequency-dict')
|
||||||
testImplementation project(':code:process-models:crawl-spec')
|
testImplementation project(':code:process-models:crawl-spec')
|
||||||
|
@ -2,6 +2,8 @@ package nu.marginalia.converting.sideload.warc;
|
|||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.atags.model.DomainLinks;
|
import nu.marginalia.atags.model.DomainLinks;
|
||||||
|
import nu.marginalia.contenttype.ContentTypeParser;
|
||||||
|
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||||
import nu.marginalia.converting.model.GeneratorType;
|
import nu.marginalia.converting.model.GeneratorType;
|
||||||
import nu.marginalia.converting.model.ProcessedDocument;
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
import nu.marginalia.converting.model.ProcessedDomain;
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
@ -11,31 +13,32 @@ import nu.marginalia.model.EdgeDomain;
|
|||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||||
import org.netpreserve.jwarc.*;
|
import org.netpreserve.jwarc.*;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.stream.StreamSupport;
|
|
||||||
|
|
||||||
public class WarcSideloader implements SideloadSource, AutoCloseable {
|
public class WarcSideloader implements SideloadSource, AutoCloseable {
|
||||||
|
|
||||||
private final Path warcFile;
|
private static final Logger logger = LoggerFactory.getLogger(WarcSideloader.class);
|
||||||
|
|
||||||
private final SideloaderProcessing sideloaderProcessing;
|
private final SideloaderProcessing sideloaderProcessing;
|
||||||
|
|
||||||
private final WarcReader reader;
|
private final WarcReader reader;
|
||||||
|
|
||||||
private final EdgeDomain domain;
|
private final EdgeDomain domain;
|
||||||
|
|
||||||
|
|
||||||
public WarcSideloader(Path warcFile,
|
public WarcSideloader(Path warcFile,
|
||||||
SideloaderProcessing sideloaderProcessing)
|
SideloaderProcessing sideloaderProcessing)
|
||||||
throws IOException
|
throws IOException
|
||||||
{
|
{
|
||||||
this.warcFile = warcFile;
|
|
||||||
this.sideloaderProcessing = sideloaderProcessing;
|
this.sideloaderProcessing = sideloaderProcessing;
|
||||||
this.reader = new WarcReader(warcFile);
|
this.reader = new WarcReader(warcFile);
|
||||||
this.domain = sniffDomainFromWarc()
|
this.domain = sniffDomainFromWarc()
|
||||||
@ -82,6 +85,8 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
|
|||||||
.map(WarcResponse.class::cast)
|
.map(WarcResponse.class::cast)
|
||||||
.filter(this::isRelevantResponse)
|
.filter(this::isRelevantResponse)
|
||||||
.map(this::process)
|
.map(this::process)
|
||||||
|
.filter(Optional::isPresent)
|
||||||
|
.map(Optional::get)
|
||||||
.iterator();
|
.iterator();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -109,8 +114,8 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private ProcessedDocument process(WarcResponse response) {
|
private Optional<ProcessedDocument> process(WarcResponse response) {
|
||||||
String body = getBody(response);
|
Optional<String> body = getBody(response);
|
||||||
String url = response.target();
|
String url = response.target();
|
||||||
|
|
||||||
// We trim "/index.html"-suffixes from the index if they are present,
|
// We trim "/index.html"-suffixes from the index if they are present,
|
||||||
@ -119,18 +124,32 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
|
|||||||
url = url.substring(0, url.length() - "index.html".length());
|
url = url.substring(0, url.length() - "index.html".length());
|
||||||
}
|
}
|
||||||
|
|
||||||
return sideloaderProcessing
|
if (body.isEmpty()) {
|
||||||
.processDocument(url, body, List.of(), new DomainLinks(),
|
return Optional.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
return Optional.of(sideloaderProcessing
|
||||||
|
.processDocument(url, body.get(), List.of(), new DomainLinks(),
|
||||||
GeneratorType.DOCS,
|
GeneratorType.DOCS,
|
||||||
10_000);
|
10_000));
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
private String getBody(WarcResponse response) {
|
private Optional<String> getBody(WarcResponse response) {
|
||||||
var http = response.http();
|
var http = response.http();
|
||||||
|
|
||||||
// TODO: We should support additional encodings here
|
// TODO: We should support additional encodings here
|
||||||
return new String(http.body().stream().readAllBytes(), StandardCharsets.UTF_8);
|
try (var body = http.body()) {
|
||||||
|
String contentType = http.headers().first("Content-Type").orElse(null);
|
||||||
|
byte[] bytes = body.stream().readAllBytes();
|
||||||
|
|
||||||
|
var ct = ContentTypeParser.parseContentType(contentType, bytes);
|
||||||
|
return Optional.of(DocumentBodyToString.getStringData(ct, bytes));
|
||||||
|
}
|
||||||
|
catch (Exception ex) {
|
||||||
|
logger.info("Failed to parse body", ex);
|
||||||
|
}
|
||||||
|
return Optional.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -3,6 +3,8 @@ package nu.marginalia.converting.sideload.warc;
|
|||||||
import com.google.inject.AbstractModule;
|
import com.google.inject.AbstractModule;
|
||||||
import com.google.inject.Guice;
|
import com.google.inject.Guice;
|
||||||
import nu.marginalia.converting.ConverterModule;
|
import nu.marginalia.converting.ConverterModule;
|
||||||
|
import nu.marginalia.converting.model.ProcessedDocument;
|
||||||
|
import nu.marginalia.converting.model.ProcessedDomain;
|
||||||
import nu.marginalia.converting.processor.ConverterDomainTypes;
|
import nu.marginalia.converting.processor.ConverterDomainTypes;
|
||||||
import nu.marginalia.converting.sideload.SideloaderProcessing;
|
import nu.marginalia.converting.sideload.SideloaderProcessing;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
@ -16,7 +18,11 @@ import java.net.URI;
|
|||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||||
import static org.mockito.Mockito.when;
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
class WarcSideloaderTest extends AbstractModule {
|
class WarcSideloaderTest extends AbstractModule {
|
||||||
@ -53,13 +59,23 @@ class WarcSideloaderTest extends AbstractModule {
|
|||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
|
|
||||||
try (var sideloader = new WarcSideloader(warcFile, processing)) {
|
ProcessedDomain domain;
|
||||||
|
List<ProcessedDocument> docs = new ArrayList<>();
|
||||||
|
|
||||||
var domain = sideloader.getDomain();
|
try (var sideloader = new WarcSideloader(warcFile, processing)) {
|
||||||
System.out.println(domain);
|
domain = sideloader.getDomain();
|
||||||
sideloader.getDocumentsStream().forEachRemaining(System.out::println);
|
sideloader.getDocumentsStream().forEachRemaining(docs::add);
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
assertNotNull(domain);
|
||||||
|
assertEquals(3, docs.size());
|
||||||
|
List<String> fetchedUrls = docs.stream().map(doc -> doc.url).map(Object::toString).toList();
|
||||||
|
assertEquals(List.of(
|
||||||
|
"https://www.marginalia.nu/",
|
||||||
|
"https://www.marginalia.nu/log/93_atags/",
|
||||||
|
"https://www.marginalia.nu/links/"),
|
||||||
|
fetchedUrls);
|
||||||
}
|
}
|
||||||
}
|
}
|
Loading…
Reference in New Issue
Block a user