MarginaliaSearch/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java
Viktor Lofgren aebb2652e8 (wip) Extract and encode spans data
Refactoring keyword extraction to extract spans information.

Modifying the intermediate storage of converted data to use the new slop library, which is allows for easier storage of ad-hoc binary data like spans and positions.

This is a bit of a katamari damacy commit that ended up dragging along a bunch of other fairly tangentially related changes that are hard to break out into separate commits after the fact.  Will push as-is to get back to being able to do more isolated work.
2024-07-27 11:44:13 +02:00

187 lines
6.1 KiB
Java

package nu.marginalia.converting;
import com.google.inject.Guice;
import com.google.inject.Injector;
import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.io.crawldata.SerializableCrawlDataStream;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.crawl.UrlIndexingState;
import nu.marginalia.model.crawldata.CrawledDocument;
import nu.marginalia.model.crawldata.CrawledDomain;
import nu.marginalia.model.crawldata.SerializableCrawlData;
import nu.marginalia.model.html.HtmlStandard;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.file.Path;
import java.time.LocalTime;
import java.util.*;
import static org.junit.jupiter.api.Assertions.*;
public class ConvertingIntegrationTest {
private DomainProcessor domainProcessor;
@BeforeEach
public void setUp() {
Injector injector = Guice.createInjector(
new ConvertingIntegrationTestModule()
);
domainProcessor = injector.getInstance(DomainProcessor.class);
}
@Test
public void testEmptyDomain() {
var docs = new ArrayList<CrawledDocument>();
var domain = new CrawledDomain("memex.marginalia.nu", null, "OK", "-", "127.0.0.1",
docs, Collections.emptyList());
var ret = domainProcessor.fullProcessing(asSerializableCrawlData(domain));
assertEquals(ret.state, DomainIndexingState.ACTIVE);
assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu"));
assertTrue(ret.documents.isEmpty());
}
@Test
public void testMemexMarginaliaNuDateInternalConsistency() throws IOException {
var ret = domainProcessor.fullProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()));
ret.documents.stream().filter(ProcessedDocument::isProcessedFully).forEach(doc -> {
int year = PubDate.fromYearByte(doc.details.metadata.year());
Integer yearMeta = doc.details.pubYear;
if (yearMeta != null) {
assertEquals(year, (int) yearMeta, doc.url.toString());
}
});
}
@Test
public void testMemexMarginaliaNuFullProcessing() throws IOException {
var ret = domainProcessor.fullProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()));
assertNotNull(ret);
assertEquals(ret.state, DomainIndexingState.ACTIVE);
assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu"));
assertFalse(ret.documents.isEmpty());
Map<UrlIndexingState, Integer> resultsByStatusCount = new HashMap<>();
ret.documents.forEach(doc -> {
resultsByStatusCount.merge(doc.state, 1, Integer::sum);
});
assertTrue(resultsByStatusCount.get(UrlIndexingState.OK) > 25);
for (var doc : ret.documents) {
if (!doc.isProcessedFully()) {
continue;
}
var details = doc.details;
assertTrue(details.title.length() > 4);
assertTrue(details.description.length() > 4);
assertEquals(HtmlStandard.HTML5, details.standard);
}
}
@Test
public void testMemexMarginaliaNuSideloadProcessing() throws IOException {
var ret = domainProcessor.sideloadProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()), 100);
assertNotNull(ret);
assertEquals("memex.marginalia.nu", ret.id());
var domain = ret.getDomain();
assertEquals(domain.domain, new EdgeDomain("memex.marginalia.nu"));
List<ProcessedDocument> docsAll = new ArrayList<>();
Map<UrlIndexingState, Integer> resultsByStatusCount = new HashMap<>();
ret.getDocumentsStream().forEachRemaining(docsAll::add);
assertTrue(docsAll.size() > 25);
docsAll.forEach(doc -> resultsByStatusCount.merge(doc.state, 1, Integer::sum));
assertTrue(resultsByStatusCount.get(UrlIndexingState.OK) > 25);
for (var doc : docsAll) {
if (!doc.isProcessedFully()) {
continue;
}
var details = doc.details;
assertTrue(details.metadata.size() > 0);
assertTrue(details.title.length() > 4);
assertTrue(details.description.length() > 4);
assertEquals(HtmlStandard.HTML5, details.standard);
}
}
private CrawledDomain readMarginaliaWorkingSet() throws IOException {
String index = readClassPathFile("memex-marginalia/index");
String[] files = index.split("\n");
var docs = new ArrayList<CrawledDocument>();
for (String file : files) {
Path p = Path.of("memex-marginalia/").resolve(file);
var doc = new CrawledDocument("1",
"https://memex.marginalia.nu/" + file,
"text/html",
LocalTime.now().toString(),
200,
"OK",
"",
"",
readClassPathFile(p.toString()),
Double.toString(Math.random()),
"https://memex.marginalia.nu/" + file,
null,
"",
false,
null,
null
);
docs.add(doc);
}
return new CrawledDomain(
"memex.marginalia.nu",
null,
"OK",
"",
"127.0.0.1",
docs, Collections.emptyList());
}
private String readClassPathFile(String s) throws IOException {
return new String(Objects.requireNonNull(ClassLoader.getSystemResourceAsStream(s)).readAllBytes());
}
private SerializableCrawlDataStream asSerializableCrawlData(CrawledDomain domain) {
List<SerializableCrawlData> data = new ArrayList<>();
data.add(domain);
if (domain.doc != null) {
data.addAll(domain.doc);
}
return SerializableCrawlDataStream.fromIterator(data.iterator());
}
}