mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 13:19:02 +00:00
185 lines
6.0 KiB
Java
185 lines
6.0 KiB
Java
package nu.marginalia.converting;
|
|
|
|
|
|
import com.google.inject.Guice;
|
|
import com.google.inject.Injector;
|
|
import nu.marginalia.converting.model.ProcessedDocument;
|
|
import nu.marginalia.converting.processor.DomainProcessor;
|
|
import nu.marginalia.io.SerializableCrawlDataStream;
|
|
import nu.marginalia.model.EdgeDomain;
|
|
import nu.marginalia.model.crawl.DomainIndexingState;
|
|
import nu.marginalia.model.crawl.PubDate;
|
|
import nu.marginalia.model.crawl.UrlIndexingState;
|
|
import nu.marginalia.model.crawldata.CrawledDocument;
|
|
import nu.marginalia.model.crawldata.CrawledDomain;
|
|
import nu.marginalia.model.crawldata.SerializableCrawlData;
|
|
import nu.marginalia.model.html.HtmlStandard;
|
|
import org.junit.jupiter.api.BeforeEach;
|
|
import org.junit.jupiter.api.Tag;
|
|
import org.junit.jupiter.api.Test;
|
|
|
|
import java.io.IOException;
|
|
import java.nio.file.Path;
|
|
import java.time.LocalTime;
|
|
import java.util.*;
|
|
|
|
import static org.junit.jupiter.api.Assertions.*;
|
|
|
|
@Tag("slow")
|
|
public class ConvertingIntegrationTest {
|
|
|
|
private DomainProcessor domainProcessor;
|
|
|
|
@BeforeEach
|
|
public void setUp() {
|
|
Injector injector = Guice.createInjector(
|
|
new ConvertingIntegrationTestModule()
|
|
);
|
|
|
|
domainProcessor = injector.getInstance(DomainProcessor.class);
|
|
}
|
|
|
|
@Test
|
|
public void testEmptyDomain() {
|
|
var docs = new ArrayList<CrawledDocument>();
|
|
|
|
var domain = new CrawledDomain("memex.marginalia.nu", null, "OK", "-", "127.0.0.1",
|
|
docs, Collections.emptyList());
|
|
var ret = domainProcessor.fullProcessing(asSerializableCrawlData(domain));
|
|
|
|
assertEquals(ret.state, DomainIndexingState.ACTIVE);
|
|
assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu"));
|
|
assertTrue(ret.documents.isEmpty());
|
|
}
|
|
@Test
|
|
public void testMemexMarginaliaNuDateInternalConsistency() throws IOException {
|
|
var ret = domainProcessor.fullProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()));
|
|
ret.documents.stream().filter(ProcessedDocument::isProcessedFully).forEach(doc -> {
|
|
int year = PubDate.fromYearByte(doc.details.metadata.year());
|
|
Integer yearMeta = doc.details.pubYear;
|
|
if (yearMeta != null) {
|
|
assertEquals(year, (int) yearMeta, doc.url.toString());
|
|
}
|
|
|
|
});
|
|
}
|
|
|
|
@Test
|
|
public void testMemexMarginaliaNuFullProcessing() throws IOException {
|
|
var ret = domainProcessor.fullProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()));
|
|
assertNotNull(ret);
|
|
assertEquals(ret.state, DomainIndexingState.ACTIVE);
|
|
assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu"));
|
|
|
|
assertFalse(ret.documents.isEmpty());
|
|
|
|
Map<UrlIndexingState, Integer> resultsByStatusCount = new HashMap<>();
|
|
|
|
ret.documents.forEach(doc -> {
|
|
resultsByStatusCount.merge(doc.state, 1, Integer::sum);
|
|
});
|
|
|
|
assertTrue(resultsByStatusCount.get(UrlIndexingState.OK) > 25);
|
|
|
|
for (var doc : ret.documents) {
|
|
|
|
if (!doc.isProcessedFully()) {
|
|
continue;
|
|
}
|
|
|
|
var details = doc.details;
|
|
|
|
assertTrue(details.title.length() > 4);
|
|
assertTrue(details.description.length() > 4);
|
|
assertEquals(HtmlStandard.HTML5, details.standard);
|
|
|
|
}
|
|
}
|
|
|
|
@Test
|
|
public void testMemexMarginaliaNuSideloadProcessing() throws IOException {
|
|
var ret = domainProcessor.sideloadProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()), 100);
|
|
assertNotNull(ret);
|
|
assertEquals("memex.marginalia.nu", ret.id());
|
|
|
|
var domain = ret.getDomain();
|
|
assertEquals(domain.domain, new EdgeDomain("memex.marginalia.nu"));
|
|
|
|
List<ProcessedDocument> docsAll = new ArrayList<>();
|
|
Map<UrlIndexingState, Integer> resultsByStatusCount = new HashMap<>();
|
|
ret.getDocumentsStream().forEachRemaining(docsAll::add);
|
|
assertTrue(docsAll.size() > 25);
|
|
|
|
docsAll.forEach(doc -> resultsByStatusCount.merge(doc.state, 1, Integer::sum));
|
|
|
|
assertTrue(resultsByStatusCount.get(UrlIndexingState.OK) > 25);
|
|
|
|
for (var doc : docsAll) {
|
|
|
|
if (!doc.isProcessedFully()) {
|
|
continue;
|
|
}
|
|
|
|
var details = doc.details;
|
|
|
|
assertTrue(details.metadata.size() > 0);
|
|
assertTrue(details.title.length() > 4);
|
|
assertTrue(details.description.length() > 4);
|
|
assertEquals(HtmlStandard.HTML5, details.standard);
|
|
}
|
|
}
|
|
|
|
private CrawledDomain readMarginaliaWorkingSet() throws IOException {
|
|
String index = readClassPathFile("memex-marginalia/index");
|
|
String[] files = index.split("\n");
|
|
|
|
var docs = new ArrayList<CrawledDocument>();
|
|
|
|
for (String file : files) {
|
|
Path p = Path.of("memex-marginalia/").resolve(file);
|
|
|
|
var doc = new CrawledDocument("1",
|
|
"https://memex.marginalia.nu/" + file,
|
|
"text/html",
|
|
LocalTime.now().toString(),
|
|
200,
|
|
"OK",
|
|
"",
|
|
"",
|
|
readClassPathFile(p.toString()),
|
|
false,
|
|
null,
|
|
null
|
|
);
|
|
docs.add(doc);
|
|
}
|
|
|
|
return new CrawledDomain(
|
|
"memex.marginalia.nu",
|
|
null,
|
|
"OK",
|
|
"",
|
|
"127.0.0.1",
|
|
docs, Collections.emptyList());
|
|
}
|
|
|
|
private String readClassPathFile(String s) throws IOException {
|
|
return new String(Objects.requireNonNull(ClassLoader.getSystemResourceAsStream(s)).readAllBytes());
|
|
}
|
|
|
|
|
|
private SerializableCrawlDataStream asSerializableCrawlData(CrawledDomain domain) {
|
|
List<SerializableCrawlData> data = new ArrayList<>();
|
|
|
|
data.add(domain);
|
|
|
|
if (domain.doc != null) {
|
|
data.addAll(domain.doc);
|
|
}
|
|
|
|
|
|
return SerializableCrawlDataStream.fromIterator(data.iterator());
|
|
}
|
|
|
|
}
|